00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby.h"
00013 #include "re.h"
00014 #include <ctype.h>
00015
00016 static VALUE rb_eRegexpError;
00017
00018 #define BEG(no) regs->beg[no]
00019 #define END(no) regs->end[no]
00020
00021 #if 'a' == 97
00022 static const char casetable[] = {
00023 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00024 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00025 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00026 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00027
00028 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00029
00030 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00031
00032 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00033
00034 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00035
00036 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00037
00038 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00039
00040 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00041
00042 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00043
00044 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00045
00046 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00047
00048 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00049
00050 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00051 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00052 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00053 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00054 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00055 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00056 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00057 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00058 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00059 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00060 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00061 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00062 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00063 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00064 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00065 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00066 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00067 };
00068 #else
00069 # error >>> "You lose. You will need a translation table for your character set." <<<
00070 #endif
00071
00072 int
00073 rb_memcicmp(p1, p2, len)
00074 char *p1, *p2;
00075 long len;
00076 {
00077 int tmp;
00078
00079 while (len--) {
00080 if (tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++])
00081 return tmp;
00082 }
00083 return 0;
00084 }
00085
00086 int
00087 rb_memcmp(p1, p2, len)
00088 char *p1, *p2;
00089 long len;
00090 {
00091 if (!ruby_ignorecase) {
00092 return memcmp(p1, p2, len);
00093 }
00094 return rb_memcicmp(p1, p2, len);
00095 }
00096
00097 long
00098 rb_memsearch(x0, m, y0, n)
00099 char *x0, *y0;
00100 long m, n;
00101 {
00102 unsigned char *x = (unsigned char *)x0, *y = (unsigned char *)y0;
00103 unsigned char *s, *e;
00104 long i;
00105 int d;
00106 unsigned long hx, hy;
00107
00108 #define KR_REHASH(a, b, h) (((h) << 1) - (((unsigned long)(a))<<d) + (b))
00109
00110 if (m > n) return -1;
00111 s = y; e = s + n - m;
00112
00113
00114
00115
00116 d = sizeof(hx) * CHAR_BIT - 1;
00117 if (d > m) d = m;
00118
00119 if (ruby_ignorecase) {
00120 if (n == m) {
00121 return rb_memcicmp(x, s, m) == 0 ? 0 : -1;
00122 }
00123
00124 for (hy = hx = i = 0; i < d; ++i) {
00125 hx = KR_REHASH(0, casetable[x[i]], hx);
00126 hy = KR_REHASH(0, casetable[s[i]], hy);
00127 }
00128
00129 while (hx != hy || rb_memcicmp(x, s, m)) {
00130 if (s >= e) return -1;
00131 hy = KR_REHASH(casetable[*s], casetable[*(s+d)], hy);
00132 s++;
00133 }
00134 }
00135 else {
00136 if (n == m) {
00137 return memcmp(x, s, m) == 0 ? 0 : -1;
00138 }
00139
00140 for (hy = hx = i = 0; i < d; ++i) {
00141 hx = KR_REHASH(0, x[i], hx);
00142 hy = KR_REHASH(0, s[i], hy);
00143 }
00144
00145 while (hx != hy || memcmp(x, s, m)) {
00146 if (s >= e) return -1;
00147 hy = KR_REHASH(*s, *(s+d), hy);
00148 s++;
00149 }
00150 }
00151 return s-y;
00152 }
00153
00154 #define REG_CASESTATE FL_USER0
00155 #define KCODE_NONE 0
00156 #define KCODE_EUC FL_USER1
00157 #define KCODE_SJIS FL_USER2
00158 #define KCODE_UTF8 FL_USER3
00159 #define KCODE_FIXED FL_USER4
00160 #define KCODE_MASK (KCODE_EUC|KCODE_SJIS|KCODE_UTF8)
00161
00162 static int reg_kcode = DEFAULT_KCODE;
00163
00164 static void
00165 kcode_euc(re)
00166 struct RRegexp *re;
00167 {
00168 FL_UNSET(re, KCODE_MASK);
00169 FL_SET(re, KCODE_EUC);
00170 FL_SET(re, KCODE_FIXED);
00171 }
00172
00173 static void
00174 kcode_sjis(re)
00175 struct RRegexp *re;
00176 {
00177 FL_UNSET(re, KCODE_MASK);
00178 FL_SET(re, KCODE_SJIS);
00179 FL_SET(re, KCODE_FIXED);
00180 }
00181
00182 static void
00183 kcode_utf8(re)
00184 struct RRegexp *re;
00185 {
00186 FL_UNSET(re, KCODE_MASK);
00187 FL_SET(re, KCODE_UTF8);
00188 FL_SET(re, KCODE_FIXED);
00189 }
00190
00191 static void
00192 kcode_none(re)
00193 struct RRegexp *re;
00194 {
00195 FL_UNSET(re, KCODE_MASK);
00196 FL_SET(re, KCODE_FIXED);
00197 }
00198
00199 static int curr_kcode;
00200
00201 static void
00202 kcode_set_option(re)
00203 VALUE re;
00204 {
00205 if (!FL_TEST(re, KCODE_FIXED)) return;
00206
00207 curr_kcode = RBASIC(re)->flags & KCODE_MASK;
00208 if (reg_kcode == curr_kcode) return;
00209 switch (curr_kcode) {
00210 case KCODE_NONE:
00211 re_mbcinit(MBCTYPE_ASCII);
00212 break;
00213 case KCODE_EUC:
00214 re_mbcinit(MBCTYPE_EUC);
00215 break;
00216 case KCODE_SJIS:
00217 re_mbcinit(MBCTYPE_SJIS);
00218 break;
00219 case KCODE_UTF8:
00220 re_mbcinit(MBCTYPE_UTF8);
00221 break;
00222 }
00223 }
00224
00225 static void
00226 kcode_reset_option()
00227 {
00228 if (reg_kcode == curr_kcode) return;
00229 switch (reg_kcode) {
00230 case KCODE_NONE:
00231 re_mbcinit(MBCTYPE_ASCII);
00232 break;
00233 case KCODE_EUC:
00234 re_mbcinit(MBCTYPE_EUC);
00235 break;
00236 case KCODE_SJIS:
00237 re_mbcinit(MBCTYPE_SJIS);
00238 break;
00239 case KCODE_UTF8:
00240 re_mbcinit(MBCTYPE_UTF8);
00241 break;
00242 }
00243 }
00244
00245 int
00246 rb_reg_mbclen2(c, re)
00247 unsigned int c;
00248 VALUE re;
00249 {
00250 int len;
00251
00252 if (!FL_TEST(re, KCODE_FIXED))
00253 return mbclen(c);
00254 kcode_set_option(re);
00255 len = mbclen(c);
00256 kcode_reset_option();
00257 return len;
00258 }
00259
00260 static void
00261 rb_reg_check(re)
00262 VALUE re;
00263 {
00264 if (!RREGEXP(re)->ptr || !RREGEXP(re)->str) {
00265 rb_raise(rb_eTypeError, "uninitialized Regexp");
00266 }
00267 }
00268
00269 extern int ruby_in_compile;
00270
00271 static void
00272 rb_reg_expr_str(str, s, len)
00273 VALUE str;
00274 const char *s;
00275 long len;
00276 {
00277 const char *p, *pend;
00278 int need_escape = 0;
00279
00280 p = s; pend = p + len;
00281 while (p<pend) {
00282 if (*p == '/' || (!ISPRINT(*p) && !ismbchar(*p))) {
00283 need_escape = 1;
00284 break;
00285 }
00286 p += mbclen(*p);
00287 }
00288 if (!need_escape) {
00289 rb_str_buf_cat(str, s, len);
00290 }
00291 else {
00292 p = s;
00293 while (p<pend) {
00294 if (*p == '\\') {
00295 int n = mbclen(p[1]) + 1;
00296 rb_str_buf_cat(str, p, n);
00297 p += n;
00298 continue;
00299 }
00300 else if (*p == '/') {
00301 char c = '\\';
00302 rb_str_buf_cat(str, &c, 1);
00303 rb_str_buf_cat(str, p, 1);
00304 }
00305 else if (ismbchar(*p)) {
00306 rb_str_buf_cat(str, p, mbclen(*p));
00307 p += mbclen(*p);
00308 continue;
00309 }
00310 else if (ISPRINT(*p)) {
00311 rb_str_buf_cat(str, p, 1);
00312 }
00313 else if (!ISSPACE(*p)) {
00314 char b[8];
00315
00316 sprintf(b, "\\%03o", *p & 0377);
00317 rb_str_buf_cat(str, b, 4);
00318 }
00319 else {
00320 rb_str_buf_cat(str, p, 1);
00321 }
00322 p++;
00323 }
00324 }
00325 }
00326
00327 static VALUE
00328 rb_reg_desc(s, len, re)
00329 const char *s;
00330 long len;
00331 VALUE re;
00332 {
00333 VALUE str = rb_str_buf_new2("/");
00334
00335 rb_reg_expr_str(str, s, len);
00336 rb_str_buf_cat2(str, "/");
00337 if (re) {
00338 rb_reg_check(re);
00339 if (RREGEXP(re)->ptr->options & RE_OPTION_MULTILINE)
00340 rb_str_buf_cat2(str, "m");
00341 if (RREGEXP(re)->ptr->options & RE_OPTION_IGNORECASE)
00342 rb_str_buf_cat2(str, "i");
00343 if (RREGEXP(re)->ptr->options & RE_OPTION_EXTENDED)
00344 rb_str_buf_cat2(str, "x");
00345
00346 if (FL_TEST(re, KCODE_FIXED)) {
00347 switch ((RBASIC(re)->flags & KCODE_MASK)) {
00348 case KCODE_NONE:
00349 rb_str_buf_cat2(str, "n");
00350 break;
00351 case KCODE_EUC:
00352 rb_str_buf_cat2(str, "e");
00353 break;
00354 case KCODE_SJIS:
00355 rb_str_buf_cat2(str, "s");
00356 break;
00357 case KCODE_UTF8:
00358 rb_str_buf_cat2(str, "u");
00359 break;
00360 }
00361 }
00362 }
00363 OBJ_INFECT(str, re);
00364 return str;
00365 }
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377 static VALUE
00378 rb_reg_source(re)
00379 VALUE re;
00380 {
00381 VALUE str;
00382
00383 rb_reg_check(re);
00384 str = rb_str_new(RREGEXP(re)->str,RREGEXP(re)->len);
00385 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00386 return str;
00387 }
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400 static VALUE
00401 rb_reg_inspect(re)
00402 VALUE re;
00403 {
00404 rb_reg_check(re);
00405 return rb_reg_desc(RREGEXP(re)->str, RREGEXP(re)->len, re);
00406 }
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429 static VALUE
00430 rb_reg_to_s(re)
00431 VALUE re;
00432 {
00433 int options;
00434 const int embeddable = RE_OPTION_MULTILINE|RE_OPTION_IGNORECASE|RE_OPTION_EXTENDED;
00435 long len;
00436 const char* ptr;
00437 VALUE str = rb_str_buf_new2("(?");
00438
00439 rb_reg_check(re);
00440
00441 options = RREGEXP(re)->ptr->options;
00442 ptr = RREGEXP(re)->str;
00443 len = RREGEXP(re)->len;
00444 again:
00445 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00446 int err = 1;
00447 ptr += 2;
00448 if ((len -= 2) > 0) {
00449 do {
00450 if (*ptr == 'm') {
00451 options |= RE_OPTION_MULTILINE;
00452 }
00453 else if (*ptr == 'i') {
00454 options |= RE_OPTION_IGNORECASE;
00455 }
00456 else if (*ptr == 'x') {
00457 options |= RE_OPTION_EXTENDED;
00458 }
00459 else break;
00460 ++ptr;
00461 } while (--len > 0);
00462 }
00463 if (len > 1 && *ptr == '-') {
00464 ++ptr;
00465 --len;
00466 do {
00467 if (*ptr == 'm') {
00468 options &= ~RE_OPTION_MULTILINE;
00469 }
00470 else if (*ptr == 'i') {
00471 options &= ~RE_OPTION_IGNORECASE;
00472 }
00473 else if (*ptr == 'x') {
00474 options &= ~RE_OPTION_EXTENDED;
00475 }
00476 else break;
00477 ++ptr;
00478 } while (--len > 0);
00479 }
00480 if (*ptr == ')') {
00481 --len;
00482 ++ptr;
00483 goto again;
00484 }
00485 if (*ptr == ':' && ptr[len-1] == ')') {
00486 Regexp *rp;
00487 kcode_set_option(re);
00488 rp = ALLOC(Regexp);
00489 MEMZERO((char *)rp, Regexp, 1);
00490 err = re_compile_pattern(++ptr, len -= 2, rp) != 0;
00491 kcode_reset_option();
00492 re_free_pattern(rp);
00493 }
00494 if (err) {
00495 options = RREGEXP(re)->ptr->options;
00496 ptr = RREGEXP(re)->str;
00497 len = RREGEXP(re)->len;
00498 }
00499 }
00500
00501 if (options & RE_OPTION_MULTILINE) rb_str_buf_cat2(str, "m");
00502 if (options & RE_OPTION_IGNORECASE) rb_str_buf_cat2(str, "i");
00503 if (options & RE_OPTION_EXTENDED) rb_str_buf_cat2(str, "x");
00504
00505 if ((options & embeddable) != embeddable) {
00506 rb_str_buf_cat2(str, "-");
00507 if (!(options & RE_OPTION_MULTILINE)) rb_str_buf_cat2(str, "m");
00508 if (!(options & RE_OPTION_IGNORECASE)) rb_str_buf_cat2(str, "i");
00509 if (!(options & RE_OPTION_EXTENDED)) rb_str_buf_cat2(str, "x");
00510 }
00511
00512 rb_str_buf_cat2(str, ":");
00513 rb_reg_expr_str(str, ptr, len);
00514 rb_str_buf_cat2(str, ")");
00515
00516 OBJ_INFECT(str, re);
00517 return str;
00518 }
00519
00520 static void
00521 rb_reg_raise(s, len, err, re)
00522 const char *s;
00523 long len;
00524 const char *err;
00525 VALUE re;
00526 {
00527 VALUE desc = rb_reg_desc(s, len, re);
00528
00529 if (ruby_in_compile)
00530 rb_compile_error("%s: %s", err, RSTRING(desc)->ptr);
00531 else
00532 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING(desc)->ptr);
00533 }
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543 static VALUE
00544 rb_reg_casefold_p(re)
00545 VALUE re;
00546 {
00547 rb_reg_check(re);
00548 if (RREGEXP(re)->ptr->options & RE_OPTION_IGNORECASE) return Qtrue;
00549 return Qfalse;
00550 }
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576 static VALUE
00577 rb_reg_options_m(re)
00578 VALUE re;
00579 {
00580 int options = rb_reg_options(re);
00581 return INT2NUM(options);
00582 }
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592 static VALUE
00593 rb_reg_kcode_m(re)
00594 VALUE re;
00595 {
00596 char *kcode;
00597
00598 if (FL_TEST(re, KCODE_FIXED)) {
00599 switch (RBASIC(re)->flags & KCODE_MASK) {
00600 case KCODE_NONE:
00601 kcode = "none"; break;
00602 case KCODE_EUC:
00603 kcode = "euc"; break;
00604 case KCODE_SJIS:
00605 kcode = "sjis"; break;
00606 case KCODE_UTF8:
00607 kcode = "utf8"; break;
00608 default:
00609 rb_bug("unknown kcode - should not happen");
00610 break;
00611 }
00612 return rb_str_new2(kcode);
00613 }
00614 return Qnil;
00615 }
00616
00617 static Regexp*
00618 make_regexp(s, len, flags)
00619 const char *s;
00620 long len;
00621 int flags;
00622 {
00623 Regexp *rp;
00624 char *err;
00625
00626
00627
00628
00629
00630
00631
00632
00633 rp = ALLOC(Regexp);
00634 MEMZERO((char *)rp, Regexp, 1);
00635 rp->buffer = ALLOC_N(char, 16);
00636 rp->allocated = 16;
00637 rp->fastmap = ALLOC_N(char, 256);
00638 if (flags) {
00639 rp->options = flags;
00640 }
00641 err = re_compile_pattern(s, len, rp);
00642
00643 if (err != NULL) {
00644 re_free_pattern(rp);
00645 rb_reg_raise(s, len, err, 0);
00646 return 0;
00647 }
00648 return rp;
00649 }
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665 static VALUE rb_cMatch;
00666
00667 static VALUE match_alloc (VALUE);
00668 static VALUE
00669 match_alloc(klass)
00670 VALUE klass;
00671 {
00672 NEWOBJ(match, struct RMatch);
00673 OBJSETUP(match, klass, T_MATCH);
00674
00675 match->str = 0;
00676 match->regs = 0;
00677 match->regs = ALLOC(struct re_registers);
00678 MEMZERO(match->regs, struct re_registers, 1);
00679
00680 return (VALUE)match;
00681 }
00682
00683
00684 static VALUE
00685 match_init_copy(obj, orig)
00686 VALUE obj, orig;
00687 {
00688 if (obj == orig) return obj;
00689
00690 if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00691 rb_raise(rb_eTypeError, "wrong argument class");
00692 }
00693 RMATCH(obj)->str = RMATCH(orig)->str;
00694 re_free_registers(RMATCH(obj)->regs);
00695 RMATCH(obj)->regs->allocated = 0;
00696 re_copy_registers(RMATCH(obj)->regs, RMATCH(orig)->regs);
00697
00698 return obj;
00699 }
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711
00712
00713
00714 static VALUE
00715 match_size(match)
00716 VALUE match;
00717 {
00718 return INT2FIX(RMATCH(match)->regs->num_regs);
00719 }
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734 static VALUE
00735 match_offset(match, n)
00736 VALUE match, n;
00737 {
00738 int i = NUM2INT(n);
00739
00740 if (i < 0 || RMATCH(match)->regs->num_regs <= i)
00741 rb_raise(rb_eIndexError, "index %d out of matches", i);
00742
00743 if (RMATCH(match)->regs->beg[i] < 0)
00744 return rb_assoc_new(Qnil, Qnil);
00745
00746 return rb_assoc_new(INT2FIX(RMATCH(match)->regs->beg[i]),
00747 INT2FIX(RMATCH(match)->regs->end[i]));
00748 }
00749
00750
00751
00752
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763 static VALUE
00764 match_begin(match, n)
00765 VALUE match, n;
00766 {
00767 int i = NUM2INT(n);
00768
00769 if (i < 0 || RMATCH(match)->regs->num_regs <= i)
00770 rb_raise(rb_eIndexError, "index %d out of matches", i);
00771
00772 if (RMATCH(match)->regs->beg[i] < 0)
00773 return Qnil;
00774
00775 return INT2FIX(RMATCH(match)->regs->beg[i]);
00776 }
00777
00778
00779
00780
00781
00782
00783
00784
00785
00786
00787
00788
00789
00790
00791 static VALUE
00792 match_end(match, n)
00793 VALUE match, n;
00794 {
00795 int i = NUM2INT(n);
00796
00797 if (i < 0 || RMATCH(match)->regs->num_regs <= i)
00798 rb_raise(rb_eIndexError, "index %d out of matches", i);
00799
00800 if (RMATCH(match)->regs->beg[i] < 0)
00801 return Qnil;
00802
00803 return INT2FIX(RMATCH(match)->regs->end[i]);
00804 }
00805
00806 #define MATCH_BUSY FL_USER2
00807
00808 void
00809 rb_match_busy(match)
00810 VALUE match;
00811 {
00812 FL_SET(match, MATCH_BUSY);
00813 }
00814
00815 int ruby_ignorecase;
00816 static int may_need_recompile;
00817
00818 static void
00819 rb_reg_prepare_re(re)
00820 VALUE re;
00821 {
00822 int need_recompile = 0;
00823 int state;
00824
00825 rb_reg_check(re);
00826 state = FL_TEST(re, REG_CASESTATE);
00827
00828 if (ruby_ignorecase && !state) {
00829 FL_SET(re, REG_CASESTATE);
00830 RREGEXP(re)->ptr->options |= RE_OPTION_IGNORECASE;
00831 need_recompile = 1;
00832 }
00833 if (!ruby_ignorecase && state) {
00834 FL_UNSET(re, REG_CASESTATE);
00835 RREGEXP(re)->ptr->options &= ~RE_OPTION_IGNORECASE;
00836 need_recompile = 1;
00837 }
00838
00839 if (!FL_TEST(re, KCODE_FIXED) &&
00840 (RBASIC(re)->flags & KCODE_MASK) != reg_kcode) {
00841 need_recompile = 1;
00842 RBASIC(re)->flags &= ~KCODE_MASK;
00843 RBASIC(re)->flags |= reg_kcode;
00844 }
00845
00846 if (need_recompile) {
00847 char *err;
00848
00849 if (FL_TEST(re, KCODE_FIXED))
00850 kcode_set_option(re);
00851 rb_reg_check(re);
00852 RREGEXP(re)->ptr->fastmap_accurate = 0;
00853 err = re_compile_pattern(RREGEXP(re)->str, RREGEXP(re)->len, RREGEXP(re)->ptr);
00854 if (err != NULL) {
00855 rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, re);
00856 }
00857 }
00858 }
00859
00860 long
00861 rb_reg_adjust_startpos(re, str, pos, reverse)
00862 VALUE re, str;
00863 long pos, reverse;
00864 {
00865 long range;
00866
00867 rb_reg_check(re);
00868 if (may_need_recompile) rb_reg_prepare_re(re);
00869
00870 if (FL_TEST(re, KCODE_FIXED))
00871 kcode_set_option(re);
00872 else if (reg_kcode != curr_kcode)
00873 kcode_reset_option();
00874
00875 if (reverse) {
00876 range = -pos;
00877 }
00878 else {
00879 range = RSTRING(str)->len - pos;
00880 }
00881 return re_adjust_startpos(RREGEXP(re)->ptr,
00882 RSTRING(str)->ptr, RSTRING(str)->len,
00883 pos, range);
00884 }
00885
00886 long
00887 rb_reg_search(re, str, pos, reverse)
00888 VALUE re, str;
00889 long pos, reverse;
00890 {
00891 long result;
00892 VALUE match;
00893 static struct re_registers regs;
00894 long range;
00895
00896 if (pos > RSTRING(str)->len || pos < 0) {
00897 rb_backref_set(Qnil);
00898 return -1;
00899 }
00900
00901 rb_reg_check(re);
00902 if (may_need_recompile) rb_reg_prepare_re(re);
00903
00904 if (FL_TEST(re, KCODE_FIXED))
00905 kcode_set_option(re);
00906 else if (reg_kcode != curr_kcode)
00907 kcode_reset_option();
00908
00909 if (reverse) {
00910 range = -pos;
00911 }
00912 else {
00913 range = RSTRING(str)->len - pos;
00914 }
00915 result = re_search(RREGEXP(re)->ptr,RSTRING(str)->ptr,RSTRING(str)->len,
00916 pos, range, ®s);
00917
00918 if (FL_TEST(re, KCODE_FIXED))
00919 kcode_reset_option();
00920
00921 if (result == -2) {
00922 rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len,
00923 "Stack overflow in regexp matcher", re);
00924 }
00925
00926 if (result < 0) {
00927 rb_backref_set(Qnil);
00928 return result;
00929 }
00930
00931 match = rb_backref_get();
00932 if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
00933 match = match_alloc(rb_cMatch);
00934 }
00935 else {
00936 if (rb_safe_level() >= 3)
00937 OBJ_TAINT(match);
00938 else
00939 FL_UNSET(match, FL_TAINT);
00940 }
00941
00942 re_copy_registers(RMATCH(match)->regs, ®s);
00943 RMATCH(match)->str = rb_str_new4(str);
00944 rb_backref_set(match);
00945
00946 OBJ_INFECT(match, re);
00947 OBJ_INFECT(match, str);
00948 return result;
00949 }
00950
00951 VALUE
00952 rb_reg_nth_defined(nth, match)
00953 int nth;
00954 VALUE match;
00955 {
00956 if (NIL_P(match)) return Qnil;
00957 if (nth >= RMATCH(match)->regs->num_regs) {
00958 return Qnil;
00959 }
00960 if (nth < 0) {
00961 nth += RMATCH(match)->regs->num_regs;
00962 if (nth <= 0) return Qnil;
00963 }
00964 if (RMATCH(match)->BEG(nth) == -1) return Qfalse;
00965 return Qtrue;
00966 }
00967
00968 VALUE
00969 rb_reg_nth_match(nth, match)
00970 int nth;
00971 VALUE match;
00972 {
00973 VALUE str;
00974 long start, end, len;
00975
00976 if (NIL_P(match)) return Qnil;
00977 if (nth >= RMATCH(match)->regs->num_regs) {
00978 return Qnil;
00979 }
00980 if (nth < 0) {
00981 nth += RMATCH(match)->regs->num_regs;
00982 if (nth <= 0) return Qnil;
00983 }
00984 start = RMATCH(match)->BEG(nth);
00985 if (start == -1) return Qnil;
00986 end = RMATCH(match)->END(nth);
00987 len = end - start;
00988 str = rb_str_substr(RMATCH(match)->str, start, len);
00989 OBJ_INFECT(str, match);
00990 return str;
00991 }
00992
00993 VALUE
00994 rb_reg_last_match(match)
00995 VALUE match;
00996 {
00997 return rb_reg_nth_match(0, match);
00998 }
00999
01000
01001
01002
01003
01004
01005
01006
01007
01008
01009
01010
01011
01012 VALUE
01013 rb_reg_match_pre(match)
01014 VALUE match;
01015 {
01016 VALUE str;
01017
01018 if (NIL_P(match)) return Qnil;
01019 if (RMATCH(match)->BEG(0) == -1) return Qnil;
01020 str = rb_str_substr(RMATCH(match)->str, 0, RMATCH(match)->BEG(0));
01021 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01022 return str;
01023 }
01024
01025
01026
01027
01028
01029
01030
01031
01032
01033
01034
01035
01036
01037 VALUE
01038 rb_reg_match_post(match)
01039 VALUE match;
01040 {
01041 VALUE str;
01042 long pos;
01043
01044 if (NIL_P(match)) return Qnil;
01045 if (RMATCH(match)->BEG(0) == -1) return Qnil;
01046 str = RMATCH(match)->str;
01047 pos = RMATCH(match)->END(0);
01048 str = rb_str_substr(str, pos, RSTRING(str)->len - pos);
01049 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01050 return str;
01051 }
01052
01053 VALUE
01054 rb_reg_match_last(match)
01055 VALUE match;
01056 {
01057 int i;
01058
01059 if (NIL_P(match)) return Qnil;
01060 if (RMATCH(match)->BEG(0) == -1) return Qnil;
01061
01062 for (i=RMATCH(match)->regs->num_regs-1; RMATCH(match)->BEG(i) == -1 && i > 0; i--)
01063 ;
01064 if (i == 0) return Qnil;
01065 return rb_reg_nth_match(i, match);
01066 }
01067
01068 static VALUE
01069 last_match_getter()
01070 {
01071 return rb_reg_last_match(rb_backref_get());
01072 }
01073
01074 static VALUE
01075 prematch_getter()
01076 {
01077 return rb_reg_match_pre(rb_backref_get());
01078 }
01079
01080 static VALUE
01081 postmatch_getter()
01082 {
01083 return rb_reg_match_post(rb_backref_get());
01084 }
01085
01086 static VALUE
01087 last_paren_match_getter()
01088 {
01089 return rb_reg_match_last(rb_backref_get());
01090 }
01091
01092 static VALUE
01093 match_array(match, start)
01094 VALUE match;
01095 int start;
01096 {
01097 struct re_registers *regs = RMATCH(match)->regs;
01098 VALUE ary = rb_ary_new2(regs->num_regs);
01099 VALUE target = RMATCH(match)->str;
01100 int i;
01101 int taint = OBJ_TAINTED(match);
01102
01103 for (i=start; i<regs->num_regs; i++) {
01104 if (regs->beg[i] == -1) {
01105 rb_ary_push(ary, Qnil);
01106 }
01107 else {
01108 VALUE str = rb_str_substr(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01109 if (taint) OBJ_TAINT(str);
01110 rb_ary_push(ary, str);
01111 }
01112 }
01113 return ary;
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137
01138
01139
01140
01141
01142
01143 static VALUE
01144 match_to_a(match)
01145 VALUE match;
01146 {
01147 return match_array(match, 0);
01148 }
01149
01150
01151
01152
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163 static VALUE
01164 match_captures(match)
01165 VALUE match;
01166 {
01167 return match_array(match, 1);
01168 }
01169
01170
01171
01172
01173
01174
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190 static VALUE
01191 match_aref(argc, argv, match)
01192 int argc;
01193 VALUE *argv;
01194 VALUE match;
01195 {
01196 VALUE idx, rest;
01197
01198 rb_scan_args(argc, argv, "11", &idx, &rest);
01199
01200 if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
01201 return rb_ary_aref(argc, argv, match_to_a(match));
01202 }
01203 return rb_reg_nth_match(FIX2INT(idx), match);
01204 }
01205
01206 static VALUE match_entry (VALUE, long);
01207 static VALUE
01208 match_entry(match, n)
01209 VALUE match;
01210 long n;
01211 {
01212 return rb_reg_nth_match(n, match);
01213 }
01214
01215
01216
01217
01218
01219
01220
01221
01222
01223
01224
01225
01226
01227
01228 static VALUE
01229 match_values_at(argc, argv, match)
01230 int argc;
01231 VALUE *argv;
01232 VALUE match;
01233 {
01234 return rb_values_at(match, RMATCH(match)->regs->num_regs, argc, argv, match_entry);
01235 }
01236
01237
01238
01239
01240
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250 static VALUE
01251 match_select(argc, argv, match)
01252 int argc;
01253 VALUE *argv;
01254 VALUE match;
01255 {
01256 if (argc > 0) {
01257 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0)", argc);
01258 }
01259 else {
01260 struct re_registers *regs = RMATCH(match)->regs;
01261 VALUE target = RMATCH(match)->str;
01262 VALUE result = rb_ary_new();
01263 int i;
01264 int taint = OBJ_TAINTED(match);
01265
01266 for (i=0; i<regs->num_regs; i++) {
01267 VALUE str = rb_str_substr(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01268 if (taint) OBJ_TAINT(str);
01269 if (RTEST(rb_yield(str))) {
01270 rb_ary_push(result, str);
01271 }
01272 }
01273 return result;
01274 }
01275 }
01276
01277
01278
01279
01280
01281
01282
01283
01284
01285
01286
01287
01288 static VALUE
01289 match_to_s(match)
01290 VALUE match;
01291 {
01292 VALUE str = rb_reg_last_match(match);
01293
01294 if (NIL_P(str)) str = rb_str_new(0,0);
01295 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01296 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01297 return str;
01298 }
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01309
01310
01311 static VALUE
01312 match_string(match)
01313 VALUE match;
01314 {
01315 return RMATCH(match)->str;
01316 }
01317
01318 VALUE rb_cRegexp;
01319
01320 static void
01321 rb_reg_initialize(obj, s, len, options)
01322 VALUE obj;
01323 const char *s;
01324 long len;
01325 int options;
01326
01327
01328
01329
01330
01331
01332 {
01333 struct RRegexp *re = RREGEXP(obj);
01334
01335 if (re->ptr) re_free_pattern(re->ptr);
01336 if (re->str) free(re->str);
01337 re->ptr = 0;
01338 re->str = 0;
01339
01340 switch (options & ~0xf) {
01341 case 0:
01342 default:
01343 FL_SET(re, reg_kcode);
01344 break;
01345 case 16:
01346 kcode_none(re);
01347 break;
01348 case 32:
01349 kcode_euc(re);
01350 break;
01351 case 48:
01352 kcode_sjis(re);
01353 break;
01354 case 64:
01355 kcode_utf8(re);
01356 break;
01357 }
01358
01359 if (options & ~0xf) {
01360 kcode_set_option((VALUE)re);
01361 }
01362 if (ruby_ignorecase) {
01363 options |= RE_OPTION_IGNORECASE;
01364 FL_SET(re, REG_CASESTATE);
01365 }
01366 re->ptr = make_regexp(s, len, options & 0xf);
01367 re->str = ALLOC_N(char, len+1);
01368 memcpy(re->str, s, len);
01369 re->str[len] = '\0';
01370 re->len = len;
01371 if (options & ~0xf) {
01372 kcode_reset_option();
01373 }
01374 }
01375
01376 static VALUE rb_reg_s_alloc (VALUE);
01377 static VALUE
01378 rb_reg_s_alloc(klass)
01379 VALUE klass;
01380 {
01381 NEWOBJ(re, struct RRegexp);
01382 OBJSETUP(re, klass, T_REGEXP);
01383
01384 re->ptr = 0;
01385 re->len = 0;
01386 re->str = 0;
01387
01388 return (VALUE)re;
01389 }
01390
01391 VALUE
01392 rb_reg_new(s, len, options)
01393 const char *s;
01394 long len;
01395 int options;
01396 {
01397 VALUE re = rb_reg_s_alloc(rb_cRegexp);
01398
01399 rb_reg_initialize(re, s, len, options);
01400 return (VALUE)re;
01401 }
01402
01403 static int case_cache;
01404 static int kcode_cache;
01405 static VALUE reg_cache;
01406
01407 VALUE
01408 rb_reg_regcomp(str)
01409 VALUE str;
01410 {
01411 volatile VALUE save_str = str;
01412 if (reg_cache && RREGEXP(reg_cache)->len == RSTRING(str)->len
01413 && case_cache == ruby_ignorecase
01414 && kcode_cache == reg_kcode
01415 && memcmp(RREGEXP(reg_cache)->str, RSTRING(str)->ptr, RSTRING(str)->len) == 0)
01416 return reg_cache;
01417
01418 case_cache = ruby_ignorecase;
01419 kcode_cache = reg_kcode;
01420 return reg_cache = rb_reg_new(RSTRING(str)->ptr, RSTRING(str)->len,
01421 ruby_ignorecase);
01422 }
01423
01424 static int
01425 rb_reg_cur_kcode(re)
01426 VALUE re;
01427 {
01428 if (FL_TEST(re, KCODE_FIXED)) {
01429 return RBASIC(re)->flags & KCODE_MASK;
01430 }
01431 return 0;
01432 }
01433
01434
01435
01436
01437
01438
01439
01440
01441 static VALUE
01442 rb_reg_hash(re)
01443 VALUE re;
01444 {
01445 int hashval, len;
01446 char *p;
01447
01448 rb_reg_check(re);
01449 hashval = RREGEXP(re)->ptr->options;
01450 len = RREGEXP(re)->len;
01451 p = RREGEXP(re)->str;
01452 while (len--) {
01453 hashval = hashval * 33 + *p++;
01454 }
01455 hashval = hashval + (hashval>>5);
01456
01457 return INT2FIX(hashval);
01458 }
01459
01460
01461
01462
01463
01464
01465
01466
01467
01468
01469
01470
01471
01472
01473
01474
01475 static VALUE
01476 rb_reg_equal(re1, re2)
01477 VALUE re1, re2;
01478 {
01479 if (re1 == re2) return Qtrue;
01480 if (TYPE(re2) != T_REGEXP) return Qfalse;
01481 rb_reg_check(re1); rb_reg_check(re2);
01482 if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
01483 if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0 &&
01484 rb_reg_cur_kcode(re1) == rb_reg_cur_kcode(re2) &&
01485 RREGEXP(re1)->ptr->options == RREGEXP(re2)->ptr->options) {
01486 return Qtrue;
01487 }
01488 return Qfalse;
01489 }
01490
01491
01492
01493
01494
01495
01496
01497
01498
01499
01500
01501
01502
01503 VALUE
01504 rb_reg_match(re, str)
01505 VALUE re, str;
01506 {
01507 long start;
01508
01509 if (NIL_P(str)) {
01510 rb_backref_set(Qnil);
01511 return Qnil;
01512 }
01513 StringValue(str);
01514 start = rb_reg_search(re, str, 0, 0);
01515 if (start < 0) {
01516 return Qnil;
01517 }
01518 return LONG2FIX(start);
01519 }
01520
01521
01522
01523
01524
01525
01526
01527
01528
01529
01530
01531
01532
01533
01534
01535
01536
01537
01538
01539
01540 VALUE
01541 rb_reg_eqq(re, str)
01542 VALUE re, str;
01543 {
01544 long start;
01545
01546 if (TYPE(str) != T_STRING) {
01547 str = rb_check_string_type(str);
01548 if (NIL_P(str)) {
01549 rb_backref_set(Qnil);
01550 return Qfalse;
01551 }
01552 }
01553 StringValue(str);
01554 start = rb_reg_search(re, str, 0, 0);
01555 if (start < 0) {
01556 return Qfalse;
01557 }
01558 return Qtrue;
01559 }
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573 VALUE
01574 rb_reg_match2(re)
01575 VALUE re;
01576 {
01577 long start;
01578 VALUE line = rb_lastline_get();
01579
01580 if (TYPE(line) != T_STRING) {
01581 rb_backref_set(Qnil);
01582 return Qnil;
01583 }
01584
01585 start = rb_reg_search(re, line, 0, 0);
01586 if (start < 0) {
01587 return Qnil;
01588 }
01589 return LONG2FIX(start);
01590 }
01591
01592
01593
01594
01595
01596
01597
01598
01599
01600
01601
01602
01603
01604 static VALUE
01605 rb_reg_match_m(re, str)
01606 VALUE re, str;
01607 {
01608 VALUE result = rb_reg_match(re, str);
01609
01610 if (NIL_P(result)) return Qnil;
01611 result = rb_backref_get();
01612 rb_match_busy(result);
01613 return result;
01614 }
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628
01629
01630
01631
01632
01633
01634
01635
01636
01637
01638
01639
01640
01641
01642
01643
01644
01645
01646 static VALUE
01647 rb_reg_initialize_m(argc, argv, self)
01648 int argc;
01649 VALUE *argv;
01650 VALUE self;
01651 {
01652 const char *s;
01653 long len;
01654 int flags = 0;
01655
01656 rb_check_frozen(self);
01657 if (argc == 0 || argc > 3) {
01658 rb_raise(rb_eArgError, "wrong number of arguments");
01659 }
01660 if (TYPE(argv[0]) == T_REGEXP) {
01661 if (argc > 1) {
01662 rb_warn("flags%s ignored", (argc == 3) ? " and encoding": "");
01663 }
01664 rb_reg_check(argv[0]);
01665 flags = RREGEXP(argv[0])->ptr->options & 0xf;
01666 if (FL_TEST(argv[0], KCODE_FIXED)) {
01667 switch (RBASIC(argv[0])->flags & KCODE_MASK) {
01668 case KCODE_NONE:
01669 flags |= 16;
01670 break;
01671 case KCODE_EUC:
01672 flags |= 32;
01673 break;
01674 case KCODE_SJIS:
01675 flags |= 48;
01676 break;
01677 case KCODE_UTF8:
01678 flags |= 64;
01679 break;
01680 default:
01681 break;
01682 }
01683 }