Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Globals

re.c

Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   re.c -
00004 
00005   $Author: akr $
00006   created at: Mon Aug  9 18:24:49 JST 1993
00007 
00008   Copyright (C) 1993-2003 Yukihiro Matsumoto
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby.h"
00013 #include "re.h"
00014 #include <ctype.h>
00015 
00016 static VALUE rb_eRegexpError;
00017 
00018 #define BEG(no) regs->beg[no]
00019 #define END(no) regs->end[no]
00020 
00021 #if 'a' == 97   /* it's ascii */
00022 static const char casetable[] = {
00023         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00024         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00025         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00026         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00027         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
00028         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00029         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
00030         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00031         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
00032         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00033         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
00034         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00035         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
00036         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00037         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
00038         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00039         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
00040         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00041         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
00042         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00043         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
00044         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00045         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
00046         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00047         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
00048         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00049         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
00050         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00051         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00052         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00053         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00054         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00055         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00056         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00057         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00058         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00059         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00060         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00061         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00062         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00063         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00064         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00065         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00066         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00067 };
00068 #else
00069 # error >>> "You lose. You will need a translation table for your character set." <<<
00070 #endif
00071 
00072 int
00073 rb_memcicmp(p1, p2, len)
00074     char *p1, *p2;
00075     long len;
00076 {
00077     int tmp;
00078 
00079     while (len--) {
00080         if (tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++])
00081             return tmp;
00082     }
00083     return 0;
00084 }
00085 
00086 int
00087 rb_memcmp(p1, p2, len)
00088     char *p1, *p2;
00089     long len;
00090 {
00091     if (!ruby_ignorecase) {
00092         return memcmp(p1, p2, len);
00093     }
00094     return rb_memcicmp(p1, p2, len);
00095 }
00096 
00097 long
00098 rb_memsearch(x0, m, y0, n)
00099     char *x0, *y0;
00100     long m, n;
00101 {
00102     unsigned char *x = (unsigned char *)x0, *y = (unsigned char *)y0;
00103     unsigned char *s, *e;
00104     long i;
00105     int d;
00106     unsigned long hx, hy;
00107 
00108 #define KR_REHASH(a, b, h) (((h) << 1) - (((unsigned long)(a))<<d) + (b))
00109 
00110     if (m > n) return -1;
00111     s = y; e = s + n - m;
00112 
00113     /* Preprocessing */
00114     /* computes d = 2^(m-1) with
00115        the left-shift operator */
00116     d = sizeof(hx) * CHAR_BIT - 1;
00117     if (d > m) d = m;
00118 
00119     if (ruby_ignorecase) {
00120         if (n == m) {
00121             return rb_memcicmp(x, s, m) == 0 ? 0 : -1;
00122         }
00123         /* Prepare hash value */
00124         for (hy = hx = i = 0; i < d; ++i) {
00125             hx = KR_REHASH(0, casetable[x[i]], hx);
00126             hy = KR_REHASH(0, casetable[s[i]], hy);
00127         }
00128         /* Searching */
00129         while (hx != hy || rb_memcicmp(x, s, m)) {
00130             if (s >= e) return -1;
00131             hy = KR_REHASH(casetable[*s], casetable[*(s+d)], hy);
00132             s++;
00133         }
00134     }
00135     else {
00136         if (n == m) {
00137             return memcmp(x, s, m) == 0 ? 0 : -1;
00138         }
00139         /* Prepare hash value */
00140         for (hy = hx = i = 0; i < d; ++i) {
00141             hx = KR_REHASH(0, x[i], hx);
00142             hy = KR_REHASH(0, s[i], hy);
00143         }
00144         /* Searching */
00145         while (hx != hy || memcmp(x, s, m)) {
00146             if (s >= e) return -1;
00147             hy = KR_REHASH(*s, *(s+d), hy);
00148             s++;
00149         }
00150     }
00151     return s-y;
00152 }
00153 
00154 #define REG_CASESTATE  FL_USER0
00155 #define KCODE_NONE  0
00156 #define KCODE_EUC   FL_USER1
00157 #define KCODE_SJIS  FL_USER2
00158 #define KCODE_UTF8  FL_USER3
00159 #define KCODE_FIXED FL_USER4
00160 #define KCODE_MASK (KCODE_EUC|KCODE_SJIS|KCODE_UTF8)
00161 
00162 static int reg_kcode = DEFAULT_KCODE;
00163 
00164 static void
00165 kcode_euc(re)
00166     struct RRegexp *re;
00167 {
00168     FL_UNSET(re, KCODE_MASK);
00169     FL_SET(re, KCODE_EUC);
00170     FL_SET(re, KCODE_FIXED);
00171 }
00172 
00173 static void
00174 kcode_sjis(re)
00175     struct RRegexp *re;
00176 {
00177     FL_UNSET(re, KCODE_MASK);
00178     FL_SET(re, KCODE_SJIS);
00179     FL_SET(re, KCODE_FIXED);
00180 }
00181 
00182 static void
00183 kcode_utf8(re)
00184     struct RRegexp *re;
00185 {
00186     FL_UNSET(re, KCODE_MASK);
00187     FL_SET(re, KCODE_UTF8);
00188     FL_SET(re, KCODE_FIXED);
00189 }
00190 
00191 static void
00192 kcode_none(re)
00193     struct RRegexp *re;
00194 {
00195     FL_UNSET(re, KCODE_MASK);
00196     FL_SET(re, KCODE_FIXED);
00197 }
00198 
00199 static int curr_kcode;
00200 
00201 static void
00202 kcode_set_option(re)
00203     VALUE re;
00204 {
00205     if (!FL_TEST(re, KCODE_FIXED)) return;
00206 
00207     curr_kcode = RBASIC(re)->flags & KCODE_MASK;
00208     if (reg_kcode == curr_kcode) return;
00209     switch (curr_kcode) {
00210       case KCODE_NONE:
00211         re_mbcinit(MBCTYPE_ASCII);
00212         break;
00213       case KCODE_EUC:
00214         re_mbcinit(MBCTYPE_EUC);
00215         break;
00216       case KCODE_SJIS:
00217         re_mbcinit(MBCTYPE_SJIS);
00218         break;
00219       case KCODE_UTF8:
00220         re_mbcinit(MBCTYPE_UTF8);
00221         break;
00222     }
00223 }         
00224 
00225 static void
00226 kcode_reset_option()
00227 {
00228     if (reg_kcode == curr_kcode) return;
00229     switch (reg_kcode) {
00230       case KCODE_NONE:
00231         re_mbcinit(MBCTYPE_ASCII);
00232         break;
00233       case KCODE_EUC:
00234         re_mbcinit(MBCTYPE_EUC);
00235         break;
00236       case KCODE_SJIS:
00237         re_mbcinit(MBCTYPE_SJIS);
00238         break;
00239       case KCODE_UTF8:
00240         re_mbcinit(MBCTYPE_UTF8);
00241         break;
00242     }
00243 }
00244 
00245 int
00246 rb_reg_mbclen2(c, re)
00247     unsigned int c;
00248     VALUE re;
00249 {
00250     int len;
00251 
00252     if (!FL_TEST(re, KCODE_FIXED))
00253         return mbclen(c);
00254     kcode_set_option(re);
00255     len = mbclen(c);
00256     kcode_reset_option();
00257     return len;
00258 }
00259 
00260 static void
00261 rb_reg_check(re)
00262     VALUE re;
00263 {
00264     if (!RREGEXP(re)->ptr || !RREGEXP(re)->str) {
00265         rb_raise(rb_eTypeError, "uninitialized Regexp");
00266     }
00267 }
00268 
00269 extern int ruby_in_compile;
00270 
00271 static void
00272 rb_reg_expr_str(str, s, len)
00273     VALUE str;
00274     const char *s;
00275     long len;
00276 {
00277     const char *p, *pend;
00278     int need_escape = 0;
00279 
00280     p = s; pend = p + len;
00281     while (p<pend) {
00282         if (*p == '/' || (!ISPRINT(*p) && !ismbchar(*p))) {
00283             need_escape = 1;
00284             break;
00285         }
00286         p += mbclen(*p);
00287     }
00288     if (!need_escape) {
00289         rb_str_buf_cat(str, s, len);
00290     }
00291     else {
00292         p = s; 
00293         while (p<pend) {
00294             if (*p == '\\') {
00295                 int n = mbclen(p[1]) + 1;
00296                 rb_str_buf_cat(str, p, n);
00297                 p += n;
00298                 continue;
00299             }
00300             else if (*p == '/') {
00301                 char c = '\\';
00302                 rb_str_buf_cat(str, &c, 1);
00303                 rb_str_buf_cat(str, p, 1);
00304             }
00305             else if (ismbchar(*p)) {
00306                 rb_str_buf_cat(str, p, mbclen(*p));
00307                 p += mbclen(*p);
00308                 continue;
00309             }
00310             else if (ISPRINT(*p)) {
00311                 rb_str_buf_cat(str, p, 1);
00312             }
00313             else if (!ISSPACE(*p)) {
00314                 char b[8];
00315 
00316                 sprintf(b, "\\%03o", *p & 0377);
00317                 rb_str_buf_cat(str, b, 4);
00318             }
00319             else {
00320                 rb_str_buf_cat(str, p, 1);
00321             }
00322             p++;
00323         }
00324     }
00325 }
00326 
00327 static VALUE
00328 rb_reg_desc(s, len, re)
00329     const char *s;
00330     long len;
00331     VALUE re;
00332 {
00333     VALUE str = rb_str_buf_new2("/");
00334 
00335     rb_reg_expr_str(str, s, len);
00336     rb_str_buf_cat2(str, "/");
00337     if (re) {
00338         rb_reg_check(re);
00339         if (RREGEXP(re)->ptr->options & RE_OPTION_MULTILINE)
00340             rb_str_buf_cat2(str, "m");
00341         if (RREGEXP(re)->ptr->options & RE_OPTION_IGNORECASE)
00342             rb_str_buf_cat2(str, "i");
00343         if (RREGEXP(re)->ptr->options & RE_OPTION_EXTENDED)
00344             rb_str_buf_cat2(str, "x");
00345         
00346         if (FL_TEST(re, KCODE_FIXED)) {
00347             switch ((RBASIC(re)->flags & KCODE_MASK)) {
00348               case KCODE_NONE:
00349                 rb_str_buf_cat2(str, "n");
00350                 break;
00351               case KCODE_EUC:
00352                 rb_str_buf_cat2(str, "e");
00353                 break;
00354               case KCODE_SJIS:
00355                 rb_str_buf_cat2(str, "s");
00356                 break;
00357               case KCODE_UTF8:
00358                 rb_str_buf_cat2(str, "u");
00359                 break;
00360             }
00361         }
00362     }
00363     OBJ_INFECT(str, re);
00364     return str;
00365 }
00366 
00367 
00368 /*
00369  *  call-seq:
00370  *     rxp.source   => str
00371  *  
00372  *  Returns the original string of the pattern.
00373  *     
00374  *     /ab+c/ix.source   #=> "ab+c"
00375  */
00376 
00377 static VALUE
00378 rb_reg_source(re)
00379     VALUE re;
00380 {
00381     VALUE str;
00382 
00383     rb_reg_check(re);
00384     str = rb_str_new(RREGEXP(re)->str,RREGEXP(re)->len);
00385     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00386     return str;
00387 }
00388 
00389 /*
00390  * call-seq:
00391  *    rxp.inspect   => string
00392  *
00393  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
00394  * <code>#inspect</code> actually produces the more natural version of
00395  * the string than <code>#to_s</code>.
00396  *
00397  *     /ab+c/ix.to_s         #=> /ab+c/ix
00398 */
00399 
00400 static VALUE
00401 rb_reg_inspect(re)
00402     VALUE re;
00403 {
00404     rb_reg_check(re);
00405     return rb_reg_desc(RREGEXP(re)->str, RREGEXP(re)->len, re);
00406 }
00407 
00408 
00409 /*
00410  *  call-seq:
00411  *     rxp.to_s   => str
00412  *  
00413  *  Returns a string containing the regular expression and its options (using the
00414  *  <code>(?xxx:yyy)</code> notation. This string can be fed back in to
00415  *  <code>Regexp::new</code> to a regular expression with the same semantics as
00416  *  the original. (However, <code>Regexp#==</code> may not return true when
00417  *  comparing the two, as the source of the regular expression itself may
00418  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
00419  *  generally more readable version of <i>rxp</i>.
00420  *     
00421  *     r1 = /ab+c/ix         #=> /ab+c/ix
00422  *     s1 = r1.to_s          #=> "(?ix-m:ab+c)"
00423  *     r2 = Regexp.new(s1)   #=> /(?ix-m:ab+c)/
00424  *     r1 == r2              #=> false
00425  *     r1.source             #=> "ab+c"
00426  *     r2.source             #=> "(?ix-m:ab+c)"
00427  */
00428 
00429 static VALUE
00430 rb_reg_to_s(re)
00431     VALUE re;
00432 {
00433     int options;
00434     const int embeddable = RE_OPTION_MULTILINE|RE_OPTION_IGNORECASE|RE_OPTION_EXTENDED;
00435     long len;
00436     const char* ptr;
00437     VALUE str = rb_str_buf_new2("(?");
00438 
00439     rb_reg_check(re);
00440 
00441     options = RREGEXP(re)->ptr->options;
00442     ptr = RREGEXP(re)->str;
00443     len = RREGEXP(re)->len;
00444   again:
00445     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00446         int err = 1;
00447         ptr += 2;
00448         if ((len -= 2) > 0) {
00449             do {
00450                 if (*ptr == 'm') {
00451                     options |= RE_OPTION_MULTILINE;
00452                 }
00453                 else if (*ptr == 'i') {
00454                     options |= RE_OPTION_IGNORECASE;
00455                 }
00456                 else if (*ptr == 'x') {
00457                     options |= RE_OPTION_EXTENDED;
00458                 }
00459                 else break;
00460                 ++ptr;
00461             } while (--len > 0);
00462         }
00463         if (len > 1 && *ptr == '-') {
00464             ++ptr;
00465             --len;
00466             do {
00467                 if (*ptr == 'm') {
00468                     options &= ~RE_OPTION_MULTILINE;
00469                 }
00470                 else if (*ptr == 'i') {
00471                     options &= ~RE_OPTION_IGNORECASE;
00472                 }
00473                 else if (*ptr == 'x') {
00474                     options &= ~RE_OPTION_EXTENDED;
00475                 }
00476                 else break;
00477                 ++ptr;
00478             } while (--len > 0);
00479         }
00480         if (*ptr == ')') {
00481             --len;
00482             ++ptr;
00483             goto again;
00484         }
00485         if (*ptr == ':' && ptr[len-1] == ')') {
00486             Regexp *rp;
00487             kcode_set_option(re);
00488             rp = ALLOC(Regexp);
00489             MEMZERO((char *)rp, Regexp, 1);
00490             err = re_compile_pattern(++ptr, len -= 2, rp) != 0;
00491             kcode_reset_option();
00492             re_free_pattern(rp);
00493         }
00494         if (err) {
00495             options = RREGEXP(re)->ptr->options;
00496             ptr = RREGEXP(re)->str;
00497             len = RREGEXP(re)->len;
00498         }
00499     }
00500 
00501     if (options & RE_OPTION_MULTILINE) rb_str_buf_cat2(str, "m");
00502     if (options & RE_OPTION_IGNORECASE) rb_str_buf_cat2(str, "i");
00503     if (options & RE_OPTION_EXTENDED) rb_str_buf_cat2(str, "x");
00504 
00505     if ((options & embeddable) != embeddable) {
00506         rb_str_buf_cat2(str, "-");
00507         if (!(options & RE_OPTION_MULTILINE)) rb_str_buf_cat2(str, "m");
00508         if (!(options & RE_OPTION_IGNORECASE)) rb_str_buf_cat2(str, "i");
00509         if (!(options & RE_OPTION_EXTENDED)) rb_str_buf_cat2(str, "x");
00510     }
00511 
00512     rb_str_buf_cat2(str, ":");
00513     rb_reg_expr_str(str, ptr, len);
00514     rb_str_buf_cat2(str, ")");
00515 
00516     OBJ_INFECT(str, re);
00517     return str;
00518 }
00519 
00520 static void
00521 rb_reg_raise(s, len, err, re)
00522     const char *s;
00523     long len;
00524     const char *err;
00525     VALUE re;
00526 {
00527     VALUE desc = rb_reg_desc(s, len, re);
00528 
00529     if (ruby_in_compile)
00530         rb_compile_error("%s: %s", err, RSTRING(desc)->ptr);
00531     else
00532         rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING(desc)->ptr);
00533 }
00534 
00535 
00536 /*
00537  *  call-seq:
00538  *     rxp.casefold?   => true or false
00539  *  
00540  *  Returns the value of the case-insensitive flag.
00541  */
00542 
00543 static VALUE
00544 rb_reg_casefold_p(re)
00545     VALUE re;
00546 {
00547     rb_reg_check(re);
00548     if (RREGEXP(re)->ptr->options & RE_OPTION_IGNORECASE) return Qtrue;
00549     return Qfalse;
00550 }
00551 
00552 
00553 /*
00554  *  call-seq:
00555  *     rxp.options   => fixnum
00556  *  
00557  *  Returns the set of bits corresponding to the options used when creating this
00558  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
00559  *  may be set in the returned options: these are used internally by the regular
00560  *  expression code. These extra bits are ignored if the options are passed to
00561  *  <code>Regexp::new</code>.
00562  *     
00563  *     Regexp::IGNORECASE                  #=> 1
00564  *     Regexp::EXTENDED                    #=> 2
00565  *     Regexp::MULTILINE                   #=> 4
00566  *     
00567  *     /cat/.options                       #=> 128
00568  *     /cat/ix.options                     #=> 131
00569  *     Regexp.new('cat', true).options     #=> 129
00570  *     Regexp.new('cat', 0, 's').options   #=> 384
00571  *     
00572  *     r = /cat/ix
00573  *     Regexp.new(r.source, r.options)     #=> /cat/ix
00574  */
00575 
00576 static VALUE
00577 rb_reg_options_m(re)
00578     VALUE re;
00579 {
00580     int options = rb_reg_options(re);
00581     return INT2NUM(options);
00582 }
00583 
00584 
00585 /*
00586  *  call-seq:
00587  *     rxp.kcode   => str
00588  *  
00589  *  Returns the character set code for the regexp.
00590  */
00591 
00592 static VALUE
00593 rb_reg_kcode_m(re)
00594     VALUE re;
00595 {
00596     char *kcode;
00597 
00598     if (FL_TEST(re, KCODE_FIXED)) {
00599         switch (RBASIC(re)->flags & KCODE_MASK) {
00600           case KCODE_NONE:
00601             kcode = "none"; break;
00602           case KCODE_EUC:
00603             kcode = "euc"; break;
00604           case KCODE_SJIS:
00605             kcode = "sjis"; break;
00606           case KCODE_UTF8:
00607             kcode = "utf8"; break;
00608           default:
00609             rb_bug("unknown kcode - should not happen");
00610             break;
00611         }
00612         return rb_str_new2(kcode);
00613     }
00614     return Qnil;
00615 }
00616 
00617 static Regexp*
00618 make_regexp(s, len, flags)
00619     const char *s;
00620     long len;
00621     int flags;
00622 {
00623     Regexp *rp;
00624     char *err;
00625 
00626     /* Handle escaped characters first. */
00627 
00628     /* Build a copy of the string (in dest) with the
00629        escaped characters translated,  and generate the regex
00630        from that.
00631     */
00632 
00633     rp = ALLOC(Regexp);
00634     MEMZERO((char *)rp, Regexp, 1);
00635     rp->buffer = ALLOC_N(char, 16);
00636     rp->allocated = 16;
00637     rp->fastmap = ALLOC_N(char, 256);
00638     if (flags) {
00639         rp->options = flags;
00640     }
00641     err = re_compile_pattern(s, len, rp);
00642 
00643     if (err != NULL) {
00644         re_free_pattern(rp);
00645         rb_reg_raise(s, len, err, 0);
00646         return 0;
00647     }
00648     return rp;
00649 }
00650 
00651 
00652 /*
00653  *  Document-class: MatchData
00654  *
00655  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
00656  *  and is the type of the object returned by <code>Regexp#match</code> and
00657  *  <code>Regexp#last_match</code>. It encapsulates all the results of a pattern
00658  *  match, results normally accessed through the special variables
00659  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
00660  *  <code>$2</code>, and so on. <code>Matchdata</code> is also known as
00661  *  <code>MatchingData</code>.
00662  *
00663  */
00664 
00665 static VALUE rb_cMatch;
00666 
00667 static VALUE match_alloc (VALUE);
00668 static VALUE
00669 match_alloc(klass)
00670     VALUE klass;
00671 {
00672     NEWOBJ(match, struct RMatch);
00673     OBJSETUP(match, klass, T_MATCH);
00674 
00675     match->str = 0;
00676     match->regs = 0;
00677     match->regs = ALLOC(struct re_registers);
00678     MEMZERO(match->regs, struct re_registers, 1);
00679 
00680     return (VALUE)match;
00681 }
00682 
00683 /* :nodoc: */
00684 static VALUE
00685 match_init_copy(obj, orig)
00686     VALUE obj, orig;
00687 {
00688     if (obj == orig) return obj;
00689 
00690     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00691         rb_raise(rb_eTypeError, "wrong argument class");
00692     }
00693     RMATCH(obj)->str = RMATCH(orig)->str;
00694     re_free_registers(RMATCH(obj)->regs);
00695     RMATCH(obj)->regs->allocated = 0;
00696     re_copy_registers(RMATCH(obj)->regs, RMATCH(orig)->regs);
00697 
00698     return obj;
00699 }
00700 
00701 
00702 /*
00703  *  call-seq:
00704  *     mtch.length   => integer
00705  *     mtch.size     => integer
00706  *  
00707  *  Returns the number of elements in the match array.
00708  *     
00709  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
00710  *     m.length   #=> 5
00711  *     m.size     #=> 5
00712  */
00713 
00714 static VALUE
00715 match_size(match)
00716     VALUE match;
00717 {
00718     return INT2FIX(RMATCH(match)->regs->num_regs);
00719 }
00720 
00721 
00722 /*
00723  *  call-seq:
00724  *     mtch.offset(n)   => array
00725  *  
00726  *  Returns a two-element array containing the beginning and ending offsets of
00727  *  the <em>n</em>th match.
00728  *     
00729  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
00730  *     m.offset(0)   #=> [1, 7]
00731  *     m.offset(4)   #=> [6, 7]
00732  */
00733 
00734 static VALUE
00735 match_offset(match, n)
00736     VALUE match, n;
00737 {
00738     int i = NUM2INT(n);
00739 
00740     if (i < 0 || RMATCH(match)->regs->num_regs <= i)
00741         rb_raise(rb_eIndexError, "index %d out of matches", i);
00742 
00743     if (RMATCH(match)->regs->beg[i] < 0)
00744         return rb_assoc_new(Qnil, Qnil);
00745 
00746     return rb_assoc_new(INT2FIX(RMATCH(match)->regs->beg[i]),
00747                         INT2FIX(RMATCH(match)->regs->end[i]));
00748 }
00749 
00750 
00751 /*
00752  *  call-seq:
00753  *     mtch.begin(n)   => integer
00754  *  
00755  *  Returns the offset of the start of the <em>n</em>th element of the match
00756  *  array in the string.
00757  *     
00758  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
00759  *     m.begin(0)   #=> 1
00760  *     m.begin(2)   #=> 2
00761  */
00762 
00763 static VALUE
00764 match_begin(match, n)
00765     VALUE match, n;
00766 {
00767     int i = NUM2INT(n);
00768 
00769     if (i < 0 || RMATCH(match)->regs->num_regs <= i)
00770         rb_raise(rb_eIndexError, "index %d out of matches", i);
00771 
00772     if (RMATCH(match)->regs->beg[i] < 0)
00773         return Qnil;
00774 
00775     return INT2FIX(RMATCH(match)->regs->beg[i]);
00776 }
00777 
00778 
00779 /*
00780  *  call-seq:
00781  *     mtch.end(n)   => integer
00782  *  
00783  *  Returns the offset of the character immediately following the end of the
00784  *  <em>n</em>th element of the match array in the string.
00785  *     
00786  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
00787  *     m.end(0)   #=> 7
00788  *     m.end(2)   #=> 3
00789  */
00790 
00791 static VALUE
00792 match_end(match, n)
00793     VALUE match, n;
00794 {
00795     int i = NUM2INT(n);
00796 
00797     if (i < 0 || RMATCH(match)->regs->num_regs <= i)
00798         rb_raise(rb_eIndexError, "index %d out of matches", i);
00799 
00800     if (RMATCH(match)->regs->beg[i] < 0)
00801         return Qnil;
00802 
00803     return INT2FIX(RMATCH(match)->regs->end[i]);
00804 }
00805 
00806 #define MATCH_BUSY FL_USER2
00807 
00808 void
00809 rb_match_busy(match)
00810     VALUE match;
00811 {
00812     FL_SET(match, MATCH_BUSY);
00813 }
00814 
00815 int ruby_ignorecase;
00816 static int may_need_recompile;
00817 
00818 static void
00819 rb_reg_prepare_re(re)
00820     VALUE re;
00821 {
00822     int need_recompile = 0;
00823     int state;
00824 
00825     rb_reg_check(re);
00826     state = FL_TEST(re, REG_CASESTATE);
00827     /* ignorecase status */
00828     if (ruby_ignorecase && !state) {
00829         FL_SET(re, REG_CASESTATE);
00830         RREGEXP(re)->ptr->options |= RE_OPTION_IGNORECASE;
00831         need_recompile = 1;
00832     }
00833     if (!ruby_ignorecase && state) {
00834         FL_UNSET(re, REG_CASESTATE);
00835         RREGEXP(re)->ptr->options &= ~RE_OPTION_IGNORECASE;
00836         need_recompile = 1;
00837     }
00838 
00839     if (!FL_TEST(re, KCODE_FIXED) &&
00840         (RBASIC(re)->flags & KCODE_MASK) != reg_kcode) {
00841         need_recompile = 1;
00842         RBASIC(re)->flags &= ~KCODE_MASK;
00843         RBASIC(re)->flags |= reg_kcode;
00844     }
00845 
00846     if (need_recompile) {
00847         char *err;
00848 
00849         if (FL_TEST(re, KCODE_FIXED))
00850             kcode_set_option(re);
00851         rb_reg_check(re);
00852         RREGEXP(re)->ptr->fastmap_accurate = 0;
00853         err = re_compile_pattern(RREGEXP(re)->str, RREGEXP(re)->len, RREGEXP(re)->ptr);
00854         if (err != NULL) {
00855             rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, re);
00856         }
00857     }
00858 }
00859 
00860 long
00861 rb_reg_adjust_startpos(re, str, pos, reverse)
00862     VALUE re, str;
00863     long pos, reverse;
00864 {
00865     long range;
00866 
00867     rb_reg_check(re);
00868     if (may_need_recompile) rb_reg_prepare_re(re);
00869 
00870     if (FL_TEST(re, KCODE_FIXED))
00871         kcode_set_option(re);
00872     else if (reg_kcode != curr_kcode)
00873         kcode_reset_option();
00874 
00875     if (reverse) {
00876         range = -pos;
00877     }
00878     else {
00879         range = RSTRING(str)->len - pos;
00880     }
00881     return re_adjust_startpos(RREGEXP(re)->ptr,
00882                               RSTRING(str)->ptr, RSTRING(str)->len,
00883                               pos, range);
00884 }
00885 
00886 long
00887 rb_reg_search(re, str, pos, reverse)
00888     VALUE re, str;
00889     long pos, reverse;
00890 {
00891     long result;
00892     VALUE match;
00893     static struct re_registers regs;
00894     long range;
00895 
00896     if (pos > RSTRING(str)->len || pos < 0) {
00897         rb_backref_set(Qnil);
00898         return -1;
00899     }
00900 
00901     rb_reg_check(re);
00902     if (may_need_recompile) rb_reg_prepare_re(re);
00903 
00904     if (FL_TEST(re, KCODE_FIXED))
00905         kcode_set_option(re);
00906     else if (reg_kcode != curr_kcode)
00907         kcode_reset_option();
00908 
00909     if (reverse) {
00910         range = -pos;
00911     }
00912     else {
00913         range = RSTRING(str)->len - pos;
00914     }
00915     result = re_search(RREGEXP(re)->ptr,RSTRING(str)->ptr,RSTRING(str)->len,
00916                        pos, range, &regs);
00917 
00918     if (FL_TEST(re, KCODE_FIXED))
00919         kcode_reset_option();
00920 
00921     if (result == -2) {
00922         rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len,
00923                      "Stack overflow in regexp matcher", re);
00924     }
00925 
00926     if (result < 0) {
00927         rb_backref_set(Qnil);
00928         return result;
00929     }
00930 
00931     match = rb_backref_get();
00932     if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
00933         match = match_alloc(rb_cMatch);
00934     }
00935     else {
00936         if (rb_safe_level() >= 3) 
00937             OBJ_TAINT(match);
00938         else
00939             FL_UNSET(match, FL_TAINT);
00940     }
00941 
00942     re_copy_registers(RMATCH(match)->regs, &regs);
00943     RMATCH(match)->str = rb_str_new4(str);
00944     rb_backref_set(match);
00945 
00946     OBJ_INFECT(match, re);
00947     OBJ_INFECT(match, str);
00948     return result;
00949 }
00950 
00951 VALUE
00952 rb_reg_nth_defined(nth, match)
00953     int nth;
00954     VALUE match;
00955 {
00956     if (NIL_P(match)) return Qnil;
00957     if (nth >= RMATCH(match)->regs->num_regs) {
00958         return Qnil;
00959     }
00960     if (nth < 0) {
00961         nth += RMATCH(match)->regs->num_regs;
00962         if (nth <= 0) return Qnil;
00963     }
00964     if (RMATCH(match)->BEG(nth) == -1) return Qfalse;
00965     return Qtrue;
00966 }
00967 
00968 VALUE
00969 rb_reg_nth_match(nth, match)
00970     int nth;
00971     VALUE match;
00972 {
00973     VALUE str;
00974     long start, end, len;
00975 
00976     if (NIL_P(match)) return Qnil;
00977     if (nth >= RMATCH(match)->regs->num_regs) {
00978         return Qnil;
00979     }
00980     if (nth < 0) {
00981         nth += RMATCH(match)->regs->num_regs;
00982         if (nth <= 0) return Qnil;
00983     }
00984     start = RMATCH(match)->BEG(nth);
00985     if (start == -1) return Qnil;
00986     end = RMATCH(match)->END(nth);
00987     len = end - start;
00988     str = rb_str_substr(RMATCH(match)->str, start, len);
00989     OBJ_INFECT(str, match);
00990     return str;
00991 }
00992 
00993 VALUE
00994 rb_reg_last_match(match)
00995     VALUE match;
00996 {
00997     return rb_reg_nth_match(0, match);
00998 }
00999 
01000 
01001 /*
01002  *  call-seq:
01003  *     mtch.pre_match   => str
01004  *  
01005  *  Returns the portion of the original string before the current match.
01006  *  Equivalent to the special variable <code>$`</code>.
01007  *     
01008  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01009  *     m.pre_match   #=> "T"
01010  */
01011 
01012 VALUE
01013 rb_reg_match_pre(match)
01014     VALUE match;
01015 {
01016     VALUE str;
01017 
01018     if (NIL_P(match)) return Qnil;
01019     if (RMATCH(match)->BEG(0) == -1) return Qnil;
01020     str = rb_str_substr(RMATCH(match)->str, 0, RMATCH(match)->BEG(0));
01021     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01022     return str;
01023 }
01024 
01025 
01026 /*
01027  *  call-seq:
01028  *     mtch.post_match   => str
01029  *  
01030  *  Returns the portion of the original string after the current match.
01031  *  Equivalent to the special variable <code>$'</code>.
01032  *     
01033  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01034  *     m.post_match   #=> ": The Movie"
01035  */
01036 
01037 VALUE
01038 rb_reg_match_post(match)
01039     VALUE match;
01040 {
01041     VALUE str;
01042     long pos;
01043 
01044     if (NIL_P(match)) return Qnil;
01045     if (RMATCH(match)->BEG(0) == -1) return Qnil;
01046     str = RMATCH(match)->str;
01047     pos = RMATCH(match)->END(0);
01048     str = rb_str_substr(str, pos, RSTRING(str)->len - pos);
01049     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01050     return str;
01051 }
01052 
01053 VALUE
01054 rb_reg_match_last(match)
01055     VALUE match;
01056 {
01057     int i;
01058 
01059     if (NIL_P(match)) return Qnil;
01060     if (RMATCH(match)->BEG(0) == -1) return Qnil;
01061 
01062     for (i=RMATCH(match)->regs->num_regs-1; RMATCH(match)->BEG(i) == -1 && i > 0; i--)
01063         ;
01064     if (i == 0) return Qnil;
01065     return rb_reg_nth_match(i, match);
01066 }
01067 
01068 static VALUE
01069 last_match_getter()
01070 {
01071     return rb_reg_last_match(rb_backref_get());
01072 }
01073 
01074 static VALUE
01075 prematch_getter()
01076 {
01077     return rb_reg_match_pre(rb_backref_get());
01078 }
01079 
01080 static VALUE
01081 postmatch_getter()
01082 {
01083     return rb_reg_match_post(rb_backref_get());
01084 }
01085 
01086 static VALUE
01087 last_paren_match_getter()
01088 {
01089     return rb_reg_match_last(rb_backref_get());
01090 }
01091 
01092 static VALUE
01093 match_array(match, start)
01094     VALUE match;
01095     int start;
01096 {
01097     struct re_registers *regs = RMATCH(match)->regs;
01098     VALUE ary = rb_ary_new2(regs->num_regs);
01099     VALUE target = RMATCH(match)->str;
01100     int i;
01101     int taint = OBJ_TAINTED(match);
01102     
01103     for (i=start; i<regs->num_regs; i++) {
01104         if (regs->beg[i] == -1) {
01105             rb_ary_push(ary, Qnil);
01106         }
01107         else {
01108             VALUE str = rb_str_substr(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01109             if (taint) OBJ_TAINT(str);
01110             rb_ary_push(ary, str);
01111         }
01112     }
01113     return ary;
01114 }
01115 
01116 
01117 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
01118    second example to prevent the '*' followed by a '/' from ending the
01119    comment. */
01120 
01121 /*
01122  *  call-seq:
01123  *     mtch.to_a   => anArray
01124  *  
01125  *  Returns the array of matches.
01126  *     
01127  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01128  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
01129  *     
01130  *  Because <code>to_a</code> is called when expanding
01131  *  <code>*</code><em>variable</em>, there's a useful assignment
01132  *  shortcut for extracting matched fields. This is slightly slower than
01133  *  accessing the fields directly (as an intermediate array is
01134  *  generated).
01135  *     
01136  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
01137  *     all   #=> "HX1138"
01138  *     f1    #=> "H"
01139  *     f2    #=> "X"
01140  *     f3    #=> "113"
01141  */
01142 
01143 static VALUE
01144 match_to_a(match)
01145     VALUE match;
01146 {
01147     return match_array(match, 0);
01148 }
01149 
01150 
01151 /*
01152  *  call-seq:
01153  *     mtch.captures   => array
01154  *
01155  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
01156  *
01157  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
01158  *     f1    #=> "H"
01159  *     f2    #=> "X"
01160  *     f3    #=> "113"
01161  *     f4    #=> "8"
01162  */
01163 static VALUE
01164 match_captures(match)
01165     VALUE match;
01166 {
01167     return match_array(match, 1);
01168 }
01169 
01170 
01171 /*
01172  *  call-seq:
01173  *     mtch[i]               => obj
01174  *     mtch[start, length]   => array
01175  *     mtch[range]           => array
01176  *  
01177  *  Match Reference---<code>MatchData</code> acts as an array, and may be
01178  *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
01179  *  equivalent to the special variable <code>$&</code>, and returns the entire
01180  *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
01181  *  of the matched backreferences (portions of the pattern between parentheses).
01182  *     
01183  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01184  *     m[0]       #=> "HX1138"
01185  *     m[1, 2]    #=> ["H", "X"]
01186  *     m[1..3]    #=> ["H", "X", "113"]
01187  *     m[-3, 2]   #=> ["X", "113"]
01188  */
01189 
01190 static VALUE
01191 match_aref(argc, argv, match)
01192     int argc;
01193     VALUE *argv;
01194     VALUE match;
01195 {
01196     VALUE idx, rest;
01197 
01198     rb_scan_args(argc, argv, "11", &idx, &rest);
01199 
01200     if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
01201         return rb_ary_aref(argc, argv, match_to_a(match));
01202     }
01203     return rb_reg_nth_match(FIX2INT(idx), match);
01204 }
01205 
01206 static VALUE match_entry (VALUE, long);
01207 static VALUE
01208 match_entry(match, n)
01209     VALUE match;
01210     long n;
01211 {
01212     return rb_reg_nth_match(n, match);
01213 }
01214 
01215 
01216 /*
01217  *  call-seq:
01218  *     mtch.select([index]*)   => array
01219  *  
01220  *  Uses each <i>index</i> to access the matching values, returning an array of
01221  *  the corresponding matches.
01222  *     
01223  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01224  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
01225  *     m.select(0, 2, -2)   #=> ["HX1138", "X", "113"]
01226  */
01227 
01228 static VALUE
01229 match_values_at(argc, argv, match)
01230     int argc;
01231     VALUE *argv;
01232     VALUE match;
01233 {
01234     return rb_values_at(match, RMATCH(match)->regs->num_regs, argc, argv, match_entry);
01235 }
01236 
01237 
01238 /*
01239  *  call-seq:
01240  *     mtch.select([index]*)   => array
01241  *  
01242  *  Uses each <i>index</i> to access the matching values, returning an
01243  *  array of the corresponding matches.
01244  *     
01245  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01246  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
01247  *     m.select(0, 2, -2)   #=> ["HX1138", "X", "113"]
01248  */
01249 
01250 static VALUE
01251 match_select(argc, argv, match)
01252     int argc;
01253     VALUE *argv;
01254     VALUE match;
01255 {
01256     if (argc > 0) {
01257         rb_raise(rb_eArgError, "wrong number of arguments (%d for 0)", argc);
01258     }
01259     else {
01260         struct re_registers *regs = RMATCH(match)->regs;
01261         VALUE target = RMATCH(match)->str;
01262         VALUE result = rb_ary_new();
01263         int i;
01264         int taint = OBJ_TAINTED(match);
01265 
01266         for (i=0; i<regs->num_regs; i++) {
01267             VALUE str = rb_str_substr(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01268             if (taint) OBJ_TAINT(str);
01269             if (RTEST(rb_yield(str))) {
01270                 rb_ary_push(result, str);
01271             }
01272         }
01273         return result;
01274     }
01275 }
01276 
01277 
01278 /*
01279  *  call-seq:
01280  *     mtch.to_s   => str
01281  *  
01282  *  Returns the entire matched string.
01283  *     
01284  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01285  *     m.to_s   #=> "HX1138"
01286  */
01287 
01288 static VALUE
01289 match_to_s(match)
01290     VALUE match;
01291 {
01292     VALUE str = rb_reg_last_match(match);
01293 
01294     if (NIL_P(str)) str = rb_str_new(0,0);
01295     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01296     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01297     return str;
01298 }
01299 
01300 
01301 /*
01302  *  call-seq:
01303  *     mtch.string   => str
01304  *  
01305  *  Returns a frozen copy of the string passed in to <code>match</code>.
01306  *     
01307  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01308  *     m.string   #=> "THX1138."
01309  */
01310 
01311 static VALUE
01312 match_string(match)
01313     VALUE match;
01314 {
01315     return RMATCH(match)->str;  /* str is frozen */
01316 }
01317 
01318 VALUE rb_cRegexp;
01319 
01320 static void
01321 rb_reg_initialize(obj, s, len, options)
01322     VALUE obj;
01323     const char *s;
01324     long len;
01325     int options;                /* CASEFOLD  = 1 */
01326                                 /* EXTENDED  = 2 */
01327                                 /* MULTILINE = 4 */
01328                                 /* CODE_NONE = 16 */
01329                                 /* CODE_EUC  = 32 */
01330                                 /* CODE_SJIS = 48 */
01331                                 /* CODE_UTF8 = 64 */
01332 {
01333     struct RRegexp *re = RREGEXP(obj);
01334 
01335     if (re->ptr) re_free_pattern(re->ptr);
01336     if (re->str) free(re->str);
01337     re->ptr = 0;
01338     re->str = 0;
01339 
01340     switch (options & ~0xf) {
01341       case 0:
01342       default:
01343         FL_SET(re, reg_kcode);
01344         break;
01345       case 16:
01346         kcode_none(re);
01347         break;
01348       case 32:
01349         kcode_euc(re);
01350         break;
01351       case 48:
01352         kcode_sjis(re);
01353         break;
01354       case 64:
01355         kcode_utf8(re);
01356         break;
01357     }
01358 
01359     if (options & ~0xf) {
01360         kcode_set_option((VALUE)re);
01361     }
01362     if (ruby_ignorecase) {
01363         options |= RE_OPTION_IGNORECASE;
01364         FL_SET(re, REG_CASESTATE);
01365     }
01366     re->ptr = make_regexp(s, len, options & 0xf);
01367     re->str = ALLOC_N(char, len+1);
01368     memcpy(re->str, s, len);
01369     re->str[len] = '\0';
01370     re->len = len;
01371     if (options & ~0xf) {
01372         kcode_reset_option();
01373     }
01374 }
01375 
01376 static VALUE rb_reg_s_alloc (VALUE);
01377 static VALUE
01378 rb_reg_s_alloc(klass)
01379     VALUE klass;
01380 {
01381     NEWOBJ(re, struct RRegexp);
01382     OBJSETUP(re, klass, T_REGEXP);
01383 
01384     re->ptr = 0;
01385     re->len = 0;
01386     re->str = 0;
01387 
01388     return (VALUE)re;
01389 }
01390 
01391 VALUE
01392 rb_reg_new(s, len, options)
01393     const char *s;
01394     long len;
01395     int options;
01396 {
01397     VALUE re = rb_reg_s_alloc(rb_cRegexp);
01398 
01399     rb_reg_initialize(re, s, len, options);
01400     return (VALUE)re;
01401 }
01402 
01403 static int case_cache;
01404 static int kcode_cache;
01405 static VALUE reg_cache;
01406 
01407 VALUE
01408 rb_reg_regcomp(str)
01409     VALUE str;
01410 {
01411     volatile VALUE save_str = str;
01412     if (reg_cache && RREGEXP(reg_cache)->len == RSTRING(str)->len
01413         && case_cache == ruby_ignorecase
01414         && kcode_cache == reg_kcode
01415         && memcmp(RREGEXP(reg_cache)->str, RSTRING(str)->ptr, RSTRING(str)->len) == 0)
01416         return reg_cache;
01417 
01418     case_cache = ruby_ignorecase;
01419     kcode_cache = reg_kcode;
01420     return reg_cache = rb_reg_new(RSTRING(str)->ptr, RSTRING(str)->len,
01421                                   ruby_ignorecase);
01422 }
01423 
01424 static int
01425 rb_reg_cur_kcode(re)
01426     VALUE re;
01427 {
01428     if (FL_TEST(re, KCODE_FIXED)) {
01429         return RBASIC(re)->flags & KCODE_MASK;
01430     }
01431     return 0;
01432 }
01433 
01434 /*
01435  * call-seq:
01436  *   rxp.hash   => fixnum
01437  *
01438  * Produce a hash based on the text and options of this regular expression.
01439  */
01440 
01441 static VALUE
01442 rb_reg_hash(re)
01443     VALUE re;
01444 {
01445     int hashval, len;
01446     char *p;
01447 
01448     rb_reg_check(re);
01449     hashval = RREGEXP(re)->ptr->options;
01450     len = RREGEXP(re)->len;
01451     p  = RREGEXP(re)->str;
01452     while (len--) {
01453         hashval = hashval * 33 + *p++;
01454     }
01455     hashval = hashval + (hashval>>5);
01456     
01457     return INT2FIX(hashval);
01458 }
01459 
01460 
01461 /*
01462  *  call-seq:
01463  *     rxp == other_rxp      => true or false
01464  *     rxp.eql?(other_rxp)   => true or false
01465  *  
01466  *  Equality---Two regexps are equal if their patterns are identical, they have
01467  *  the same character set code, and their <code>casefold?</code> values are the
01468  *  same.
01469  *     
01470  *     /abc/  == /abc/x   #=> false
01471  *     /abc/  == /abc/i   #=> false
01472  *     /abc/u == /abc/n   #=> false
01473  */
01474 
01475 static VALUE
01476 rb_reg_equal(re1, re2)
01477     VALUE re1, re2;
01478 {
01479     if (re1 == re2) return Qtrue;
01480     if (TYPE(re2) != T_REGEXP) return Qfalse;
01481     rb_reg_check(re1); rb_reg_check(re2);
01482     if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
01483     if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0 &&
01484         rb_reg_cur_kcode(re1) == rb_reg_cur_kcode(re2) &&
01485         RREGEXP(re1)->ptr->options == RREGEXP(re2)->ptr->options) {
01486         return Qtrue;
01487     }
01488     return Qfalse;
01489 }
01490 
01491 
01492 /*
01493  *  call-seq:
01494  *     rxp.match(str)   => matchdata or nil
01495  *  
01496  *  Returns a <code>MatchData</code> object describing the match, or
01497  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
01498  *  value of the special variable <code>$~</code> following a normal match.
01499  *     
01500  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
01501  */
01502 
01503 VALUE
01504 rb_reg_match(re, str)
01505     VALUE re, str;
01506 {
01507     long start;
01508 
01509     if (NIL_P(str)) {
01510         rb_backref_set(Qnil);
01511         return Qnil;
01512     }
01513     StringValue(str);
01514     start = rb_reg_search(re, str, 0, 0);
01515     if (start < 0) {
01516         return Qnil;
01517     }
01518     return LONG2FIX(start);
01519 }
01520 
01521 
01522 /*
01523  *  call-seq:
01524  *     rxp === str   => true or false
01525  *  
01526  *  Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
01527  *     
01528  *     a = "HELLO"
01529  *     case a
01530  *     when /^[a-z]*$/; print "Lower case\n"
01531  *     when /^[A-Z]*$/; print "Upper case\n"
01532  *     else;            print "Mixed case\n"
01533  *     end
01534  *     
01535  *  <em>produces:</em>
01536  *     
01537  *     Upper case
01538  */
01539 
01540 VALUE
01541 rb_reg_eqq(re, str)
01542     VALUE re, str;
01543 {
01544     long start;
01545 
01546     if (TYPE(str) != T_STRING) {
01547         str = rb_check_string_type(str);
01548         if (NIL_P(str)) {
01549             rb_backref_set(Qnil);
01550             return Qfalse;
01551         }
01552     }
01553     StringValue(str);
01554     start = rb_reg_search(re, str, 0, 0);
01555     if (start < 0) {
01556         return Qfalse;
01557     }
01558     return Qtrue;
01559 }
01560 
01561 
01562 /*
01563  *  call-seq:
01564  *     ~ rxp   => integer or nil
01565  *  
01566  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
01567  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
01568  *     
01569  *     $_ = "input data"
01570  *     ~ /at/   #=> 7
01571  */
01572 
01573 VALUE
01574 rb_reg_match2(re)
01575     VALUE re;
01576 {
01577     long start;
01578     VALUE line = rb_lastline_get();
01579 
01580     if (TYPE(line) != T_STRING) {
01581         rb_backref_set(Qnil);
01582         return Qnil;
01583     }
01584 
01585     start = rb_reg_search(re, line, 0, 0);
01586     if (start < 0) {
01587         return Qnil;
01588     }
01589     return LONG2FIX(start);
01590 }
01591 
01592 
01593 /*
01594  *  call-seq:
01595  *     rxp.match(str)   => matchdata or nil
01596  *  
01597  *  Returns a <code>MatchData</code> object describing the match, or
01598  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
01599  *  value of the special variable <code>$~</code> following a normal match.
01600  *     
01601  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
01602  */
01603 
01604 static VALUE
01605 rb_reg_match_m(re, str)
01606     VALUE re, str;
01607 {
01608     VALUE result = rb_reg_match(re, str);
01609 
01610     if (NIL_P(result)) return Qnil;
01611     result = rb_backref_get();
01612     rb_match_busy(result);
01613     return result;
01614 }
01615 
01616 /*
01617  * Document-method: compile
01618  *
01619  * Synonym for <code>Regexp.new</code>
01620  */
01621 
01622 /*
01623  *  call-seq:
01624  *     Regexp.new(string [, options [, lang]])       => regexp
01625  *     Regexp.new(regexp)                            => regexp
01626  *     Regexp.compile(string [, options [, lang]])   => regexp
01627  *     Regexp.compile(regexp)                        => regexp
01628  *  
01629  *  Constructs a new regular expression from <i>pattern</i>, which can be either
01630  *  a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
01631  *  options are propagated, and new options may not be specified (a change as of
01632  *  Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
01633  *  more of the constants <code>Regexp::EXTENDED</code>,
01634  *  <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
01635  *  <em>or</em>-ed together. Otherwise, if <i>options</i> is not
01636  *  <code>nil</code>, the regexp will be case insensitive. The <i>lang</i>
01637  *  parameter enables multibyte support for the regexp: `n', `N' = none, `e',
01638  *  `E' = EUC, `s', `S' = SJIS, `u', `U' = UTF-8.
01639  * 
01640  *     r1 = Regexp.new('^a-z+:\\s+\w+')           #=> /^a-z+:\s+\w+/
01641  *     r2 = Regexp.new('cat', true)               #=> /cat/i
01642  *     r3 = Regexp.new('dog', Regexp::EXTENDED)   #=> /dog/x
01643  *     r4 = Regexp.new(r2)                        #=> /cat/i
01644  */
01645 
01646 static VALUE
01647 rb_reg_initialize_m(argc, argv, self)
01648     int argc;
01649     VALUE *argv;
01650     VALUE self;
01651 {
01652     const char *s;
01653     long len;
01654     int flags = 0;
01655 
01656     rb_check_frozen(self);
01657     if (argc == 0 || argc > 3) {
01658         rb_raise(rb_eArgError, "wrong number of arguments");
01659     }
01660     if (TYPE(argv[0]) == T_REGEXP) {
01661         if (argc > 1) {
01662             rb_warn("flags%s ignored", (argc == 3) ? " and encoding": "");
01663         }
01664         rb_reg_check(argv[0]);
01665         flags = RREGEXP(argv[0])->ptr->options & 0xf;
01666         if (FL_TEST(argv[0], KCODE_FIXED)) {
01667             switch (RBASIC(argv[0])->flags & KCODE_MASK) {
01668               case KCODE_NONE:
01669                 flags |= 16;
01670                 break;
01671               case KCODE_EUC:
01672                 flags |= 32;
01673                 break;
01674               case KCODE_SJIS:
01675                 flags |= 48;
01676                 break;
01677               case KCODE_UTF8:
01678                 flags |= 64;
01679                 break;
01680               default:
01681                 break;
01682             }
01683         }