00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "config.h"
00025
00026 #ifdef HAVE_STRING_H
00027 # include <string.h>
00028 #else
00029 # include <strings.h>
00030 #endif
00031
00032
00033 #include <stdio.h>
00034
00035
00036 #include <ctype.h>
00037 #include <sys/types.h>
00038
00039 #ifndef PARAMS
00040 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
00041 # define PARAMS(args) args
00042 # else
00043 # define PARAMS(args) ()
00044 # endif
00045 #endif
00046
00047 #if defined(STDC_HEADERS)
00048 # include <stddef.h>
00049 #else
00050
00051 # include <sys/types.h>
00052 #endif
00053
00054 #if !defined(__STDC__) && !defined(_MSC_VER)
00055 # define volatile
00056 #endif
00057
00058 #ifdef HAVE_PROTOTYPES
00059 # define args args
00060 #else
00061 # define args ()
00062 #endif
00063
00064 #ifdef RUBY_PLATFORM
00065 #include "defines.h"
00066
00067 # define RUBY
00068 extern int rb_prohibit_interrupt;
00069 extern int rb_trap_pending;
00070 void rb_trap_exec (void);
00071
00072 # define CHECK_INTS do {\
00073 if (!rb_prohibit_interrupt) {\
00074 if (rb_trap_pending) rb_trap_exec();\
00075 }\
00076 } while (0)
00077 #endif
00078
00079
00080 #ifdef __GNUC__
00081 # ifndef atarist
00082 # ifndef alloca
00083 # define alloca __builtin_alloca
00084 # endif
00085 # endif
00086 #else
00087 # if defined(HAVE_ALLOCA_H)
00088 # include <alloca.h>
00089 # elif !defined(alloca)
00090 char *alloca();
00091 # endif
00092 #endif
00093
00094 #ifdef _AIX
00095 #pragma alloca
00096 #endif
00097
00098 #ifdef HAVE_STRING_H
00099 # include <string.h>
00100 #else
00101 # include <strings.h>
00102 #endif
00103
00104 #ifdef C_ALLOCA
00105 #define FREE_VARIABLES() alloca(0)
00106 #else
00107 #define FREE_VARIABLES()
00108 #endif
00109
00110 #define FREE_AND_RETURN_VOID(stackb) do { \
00111 FREE_VARIABLES(); \
00112 if (stackb != stacka) xfree(stackb); \
00113 return; \
00114 } while(0)
00115
00116 #define FREE_AND_RETURN(stackb,val) do { \
00117 FREE_VARIABLES(); \
00118 if (stackb != stacka) xfree(stackb); \
00119 return(val); \
00120 } while(0)
00121
00122 #define DOUBLE_STACK(type) do { \
00123 type *stackx; \
00124 unsigned int xlen = stacke - stackb; \
00125 if (stackb == stacka) { \
00126 stackx = (type*)xmalloc(2 * xlen * sizeof(type)); \
00127 memcpy(stackx, stackb, xlen * sizeof (type)); \
00128 } \
00129 else { \
00130 stackx = (type*)xrealloc(stackb, 2 * xlen * sizeof(type)); \
00131 } \
00132 \
00133 stackp = stackx + (stackp - stackb); \
00134 stackb = stackx; \
00135 stacke = stackb + 2 * xlen; \
00136 } while (0)
00137
00138 #define RE_TALLOC(n,t) ((t*)alloca((n)*sizeof(t)))
00139 #define TMALLOC(n,t) ((t*)xmalloc((n)*sizeof(t)))
00140 #define TREALLOC(s,n,t) (s=((t*)xrealloc(s,(n)*sizeof(t))))
00141
00142 #define EXPAND_FAIL_STACK() DOUBLE_STACK(unsigned char*)
00143 #define ENSURE_FAIL_STACK(n) \
00144 do { \
00145 if (stacke - stackp <= (n)) { \
00146
00147
00148
00149 \
00150 \
00151 \
00152 EXPAND_FAIL_STACK(); \
00153 } \
00154 } while (0)
00155
00156
00157 #include "regex.h"
00158
00159
00160 static void store_jump (char*, int, char*);
00161 static void insert_jump (int, char*, char*, char*);
00162 static void store_jump_n (char*, int, char*, unsigned);
00163 static void insert_jump_n (int, char*, char*, char*, unsigned);
00164 static void insert_op (int, char*, char*);
00165 static void insert_op_2 (int, char*, char*, int, int);
00166 static int memcmp_translate (unsigned char*, unsigned char*, int);
00167
00168
00169
00170
00171
00172 #define Sword 1
00173 #define Sword2 2
00174
00175 #define SYNTAX(c) re_syntax_table[c]
00176
00177 static char re_syntax_table[256];
00178 static void init_syntax_once (void);
00179 static const unsigned char *translate = 0;
00180 static void init_regs (struct re_registers*, unsigned int);
00181 static void bm_init_skip (int *, unsigned char*, int, const unsigned char*);
00182 static int current_mbctype = MBCTYPE_ASCII;
00183
00184 #undef P
00185
00186 #ifdef RUBY
00187 #include "util.h"
00188 void rb_warn (const char*, ...);
00189 # define re_warning(x) rb_warn(x)
00190 #endif
00191
00192 #ifndef re_warning
00193 # define re_warning(x)
00194 #endif
00195
00196 static void
00197 init_syntax_once()
00198 {
00199 register int c;
00200 static int done = 0;
00201
00202 if (done)
00203 return;
00204
00205 memset(re_syntax_table, 0, sizeof re_syntax_table);
00206
00207 for (c=0; c<=0x7f; c++)
00208 if (isalnum(c))
00209 re_syntax_table[c] = Sword;
00210 re_syntax_table['_'] = Sword;
00211
00212 for (c=0x80; c<=0xff; c++)
00213 if (isalnum(c))
00214 re_syntax_table[c] = Sword2;
00215 done = 1;
00216 }
00217
00218 void
00219 re_set_casetable(table)
00220 const char *table;
00221 {
00222 translate = (const unsigned char*)table;
00223 }
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237 #undef ISASCII
00238 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
00239 # define ISASCII(c) 1
00240 #else
00241 # define ISASCII(c) isascii(c)
00242 #endif
00243
00244 #ifdef isblank
00245 # define ISBLANK(c) (ISASCII(c) && isblank(c))
00246 #else
00247 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
00248 #endif
00249 #ifdef isgraph
00250 # define ISGRAPH(c) (ISASCII(c) && isgraph(c))
00251 #else
00252 # define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
00253 #endif
00254
00255 #undef ISPRINT
00256 #define ISPRINT(c) (ISASCII(c) && isprint(c))
00257 #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
00258 #define ISALNUM(c) (ISASCII(c) && isalnum(c))
00259 #define ISALPHA(c) (ISASCII(c) && isalpha(c))
00260 #define ISCNTRL(c) (ISASCII(c) && iscntrl(c))
00261 #define ISLOWER(c) (ISASCII(c) && islower(c))
00262 #define ISPUNCT(c) (ISASCII(c) && ispunct(c))
00263 #define ISSPACE(c) (ISASCII(c) && isspace(c))
00264 #define ISUPPER(c) (ISASCII(c) && isupper(c))
00265 #define ISXDIGIT(c) (ISASCII(c) && isxdigit(c))
00266
00267 #ifndef NULL
00268 # define NULL (void *)0
00269 #endif
00270
00271
00272
00273
00274
00275 #undef SIGN_EXTEND_CHAR
00276 #if __STDC__
00277 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
00278 #else
00279
00280 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
00281 #endif
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293 enum regexpcode
00294 {
00295 unused=0,
00296 exactn=1,
00297 begline,
00298 endline,
00299 begbuf,
00300
00301 endbuf,
00302 endbuf2,
00303 begpos,
00304 jump,
00305 jump_past_alt,
00306 on_failure_jump,
00307
00308 finalize_jump,
00309
00310 maybe_finalize_jump,
00311
00312
00313
00314
00315
00316
00317
00318 dummy_failure_jump,
00319
00320
00321
00322
00323
00324 push_dummy_failure,
00325
00326 succeed_n,
00327
00328
00329
00330 jump_n,
00331
00332
00333 try_next,
00334
00335 finalize_push,
00336
00337 finalize_push_n,
00338 set_number_at,
00339
00340 anychar,
00341 anychar_repeat,
00342 charset,
00343
00344
00345
00346
00347
00348
00349 charset_not,
00350
00351 start_memory,
00352
00353
00354
00355 stop_memory,
00356
00357
00358
00359 start_paren,
00360 stop_paren,
00361 casefold_on,
00362 casefold_off,
00363 option_set,
00364 start_nowidth,
00365 stop_nowidth,
00366 pop_and_fail,
00367 stop_backtrack,
00368 duplicate,
00369
00370
00371 wordchar,
00372 notwordchar,
00373 wordbeg,
00374 wordend,
00375 wordbound,
00376 notwordbound
00377 };
00378
00379
00380
00381
00382
00383
00384 #ifndef NFAILURES
00385 #define NFAILURES 160
00386 #endif
00387
00388
00389 #define STORE_NUMBER(destination, number) \
00390 do { (destination)[0] = (number) & 0377; \
00391 (destination)[1] = (number) >> 8; } while (0)
00392
00393
00394
00395
00396 #define STORE_NUMBER_AND_INCR(destination, number) \
00397 do { STORE_NUMBER(destination, number); \
00398 (destination) += 2; } while (0)
00399
00400
00401
00402
00403 #define EXTRACT_NUMBER(destination, source) \
00404 do { (destination) = *(source) & 0377; \
00405 (destination) += SIGN_EXTEND_CHAR(*(char*)((source) + 1)) << 8; } while (0)
00406
00407
00408
00409
00410 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
00411 do { EXTRACT_NUMBER(destination, source); \
00412 (source) += 2; } while (0)
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422 long
00423 re_set_syntax(syntax)
00424 long syntax;
00425 {
00426
00427 return 0;
00428 }
00429
00430
00431
00432 #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate)
00433 #define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate)
00434
00435
00436
00437
00438 #define PATFETCH(c) \
00439 do {if (p == pend) goto end_of_pattern; \
00440 c = (unsigned char) *p++; \
00441 if (TRANSLATE_P()) c = (unsigned char)translate[c]; \
00442 } while (0)
00443
00444
00445
00446 #define PATFETCH_RAW(c) \
00447 do {if (p == pend) goto end_of_pattern; \
00448 c = (unsigned char)*p++; \
00449 } while (0)
00450
00451
00452 #define PATUNFETCH p--
00453
00454 #define MBC2WC(c, p) \
00455 do { \
00456 if (current_mbctype == MBCTYPE_UTF8) { \
00457 int n = mbclen(c) - 1; \
00458 c &= (1<<(BYTEWIDTH-2-n)) - 1; \
00459 while (n--) { \
00460 c = c << 6 | (*p++ & ((1<<6)-1)); \
00461 } \
00462 } \
00463 else { \
00464 c <<= 8; \
00465 c |= (unsigned char)*(p)++; \
00466 } \
00467 } while (0)
00468
00469 #define PATFETCH_MBC(c) \
00470 do { \
00471 if (p + mbclen(c) - 1 >= pend) goto end_of_pattern; \
00472 MBC2WC(c, p); \
00473 } while(0)
00474
00475 #define WC2MBC1ST(c) \
00476 ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c))
00477
00478 typedef unsigned int (*mbc_startpos_func_t) (const char *string, unsigned int pos);
00479
00480 static unsigned int asc_startpos (const char *string, unsigned int pos);
00481 static unsigned int euc_startpos (const char *string, unsigned int pos);
00482 static unsigned int sjis_startpos (const char *string, unsigned int pos);
00483 static unsigned int utf8_startpos (const char *string, unsigned int pos);
00484
00485 static const mbc_startpos_func_t mbc_startpos_func[4] = {
00486 asc_startpos, euc_startpos, sjis_startpos, utf8_startpos
00487 };
00488
00489 #define mbc_startpos(start, pos) (*mbc_startpos_func[current_mbctype])((start), (pos))
00490
00491 static unsigned int
00492 utf8_firstbyte(c)
00493 unsigned long c;
00494 {
00495 if (c < 0x80) return c;
00496 if (c <= 0x7ff) return ((c>>6)&0xff)|0xc0;
00497 if (c <= 0xffff) return ((c>>12)&0xff)|0xe0;
00498 if (c <= 0x1fffff) return ((c>>18)&0xff)|0xf0;
00499 if (c <= 0x3ffffff) return ((c>>24)&0xff)|0xf8;
00500 if (c <= 0x7fffffff) return ((c>>30)&0xff)|0xfc;
00501 #if SIZEOF_INT > 4
00502 if (c <= 0xfffffffff) return 0xfe;
00503 #else
00504 return 0xfe;
00505 #endif
00506 }
00507
00508 static void
00509 print_mbc(c)
00510 unsigned int c;
00511 {
00512 if (current_mbctype == MBCTYPE_UTF8) {
00513 if (c < 0x80)
00514 printf("%c", (int)c);
00515 else if (c <= 0x7ff)
00516 printf("%c%c", (int)utf8_firstbyte(c), (int)(c & 0x3f));
00517 else if (c <= 0xffff)
00518 printf("%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 6) & 0x3f),
00519 (int)(c & 0x3f));
00520 else if (c <= 0x1fffff)
00521 printf("%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 12) & 0x3f),
00522 (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
00523 else if (c <= 0x3ffffff)
00524 printf("%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 18) & 0x3f),
00525 (int)((c >> 12) & 0x3f), (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
00526 else if (c <= 0x7fffffff)
00527 printf("%c%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 24) & 0x3f),
00528 (int)((c >> 18) & 0x3f), (int)((c >> 12) & 0x3f),
00529 (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
00530 }
00531 else if (c < 0xff) {
00532 printf("\\%o", (int)c);
00533 }
00534 else {
00535 printf("%c%c", (int)(c >> BYTEWIDTH), (int)(c &0xff));
00536 }
00537 }
00538
00539
00540 #define INIT_BUF_SIZE 28
00541
00542
00543 #define GET_BUFFER_SPACE(n) \
00544 do { \
00545 while (b - bufp->buffer + (n) >= bufp->allocated) \
00546 EXTEND_BUFFER; \
00547 } while (0)
00548
00549
00550 #define BUFPUSH(ch) \
00551 do { \
00552 GET_BUFFER_SPACE(1); \
00553 *b++ = (char)(ch); \
00554 } while (0)
00555
00556
00557
00558
00559
00560 #define EXTEND_BUFFER \
00561 do { char *old_buffer = bufp->buffer; \
00562 if (bufp->allocated == (1L<<16)) goto too_big; \
00563 bufp->allocated *= 2; \
00564 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
00565 bufp->buffer = (char*)xrealloc(bufp->buffer, bufp->allocated); \
00566 if (bufp->buffer == 0) \
00567 goto memory_exhausted; \
00568 b = (b - old_buffer) + bufp->buffer; \
00569 if (fixup_alt_jump) \
00570 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \
00571 if (laststart) \
00572 laststart = (laststart - old_buffer) + bufp->buffer; \
00573 begalt = (begalt - old_buffer) + bufp->buffer; \
00574 if (pending_exact) \
00575 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
00576 } while (0)
00577
00578
00579
00580 #define SET_LIST_BIT(c) \
00581 (b[(unsigned char)(c) / BYTEWIDTH] \
00582 |= 1 << ((unsigned char)(c) % BYTEWIDTH))
00583
00584
00585 #define GET_UNSIGNED_NUMBER(num) \
00586 do { if (p != pend) { \
00587 PATFETCH(c); \
00588 while (ISDIGIT(c)) { \
00589 if (num < 0) \
00590 num = 0; \
00591 num = num * 10 + c - '0'; \
00592 if (p == pend) \
00593 break; \
00594 PATFETCH(c); \
00595 } \
00596 } \
00597 } while (0)
00598
00599 #define STREQ(s1, s2) ((strcmp(s1, s2) == 0))
00600
00601 #define CHAR_CLASS_MAX_LENGTH 6
00602
00603 #define IS_CHAR_CLASS(string) \
00604 (STREQ(string, "alpha") || STREQ(string, "upper") \
00605 || STREQ(string, "lower") || STREQ(string, "digit") \
00606 || STREQ(string, "alnum") || STREQ(string, "xdigit") \
00607 || STREQ(string, "space") || STREQ(string, "print") \
00608 || STREQ(string, "punct") || STREQ(string, "graph") \
00609 || STREQ(string, "cntrl") || STREQ(string, "blank"))
00610
00611 #define STORE_MBC(p, c) \
00612 do { \
00613 (p)[0] = (unsigned char)(((c) >>24) & 0xff); \
00614 (p)[1] = (unsigned char)(((c) >>16) & 0xff); \
00615 (p)[2] = (unsigned char)(((c) >> 8) & 0xff); \
00616 (p)[3] = (unsigned char)(((c) >> 0) & 0xff); \
00617 } while (0)
00618
00619 #define STORE_MBC_AND_INCR(p, c) \
00620 do { \
00621 *(p)++ = (unsigned char)(((c) >>24) & 0xff); \
00622 *(p)++ = (unsigned char)(((c) >>16) & 0xff); \
00623 *(p)++ = (unsigned char)(((c) >> 8) & 0xff); \
00624 *(p)++ = (unsigned char)(((c) >> 0) & 0xff); \
00625 } while (0)
00626
00627 #define EXTRACT_MBC(p) \
00628 ((unsigned int)((unsigned char)(p)[0] << 24 | \
00629 (unsigned char)(p)[1] << 16 | \
00630 (unsigned char)(p)[2] << 8 | \
00631 (unsigned char)(p)[3]))
00632
00633 #define EXTRACT_MBC_AND_INCR(p) \
00634 ((unsigned int)((p) += 4, \
00635 (unsigned char)(p)[-4] << 24 | \
00636 (unsigned char)(p)[-3] << 16 | \
00637 (unsigned char)(p)[-2] << 8 | \
00638 (unsigned char)(p)[-1]))
00639
00640 #define EXTRACT_UNSIGNED(p) \
00641 ((unsigned char)(p)[0] | (unsigned char)(p)[1] << 8)
00642 #define EXTRACT_UNSIGNED_AND_INCR(p) \
00643 ((p) += 2, (unsigned char)(p)[-2] | (unsigned char)(p)[-1] << 8)
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660 static void
00661 set_list_bits(c1, c2, b)
00662 unsigned long c1, c2;
00663 unsigned char *b;
00664 {
00665 unsigned char sbc_size = b[-1];
00666 unsigned short mbc_size = EXTRACT_UNSIGNED(&b[sbc_size]);
00667 unsigned short beg, end, upb;
00668
00669 if (c1 > c2)
00670 return;
00671 b = &b[sbc_size + 2];
00672
00673 for (beg = 0, upb = mbc_size; beg < upb; ) {
00674 unsigned short mid = (unsigned short)(beg + upb) >> 1;
00675
00676 if ((int)c1 - 1 > (int)EXTRACT_MBC(&b[mid*8+4]))
00677 beg = mid + 1;
00678 else
00679 upb = mid;
00680 }
00681
00682 for (end = beg, upb = mbc_size; end < upb; ) {
00683 unsigned short mid = (unsigned short)(end + upb) >> 1;
00684
00685 if ((int)c2 >= (int)EXTRACT_MBC(&b[mid*8]) - 1)
00686 end = mid + 1;
00687 else
00688 upb = mid;
00689 }
00690
00691 if (beg != end) {
00692 if (c1 > EXTRACT_MBC(&b[beg*8]))
00693 c1 = EXTRACT_MBC(&b[beg*8]);
00694 if (c2 < EXTRACT_MBC(&b[(end - 1)*8+4]))
00695 c2 = EXTRACT_MBC(&b[(end - 1)*8+4]);
00696 }
00697 if (end < mbc_size && end != beg + 1)
00698
00699 memmove(&b[(beg + 1)*8], &b[end*8], (mbc_size - end)*8);
00700 STORE_MBC(&b[beg*8 + 0], c1);
00701 STORE_MBC(&b[beg*8 + 4], c2);
00702 mbc_size += beg - end + 1;
00703 STORE_NUMBER(&b[-2], mbc_size);
00704 }
00705
00706 static int
00707 is_in_list_sbc(c, b)
00708 unsigned long c;
00709 const unsigned char *b;
00710 {
00711 unsigned short size;
00712
00713 size = *b++;
00714 return ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH);
00715 }
00716
00717 static int
00718 is_in_list_mbc(c, b)
00719 unsigned long c;
00720 const unsigned char *b;
00721 {
00722 unsigned short size;
00723 unsigned short i, j;
00724
00725 size = *b++;
00726 b += size + 2;
00727 size = EXTRACT_UNSIGNED(&b[-2]);
00728 if (size == 0) return 0;
00729
00730 for (i = 0, j = size; i < j; ) {
00731 unsigned short k = (unsigned short)(i + j) >> 1;
00732
00733 if (c > EXTRACT_MBC(&b[k*8+4]))
00734 i = k + 1;
00735 else
00736 j = k;
00737 }
00738 if (i < size && EXTRACT_MBC(&b[i*8]) <= c)
00739 return 1;
00740
00741 return 0;
00742 }
00743
00744 static int
00745 is_in_list(c, b)
00746 unsigned long c;
00747 const unsigned char *b;
00748 {
00749 return is_in_list_sbc(c, b) || (current_mbctype ? is_in_list_mbc(c, b) : 0);
00750 }
00751
00752 static void
00753 print_partial_compiled_pattern(start, end)
00754 unsigned char *start;
00755 unsigned char *end;
00756 {
00757 int mcnt, mcnt2;
00758 unsigned char *p = start;
00759 unsigned char *pend = end;
00760
00761 if (start == NULL) {
00762 printf("(null)\n");
00763 return;
00764 }
00765
00766
00767 while (p < pend) {
00768 switch ((enum regexpcode)*p++) {
00769 case unused:
00770 printf("/unused");
00771 break;
00772
00773 case exactn:
00774 mcnt = *p++;
00775 printf("/exactn/%d", mcnt);
00776 do {
00777 putchar('/');
00778 printf("%c", *p++);
00779 }
00780 while (--mcnt);
00781 break;
00782
00783 case start_memory:
00784 mcnt = *p++;
00785 printf("/start_memory/%d/%d", mcnt, *p++);
00786 break;
00787
00788 case stop_memory:
00789 mcnt = *p++;
00790 printf("/stop_memory/%d/%d", mcnt, *p++);
00791 break;
00792
00793 case start_paren:
00794 printf("/start_paren");
00795 break;
00796
00797 case stop_paren:
00798 printf("/stop_paren");
00799 break;
00800
00801 case casefold_on:
00802 printf("/casefold_on");
00803 break;
00804
00805 case casefold_off:
00806 printf("/casefold_off");
00807 break;
00808
00809 case option_set:
00810 printf("/option_set/%d", *p++);
00811 break;
00812
00813 case start_nowidth:
00814 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00815 printf("/start_nowidth//%d", mcnt);
00816 break;
00817
00818 case stop_nowidth:
00819 printf("/stop_nowidth//");
00820 p += 2;
00821 break;
00822
00823 case pop_and_fail:
00824 printf("/pop_and_fail");
00825 break;
00826
00827 case stop_backtrack:
00828 printf("/stop_backtrack//");
00829 p += 2;
00830 break;
00831
00832 case duplicate:
00833 printf("/duplicate/%d", *p++);
00834 break;
00835
00836 case anychar:
00837 printf("/anychar");
00838 break;
00839
00840 case anychar_repeat:
00841 printf("/anychar_repeat");
00842 break;
00843
00844 case charset:
00845 case charset_not:
00846 {
00847 register int c;
00848
00849 printf("/charset%s",
00850 (enum regexpcode)*(p - 1) == charset_not ? "_not" : "");
00851
00852 mcnt = *p++;
00853 printf("/%d", mcnt);
00854 for (c = 0; c < mcnt; c++) {
00855 unsigned bit;
00856 unsigned char map_byte = p[c];
00857
00858 putchar('/');
00859
00860 for (bit = 0; bit < BYTEWIDTH; bit++)
00861 if (map_byte & (1 << bit))
00862 printf("%c", c * BYTEWIDTH + bit);
00863 }
00864 p += mcnt;
00865 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
00866 putchar('/');
00867 while (mcnt--) {
00868 print_mbc(EXTRACT_MBC_AND_INCR(p));
00869 putchar('-');
00870 print_mbc(EXTRACT_MBC_AND_INCR(p));
00871 }
00872 break;
00873 }
00874
00875 case begline:
00876 printf("/begline");
00877 break;
00878
00879 case endline:
00880 printf("/endline");
00881 break;
00882
00883 case on_failure_jump:
00884 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00885 printf("/on_failure_jump//%d", mcnt);
00886 break;
00887
00888 case dummy_failure_jump:
00889 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00890 printf("/dummy_failure_jump//%d", mcnt);
00891 break;
00892
00893 case push_dummy_failure:
00894 printf("/push_dummy_failure");
00895 break;
00896
00897 case finalize_jump:
00898 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00899 printf("/finalize_jump//%d", mcnt);
00900 break;
00901
00902 case maybe_finalize_jump:
00903 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00904 printf("/maybe_finalize_jump//%d", mcnt);
00905 break;
00906
00907 case jump_past_alt:
00908 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00909 printf("/jump_past_alt//%d", mcnt);
00910 break;
00911
00912 case jump:
00913 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00914 printf("/jump//%d", mcnt);
00915 break;
00916
00917 case succeed_n:
00918 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00919 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00920 printf("/succeed_n//%d//%d", mcnt, mcnt2);
00921 break;
00922
00923 case jump_n:
00924 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00925 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00926 printf("/jump_n//%d//%d", mcnt, mcnt2);
00927 break;
00928
00929 case set_number_at:
00930 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00931 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00932 printf("/set_number_at//%d//%d", mcnt, mcnt2);
00933 break;
00934
00935 case try_next:
00936 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00937 printf("/try_next//%d", mcnt);
00938 break;
00939
00940 case finalize_push:
00941 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00942 printf("/finalize_push//%d", mcnt);
00943 break;
00944
00945 case finalize_push_n:
00946 EXTRACT_NUMBER_AND_INCR(mcnt, p);
00947 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00948 printf("/finalize_push_n//%d//%d", mcnt, mcnt2);
00949 break;
00950
00951 case wordbound:
00952 printf("/wordbound");
00953 break;
00954
00955 case notwordbound:
00956 printf("/notwordbound");
00957 break;
00958
00959 case wordbeg:
00960 printf("/wordbeg");
00961 break;
00962
00963 case wordend:
00964 printf("/wordend");
00965
00966 case wordchar:
00967 printf("/wordchar");
00968 break;
00969
00970 case notwordchar:
00971 printf("/notwordchar");
00972 break;
00973
00974 case begbuf:
00975 printf("/begbuf");
00976 break;
00977
00978 case endbuf:
00979 printf("/endbuf");
00980 break;
00981
00982 case endbuf2:
00983 printf("/endbuf2");
00984 break;
00985
00986 case begpos:
00987 printf("/begpos");
00988 break;
00989
00990 default:
00991 printf("?%d", *(p-1));
00992 }
00993 }
00994 printf("/\n");
00995 }
00996
00997
00998 static void
00999 print_compiled_pattern(bufp)
01000 struct re_pattern_buffer *bufp;
01001 {
01002 unsigned char *buffer = (unsigned char*)bufp->buffer;
01003
01004 print_partial_compiled_pattern(buffer, buffer + bufp->used);
01005 }
01006
01007 static char*
01008 calculate_must_string(start, end)
01009 char *start;
01010 char *end;
01011 {
01012 int mcnt;
01013 int max = 0;
01014 unsigned char *p = start;
01015 unsigned char *pend = end;
01016 char *must = 0;
01017
01018 if (start == NULL) return 0;
01019
01020
01021 while (p < pend) {
01022 switch ((enum regexpcode)*p++) {
01023 case unused:
01024 break;
01025
01026 case exactn:
01027 mcnt = *p;
01028 if (mcnt > max) {
01029 must = p;
01030 max = mcnt;
01031 }
01032 p += mcnt+1;
01033 break;
01034
01035 case start_memory:
01036 case stop_memory:
01037 p += 2;
01038 break;
01039
01040 case duplicate:
01041 case option_set:
01042 p++;
01043 break;
01044
01045 case casefold_on:
01046 case casefold_off:
01047 return 0;
01048
01049 case pop_and_fail:
01050 case anychar:
01051 case anychar_repeat:
01052 case begline:
01053 case endline:
01054 case wordbound:
01055 case notwordbound:
01056 case wordbeg:
01057 case wordend:
01058 case wordchar:
01059 case notwordchar:
01060 case begbuf:
01061 case endbuf:
01062 case endbuf2:
01063 case begpos:
01064 case push_dummy_failure:
01065 case start_paren:
01066 case stop_paren:
01067 break;
01068
01069 case charset:
01070 case charset_not:
01071 mcnt = *p++;
01072 p += mcnt;
01073 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
01074 while (mcnt--) {
01075 p += 8;
01076 }
01077 break;
01078
01079 case on_failure_jump:
01080 EXTRACT_NUMBER_AND_INCR(mcnt, p);
01081 if (mcnt > 0) p += mcnt;
01082 if ((enum regexpcode)p[-3] == jump) {
01083 p -= 2;
01084 EXTRACT_NUMBER_AND_INCR(mcnt, p);
01085 if (mcnt > 0) p += mcnt;
01086 }
01087 break;
01088
01089 case dummy_failure_jump:
01090 case succeed_n:
01091 case try_next:
01092 case jump:
01093 EXTRACT_NUMBER_AND_INCR(mcnt, p);
01094 if (mcnt > 0) p += mcnt;
01095 break;
01096
01097 case start_nowidth:
01098 case stop_nowidth:
01099 case stop_backtrack:
01100 case finalize_jump:
01101 case maybe_finalize_jump:
01102 case finalize_push:
01103 p += 2;
01104 break;
01105
01106 case jump_n:
01107 case set_number_at:
01108 case finalize_push_n:
01109 p += 4;
01110 break;
01111
01112 default:
01113 break;
01114 }
01115 }
01116 return must;
01117 }
01118
01119 static unsigned int
01120 read_backslash(c)
01121 int c;
01122 {
01123 switch (c) {
01124 case 'n':
01125 return '\n';
01126
01127 case 't':
01128 return '\t';
01129
01130 case 'r':
01131 return '\r';
01132
01133 case 'f':
01134 return '\f';
01135
01136 case 'v':
01137 return '\v';
01138
01139 case 'a':
01140 return '\007';
01141
01142 case 'b':
01143 return '\010';
01144
01145 case 'e':
01146 return '\033';
01147 }
01148 return c;
01149 }
01150
01151 static unsigned int
01152 read_special(p, pend, pp)
01153 const char *p, *pend, **pp;
01154 {
01155 int c;
01156
01157 PATFETCH_RAW(c);
01158 switch (c) {
01159 case 'M':
01160 PATFETCH_RAW(c);
01161 if (c != '-') return -1;
01162 PATFETCH_RAW(c);
01163 *pp = p;
01164 if (c == '\\') {
01165 return read_special(--p, pend, pp) | 0x80;
01166 }
01167 else if (c == -1) return ~0;
01168 else {
01169 return ((c & 0xff) | 0x80);
01170 }
01171
01172 case 'C':
01173 PATFETCH_RAW(c);
01174 if (c != '-') return ~0;
01175 case 'c':
01176 PATFETCH_RAW(c);
01177 *pp = p;
01178 if (c == '\\') {
01179 c = read_special(--p, pend, pp);
01180 }
01181 else if (c == '?') return 0177;
01182 else if (c == -1) return ~0;
01183 return c & 0x9f;
01184 default:
01185 *pp = p + 1;
01186 return read_backslash(c);
01187 }
01188
01189 end_of_pattern:
01190 return ~0;
01191 }
01192
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208 char *
01209 re_compile_pattern(pattern, size, bufp)
01210 const char *pattern;
01211 int size;
01212 struct re_pattern_buffer *bufp;
01213 {
01214 register char *b = bufp->buffer;
01215 register const char *p = pattern;
01216 const char *nextp;
01217 const char *pend = pattern + size;
01218 register unsigned int c, c1 = 0;
01219 const char *p0;
01220 int numlen;
01221 #define ERROR_MSG_MAX_SIZE 200
01222 static char error_msg[ERROR_MSG_MAX_SIZE+1];
01223
01224
01225
01226
01227
01228
01229 char *pending_exact = 0;
01230
01231
01232
01233
01234
01235 char *fixup_alt_jump = 0;
01236
01237
01238
01239
01240 char *laststart = 0;
01241
01242
01243
01244 char zero_times_ok;
01245
01246
01247
01248 char many_times_ok;
01249
01250
01251
01252 char greedy;
01253
01254
01255
01256 char *begalt = b;
01257
01258
01259
01260 const char *beg_interval;
01261
01262
01263 int lower_bound;
01264
01265
01266 int upper_bound;
01267
01268
01269
01270
01271
01272
01273
01274
01275
01276 int stacka[40];
01277 int *stackb = stacka;
01278 int *stackp = stackb;
01279 int *stacke = stackb + 40;
01280
01281
01282
01283
01284
01285 int regnum = 1;
01286
01287 int range = 0;
01288 int had_mbchar = 0;
01289 int had_num_literal = 0;
01290 int had_char_class = 0;
01291
01292 int options = bufp->options;
01293
01294 bufp->fastmap_accurate = 0;
01295 bufp->must = 0;
01296 bufp->must_skip = 0;
01297
01298
01299 init_syntax_once();
01300
01301 if (bufp->allocated == 0) {
01302 bufp->allocated = INIT_BUF_SIZE;
01303
01304 bufp->buffer = (char*)xrealloc(bufp->buffer, INIT_BUF_SIZE);
01305 if (!bufp->buffer) goto memory_exhausted;
01306 begalt = b = bufp->buffer;
01307 }
01308
01309 while (p != pend) {
01310 PATFETCH(c);
01311
01312 switch (c) {
01313 case '$':
01314 if (bufp->options & RE_OPTION_SINGLELINE) {
01315 BUFPUSH(endbuf);
01316 }
01317 else {
01318 p0 = p;
01319
01320
01321
01322 while (p0 != pend) {
01323 if (*p0 == '\\' && p0 + 1 != pend
01324 && (p0[1] == 'b' || p0[1] == 'B'))
01325 p0 += 2;
01326 else
01327 break;
01328 }
01329 BUFPUSH(endline);
01330 }
01331 break;
01332
01333 case '^':
01334 if (bufp->options & RE_OPTION_SINGLELINE)
01335 BUFPUSH(begbuf);
01336 else
01337 BUFPUSH(begline);
01338 break;
01339
01340 case '+':
01341 case '?':
01342 case '*':
01343
01344 if (!laststart) {
01345 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
01346 "invalid regular expression; there's no previous pattern, to which '%c' would define cardinality at %d",
01347 c, p-pattern);
01348 FREE_AND_RETURN(stackb, error_msg);
01349 }
01350
01351
01352 zero_times_ok = c != '+';
01353 many_times_ok = c != '?';
01354 greedy = 1;
01355 if (p != pend) {
01356 PATFETCH(c);
01357 switch (c) {
01358 case '?':
01359 greedy = 0;
01360 break;
01361 case '*':
01362 case '+':
01363 goto nested_meta;
01364 default:
01365 PATUNFETCH;
01366 break;
01367 }
01368 }
01369
01370 repeat:
01371
01372
01373 if (!laststart)
01374 break;
01375
01376 if (greedy && many_times_ok && *laststart == anychar && b - laststart <= 2) {
01377 if (b[-1] == stop_paren)
01378 b--;
01379 if (zero_times_ok)
01380 *laststart = anychar_repeat;
01381 else {
01382 BUFPUSH(anychar_repeat);
01383 }
01384 break;
01385 }
01386
01387
01388 if (many_times_ok) {
01389
01390
01391
01392
01393 GET_BUFFER_SPACE(3);
01394 store_jump(b,greedy?maybe_finalize_jump:finalize_push,laststart-3);
01395 b += 3;
01396 }
01397
01398
01399
01400 GET_BUFFER_SPACE(3);
01401 insert_jump(on_failure_jump, laststart, b + 3, b);
01402 b += 3;
01403
01404 if (zero_times_ok) {
01405 if (greedy == 0) {
01406 GET_BUFFER_SPACE(3);
01407 insert_jump(try_next, laststart, b + 3, b);
01408 b += 3;
01409 }
01410 }
01411 else {
01412
01413
01414
01415
01416
01417 GET_BUFFER_SPACE(3);
01418 insert_jump(dummy_failure_jump, laststart, laststart + 6, b);
01419 b += 3;
01420 }
01421 break;
01422
01423 case '.':
01424 laststart = b;
01425 BUFPUSH(anychar);
01426 break;
01427
01428 case '[':
01429 if (p == pend)
01430 FREE_AND_RETURN(stackb, "invalid regular expression; '[' can't be the last character ie. can't start range at the end of pattern");
01431 while ((b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH)
01432 > bufp->allocated)
01433 EXTEND_BUFFER;
01434
01435 laststart = b;
01436 if (*p == '^') {
01437 BUFPUSH(charset_not);
01438 p++;
01439 }
01440 else
01441 BUFPUSH(charset);
01442 p0 = p;
01443
01444 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
01445
01446 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
01447
01448 had_mbchar = 0;
01449 had_num_literal = 0;
01450 had_char_class = 0;
01451
01452
01453 for (;;) {
01454 int size;
01455 unsigned last = (unsigned)-1;
01456
01457 if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])) || current_mbctype) {
01458
01459
01460 size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*8 + 8;
01461 while (b + size + 1 > bufp->buffer + bufp->allocated)
01462 EXTEND_BUFFER;
01463 }
01464 range_retry:
01465 if (range && had_char_class) {
01466 FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as an end value of range");
01467 }
01468 PATFETCH_RAW(c);
01469
01470 if (c == ']') {
01471 if (p == p0 + 1) {
01472 if (p == pend)
01473 FREE_AND_RETURN(stackb, "invalid regular expression; empty character class");
01474 re_warning("character class has `]' without escape");
01475 }
01476 else
01477
01478
01479
01480 break;
01481 }
01482
01483
01484 if (had_char_class && c == '-' && *p != ']')
01485 FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as a start value of range");
01486 if (ismbchar(c)) {
01487 PATFETCH_MBC(c);
01488 had_mbchar++;
01489 }
01490 had_char_class = 0;
01491
01492 if (c == '-' && ((p != p0 + 1 && *p != ']') ||
01493 (p[0] == '-' && p[1] != ']') ||
01494 range))
01495 re_warning("character class has `-' without escape");
01496 if (c == '[' && *p != ':')
01497 re_warning("character class has `[' without escape");
01498
01499
01500 if (c == '\\') {
01501 PATFETCH_RAW(c);
01502 switch (c) {
01503 case 'w':
01504 for (c = 0; c < (1 << BYTEWIDTH); c++) {
01505 if (SYNTAX(c) == Sword ||
01506 (!current_mbctype && SYNTAX(c) == Sword2))
01507 SET_LIST_BIT(c);
01508 }
01509 if (current_mbctype) {
01510 set_list_bits(0x80, 0xffffffff, b);
01511 }
01512 had_char_class = 1;
01513 last = -1;
01514 continue;
01515
01516 case 'W':
01517 for (c = 0; c < (1 << BYTEWIDTH); c++) {
01518 if (SYNTAX(c) != Sword &&
01519 ((current_mbctype && !re_mbctab[c]) ||
01520 (!current_mbctype && SYNTAX(c) != Sword2)))
01521 SET_LIST_BIT(c);
01522 }
01523 had_char_class = 1;
01524 last = -1;
01525 continue;
01526
01527 case 's':
01528 for (c = 0; c < 256; c++)
01529 if (ISSPACE(c))
01530 SET_LIST_BIT(c);
01531 had_char_class = 1;
01532 last = -1;
01533 continue;
01534
01535 case 'S':
01536 for (c = 0; c < 256; c++)
01537 if (!ISSPACE(c))
01538