Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Globals

regex.c

Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 1993, 94, 95, 96, 97, 98 Free Software Foundation, Inc.
00003 
00004    The GNU C Library is free software; you can redistribute it and/or
00005    modify it under the terms of the GNU Library General Public License as
00006    published by the Free Software Foundation; either version 2 of the
00007    License, or (at your option) any later version.
00008 
00009    The GNU C Library is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012    Library General Public License for more details.
00013 
00014    You should have received a copy of the GNU Library General Public
00015    License along with the GNU C Library; see the file LGPL.  If not,
00016    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00017    Boston, MA 02111-1307, USA.  */
00018 /* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto)
00019    Last change: May 21, 1993 by t^2  */
00020 /* removed gapped buffer support, multiple syntax support by matz <matz@nts.co.jp> */
00021 /* Perl5 extension added by matz <matz@caelum.co.jp> */
00022 /* UTF-8 extension added Jan 16 1999 by Yoshida Masato  <yoshidam@tau.bekkoame.ne.jp> */
00023 
00024 #include "config.h"
00025 
00026 #ifdef HAVE_STRING_H
00027 # include <string.h>
00028 #else
00029 # include <strings.h>
00030 #endif
00031 
00032 /* We write fatal error messages on standard error.  */
00033 #include <stdio.h>
00034 
00035 /* isalpha(3) etc. are used for the character classes.  */
00036 #include <ctype.h>
00037 #include <sys/types.h>
00038 
00039 #ifndef PARAMS
00040 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
00041 #  define PARAMS(args) args
00042 # else
00043 #  define PARAMS(args) ()
00044 # endif  /* GCC.  */
00045 #endif  /* Not PARAMS.  */
00046 
00047 #if defined(STDC_HEADERS)
00048 # include <stddef.h>
00049 #else
00050 /* We need this for `regex.h', and perhaps for the Emacs include files.  */
00051 # include <sys/types.h>
00052 #endif
00053 
00054 #if !defined(__STDC__) && !defined(_MSC_VER)
00055 # define volatile
00056 #endif
00057 
00058 #ifdef HAVE_PROTOTYPES
00059 # define args args
00060 #else
00061 # define args ()
00062 #endif
00063 
00064 #ifdef RUBY_PLATFORM
00065 #include "defines.h"
00066 
00067 # define RUBY
00068 extern int rb_prohibit_interrupt;
00069 extern int rb_trap_pending;
00070 void rb_trap_exec (void);
00071 
00072 # define CHECK_INTS do {\
00073     if (!rb_prohibit_interrupt) {\
00074         if (rb_trap_pending) rb_trap_exec();\
00075     }\
00076 } while (0)
00077 #endif
00078 
00079 /* Make alloca work the best possible way.  */
00080 #ifdef __GNUC__
00081 # ifndef atarist
00082 #  ifndef alloca
00083 #   define alloca __builtin_alloca
00084 #  endif
00085 # endif /* atarist */
00086 #else
00087 # if defined(HAVE_ALLOCA_H)
00088 #  include <alloca.h>
00089 # elif !defined(alloca)
00090 char *alloca();
00091 # endif
00092 #endif /* __GNUC__ */
00093 
00094 #ifdef _AIX
00095 #pragma alloca
00096 #endif
00097 
00098 #ifdef HAVE_STRING_H
00099 # include <string.h>
00100 #else
00101 # include <strings.h>
00102 #endif
00103 
00104 #ifdef C_ALLOCA
00105 #define FREE_VARIABLES() alloca(0)
00106 #else
00107 #define FREE_VARIABLES()
00108 #endif
00109 
00110 #define FREE_AND_RETURN_VOID(stackb)   do {                             \
00111   FREE_VARIABLES();                                                     \
00112   if (stackb != stacka) xfree(stackb);                                  \
00113   return;                                                               \
00114 } while(0)
00115 
00116 #define FREE_AND_RETURN(stackb,val)    do {                             \
00117   FREE_VARIABLES();                                                     \
00118   if (stackb != stacka) xfree(stackb);                                  \
00119   return(val);                                                          \
00120 } while(0)
00121 
00122 #define DOUBLE_STACK(type) do {                                         \
00123   type *stackx;                                                         \
00124   unsigned int xlen = stacke - stackb;                                  \
00125   if (stackb == stacka) {                                               \
00126     stackx = (type*)xmalloc(2 * xlen * sizeof(type));                   \
00127     memcpy(stackx, stackb, xlen * sizeof (type));                       \
00128   }                                                                     \
00129   else {                                                                \
00130     stackx = (type*)xrealloc(stackb, 2 * xlen * sizeof(type));          \
00131   }                                                                     \
00132   /* Rearrange the pointers. */                                         \
00133   stackp = stackx + (stackp - stackb);                                  \
00134   stackb = stackx;                                                      \
00135   stacke = stackb + 2 * xlen;                                           \
00136 } while (0)
00137 
00138 #define RE_TALLOC(n,t)  ((t*)alloca((n)*sizeof(t)))
00139 #define TMALLOC(n,t)    ((t*)xmalloc((n)*sizeof(t)))
00140 #define TREALLOC(s,n,t) (s=((t*)xrealloc(s,(n)*sizeof(t))))
00141 
00142 #define EXPAND_FAIL_STACK() DOUBLE_STACK(unsigned char*)
00143 #define ENSURE_FAIL_STACK(n)                                            \
00144   do {                                                                  \
00145     if (stacke - stackp <= (n)) {                                       \
00146         /* if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS)           \
00147            {                                                            \
00148            FREE_AND_RETURN(stackb,(-2));                                \
00149            }*/                                                          \
00150                                                                         \
00151         /* Roughly double the size of the stack.  */                    \
00152         EXPAND_FAIL_STACK();                                            \
00153       }                                                                 \
00154   } while (0)
00155 
00156 /* Get the interface, including the syntax bits.  */
00157 #include "regex.h"
00158 
00159 /* Subroutines for re_compile_pattern.  */
00160 static void store_jump (char*, int, char*);
00161 static void insert_jump (int, char*, char*, char*);
00162 static void store_jump_n (char*, int, char*, unsigned);
00163 static void insert_jump_n (int, char*, char*, char*, unsigned);
00164 static void insert_op (int, char*, char*);
00165 static void insert_op_2 (int, char*, char*, int, int);
00166 static int memcmp_translate (unsigned char*, unsigned char*, int);
00167 
00168 /* Define the syntax stuff, so we can do the <, >, etc.  */
00169 
00170 /* This must be nonzero for the wordchar and notwordchar pattern
00171    commands in re_match.  */
00172 #define Sword  1
00173 #define Sword2 2
00174 
00175 #define SYNTAX(c) re_syntax_table[c]
00176 
00177 static char re_syntax_table[256];
00178 static void init_syntax_once (void);
00179 static const unsigned char *translate = 0;
00180 static void init_regs (struct re_registers*, unsigned int);
00181 static void bm_init_skip (int *, unsigned char*, int, const unsigned char*);
00182 static int current_mbctype = MBCTYPE_ASCII;
00183 
00184 #undef P
00185 
00186 #ifdef RUBY
00187 #include "util.h"
00188 void rb_warn (const char*, ...);
00189 # define re_warning(x) rb_warn(x)
00190 #endif
00191 
00192 #ifndef re_warning
00193 # define re_warning(x)
00194 #endif
00195 
00196 static void
00197 init_syntax_once()
00198 {
00199    register int c;
00200    static int done = 0;
00201 
00202    if (done)
00203      return;
00204 
00205    memset(re_syntax_table, 0, sizeof re_syntax_table);
00206 
00207    for (c=0; c<=0x7f; c++)
00208      if (isalnum(c)) 
00209        re_syntax_table[c] = Sword;
00210    re_syntax_table['_'] = Sword;
00211 
00212    for (c=0x80; c<=0xff; c++)
00213      if (isalnum(c)) 
00214        re_syntax_table[c] = Sword2;
00215    done = 1;
00216 }
00217 
00218 void
00219 re_set_casetable(table)
00220      const char *table;
00221 {
00222   translate = (const unsigned char*)table;
00223 }
00224 
00225 /* Jim Meyering writes:
00226 
00227    "... Some ctype macros are valid only for character codes that
00228    isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
00229    using /bin/cc or gcc but without giving an ansi option).  So, all
00230    ctype uses should be through macros like ISPRINT...  If
00231    STDC_HEADERS is defined, then autoconf has verified that the ctype
00232    macros don't need to be guarded with references to isascii. ...
00233    Defining isascii to 1 should let any compiler worth its salt
00234    eliminate the && through constant folding."
00235    Solaris defines some of these symbols so we must undefine them first.  */
00236 
00237 #undef ISASCII
00238 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
00239 # define ISASCII(c) 1
00240 #else
00241 # define ISASCII(c) isascii(c)
00242 #endif
00243 
00244 #ifdef isblank
00245 # define ISBLANK(c) (ISASCII(c) && isblank(c))
00246 #else
00247 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
00248 #endif
00249 #ifdef isgraph
00250 # define ISGRAPH(c) (ISASCII(c) && isgraph(c))
00251 #else
00252 # define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
00253 #endif
00254 
00255 #undef ISPRINT
00256 #define ISPRINT(c) (ISASCII(c) && isprint(c))
00257 #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
00258 #define ISALNUM(c) (ISASCII(c) && isalnum(c))
00259 #define ISALPHA(c) (ISASCII(c) && isalpha(c))
00260 #define ISCNTRL(c) (ISASCII(c) && iscntrl(c))
00261 #define ISLOWER(c) (ISASCII(c) && islower(c))
00262 #define ISPUNCT(c) (ISASCII(c) && ispunct(c))
00263 #define ISSPACE(c) (ISASCII(c) && isspace(c))
00264 #define ISUPPER(c) (ISASCII(c) && isupper(c))
00265 #define ISXDIGIT(c) (ISASCII(c) && isxdigit(c))
00266 
00267 #ifndef NULL
00268 # define NULL (void *)0
00269 #endif
00270 
00271 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
00272    since ours (we hope) works properly with all combinations of
00273    machines, compilers, `char' and `unsigned char' argument types.
00274    (Per Bothner suggested the basic approach.)  */
00275 #undef SIGN_EXTEND_CHAR
00276 #if __STDC__
00277 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
00278 #else  /* not __STDC__ */
00279 /* As in Harbison and Steele.  */
00280 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
00281 #endif
00282 
00283 /* These are the command codes that appear in compiled regular
00284    expressions, one per byte.  Some command codes are followed by
00285    argument bytes.  A command code can specify any interpretation
00286    whatsoever for its arguments.  Zero-bytes may appear in the compiled
00287    regular expression.
00288 
00289    The value of `exactn' is needed in search.c (search_buffer) in emacs.
00290    So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
00291    `exactn' we use here must also be 1.  */
00292 
00293 enum regexpcode
00294   {
00295     unused=0,
00296     exactn=1, /* Followed by one byte giving n, then by n literal bytes.  */
00297     begline,  /* Fail unless at beginning of line.  */
00298     endline,  /* Fail unless at end of line.  */
00299     begbuf,   /* Succeeds if at beginning of buffer (if emacs) or at beginning
00300                  of string to be matched (if not).  */
00301     endbuf,   /* Analogously, for end of buffer/string.  */
00302     endbuf2,  /* End of buffer/string, or newline just before it.  */
00303     begpos,   /* Matches where last scan//gsub left off.  */
00304     jump,     /* Followed by two bytes giving relative address to jump to.  */
00305     jump_past_alt,/* Same as jump, but marks the end of an alternative.  */
00306     on_failure_jump,     /* Followed by two bytes giving relative address of 
00307                             place to resume at in case of failure.  */
00308     finalize_jump,       /* Throw away latest failure point and then jump to 
00309                             address.  */
00310     maybe_finalize_jump, /* Like jump but finalize if safe to do so.
00311                             This is used to jump back to the beginning
00312                             of a repeat.  If the command that follows
00313                             this jump is clearly incompatible with the
00314                             one at the beginning of the repeat, such that
00315                             we can be sure that there is no use backtracking
00316                             out of repetitions already completed,
00317                             then we finalize.  */
00318     dummy_failure_jump,  /* Jump, and push a dummy failure point. This 
00319                             failure point will be thrown away if an attempt 
00320                             is made to use it for a failure. A + construct 
00321                             makes this before the first repeat.  Also
00322                             use it as an intermediary kind of jump when
00323                             compiling an or construct.  */
00324     push_dummy_failure, /* Push a dummy failure point and continue.  Used at the end of
00325                            alternatives.  */
00326     succeed_n,   /* Used like on_failure_jump except has to succeed n times;
00327                     then gets turned into an on_failure_jump. The relative
00328                     address following it is useless until then.  The
00329                     address is followed by two bytes containing n.  */
00330     jump_n,      /* Similar to jump, but jump n times only; also the relative
00331                     address following is in turn followed by yet two more bytes
00332                     containing n.  */
00333     try_next,    /* Jump to next pattern for the first time,
00334                     leaving this pattern on the failure stack. */
00335     finalize_push,      /* Finalize stack and push the beginning of the pattern
00336                            on the stack to retry (used for non-greedy match) */
00337     finalize_push_n,    /* Similar to finalize_push, buf finalize n time only */
00338     set_number_at,      /* Set the following relative location to the
00339                            subsequent number.  */
00340     anychar,     /* Matches any (more or less) one character excluding newlines.  */
00341     anychar_repeat,      /* Matches sequence of characters excluding newlines.  */
00342     charset,     /* Matches any one char belonging to specified set.
00343                     First following byte is number of bitmap bytes.
00344                     Then come bytes for a bitmap saying which chars are in.
00345                     Bits in each byte are ordered low-bit-first.
00346                     A character is in the set if its bit is 1.
00347                     A character too large to have a bit in the map
00348                     is automatically not in the set.  */
00349     charset_not, /* Same parameters as charset, but match any character
00350                     that is not one of those specified.  */
00351     start_memory, /* Start remembering the text that is matched, for
00352                     storing in a memory register.  Followed by one
00353                     byte containing the register number.  Register numbers
00354                     must be in the range 0 through RE_NREGS.  */
00355     stop_memory, /* Stop remembering the text that is matched
00356                     and store it in a memory register.  Followed by
00357                     one byte containing the register number. Register
00358                     numbers must be in the range 0 through RE_NREGS.  */
00359     start_paren,    /* Place holder at the start of (?:..). */
00360     stop_paren,    /* Place holder at the end of (?:..). */
00361     casefold_on,   /* Turn on casefold flag. */
00362     casefold_off,  /* Turn off casefold flag. */
00363     option_set,    /* Turn on multi line match (match with newlines). */
00364     start_nowidth, /* Save string point to the stack. */
00365     stop_nowidth,  /* Restore string place at the point start_nowidth. */
00366     pop_and_fail,  /* Fail after popping nowidth entry from stack. */
00367     stop_backtrack,  /* Restore backtrack stack at the point start_nowidth. */
00368     duplicate,   /* Match a duplicate of something remembered.
00369                     Followed by one byte containing the index of the memory 
00370                     register.  */
00371     wordchar,    /* Matches any word-constituent character.  */
00372     notwordchar, /* Matches any char that is not a word-constituent.  */
00373     wordbeg,     /* Succeeds if at word beginning.  */
00374     wordend,     /* Succeeds if at word end.  */
00375     wordbound,   /* Succeeds if at a word boundary.  */
00376     notwordbound /* Succeeds if not at a word boundary.  */
00377   };
00378 
00379 
00380 /* Number of failure points to allocate space for initially,
00381    when matching.  If this number is exceeded, more space is allocated,
00382    so it is not a hard limit.  */
00383 
00384 #ifndef NFAILURES
00385 #define NFAILURES 160
00386 #endif
00387 
00388 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
00389 #define STORE_NUMBER(destination, number)                               \
00390   do { (destination)[0] = (number) & 0377;                              \
00391     (destination)[1] = (number) >> 8; } while (0)
00392 
00393 /* Same as STORE_NUMBER, except increment the destination pointer to
00394    the byte after where the number is stored.  Watch out that values for
00395    DESTINATION such as p + 1 won't work, whereas p will.  */
00396 #define STORE_NUMBER_AND_INCR(destination, number)                      \
00397   do { STORE_NUMBER(destination, number);                               \
00398     (destination) += 2; } while (0)
00399 
00400 
00401 /* Put into DESTINATION a number stored in two contingous bytes starting
00402    at SOURCE.  */
00403 #define EXTRACT_NUMBER(destination, source)                             \
00404   do { (destination) = *(source) & 0377;                                \
00405     (destination) += SIGN_EXTEND_CHAR(*(char*)((source) + 1)) << 8; } while (0)
00406 
00407 /* Same as EXTRACT_NUMBER, except increment the pointer for source to
00408    point to second byte of SOURCE.  Note that SOURCE has to be a value
00409    such as p, not, e.g., p + 1. */
00410 #define EXTRACT_NUMBER_AND_INCR(destination, source)                    \
00411   do { EXTRACT_NUMBER(destination, source);                             \
00412        (source) += 2; } while (0)
00413 
00414 
00415 /* Specify the precise syntax of regexps for compilation.  This provides
00416    for compatibility for various utilities which historically have
00417    different, incompatible syntaxes.
00418 
00419    The argument SYNTAX is a bit-mask comprised of the various bits
00420    defined in regex.h.  */
00421 
00422 long
00423 re_set_syntax(syntax)
00424   long syntax;
00425 {
00426     /* obsolete */
00427     return 0;
00428 }
00429 
00430 /* Macros for re_compile_pattern, which is found below these definitions.  */
00431 
00432 #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate)
00433 #define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate)
00434 /* Fetch the next character in the uncompiled pattern---translating it 
00435    if necessary.  Also cast from a signed character in the constant
00436    string passed to us by the user to an unsigned char that we can use
00437    as an array index (in, e.g., `translate').  */
00438 #define PATFETCH(c)                                                     \
00439   do {if (p == pend) goto end_of_pattern;                               \
00440     c = (unsigned char) *p++;                                           \
00441     if (TRANSLATE_P()) c = (unsigned char)translate[c]; \
00442   } while (0)
00443 
00444 /* Fetch the next character in the uncompiled pattern, with no
00445    translation.  */
00446 #define PATFETCH_RAW(c)                                                 \
00447   do {if (p == pend) goto end_of_pattern;                               \
00448     c = (unsigned char)*p++;                                            \
00449   } while (0)
00450 
00451 /* Go backwards one character in the pattern.  */
00452 #define PATUNFETCH p--
00453 
00454 #define MBC2WC(c, p)                                                    \
00455   do {                                                                  \
00456     if (current_mbctype == MBCTYPE_UTF8) {                              \
00457       int n = mbclen(c) - 1;                                            \
00458       c &= (1<<(BYTEWIDTH-2-n)) - 1;                                    \
00459       while (n--) {                                                     \
00460         c = c << 6 | (*p++ & ((1<<6)-1));                               \
00461       }                                                                 \
00462     }                                                                   \
00463     else {                                                              \
00464       c <<= 8;                                                          \
00465       c |= (unsigned char)*(p)++;                                       \
00466     }                                                                   \
00467   } while (0)
00468 
00469 #define PATFETCH_MBC(c)                                                 \
00470   do {                                                                  \
00471     if (p + mbclen(c) - 1 >= pend) goto end_of_pattern;                 \
00472     MBC2WC(c, p);                                                       \
00473   } while(0)
00474 
00475 #define WC2MBC1ST(c)                                                    \
00476  ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c))
00477 
00478 typedef unsigned int (*mbc_startpos_func_t) (const char *string, unsigned int pos);
00479 
00480 static unsigned int asc_startpos (const char *string, unsigned int pos);
00481 static unsigned int euc_startpos (const char *string, unsigned int pos);
00482 static unsigned int sjis_startpos (const char *string, unsigned int pos);
00483 static unsigned int utf8_startpos (const char *string, unsigned int pos);
00484 
00485 static const mbc_startpos_func_t mbc_startpos_func[4] = {
00486   asc_startpos, euc_startpos, sjis_startpos, utf8_startpos
00487 };
00488 
00489 #define mbc_startpos(start, pos) (*mbc_startpos_func[current_mbctype])((start), (pos))
00490 
00491 static unsigned int
00492 utf8_firstbyte(c)
00493      unsigned long c;
00494 {
00495   if (c < 0x80) return c;
00496   if (c <= 0x7ff) return ((c>>6)&0xff)|0xc0;
00497   if (c <= 0xffff) return ((c>>12)&0xff)|0xe0;
00498   if (c <= 0x1fffff) return ((c>>18)&0xff)|0xf0;
00499   if (c <= 0x3ffffff) return ((c>>24)&0xff)|0xf8;
00500   if (c <= 0x7fffffff) return ((c>>30)&0xff)|0xfc;
00501 #if SIZEOF_INT > 4
00502   if (c <= 0xfffffffff) return 0xfe;
00503 #else
00504   return 0xfe;
00505 #endif
00506 }
00507 
00508 static void
00509 print_mbc(c)
00510      unsigned int c;
00511 {
00512   if (current_mbctype == MBCTYPE_UTF8) {
00513     if (c < 0x80)
00514       printf("%c", (int)c);
00515     else if (c <= 0x7ff)
00516       printf("%c%c", (int)utf8_firstbyte(c), (int)(c & 0x3f));
00517     else if (c <= 0xffff)
00518       printf("%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 6) & 0x3f),
00519              (int)(c & 0x3f));
00520     else if (c <= 0x1fffff) 
00521       printf("%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 12) & 0x3f),
00522              (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
00523     else if (c <= 0x3ffffff)
00524       printf("%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 18) & 0x3f),
00525              (int)((c >> 12) & 0x3f), (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
00526     else if (c <= 0x7fffffff)
00527       printf("%c%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 24) & 0x3f),
00528              (int)((c >> 18) & 0x3f), (int)((c >> 12) & 0x3f),
00529              (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
00530   }
00531   else if (c < 0xff) {
00532     printf("\\%o", (int)c);
00533   }
00534   else {
00535     printf("%c%c", (int)(c >> BYTEWIDTH), (int)(c &0xff));
00536   }
00537 }
00538 
00539 /* If the buffer isn't allocated when it comes in, use this.  */
00540 #define INIT_BUF_SIZE  28
00541 
00542 /* Make sure we have at least N more bytes of space in buffer.  */
00543 #define GET_BUFFER_SPACE(n)                                             \
00544   do {                                                                  \
00545     while (b - bufp->buffer + (n) >= bufp->allocated)                   \
00546       EXTEND_BUFFER;                                                    \
00547   } while (0)
00548 
00549 /* Make sure we have one more byte of buffer space and then add CH to it.  */
00550 #define BUFPUSH(ch)                                                     \
00551   do {                                                                  \
00552     GET_BUFFER_SPACE(1);                                                \
00553     *b++ = (char)(ch);                                                  \
00554   } while (0)
00555 
00556 /* Extend the buffer by twice its current size via reallociation and
00557    reset the pointers that pointed into the old allocation to point to
00558    the correct places in the new allocation.  If extending the buffer
00559    results in it being larger than 1 << 16, then flag memory exhausted.  */
00560 #define EXTEND_BUFFER                                           \
00561   do { char *old_buffer = bufp->buffer;                                 \
00562     if (bufp->allocated == (1L<<16)) goto too_big;                      \
00563     bufp->allocated *= 2;                                               \
00564     if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16);         \
00565     bufp->buffer = (char*)xrealloc(bufp->buffer, bufp->allocated);      \
00566     if (bufp->buffer == 0)                                              \
00567       goto memory_exhausted;                                            \
00568     b = (b - old_buffer) + bufp->buffer;                                \
00569     if (fixup_alt_jump)                                                 \
00570       fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;    \
00571     if (laststart)                                                      \
00572       laststart = (laststart - old_buffer) + bufp->buffer;              \
00573     begalt = (begalt - old_buffer) + bufp->buffer;                      \
00574     if (pending_exact)                                                  \
00575       pending_exact = (pending_exact - old_buffer) + bufp->buffer;      \
00576   } while (0)
00577 
00578 
00579 /* Set the bit for character C in a character set list.  */
00580 #define SET_LIST_BIT(c)                                                 \
00581   (b[(unsigned char)(c) / BYTEWIDTH]                                    \
00582    |= 1 << ((unsigned char)(c) % BYTEWIDTH))
00583 
00584 /* Get the next unsigned number in the uncompiled pattern.  */
00585 #define GET_UNSIGNED_NUMBER(num)                                        \
00586   do { if (p != pend) {                                                 \
00587         PATFETCH(c);                                                    \
00588         while (ISDIGIT(c)) {                                            \
00589           if (num < 0)                                                  \
00590              num = 0;                                                   \
00591           num = num * 10 + c - '0';                                     \
00592           if (p == pend)                                                \
00593              break;                                                     \
00594           PATFETCH(c);                                                  \
00595         }                                                               \
00596      }                                                                  \
00597   } while (0)
00598 
00599 #define STREQ(s1, s2) ((strcmp(s1, s2) == 0))
00600 
00601 #define CHAR_CLASS_MAX_LENGTH  6 /* Namely, `xdigit'.  */
00602 
00603 #define IS_CHAR_CLASS(string)                                           \
00604    (STREQ(string, "alpha") || STREQ(string, "upper")                    \
00605     || STREQ(string, "lower") || STREQ(string, "digit")                 \
00606     || STREQ(string, "alnum") || STREQ(string, "xdigit")                \
00607     || STREQ(string, "space") || STREQ(string, "print")                 \
00608     || STREQ(string, "punct") || STREQ(string, "graph")                 \
00609     || STREQ(string, "cntrl") || STREQ(string, "blank"))
00610 
00611 #define STORE_MBC(p, c)                                                 \
00612   do {                                                                  \
00613     (p)[0] = (unsigned char)(((c) >>24) & 0xff);                        \
00614     (p)[1] = (unsigned char)(((c) >>16) & 0xff);                        \
00615     (p)[2] = (unsigned char)(((c) >> 8) & 0xff);                        \
00616     (p)[3] = (unsigned char)(((c) >> 0) & 0xff);                        \
00617   } while (0)
00618 
00619 #define STORE_MBC_AND_INCR(p, c)                                        \
00620   do {                                                                  \
00621     *(p)++ = (unsigned char)(((c) >>24) & 0xff);                        \
00622     *(p)++ = (unsigned char)(((c) >>16) & 0xff);                        \
00623     *(p)++ = (unsigned char)(((c) >> 8) & 0xff);                        \
00624     *(p)++ = (unsigned char)(((c) >> 0) & 0xff);                        \
00625   } while (0)
00626 
00627 #define EXTRACT_MBC(p)                                                  \
00628   ((unsigned int)((unsigned char)(p)[0] << 24 |                         \
00629                     (unsigned char)(p)[1] << 16 |                       \
00630                     (unsigned char)(p)[2] <<  8 |                       \
00631                     (unsigned char)(p)[3]))
00632 
00633 #define EXTRACT_MBC_AND_INCR(p)                                         \
00634   ((unsigned int)((p) += 4,                                             \
00635                     (unsigned char)(p)[-4] << 24 |                      \
00636                     (unsigned char)(p)[-3] << 16 |                      \
00637                     (unsigned char)(p)[-2] <<  8 |                      \
00638                     (unsigned char)(p)[-1]))
00639 
00640 #define EXTRACT_UNSIGNED(p) \
00641   ((unsigned char)(p)[0] | (unsigned char)(p)[1] << 8)
00642 #define EXTRACT_UNSIGNED_AND_INCR(p) \
00643   ((p) += 2, (unsigned char)(p)[-2] | (unsigned char)(p)[-1] << 8)
00644 
00645 /* Handle (mb)?charset(_not)?.
00646 
00647    Structure of mbcharset(_not)? in compiled pattern.
00648 
00649      struct {
00650        unsinged char id;                mbcharset(_not)?
00651        unsigned char sbc_size;
00652        unsigned char sbc_map[sbc_size]; same as charset(_not)? up to here.
00653        unsigned short mbc_size;         number of intervals.
00654        struct {
00655          unsigned long beg;             beginning of interval.
00656          unsigned long end;             end of interval.
00657        } intervals[mbc_size];
00658      }; */
00659 
00660 static void
00661 set_list_bits(c1, c2, b)
00662     unsigned long c1, c2;
00663     unsigned char *b;
00664 {
00665   unsigned char sbc_size = b[-1];
00666   unsigned short mbc_size = EXTRACT_UNSIGNED(&b[sbc_size]);
00667   unsigned short beg, end, upb;
00668 
00669   if (c1 > c2)
00670     return;
00671   b = &b[sbc_size + 2];
00672 
00673   for (beg = 0, upb = mbc_size; beg < upb; ) {
00674     unsigned short mid = (unsigned short)(beg + upb) >> 1;
00675 
00676     if ((int)c1 - 1 > (int)EXTRACT_MBC(&b[mid*8+4]))
00677       beg = mid + 1;
00678     else
00679       upb = mid;
00680   }
00681 
00682   for (end = beg, upb = mbc_size; end < upb; ) {
00683     unsigned short mid = (unsigned short)(end + upb) >> 1;
00684 
00685     if ((int)c2 >= (int)EXTRACT_MBC(&b[mid*8]) - 1)
00686       end = mid + 1;
00687     else
00688       upb = mid;
00689   }
00690 
00691   if (beg != end) {
00692     if (c1 > EXTRACT_MBC(&b[beg*8]))
00693       c1 = EXTRACT_MBC(&b[beg*8]);
00694     if (c2 < EXTRACT_MBC(&b[(end - 1)*8+4]))
00695       c2 = EXTRACT_MBC(&b[(end - 1)*8+4]);
00696   }
00697   if (end < mbc_size && end != beg + 1)
00698     /* NOTE: memcpy() would not work here.  */
00699     memmove(&b[(beg + 1)*8], &b[end*8], (mbc_size - end)*8);
00700   STORE_MBC(&b[beg*8 + 0], c1);
00701   STORE_MBC(&b[beg*8 + 4], c2);
00702   mbc_size += beg - end + 1;
00703   STORE_NUMBER(&b[-2], mbc_size);
00704 }
00705 
00706 static int
00707 is_in_list_sbc(c, b)
00708     unsigned long c;
00709     const unsigned char *b;
00710 {
00711   unsigned short size;
00712 
00713   size = *b++;
00714   return ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH);
00715 }
00716   
00717 static int
00718 is_in_list_mbc(c, b)
00719     unsigned long c;
00720     const unsigned char *b;
00721 {
00722   unsigned short size;
00723   unsigned short i, j;
00724 
00725   size = *b++;
00726   b += size + 2;
00727   size = EXTRACT_UNSIGNED(&b[-2]);
00728   if (size == 0) return 0;
00729 
00730   for (i = 0, j = size; i < j; ) {
00731     unsigned short k = (unsigned short)(i + j) >> 1;
00732 
00733     if (c > EXTRACT_MBC(&b[k*8+4]))
00734       i = k + 1;
00735     else
00736       j = k;
00737   }
00738   if (i < size && EXTRACT_MBC(&b[i*8]) <= c)
00739     return 1;
00740 
00741   return 0;
00742 }
00743 
00744 static int
00745 is_in_list(c, b)
00746     unsigned long c;
00747     const unsigned char *b;
00748 {
00749   return is_in_list_sbc(c, b) || (current_mbctype ? is_in_list_mbc(c, b) : 0);
00750 }
00751 
00752 static void
00753 print_partial_compiled_pattern(start, end)
00754     unsigned char *start;
00755     unsigned char *end;
00756 {
00757   int mcnt, mcnt2;
00758   unsigned char *p = start;
00759   unsigned char *pend = end;
00760 
00761   if (start == NULL) {
00762     printf("(null)\n");
00763     return;
00764   }
00765 
00766   /* Loop over pattern commands.  */
00767   while (p < pend) {
00768     switch ((enum regexpcode)*p++) {
00769     case unused:
00770       printf("/unused");
00771       break;
00772 
00773     case exactn:
00774       mcnt = *p++;
00775       printf("/exactn/%d", mcnt);
00776       do {
00777         putchar('/');
00778         printf("%c", *p++);
00779       }
00780       while (--mcnt);
00781       break;
00782 
00783     case start_memory:
00784       mcnt = *p++;
00785       printf("/start_memory/%d/%d", mcnt, *p++);
00786       break;
00787 
00788     case stop_memory:
00789       mcnt = *p++;
00790       printf("/stop_memory/%d/%d", mcnt, *p++);
00791       break;
00792 
00793     case start_paren:
00794       printf("/start_paren");
00795       break;
00796 
00797     case stop_paren:
00798       printf("/stop_paren");
00799       break;
00800 
00801     case casefold_on:
00802       printf("/casefold_on");
00803       break;
00804 
00805     case casefold_off:
00806       printf("/casefold_off");
00807       break;
00808 
00809     case option_set:
00810       printf("/option_set/%d", *p++);
00811       break;
00812 
00813     case start_nowidth:
00814       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00815       printf("/start_nowidth//%d", mcnt);
00816       break;
00817 
00818     case stop_nowidth:
00819       printf("/stop_nowidth//");
00820       p += 2;
00821       break;
00822 
00823     case pop_and_fail:
00824       printf("/pop_and_fail");
00825       break;
00826 
00827     case stop_backtrack:
00828       printf("/stop_backtrack//");
00829       p += 2;
00830       break;
00831 
00832     case duplicate:
00833       printf("/duplicate/%d", *p++);
00834       break;
00835 
00836     case anychar:
00837       printf("/anychar");
00838       break;
00839 
00840     case anychar_repeat:
00841       printf("/anychar_repeat");
00842       break;
00843 
00844     case charset:
00845     case charset_not:
00846       {
00847         register int c;
00848 
00849         printf("/charset%s",
00850                (enum regexpcode)*(p - 1) == charset_not ? "_not" : "");
00851 
00852         mcnt = *p++;
00853         printf("/%d", mcnt);
00854         for (c = 0; c < mcnt; c++) {
00855           unsigned bit;
00856           unsigned char map_byte = p[c];
00857 
00858           putchar('/');
00859 
00860           for (bit = 0; bit < BYTEWIDTH; bit++)
00861             if (map_byte & (1 << bit))
00862               printf("%c", c * BYTEWIDTH + bit);
00863         }
00864         p += mcnt;
00865         mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
00866         putchar('/');
00867         while (mcnt--) {
00868           print_mbc(EXTRACT_MBC_AND_INCR(p));
00869           putchar('-');
00870           print_mbc(EXTRACT_MBC_AND_INCR(p));
00871         }
00872         break;
00873       }
00874 
00875     case begline:
00876       printf("/begline");
00877       break;
00878 
00879     case endline:
00880       printf("/endline");
00881       break;
00882 
00883     case on_failure_jump:
00884       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00885       printf("/on_failure_jump//%d", mcnt);
00886       break;
00887 
00888     case dummy_failure_jump:
00889       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00890       printf("/dummy_failure_jump//%d", mcnt);
00891       break;
00892 
00893     case push_dummy_failure:
00894       printf("/push_dummy_failure");
00895       break;
00896 
00897     case finalize_jump:
00898       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00899       printf("/finalize_jump//%d", mcnt);
00900       break;
00901 
00902     case maybe_finalize_jump:
00903       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00904       printf("/maybe_finalize_jump//%d", mcnt);
00905       break;
00906 
00907     case jump_past_alt:
00908       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00909       printf("/jump_past_alt//%d", mcnt);
00910       break;
00911 
00912     case jump:
00913       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00914       printf("/jump//%d", mcnt);
00915       break;
00916 
00917     case succeed_n: 
00918       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00919       EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00920       printf("/succeed_n//%d//%d", mcnt, mcnt2);
00921       break;
00922 
00923     case jump_n: 
00924       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00925       EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00926       printf("/jump_n//%d//%d", mcnt, mcnt2);
00927       break;
00928 
00929     case set_number_at: 
00930       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00931       EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00932       printf("/set_number_at//%d//%d", mcnt, mcnt2);
00933       break;
00934 
00935     case try_next:
00936       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00937       printf("/try_next//%d", mcnt);
00938       break;
00939 
00940     case finalize_push:
00941       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00942       printf("/finalize_push//%d", mcnt);
00943       break;
00944 
00945     case finalize_push_n:
00946       EXTRACT_NUMBER_AND_INCR(mcnt, p);
00947       EXTRACT_NUMBER_AND_INCR(mcnt2, p);
00948       printf("/finalize_push_n//%d//%d", mcnt, mcnt2);
00949       break;
00950 
00951     case wordbound:
00952       printf("/wordbound");
00953       break;
00954 
00955     case notwordbound:
00956       printf("/notwordbound");
00957       break;
00958 
00959     case wordbeg:
00960       printf("/wordbeg");
00961       break;
00962 
00963     case wordend:
00964       printf("/wordend");
00965 
00966     case wordchar:
00967       printf("/wordchar");
00968       break;
00969           
00970     case notwordchar:
00971       printf("/notwordchar");
00972       break;
00973 
00974     case begbuf:
00975       printf("/begbuf");
00976       break;
00977 
00978     case endbuf:
00979       printf("/endbuf");
00980       break;
00981 
00982     case endbuf2:
00983       printf("/endbuf2");
00984       break;
00985 
00986     case begpos:
00987       printf("/begpos");
00988       break;
00989 
00990     default:
00991       printf("?%d", *(p-1));
00992     }
00993   }
00994   printf("/\n");
00995 }
00996 
00997 
00998 static void
00999 print_compiled_pattern(bufp)
01000      struct re_pattern_buffer *bufp;
01001 {
01002   unsigned char *buffer = (unsigned char*)bufp->buffer;
01003 
01004   print_partial_compiled_pattern(buffer, buffer + bufp->used);
01005 }
01006 
01007 static char*
01008 calculate_must_string(start, end)
01009      char *start;
01010      char *end;
01011 {
01012   int mcnt;
01013   int max = 0;
01014   unsigned char *p = start;
01015   unsigned char *pend = end;
01016   char *must = 0;
01017 
01018   if (start == NULL) return 0;
01019 
01020   /* Loop over pattern commands.  */
01021   while (p < pend) {
01022     switch ((enum regexpcode)*p++) {
01023     case unused:
01024       break;
01025 
01026     case exactn:
01027       mcnt = *p;
01028       if (mcnt > max) {
01029         must = p;
01030         max = mcnt;
01031       }
01032       p += mcnt+1;
01033       break;
01034 
01035     case start_memory:
01036     case stop_memory:
01037       p += 2;
01038       break;
01039 
01040     case duplicate:
01041     case option_set:
01042       p++;
01043       break;
01044 
01045     case casefold_on:
01046     case casefold_off:
01047       return 0;         /* should not check must_string */
01048 
01049     case pop_and_fail:
01050     case anychar:
01051     case anychar_repeat:
01052     case begline:
01053     case endline:
01054     case wordbound:
01055     case notwordbound:
01056     case wordbeg:
01057     case wordend:
01058     case wordchar:
01059     case notwordchar:
01060     case begbuf:
01061     case endbuf:
01062     case endbuf2:
01063     case begpos:
01064     case push_dummy_failure:
01065     case start_paren:
01066     case stop_paren:
01067       break;
01068 
01069     case charset:
01070     case charset_not:
01071       mcnt = *p++;
01072       p += mcnt;
01073       mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
01074       while (mcnt--) {
01075         p += 8;
01076       }
01077       break;
01078 
01079     case on_failure_jump:
01080       EXTRACT_NUMBER_AND_INCR(mcnt, p);
01081       if (mcnt > 0) p += mcnt;
01082       if ((enum regexpcode)p[-3] == jump) {
01083         p -= 2;
01084         EXTRACT_NUMBER_AND_INCR(mcnt, p);
01085         if (mcnt > 0) p += mcnt;
01086       }
01087       break;
01088 
01089     case dummy_failure_jump:
01090     case succeed_n: 
01091     case try_next:
01092     case jump:
01093       EXTRACT_NUMBER_AND_INCR(mcnt, p);
01094       if (mcnt > 0) p += mcnt;
01095       break;
01096 
01097     case start_nowidth:
01098     case stop_nowidth:
01099     case stop_backtrack:
01100     case finalize_jump:
01101     case maybe_finalize_jump:
01102     case finalize_push:
01103       p += 2;
01104       break;
01105 
01106     case jump_n: 
01107     case set_number_at: 
01108     case finalize_push_n:
01109       p += 4;
01110       break;
01111 
01112     default:
01113       break;
01114     }
01115   }
01116   return must;
01117 }
01118 
01119 static unsigned int
01120 read_backslash(c)
01121      int c;
01122 {
01123   switch (c) {
01124   case 'n':
01125     return '\n';
01126 
01127   case 't':
01128     return '\t';
01129 
01130   case 'r':
01131     return '\r';
01132 
01133   case 'f':
01134     return '\f';
01135 
01136   case 'v':
01137     return '\v';
01138 
01139   case 'a':
01140     return '\007';
01141 
01142   case 'b':
01143     return '\010';
01144 
01145   case 'e':
01146     return '\033';
01147   }
01148   return c;
01149 }
01150 
01151 static unsigned int
01152 read_special(p, pend, pp)
01153      const char *p, *pend, **pp;
01154 {
01155   int c;
01156 
01157   PATFETCH_RAW(c);
01158   switch (c) {
01159   case 'M':
01160     PATFETCH_RAW(c);
01161     if (c != '-') return -1;
01162     PATFETCH_RAW(c);
01163     *pp = p;
01164     if (c == '\\') {
01165       return read_special(--p, pend, pp) | 0x80;
01166     }
01167     else if (c == -1) return ~0;
01168     else {
01169       return ((c & 0xff) | 0x80);
01170     }
01171 
01172   case 'C':
01173     PATFETCH_RAW(c);
01174     if (c != '-') return ~0;
01175   case 'c':
01176     PATFETCH_RAW(c);
01177     *pp = p;
01178     if (c == '\\') {
01179       c = read_special(--p, pend, pp);
01180     }
01181     else if (c == '?') return 0177;
01182     else if (c == -1) return ~0;
01183     return c & 0x9f;
01184   default:
01185     *pp = p + 1;
01186     return read_backslash(c);
01187   }
01188 
01189  end_of_pattern:
01190   return ~0;
01191 }
01192 
01193 /* re_compile_pattern takes a regular-expression string
01194    and converts it into a buffer full of byte commands for matching.
01195 
01196    PATTERN   is the address of the pattern string
01197    SIZE      is the length of it.
01198    BUFP     is a  struct re_pattern_buffer *  which points to the info
01199              on where to store the byte commands.
01200              This structure contains a  char *  which points to the
01201              actual space, which should have been obtained with malloc.
01202              re_compile_pattern may use realloc to grow the buffer space.
01203 
01204    The number of bytes of commands can be found out by looking in
01205    the `struct re_pattern_buffer' that bufp pointed to, after
01206    re_compile_pattern returns. */
01207 
01208 char *
01209 re_compile_pattern(pattern, size, bufp)
01210      const char *pattern;
01211      int size;
01212      struct re_pattern_buffer *bufp;
01213 {
01214   register char *b = bufp->buffer;
01215   register const char *p = pattern;
01216   const char *nextp;
01217   const char *pend = pattern + size;
01218   register unsigned int c, c1 = 0;
01219   const char *p0;
01220   int numlen;
01221 #define ERROR_MSG_MAX_SIZE 200
01222   static char error_msg[ERROR_MSG_MAX_SIZE+1];
01223 
01224   /* Address of the count-byte of the most recently inserted `exactn'
01225      command.  This makes it possible to tell whether a new exact-match
01226      character can be added to that command or requires a new `exactn'
01227      command.  */
01228 
01229   char *pending_exact = 0;
01230 
01231   /* Address of the place where a forward-jump should go to the end of
01232      the containing expression.  Each alternative of an `or', except the
01233      last, ends with a forward-jump of this sort.  */
01234 
01235   char *fixup_alt_jump = 0;
01236 
01237   /* Address of start of the most recently finished expression.
01238      This tells postfix * where to find the start of its operand.  */
01239 
01240   char *laststart = 0;
01241 
01242   /* In processing a repeat, 1 means zero matches is allowed.  */
01243 
01244   char zero_times_ok;
01245 
01246   /* In processing a repeat, 1 means many matches is allowed.  */
01247 
01248   char many_times_ok;
01249 
01250   /* In processing a repeat, 1 means non-greedy matches.  */
01251 
01252   char greedy;
01253 
01254   /* Address of beginning of regexp, or inside of last (.  */
01255 
01256   char *begalt = b;
01257 
01258   /* Place in the uncompiled pattern (i.e., the {) to
01259      which to go back if the interval is invalid.  */
01260   const char *beg_interval;
01261 
01262   /* In processing an interval, at least this many matches must be made.  */
01263   int lower_bound;
01264 
01265   /* In processing an interval, at most this many matches can be made.  */
01266   int upper_bound;
01267 
01268   /* Stack of information saved by ( and restored by ).
01269      Five stack elements are pushed by each (:
01270      First, the value of b.
01271      Second, the value of fixup_alt_jump.
01272      Third, the value of begalt.
01273      Fourth, the value of regnum.
01274      Fifth, the type of the paren. */
01275 
01276   int stacka[40];
01277   int *stackb = stacka;
01278   int *stackp = stackb;
01279   int *stacke = stackb + 40;
01280 
01281   /* Counts ('s as they are encountered.  Remembered for the matching ),
01282      where it becomes the register number to put in the stop_memory
01283      command.  */
01284 
01285   int regnum = 1;
01286 
01287   int range = 0;
01288   int had_mbchar = 0;
01289   int had_num_literal = 0;
01290   int had_char_class = 0;
01291 
01292   int options = bufp->options;
01293 
01294   bufp->fastmap_accurate = 0;
01295   bufp->must = 0;
01296   bufp->must_skip = 0;
01297 
01298   /* Initialize the syntax table.  */
01299   init_syntax_once();
01300 
01301   if (bufp->allocated == 0) {
01302     bufp->allocated = INIT_BUF_SIZE;
01303     /* EXTEND_BUFFER loses when bufp->allocated is 0.  */
01304     bufp->buffer = (char*)xrealloc(bufp->buffer, INIT_BUF_SIZE);
01305     if (!bufp->buffer) goto memory_exhausted; /* this not happen */
01306     begalt = b = bufp->buffer;
01307   }
01308 
01309   while (p != pend) {
01310     PATFETCH(c);
01311 
01312     switch (c) {
01313     case '$':
01314       if (bufp->options & RE_OPTION_SINGLELINE) {
01315         BUFPUSH(endbuf);
01316       }
01317       else {
01318         p0 = p;
01319         /* When testing what follows the $,
01320            look past the \-constructs that don't consume anything.  */
01321 
01322         while (p0 != pend) {
01323           if (*p0 == '\\' && p0 + 1 != pend
01324               && (p0[1] == 'b' || p0[1] == 'B'))
01325             p0 += 2;
01326           else
01327             break;
01328         }
01329         BUFPUSH(endline);
01330       }
01331       break;
01332 
01333     case '^':
01334       if (bufp->options & RE_OPTION_SINGLELINE)
01335         BUFPUSH(begbuf);
01336       else
01337         BUFPUSH(begline);
01338       break;
01339 
01340     case '+':
01341     case '?':
01342     case '*':
01343       /* If there is no previous pattern, char not special. */
01344       if (!laststart) {
01345         snprintf(error_msg, ERROR_MSG_MAX_SIZE, 
01346                  "invalid regular expression; there's no previous pattern, to which '%c' would define cardinality at %d", 
01347                  c, p-pattern);
01348         FREE_AND_RETURN(stackb, error_msg);
01349       }
01350       /* If there is a sequence of repetition chars,
01351          collapse it down to just one.  */
01352       zero_times_ok = c != '+';
01353       many_times_ok = c != '?';
01354       greedy = 1;
01355       if (p != pend) {
01356         PATFETCH(c);
01357         switch (c) {
01358         case '?':
01359           greedy = 0;
01360           break;
01361         case '*':
01362         case '+':
01363           goto nested_meta;
01364         default:
01365           PATUNFETCH;
01366           break;
01367         }
01368       }
01369 
01370     repeat:
01371       /* Star, etc. applied to an empty pattern is equivalent
01372          to an empty pattern.  */
01373       if (!laststart)  
01374         break;
01375 
01376       if (greedy && many_times_ok && *laststart == anychar && b - laststart <= 2) {
01377         if (b[-1] == stop_paren)
01378           b--;
01379         if (zero_times_ok)
01380           *laststart = anychar_repeat;
01381         else {
01382           BUFPUSH(anychar_repeat);
01383         }
01384         break;
01385       }
01386       /* Now we know whether or not zero matches is allowed
01387          and also whether or not two or more matches is allowed.  */
01388       if (many_times_ok) {
01389         /* If more than one repetition is allowed, put in at the
01390            end a backward relative jump from b to before the next
01391            jump we're going to put in below (which jumps from
01392            laststart to after this jump).  */
01393         GET_BUFFER_SPACE(3);
01394         store_jump(b,greedy?maybe_finalize_jump:finalize_push,laststart-3);
01395         b += 3;         /* Because store_jump put stuff here.  */
01396       }
01397 
01398       /* On failure, jump from laststart to next pattern, which will be the
01399          end of the buffer after this jump is inserted.  */
01400       GET_BUFFER_SPACE(3);
01401       insert_jump(on_failure_jump, laststart, b + 3, b);
01402       b += 3;
01403 
01404       if (zero_times_ok) {
01405         if (greedy == 0) {
01406           GET_BUFFER_SPACE(3);
01407           insert_jump(try_next, laststart, b + 3, b);
01408           b += 3;
01409         }
01410       }
01411       else {
01412         /* At least one repetition is required, so insert a
01413            `dummy_failure_jump' before the initial
01414            `on_failure_jump' instruction of the loop. This
01415            effects a skip over that instruction the first time
01416            we hit that loop.  */
01417         GET_BUFFER_SPACE(3);
01418         insert_jump(dummy_failure_jump, laststart, laststart + 6, b);
01419         b += 3;
01420       }
01421       break;
01422 
01423     case '.':
01424       laststart = b;
01425       BUFPUSH(anychar);
01426       break;
01427 
01428     case '[':
01429       if (p == pend)
01430         FREE_AND_RETURN(stackb, "invalid regular expression; '[' can't be the last character ie. can't start range at the end of pattern");
01431       while ((b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH)
01432              > bufp->allocated)
01433         EXTEND_BUFFER;
01434 
01435       laststart = b;
01436       if (*p == '^') {
01437         BUFPUSH(charset_not); 
01438         p++;
01439       }
01440       else
01441         BUFPUSH(charset);
01442       p0 = p;
01443 
01444       BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
01445       /* Clear the whole map */
01446       memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
01447 
01448       had_mbchar = 0;
01449       had_num_literal = 0;
01450       had_char_class = 0;
01451 
01452       /* Read in characters and ranges, setting map bits.  */
01453       for (;;) {
01454         int size;
01455         unsigned last = (unsigned)-1;
01456 
01457         if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])) || current_mbctype) {
01458           /* Ensure the space is enough to hold another interval
01459              of multi-byte chars in charset(_not)?.  */
01460           size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*8 + 8;
01461           while (b + size + 1 > bufp->buffer + bufp->allocated)
01462             EXTEND_BUFFER;
01463         }
01464       range_retry:
01465         if (range && had_char_class) {
01466           FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as an end value of range");
01467         }
01468         PATFETCH_RAW(c);
01469 
01470         if (c == ']') {
01471           if (p == p0 + 1) {
01472             if (p == pend)
01473               FREE_AND_RETURN(stackb, "invalid regular expression; empty character class");
01474             re_warning("character class has `]' without escape");
01475           }
01476           else 
01477             /* Stop if this isn't merely a ] inside a bracket
01478                expression, but rather the end of a bracket
01479                expression.  */
01480             break;
01481         }
01482         /* Look ahead to see if it's a range when the last thing
01483            was a character class.  */
01484         if (had_char_class && c == '-' && *p != ']')
01485           FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as a start value of range");
01486         if (ismbchar(c)) {
01487           PATFETCH_MBC(c);
01488           had_mbchar++;
01489         }
01490         had_char_class = 0;
01491 
01492         if (c == '-' && ((p != p0 + 1 && *p != ']') ||
01493                          (p[0] == '-' && p[1] != ']') ||
01494                          range))
01495           re_warning("character class has `-' without escape");
01496         if (c == '[' && *p != ':')
01497           re_warning("character class has `[' without escape");
01498 
01499         /* \ escapes characters when inside [...].  */
01500         if (c == '\\') {
01501           PATFETCH_RAW(c);
01502           switch (c) {
01503           case 'w':
01504             for (c = 0; c < (1 << BYTEWIDTH); c++) {
01505               if (SYNTAX(c) == Sword ||
01506                   (!current_mbctype && SYNTAX(c) == Sword2))
01507                 SET_LIST_BIT(c);
01508             }
01509             if (current_mbctype) {
01510               set_list_bits(0x80, 0xffffffff, b);
01511             }
01512             had_char_class = 1;
01513             last = -1;
01514             continue;
01515 
01516           case 'W':
01517             for (c = 0; c < (1 << BYTEWIDTH); c++) {
01518               if (SYNTAX(c) != Sword &&
01519                   ((current_mbctype && !re_mbctab[c]) ||
01520                   (!current_mbctype && SYNTAX(c) != Sword2)))
01521                 SET_LIST_BIT(c);
01522             }
01523             had_char_class = 1;
01524             last = -1;
01525             continue;
01526 
01527           case 's':
01528             for (c = 0; c < 256; c++)
01529               if (ISSPACE(c))
01530                 SET_LIST_BIT(c);
01531             had_char_class = 1;
01532             last = -1;
01533             continue;
01534 
01535           case 'S':
01536             for (c = 0; c < 256; c++)
01537               if (!ISSPACE(c))
01538