contrib/libgnuregex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 {
 218   reg_errcode_t ret;
 219
 220   /* And GNU code determines whether or not to get register information
 221      by passing null for the REGS argument to re_match, etc., not by
 222      setting no_sub, unless RE_NO_SUB is set.  */
 223   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 224
 225   /* Match anchors at newline.  */
 226   bufp->newline_anchor = 1;
 227
 228   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 229
 230   if (!ret)
 231     return NULL;
 232   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 233 }
 234 #ifdef _LIBC
 235 weak_alias (__re_compile_pattern, re_compile_pattern)
 236 #endif
 237
 238 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 239    also be assigned to arbitrarily: each pattern buffer stores its own
 240    syntax, so it can be changed between regex compilations.  */
 241 /* This has no initializer because initialized variables in Emacs
 242    become read-only after dumping.  */
 243 reg_syntax_t re_syntax_options;
 244
 245
 246 /* Specify the precise syntax of regexps for compilation.  This provides
 247    for compatibility for various utilities which historically have
 248    different, incompatible syntaxes.
 249
 250    The argument SYNTAX is a bit mask comprised of the various bits
 251    defined in regex.h.  We return the old syntax.  */
 252
 253 reg_syntax_t
 254 re_set_syntax (syntax)
 255     reg_syntax_t syntax;
 256 {
 257   reg_syntax_t ret = re_syntax_options;
 258
 259   re_syntax_options = syntax;
 260   return ret;
 261 }
 262 #ifdef _LIBC
 263 weak_alias (__re_set_syntax, re_set_syntax)
 264 #endif
 265
 266 int
 267 re_compile_fastmap (bufp)
 268     struct re_pattern_buffer *bufp;
 269 {
 270   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute ((always_inline))
 290 re_set_fastmap (char *fastmap, int icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 305   int node_cnt;
 306   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       int node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 319               wchar_t wc;
 320               mbstate_t state;
 321
 322               p = buf;
 323               *p++ = dfa->nodes[node].opr.c;
 324               while (++node < dfa->nodes_len
 325                      && dfa->nodes[node].type == CHARACTER
 326                      && dfa->nodes[node].mb_partial)
 327                 *p++ = dfa->nodes[node].opr.c;
 328               memset (&state, '\0', sizeof (state));
 329               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 330                              &state) == p - buf
 331                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 332                       != (size_t) -1))
 333                 re_set_fastmap (fastmap, 0, buf[0]);
 334             }
 335 #endif
 336         }
 337       else if (type == SIMPLE_BRACKET)
 338         {
 339           int i, ch;
 340           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 341             {
 342               int j;
 343               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 344               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 345                 if (w & ((bitset_word_t) 1 << j))
 346                   re_set_fastmap (fastmap, icase, ch);
 347             }
 348         }
 349 #ifdef RE_ENABLE_I18N
 350       else if (type == COMPLEX_BRACKET)
 351         {
 352           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 353           int i;
 354
 355 # ifdef _LIBC
 356           /* See if we have to try all bytes which start multiple collation
 357              elements.
 358              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 359                   collation element, and don't catch 'b' since 'b' is
 360                   the only collation element which starts from 'b' (and
 361                   it is caught by SIMPLE_BRACKET).  */
 362               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 363                   && (cset->ncoll_syms || cset->nranges))
 364                 {
 365                   const int32_t *table = (const int32_t *)
 366                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 367                   for (i = 0; i < SBC_MAX; ++i)
 368                     if (table[i] < 0)
 369                       re_set_fastmap (fastmap, icase, i);
 370                 }
 371 # endif /* _LIBC */
 372
 373           /* See if we have to start the match at all multibyte characters,
 374              i.e. where we would not find an invalid sequence.  This only
 375              applies to multibyte character sets; for single byte character
 376              sets, the SIMPLE_BRACKET again suffices.  */
 377           if (dfa->mb_cur_max > 1
 378               && (cset->nchar_classes || cset->non_match || cset->nranges
 379 # ifdef _LIBC
 380                   || cset->nequiv_classes
 381 # endif /* _LIBC */
 382                  ))
 383             {
 384               unsigned char c = 0;
 385               do
 386                 {
 387                   mbstate_t mbs;
 388                   memset (&mbs, 0, sizeof (mbs));
 389                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 390                     re_set_fastmap (fastmap, false, (int) c);
 391                 }
 392               while (++c != 0);
 393             }
 394
 395           else
 396             {
 397               /* ... Else catch all bytes which can start the mbchars.  */
 398               for (i = 0; i < cset->nmbchars; ++i)
 399                 {
 400                   char buf[256];
 401                   mbstate_t state;
 402                   memset (&state, '\0', sizeof (state));
 403                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 404                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 405                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 406                     {
 407                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 408                           != (size_t) -1)
 409                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 410                     }
 411                 }
 412             }
 413         }
 414 #endif /* RE_ENABLE_I18N */
 415       else if (type == OP_PERIOD
 416 #ifdef RE_ENABLE_I18N
 417                || type == OP_UTF8_PERIOD
 418 #endif /* RE_ENABLE_I18N */
 419                || type == END_OF_RE)
 420         {
 421           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 422           if (type == END_OF_RE)
 423             bufp->can_be_null = 1;
 424           return;
 425         }
 426     }
 427 }
 428 \f
 429 /* Entry point for POSIX code.  */
 430 /* regcomp takes a regular expression as a string and compiles it.
 431
 432    PREG is a regex_t *.  We do not expect any fields to be initialized,
 433    since POSIX says we shouldn't.  Thus, we set
 434
 435      `buffer' to the compiled pattern;
 436      `used' to the length of the compiled pattern;
 437      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 438        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 439        RE_SYNTAX_POSIX_BASIC;
 440      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 441      `fastmap' to an allocated space for the fastmap;
 442      `fastmap_accurate' to zero;
 443      `re_nsub' to the number of subexpressions in PATTERN.
 444
 445    PATTERN is the address of the pattern string.
 446
 447    CFLAGS is a series of bits which affect compilation.
 448
 449      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 450      use POSIX basic syntax.
 451
 452      If REG_NEWLINE is set, then . and [^...] don't match newline.
 453      Also, regexec will try a match beginning after every newline.
 454
 455      If REG_ICASE is set, then we considers upper- and lowercase
 456      versions of letters to be equivalent when matching.
 457
 458      If REG_NOSUB is set, then when PREG is passed to regexec, that
 459      routine will report only success or failure, and nothing about the
 460      registers.
 461
 462    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 463    the return codes and their meanings.)  */
 464
 465 int
 466 regcomp (preg, pattern, cflags)
 467     regex_t *__restrict preg;
 468     const char *__restrict pattern;
 469     int cflags;
 470 {
 471   reg_errcode_t ret;
 472   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 473                          : RE_SYNTAX_POSIX_BASIC);
 474
 475   preg->buffer = NULL;
 476   preg->allocated = 0;
 477   preg->used = 0;
 478
 479   /* Try to allocate space for the fastmap.  */
 480   preg->fastmap = re_malloc (char, SBC_MAX);
 481   if (BE (preg->fastmap == NULL, 0))
 482     return REG_ESPACE;
 483
 484   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 485
 486   /* If REG_NEWLINE is set, newlines are treated differently.  */
 487   if (cflags & REG_NEWLINE)
 488     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 489       syntax &= ~RE_DOT_NEWLINE;
 490       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 491       /* It also changes the matching behavior.  */
 492       preg->newline_anchor = 1;
 493     }
 494   else
 495     preg->newline_anchor = 0;
 496   preg->no_sub = !!(cflags & REG_NOSUB);
 497   preg->translate = NULL;
 498
 499   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 500
 501   /* POSIX doesn't distinguish between an unmatched open-group and an
 502      unmatched close-group: both are REG_EPAREN.  */
 503   if (ret == REG_ERPAREN)
 504     ret = REG_EPAREN;
 505
 506   /* We have already checked preg->fastmap != NULL.  */
 507   if (BE (ret == REG_NOERROR, 1))
 508     /* Compute the fastmap now, since regexec cannot modify the pattern
 509        buffer.  This function never fails in this implementation.  */
 510     (void) re_compile_fastmap (preg);
 511   else
 512     {
 513       /* Some error occurred while compiling the expression.  */
 514       re_free (preg->fastmap);
 515       preg->fastmap = NULL;
 516     }
 517
 518   return (int) ret;
 519 }
 520 #ifdef _LIBC
 521 weak_alias (__regcomp, regcomp)
 522 #endif
 523
 524 /* Returns a message corresponding to an error code, ERRCODE, returned
 525    from either regcomp or regexec.   We don't use PREG here.  */
 526
 527 size_t
 528 regerror (errcode, preg, errbuf, errbuf_size)
 529     int errcode;
 530     const regex_t *__restrict preg;
 531     char *__restrict errbuf;
 532     size_t errbuf_size;
 533 {
 534   const char *msg;
 535   size_t msg_size;
 536
 537   if (BE (errcode < 0
 538           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 539                                / sizeof (__re_error_msgid_idx[0])), 0))
 540     /* Only error codes returned by the rest of the code should be passed
 541        to this routine.  If we are given anything else, or if other regex
 542        code generates an invalid error code, then the program has a bug.
 543        Dump core so we can fix it.  */
 544     abort ();
 545
 546   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 547
 548   msg_size = strlen (msg) + 1; /* Includes the null.  */
 549
 550   if (BE (errbuf_size != 0, 1))
 551     {
 552       if (BE (msg_size > errbuf_size, 0))
 553         {
 554 #if defined HAVE_MEMPCPY || defined _LIBC
 555           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 556 #else
 557           memcpy (errbuf, msg, errbuf_size - 1);
 558           errbuf[errbuf_size - 1] = 0;
 559 #endif
 560         }
 561       else
 562         memcpy (errbuf, msg, msg_size);
 563     }
 564
 565   return msg_size;
 566 }
 567 #ifdef _LIBC
 568 weak_alias (__regerror, regerror)
 569 #endif
 570
 571
 572 #ifdef RE_ENABLE_I18N
 573 /* This static array is used for the map to single-byte characters when
 574    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 575    it the same all the time.  UTF-8 is the preferred encoding so this is
 576    a worthwhile optimization.  */
 577 static const bitset_t utf8_sb_map =
 578 {
 579   /* Set the first 128 bits.  */
 580   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 581 };
 582 #endif
 583
 584
 585 static void
 586 free_dfa_content (re_dfa_t *dfa)
 587 {
 588   int i, j;
 589
 590   if (dfa->nodes)
 591     for (i = 0; i < dfa->nodes_len; ++i)
 592       free_token (dfa->nodes + i);
 593   re_free (dfa->nexts);
 594   for (i = 0; i < dfa->nodes_len; ++i)
 595     {
 596       if (dfa->eclosures != NULL)
 597         re_node_set_free (dfa->eclosures + i);
 598       if (dfa->inveclosures != NULL)
 599         re_node_set_free (dfa->inveclosures + i);
 600       if (dfa->edests != NULL)
 601         re_node_set_free (dfa->edests + i);
 602     }
 603   re_free (dfa->edests);
 604   re_free (dfa->eclosures);
 605   re_free (dfa->inveclosures);
 606   re_free (dfa->nodes);
 607
 608   if (dfa->state_table)
 609     for (i = 0; i <= dfa->state_hash_mask; ++i)
 610       {
 611         struct re_state_table_entry *entry = dfa->state_table + i;
 612         for (j = 0; j < entry->num; ++j)
 613           {
 614             re_dfastate_t *state = entry->array[j];
 615             free_state (state);
 616           }
 617         re_free (entry->array);
 618       }
 619   re_free (dfa->state_table);
 620 #ifdef RE_ENABLE_I18N
 621   if (dfa->sb_char != utf8_sb_map)
 622     re_free (dfa->sb_char);
 623 #endif
 624   re_free (dfa->subexp_map);
 625 #ifdef DEBUG
 626   re_free (dfa->re_str);
 627 #endif
 628
 629   re_free (dfa);
 630 }
 631
 632
 633 /* Free dynamically allocated space used by PREG.  */
 634
 635 void
 636 regfree (preg)
 637     regex_t *preg;
 638 {
 639   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 640   if (BE (dfa != NULL, 1))
 641     free_dfa_content (dfa);
 642   preg->buffer = NULL;
 643   preg->allocated = 0;
 644
 645   re_free (preg->fastmap);
 646   preg->fastmap = NULL;
 647
 648   re_free (preg->translate);
 649   preg->translate = NULL;
 650 }
 651 #ifdef _LIBC
 652 weak_alias (__regfree, regfree)
 653 #endif
 654 \f
 655 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 656    them unless specifically requested.  */
 657
 658 #if defined _REGEX_RE_COMP || defined _LIBC
 659
 660 /* BSD has one and only one pattern buffer.  */
 661 static struct re_pattern_buffer re_comp_buf;
 662
 663 char *
 664 # ifdef _LIBC
 665 /* Make these definitions weak in libc, so POSIX programs can redefine
 666    these names if they don't use our functions, and still use
 667    regcomp/regexec above without link errors.  */
 668 weak_function
 669 # endif
 670 re_comp (s)
 671      const char *s;
 672 {
 673   reg_errcode_t ret;
 674   char *fastmap;
 675
 676   if (!s)
 677     {
 678       if (!re_comp_buf.buffer)
 679         return gettext ("No previous regular expression");
 680       return 0;
 681     }
 682
 683   if (re_comp_buf.buffer)
 684     {
 685       fastmap = re_comp_buf.fastmap;
 686       re_comp_buf.fastmap = NULL;
 687       __regfree (&re_comp_buf);
 688       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 689       re_comp_buf.fastmap = fastmap;
 690     }
 691
 692   if (re_comp_buf.fastmap == NULL)
 693     {
 694       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 695       if (re_comp_buf.fastmap == NULL)
 696         return (char *) gettext (__re_error_msgid
 697                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 698     }
 699
 700   /* Since `re_exec' always passes NULL for the `regs' argument, we
 701      don't need to initialize the pattern buffer fields which affect it.  */
 702
 703   /* Match anchors at newlines.  */
 704   re_comp_buf.newline_anchor = 1;
 705
 706   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 707
 708   if (!ret)
 709     return NULL;
 710
 711   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 712   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 713 }
 714
 715 #ifdef _LIBC
 716 libc_freeres_fn (free_mem)
 717 {
 718   __regfree (&re_comp_buf);
 719 }
 720 #endif
 721
 722 #endif /* _REGEX_RE_COMP */
 723 \f
 724 /* Internal entry point.
 725    Compile the regular expression PATTERN, whose length is LENGTH.
 726    SYNTAX indicate regular expression's syntax.  */
 727
 728 static reg_errcode_t
 729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 730                      reg_syntax_t syntax)
 731 {
 732   reg_errcode_t err = REG_NOERROR;
 733   re_dfa_t *dfa;
 734   re_string_t regexp;
 735
 736   /* Initialize the pattern buffer.  */
 737   preg->fastmap_accurate = 0;
 738   preg->syntax = syntax;
 739   preg->not_bol = preg->not_eol = 0;
 740   preg->used = 0;
 741   preg->re_nsub = 0;
 742   preg->can_be_null = 0;
 743   preg->regs_allocated = REGS_UNALLOCATED;
 744
 745   /* Initialize the dfa.  */
 746   dfa = (re_dfa_t *) preg->buffer;
 747   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 748     {
 749       /* If zero allocated, but buffer is non-null, try to realloc
 750          enough space.  This loses if buffer's address is bogus, but
 751          that is the user's responsibility.  If ->buffer is NULL this
 752          is a simple allocation.  */
 753       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 754       if (dfa == NULL)
 755         return REG_ESPACE;
 756       preg->allocated = sizeof (re_dfa_t);
 757       preg->buffer = (unsigned char *) dfa;
 758     }
 759   preg->used = sizeof (re_dfa_t);
 760
 761   err = init_dfa (dfa, length);
 762   if (BE (err != REG_NOERROR, 0))
 763     {
 764       free_dfa_content (dfa);
 765       preg->buffer = NULL;
 766       preg->allocated = 0;
 767       return err;
 768     }
 769 #ifdef DEBUG
 770   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 771   dfa->re_str = re_malloc (char, length + 1);
 772   strncpy (dfa->re_str, pattern, length + 1);
 773 #endif
 774
 775   __libc_lock_init (dfa->lock);
 776
 777   err = re_string_construct (&regexp, pattern, length, preg->translate,
 778                              syntax & RE_ICASE, dfa);
 779   if (BE (err != REG_NOERROR, 0))
 780     {
 781     re_compile_internal_free_return:
 782       free_workarea_compile (preg);
 783       re_string_destruct (&regexp);
 784       free_dfa_content (dfa);
 785       preg->buffer = NULL;
 786       preg->allocated = 0;
 787       return err;
 788     }
 789
 790   /* Parse the regular expression, and build a structure tree.  */
 791   preg->re_nsub = 0;
 792   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 793   if (BE (dfa->str_tree == NULL, 0))
 794     goto re_compile_internal_free_return;
 795
 796   /* Analyze the tree and create the nfa.  */
 797   err = analyze (preg);
 798   if (BE (err != REG_NOERROR, 0))
 799     goto re_compile_internal_free_return;
 800
 801 #ifdef RE_ENABLE_I18N
 802   /* If possible, do searching in single byte encoding to speed things up.  */
 803   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 804     optimize_utf8 (dfa);
 805 #endif
 806
 807   /* Then create the initial state of the dfa.  */
 808   err = create_initial_state (dfa);
 809
 810   /* Release work areas.  */
 811   free_workarea_compile (preg);
 812   re_string_destruct (&regexp);
 813
 814   if (BE (err != REG_NOERROR, 0))
 815     {
 816       free_dfa_content (dfa);
 817       preg->buffer = NULL;
 818       preg->allocated = 0;
 819     }
 820
 821   return err;
 822 }
 823
 824 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 825    as the initial length of some arrays.  */
 826
 827 static reg_errcode_t
 828 init_dfa (re_dfa_t *dfa, size_t pat_len)
 829 {
 830   unsigned int table_size;
 831 #ifndef _LIBC
 832   char *codeset_name;
 833 #endif
 834
 835   memset (dfa, '\0', sizeof (re_dfa_t));
 836
 837   /* Force allocation of str_tree_storage the first time.  */
 838   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 839
 840   /* Avoid overflows.  */
 841   if (pat_len == SIZE_MAX)
 842     return REG_ESPACE;
 843
 844   dfa->nodes_alloc = pat_len + 1;
 845   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 846
 847   /*  table_size = 2 ^ ceil(log pat_len) */
 848   for (table_size = 1; ; table_size <<= 1)
 849     if (table_size > pat_len)
 850       break;
 851
 852   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 853   dfa->state_hash_mask = table_size - 1;
 854
 855   dfa->mb_cur_max = MB_CUR_MAX;
 856 #ifdef _LIBC
 857   if (dfa->mb_cur_max == 6
 858       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 859     dfa->is_utf8 = 1;
 860   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 861                        != 0);
 862 #else
 863 # ifdef HAVE_LANGINFO_CODESET
 864   codeset_name = nl_langinfo (CODESET);
 865 # else
 866   codeset_name = getenv ("LC_ALL");
 867   if (codeset_name == NULL || codeset_name[0] == '\0')
 868     codeset_name = getenv ("LC_CTYPE");
 869   if (codeset_name == NULL || codeset_name[0] == '\0')
 870     codeset_name = getenv ("LANG");
 871   if (codeset_name == NULL)
 872     codeset_name = "";
 873   else if (strchr (codeset_name, '.') !=  NULL)
 874     codeset_name = strchr (codeset_name, '.') + 1;
 875 # endif
 876
 877   if (strcasecmp (codeset_name, "UTF-8") == 0
 878       || strcasecmp (codeset_name, "UTF8") == 0)
 879     dfa->is_utf8 = 1;
 880
 881   /* We check exhaustively in the loop below if this charset is a
 882      superset of ASCII.  */
 883   dfa->map_notascii = 0;
 884 #endif
 885
 886 #ifdef RE_ENABLE_I18N
 887   if (dfa->mb_cur_max > 1)
 888     {
 889       if (dfa->is_utf8)
 890         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 891       else
 892         {
 893           int i, j, ch;
 894
 895           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 896           if (BE (dfa->sb_char == NULL, 0))
 897             return REG_ESPACE;
 898
 899           /* Set the bits corresponding to single byte chars.  */
 900           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 901             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 902               {
 903                 wint_t wch = __btowc (ch);
 904                 if (wch != WEOF)
 905                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 906 # ifndef _LIBC
 907                 if (isascii (ch) && wch != ch)
 908                   dfa->map_notascii = 1;
 909 # endif
 910               }
 911         }
 912     }
 913 #endif
 914
 915   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 916     return REG_ESPACE;
 917   return REG_NOERROR;
 918 }
 919
 920 /* Initialize WORD_CHAR table, which indicate which character is
 921    "word".  In this case "word" means that it is the word construction
 922    character used by some operators like "\<", "\>", etc.  */
 923
 924 static void
 925 internal_function
 926 init_word_char (re_dfa_t *dfa)
 927 {
 928   dfa->word_ops_used = 1;
 929   int i = 0;
 930   int ch = 0;
 931   if (BE (dfa->map_notascii == 0, 1))
 932     {
 933       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 934          them, an issue when this code is used in Gnulib.  */
 935       bitset_word_t bits0 = 0x00000000;
 936       bitset_word_t bits1 = 0x03ff0000;
 937       bitset_word_t bits2 = 0x87fffffe;
 938       bitset_word_t bits3 = 0x07fffffe;
 939
 940       if (BITSET_WORD_BITS == 64)
 941         {
 942           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 943           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 944           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 945           i = 2;
 946         }
 947       else if (BITSET_WORD_BITS == 32)
 948         {
 949           dfa->word_char[0] = bits0;
 950           dfa->word_char[1] = bits1;
 951           dfa->word_char[2] = bits2;
 952           dfa->word_char[3] = bits3;
 953           i = 4;
 954         }
 955       else
 956         goto general_case;
 957       ch = 128;
 958
 959       if (BE (dfa->is_utf8, 1))
 960         {
 961           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 962           return;
 963         }
 964     }
 965
 966 general_case:
 967   for (; i < BITSET_WORDS; ++i)
 968     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 969       if (isalnum (ch) || ch == '_')
 970         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 971 }
 972
 973 /* Free the work area which are only used while compiling.  */
 974
 975 static void
 976 free_workarea_compile (regex_t *preg)
 977 {
 978   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 979   bin_tree_storage_t *storage, *next;
 980   for (storage = dfa->str_tree_storage; storage; storage = next)
 981     {
 982       next = storage->next;
 983       re_free (storage);
 984     }
 985   dfa->str_tree_storage = NULL;
 986   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 987   dfa->str_tree = NULL;
 988   re_free (dfa->org_indices);
 989   dfa->org_indices = NULL;
 990 }
 991
 992 /* Create initial states for all contexts.  */
 993
 994 static reg_errcode_t
 995 create_initial_state (re_dfa_t *dfa)
 996 {
 997   int first, i;
 998   reg_errcode_t err;
 999   re_node_set init_nodes;
1000
1001   /* Initial states have the epsilon closure of the node which is
1002      the first node of the regular expression.  */
1003   first = dfa->str_tree->first->node_idx;
1004   dfa->init_node = first;
1005   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006   if (BE (err != REG_NOERROR, 0))
1007     return err;
1008
1009   /* The back-references which are in initial states can epsilon transit,
1010      since in this case all of the subexpressions can be null.
1011      Then we add epsilon closures of the nodes which are the next nodes of
1012      the back-references.  */
1013   if (dfa->nbackref > 0)
1014     for (i = 0; i < init_nodes.nelem; ++i)
1015       {
1016         int node_idx = init_nodes.elems[i];
1017         re_token_type_t type = dfa->nodes[node_idx].type;
1018
1019         int clexp_idx;
1020         if (type != OP_BACK_REF)
1021           continue;
1022         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1023           {
1024             re_token_t *clexp_node;
1025             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026             if (clexp_node->type == OP_CLOSE_SUBEXP
1027                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1028               break;
1029           }
1030         if (clexp_idx == init_nodes.nelem)
1031           continue;
1032
1033         if (type == OP_BACK_REF)
1034           {
1035             int dest_idx = dfa->edests[node_idx].elems[0];
1036             if (!re_node_set_contains (&init_nodes, dest_idx))
1037               {
1038                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1039                                                        dfa->eclosures
1040                                                        + dest_idx);
1041                 if (err != REG_NOERROR)
1042                   return err;
1043                 i = 0;
1044               }
1045           }
1046       }
1047
1048   /* It must be the first time to invoke acquire_state.  */
1049   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1050   /* We don't check ERR here, since the initial state must not be NULL.  */
1051   if (BE (dfa->init_state == NULL, 0))
1052     return err;
1053   if (dfa->init_state->has_constraint)
1054     {
1055       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1056                                                        CONTEXT_WORD);
1057       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1058                                                      CONTEXT_NEWLINE);
1059       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1060                                                          &init_nodes,
1061                                                          CONTEXT_NEWLINE
1062                                                          | CONTEXT_BEGBUF);
1063       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1064               || dfa->init_state_begbuf == NULL, 0))
1065         return err;
1066     }
1067   else
1068     dfa->init_state_word = dfa->init_state_nl
1069       = dfa->init_state_begbuf = dfa->init_state;
1070
1071   re_node_set_free (&init_nodes);
1072   return REG_NOERROR;
1073 }
1074 \f
1075 #ifdef RE_ENABLE_I18N
1076 /* If it is possible to do searching in single byte encoding instead of UTF-8
1077    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1078    DFA nodes where needed.  */
1079
1080 static void
1081 optimize_utf8 (re_dfa_t *dfa)
1082 {
1083   int node, i, mb_chars = 0, has_period = 0;
1084
1085   for (node = 0; node < dfa->nodes_len; ++node)
1086     switch (dfa->nodes[node].type)
1087       {
1088       case CHARACTER:
1089         if (dfa->nodes[node].opr.c >= 0x80)
1090           mb_chars = 1;
1091         break;
1092       case ANCHOR:
1093         switch (dfa->nodes[node].opr.ctx_type)
1094           {
1095           case LINE_FIRST:
1096           case LINE_LAST:
1097           case BUF_FIRST:
1098           case BUF_LAST:
1099             break;
1100           default:
1101             /* Word anchors etc. cannot be handled.  It's okay to test
1102                opr.ctx_type since constraints (for all DFA nodes) are
1103                created by ORing one or more opr.ctx_type values.  */
1104             return;
1105           }
1106         break;
1107       case OP_PERIOD:
1108         has_period = 1;
1109         break;
1110       case OP_BACK_REF:
1111       case OP_ALT:
1112       case END_OF_RE:
1113       case OP_DUP_ASTERISK:
1114       case OP_OPEN_SUBEXP:
1115       case OP_CLOSE_SUBEXP:
1116         break;
1117       case COMPLEX_BRACKET:
1118         return;
1119       case SIMPLE_BRACKET:
1120         /* Just double check.  The non-ASCII range starts at 0x80.  */
1121         assert (0x80 % BITSET_WORD_BITS == 0);
1122         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1123           if (dfa->nodes[node].opr.sbcset[i])
1124             return;
1125         break;
1126       default:
1127         abort ();
1128       }
1129
1130   if (mb_chars || has_period)
1131     for (node = 0; node < dfa->nodes_len; ++node)
1132       {
1133         if (dfa->nodes[node].type == CHARACTER
1134             && dfa->nodes[node].opr.c >= 0x80)
1135           dfa->nodes[node].mb_partial = 0;
1136         else if (dfa->nodes[node].type == OP_PERIOD)
1137           dfa->nodes[node].type = OP_UTF8_PERIOD;
1138       }
1139
1140   /* The search can be in single byte locale.  */
1141   dfa->mb_cur_max = 1;
1142   dfa->is_utf8 = 0;
1143   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1144 }
1145 #endif
1146 \f
1147 /* Analyze the structure tree, and calculate "first", "next", "edest",
1148    "eclosure", and "inveclosure".  */
1149
1150 static reg_errcode_t
1151 analyze (regex_t *preg)
1152 {
1153   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1154   reg_errcode_t ret;
1155
1156   /* Allocate arrays.  */
1157   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1158   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1159   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1160   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1161   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1162           || dfa->eclosures == NULL, 0))
1163     return REG_ESPACE;
1164
1165   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1166   if (dfa->subexp_map != NULL)
1167     {
1168       int i;
1169       for (i = 0; i < preg->re_nsub; i++)
1170         dfa->subexp_map[i] = i;
1171       preorder (dfa->str_tree, optimize_subexps, dfa);
1172       for (i = 0; i < preg->re_nsub; i++)
1173         if (dfa->subexp_map[i] != i)
1174           break;
1175       if (i == preg->re_nsub)
1176         {
1177           free (dfa->subexp_map);
1178           dfa->subexp_map = NULL;
1179         }
1180     }
1181
1182   ret = postorder (dfa->str_tree, lower_subexps, preg);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185   ret = postorder (dfa->str_tree, calc_first, dfa);
1186   if (BE (ret != REG_NOERROR, 0))
1187     return ret;
1188   preorder (dfa->str_tree, calc_next, dfa);
1189   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192   ret = calc_eclosure (dfa);
1193   if (BE (ret != REG_NOERROR, 0))
1194     return ret;
1195
1196   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1197      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1198   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1199       || dfa->nbackref)
1200     {
1201       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1202       if (BE (dfa->inveclosures == NULL, 0))
1203         return REG_ESPACE;
1204       ret = calc_inveclosure (dfa);
1205     }
1206
1207   return ret;
1208 }
1209
1210 /* Our parse trees are very unbalanced, so we cannot use a stack to
1211    implement parse tree visits.  Instead, we use parent pointers and
1212    some hairy code in these two functions.  */
1213 static reg_errcode_t
1214 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1215            void *extra)
1216 {
1217   bin_tree_t *node, *prev;
1218
1219   for (node = root; ; )
1220     {
1221       /* Descend down the tree, preferably to the left (or to the right
1222          if that's the only child).  */
1223       while (node->left || node->right)
1224         if (node->left)
1225           node = node->left;
1226         else
1227           node = node->right;
1228
1229       do
1230         {
1231           reg_errcode_t err = fn (extra, node);
1232           if (BE (err != REG_NOERROR, 0))
1233             return err;
1234           if (node->parent == NULL)
1235             return REG_NOERROR;
1236           prev = node;
1237           node = node->parent;
1238         }
1239       /* Go up while we have a node that is reached from the right.  */
1240       while (node->right == prev || node->right == NULL);
1241       node = node->right;
1242     }
1243 }
1244
1245 static reg_errcode_t
1246 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1247           void *extra)
1248 {
1249   bin_tree_t *node;
1250
1251   for (node = root; ; )
1252     {
1253       reg_errcode_t err = fn (extra, node);
1254       if (BE (err != REG_NOERROR, 0))
1255         return err;
1256
1257       /* Go to the left node, or up and to the right.  */
1258       if (node->left)
1259         node = node->left;
1260       else
1261         {
1262           bin_tree_t *prev = NULL;
1263           while (node->right == prev || node->right == NULL)
1264             {
1265               prev = node;
1266               node = node->parent;
1267               if (!node)
1268                 return REG_NOERROR;
1269             }
1270           node = node->right;
1271         }
1272     }
1273 }
1274
1275 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1276    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1277    backreferences as well.  Requires a preorder visit.  */
1278 static reg_errcode_t
1279 optimize_subexps (void *extra, bin_tree_t *node)
1280 {
1281   re_dfa_t *dfa = (re_dfa_t *) extra;
1282
1283   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1284     {
1285       int idx = node->token.opr.idx;
1286       node->token.opr.idx = dfa->subexp_map[idx];
1287       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1288     }
1289
1290   else if (node->token.type == SUBEXP
1291            && node->left && node->left->token.type == SUBEXP)
1292     {
1293       int other_idx = node->left->token.opr.idx;
1294
1295       node->left = node->left->left;
1296       if (node->left)
1297         node->left->parent = node;
1298
1299       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1300       if (other_idx < BITSET_WORD_BITS)
1301           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1302     }
1303
1304   return REG_NOERROR;
1305 }
1306
1307 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1308    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1309 static reg_errcode_t
1310 lower_subexps (void *extra, bin_tree_t *node)
1311 {
1312   regex_t *preg = (regex_t *) extra;
1313   reg_errcode_t err = REG_NOERROR;
1314
1315   if (node->left && node->left->token.type == SUBEXP)
1316     {
1317       node->left = lower_subexp (&err, preg, node->left);
1318       if (node->left)
1319         node->left->parent = node;
1320     }
1321   if (node->right && node->right->token.type == SUBEXP)
1322     {
1323       node->right = lower_subexp (&err, preg, node->right);
1324       if (node->right)
1325         node->right->parent = node;
1326     }
1327
1328   return err;
1329 }
1330
1331 static bin_tree_t *
1332 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1333 {
1334   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335   bin_tree_t *body = node->left;
1336   bin_tree_t *op, *cls, *tree1, *tree;
1337
1338   if (preg->no_sub
1339       /* We do not optimize empty subexpressions, because otherwise we may
1340          have bad CONCAT nodes with NULL children.  This is obviously not
1341          very common, so we do not lose much.  An example that triggers
1342          this case is the sed "script" /\(\)/x.  */
1343       && node->left != NULL
1344       && (node->token.opr.idx >= BITSET_WORD_BITS
1345           || !(dfa->used_bkref_map
1346                & ((bitset_word_t) 1 << node->token.opr.idx))))
1347     return node->left;
1348
1349   /* Convert the SUBEXP node to the concatenation of an
1350      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1351   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1352   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1353   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1354   tree = create_tree (dfa, op, tree1, CONCAT);
1355   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1356     {
1357       *err = REG_ESPACE;
1358       return NULL;
1359     }
1360
1361   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1362   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1363   return tree;
1364 }
1365
1366 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1367    nodes.  Requires a postorder visit.  */
1368 static reg_errcode_t
1369 calc_first (void *extra, bin_tree_t *node)
1370 {
1371   re_dfa_t *dfa = (re_dfa_t *) extra;
1372   if (node->token.type == CONCAT)
1373     {
1374       node->first = node->left->first;
1375       node->node_idx = node->left->node_idx;
1376     }
1377   else
1378     {
1379       node->first = node;
1380       node->node_idx = re_dfa_add_node (dfa, node->token);
1381       if (BE (node->node_idx == -1, 0))
1382         return REG_ESPACE;
1383       if (node->token.type == ANCHOR)
1384         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1385     }
1386   return REG_NOERROR;
1387 }
1388
1389 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1390 static reg_errcode_t
1391 calc_next (void *extra, bin_tree_t *node)
1392 {
1393   switch (node->token.type)
1394     {
1395     case OP_DUP_ASTERISK:
1396       node->left->next = node;
1397       break;
1398     case CONCAT:
1399       node->left->next = node->right->first;
1400       node->right->next = node->next;
1401       break;
1402     default:
1403       if (node->left)
1404         node->left->next = node->next;
1405       if (node->right)
1406         node->right->next = node->next;
1407       break;
1408     }
1409   return REG_NOERROR;
1410 }
1411
1412 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1413 static reg_errcode_t
1414 link_nfa_nodes (void *extra, bin_tree_t *node)
1415 {
1416   re_dfa_t *dfa = (re_dfa_t *) extra;
1417   int idx = node->node_idx;
1418   reg_errcode_t err = REG_NOERROR;
1419
1420   switch (node->token.type)
1421     {
1422     case CONCAT:
1423       break;
1424
1425     case END_OF_RE:
1426       assert (node->next == NULL);
1427       break;
1428
1429     case OP_DUP_ASTERISK:
1430     case OP_ALT:
1431       {
1432         int left, right;
1433         dfa->has_plural_match = 1;
1434         if (node->left != NULL)
1435           left = node->left->first->node_idx;
1436         else
1437           left = node->next->node_idx;
1438         if (node->right != NULL)
1439           right = node->right->first->node_idx;
1440         else
1441           right = node->next->node_idx;
1442         assert (left > -1);
1443         assert (right > -1);
1444         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1445       }
1446       break;
1447
1448     case ANCHOR:
1449     case OP_OPEN_SUBEXP:
1450     case OP_CLOSE_SUBEXP:
1451       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1452       break;
1453
1454     case OP_BACK_REF:
1455       dfa->nexts[idx] = node->next->node_idx;
1456       if (node->token.type == OP_BACK_REF)
1457         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1458       break;
1459
1460     default:
1461       assert (!IS_EPSILON_NODE (node->token.type));
1462       dfa->nexts[idx] = node->next->node_idx;
1463       break;
1464     }
1465
1466   return err;
1467 }
1468
1469 /* Duplicate the epsilon closure of the node ROOT_NODE.
1470    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1471    to their own constraint.  */
1472
1473 static reg_errcode_t
1474 internal_function
1475 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1476                         int root_node, unsigned int init_constraint)
1477 {
1478   int org_node, clone_node, ret;
1479   unsigned int constraint = init_constraint;
1480   for (org_node = top_org_node, clone_node = top_clone_node;;)
1481     {
1482       int org_dest, clone_dest;
1483       if (dfa->nodes[org_node].type == OP_BACK_REF)
1484         {
1485           /* If the back reference epsilon-transit, its destination must
1486              also have the constraint.  Then duplicate the epsilon closure
1487              of the destination of the back reference, and store it in
1488              edests of the back reference.  */
1489           org_dest = dfa->nexts[org_node];
1490           re_node_set_empty (dfa->edests + clone_node);
1491           clone_dest = duplicate_node (dfa, org_dest, constraint);
1492           if (BE (clone_dest == -1, 0))
1493             return REG_ESPACE;
1494           dfa->nexts[clone_node] = dfa->nexts[org_node];
1495           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496           if (BE (ret < 0, 0))
1497             return REG_ESPACE;
1498         }
1499       else if (dfa->edests[org_node].nelem == 0)
1500         {
1501           /* In case of the node can't epsilon-transit, don't duplicate the
1502              destination and store the original destination as the
1503              destination of the node.  */
1504           dfa->nexts[clone_node] = dfa->nexts[org_node];
1505           break;
1506         }
1507       else if (dfa->edests[org_node].nelem == 1)
1508         {
1509           /* In case of the node can epsilon-transit, and it has only one
1510              destination.  */
1511           org_dest = dfa->edests[org_node].elems[0];
1512           re_node_set_empty (dfa->edests + clone_node);
1513           /* If the node is root_node itself, it means the epsilon clsoure
1514              has a loop.   Then tie it to the destination of the root_node.  */
1515           if (org_node == root_node && clone_node != org_node)
1516             {
1517               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518               if (BE (ret < 0, 0))
1519                 return REG_ESPACE;
1520               break;
1521             }
1522           /* In case of the node has another constraint, add it.  */
1523           constraint |= dfa->nodes[org_node].constraint;
1524           clone_dest = duplicate_node (dfa, org_dest, constraint);
1525           if (BE (clone_dest == -1, 0))
1526             return REG_ESPACE;
1527           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528           if (BE (ret < 0, 0))
1529             return REG_ESPACE;
1530         }
1531       else /* dfa->edests[org_node].nelem == 2 */
1532         {
1533           /* In case of the node can epsilon-transit, and it has two
1534              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1535           org_dest = dfa->edests[org_node].elems[0];
1536           re_node_set_empty (dfa->edests + clone_node);
1537           /* Search for a duplicated node which satisfies the constraint.  */
1538           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539           if (clone_dest == -1)
1540             {
1541               /* There is no such duplicated node, create a new one.  */
1542               reg_errcode_t err;
1543               clone_dest = duplicate_node (dfa, org_dest, constraint);
1544               if (BE (clone_dest == -1, 0))
1545                 return REG_ESPACE;
1546               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547               if (BE (ret < 0, 0))
1548                 return REG_ESPACE;
1549               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550                                             root_node, constraint);
1551               if (BE (err != REG_NOERROR, 0))
1552                 return err;
1553             }
1554           else
1555             {
1556               /* There is a duplicated node which satisfies the constraint,
1557                  use it to avoid infinite loop.  */
1558               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559               if (BE (ret < 0, 0))
1560                 return REG_ESPACE;
1561             }
1562
1563           org_dest = dfa->edests[org_node].elems[1];
1564           clone_dest = duplicate_node (dfa, org_dest, constraint);
1565           if (BE (clone_dest == -1, 0))
1566             return REG_ESPACE;
1567           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568           if (BE (ret < 0, 0))
1569             return REG_ESPACE;
1570         }
1571       org_node = org_dest;
1572       clone_node = clone_dest;
1573     }
1574   return REG_NOERROR;
1575 }
1576
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578    satisfies the constraint CONSTRAINT.  */
1579
1580 static int
1581 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1582                         unsigned int constraint)
1583 {
1584   int idx;
1585   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1586     {
1587       if (org_node == dfa->org_indices[idx]
1588           && constraint == dfa->nodes[idx].constraint)
1589         return idx; /* Found.  */
1590     }
1591   return -1; /* Not found.  */
1592 }
1593
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595    Return the index of the new node, or -1 if insufficient storage is
1596    available.  */
1597
1598 static int
1599 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1600 {
1601   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602   if (BE (dup_idx != -1, 1))
1603     {
1604       dfa->nodes[dup_idx].constraint = constraint;
1605       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1606       dfa->nodes[dup_idx].duplicated = 1;
1607
1608       /* Store the index of the original node.  */
1609       dfa->org_indices[dup_idx] = org_idx;
1610     }
1611   return dup_idx;
1612 }
1613
1614 static reg_errcode_t
1615 calc_inveclosure (re_dfa_t *dfa)
1616 {
1617   int src, idx, ret;
1618   for (idx = 0; idx < dfa->nodes_len; ++idx)
1619     re_node_set_init_empty (dfa->inveclosures + idx);
1620
1621   for (src = 0; src < dfa->nodes_len; ++src)
1622     {
1623       int *elems = dfa->eclosures[src].elems;
1624       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1625         {
1626           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1627           if (BE (ret == -1, 0))
1628             return REG_ESPACE;
1629         }
1630     }
1631
1632   return REG_NOERROR;
1633 }
1634
1635 /* Calculate "eclosure" for all the node in DFA.  */
1636
1637 static reg_errcode_t
1638 calc_eclosure (re_dfa_t *dfa)
1639 {
1640   int node_idx, incomplete;
1641 #ifdef DEBUG
1642   assert (dfa->nodes_len > 0);
1643 #endif
1644   incomplete = 0;
1645   /* For each nodes, calculate epsilon closure.  */
1646   for (node_idx = 0; ; ++node_idx)
1647     {
1648       reg_errcode_t err;
1649       re_node_set eclosure_elem;
1650       if (node_idx == dfa->nodes_len)
1651         {
1652           if (!incomplete)
1653             break;
1654           incomplete = 0;
1655           node_idx = 0;
1656         }
1657
1658 #ifdef DEBUG
1659       assert (dfa->eclosures[node_idx].nelem != -1);
1660 #endif
1661
1662       /* If we have already calculated, skip it.  */
1663       if (dfa->eclosures[node_idx].nelem != 0)
1664         continue;
1665       /* Calculate epsilon closure of `node_idx'.  */
1666       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1667       if (BE (err != REG_NOERROR, 0))
1668         return err;
1669
1670       if (dfa->eclosures[node_idx].nelem == 0)
1671         {
1672           incomplete = 1;
1673           re_node_set_free (&eclosure_elem);
1674         }
1675     }
1676   return REG_NOERROR;
1677 }
1678
1679 /* Calculate epsilon closure of NODE.  */
1680
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1683 {
1684   reg_errcode_t err;
1685   int i;
1686   re_node_set eclosure;
1687   int ret;
1688   int incomplete = 0;
1689   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690   if (BE (err != REG_NOERROR, 0))
1691     return err;
1692
1693   /* This indicates that we are calculating this node now.
1694      We reference this value to avoid infinite loop.  */
1695   dfa->eclosures[node].nelem = -1;
1696
1697   /* If the current node has constraints, duplicate all nodes
1698      since they must inherit the constraints.  */
1699   if (dfa->nodes[node].constraint
1700       && dfa->edests[node].nelem
1701       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702     {
1703       err = duplicate_node_closure (dfa, node, node, node,
1704                                     dfa->nodes[node].constraint);
1705       if (BE (err != REG_NOERROR, 0))
1706         return err;
1707     }
1708
1709   /* Expand each epsilon destination nodes.  */
1710   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711     for (i = 0; i < dfa->edests[node].nelem; ++i)
1712       {
1713         re_node_set eclosure_elem;
1714         int edest = dfa->edests[node].elems[i];
1715         /* If calculating the epsilon closure of `edest' is in progress,
1716            return intermediate result.  */
1717         if (dfa->eclosures[edest].nelem == -1)
1718           {
1719             incomplete = 1;
1720             continue;
1721           }
1722         /* If we haven't calculated the epsilon closure of `edest' yet,
1723            calculate now. Otherwise use calculated epsilon closure.  */
1724         if (dfa->eclosures[edest].nelem == 0)
1725           {
1726             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1727             if (BE (err != REG_NOERROR, 0))
1728               return err;
1729           }
1730         else
1731           eclosure_elem = dfa->eclosures[edest];
1732         /* Merge the epsilon closure of `edest'.  */
1733         err = re_node_set_merge (&eclosure, &eclosure_elem);
1734         if (BE (err != REG_NOERROR, 0))
1735           return err;
1736         /* If the epsilon closure of `edest' is incomplete,
1737            the epsilon closure of this node is also incomplete.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             incomplete = 1;
1741             re_node_set_free (&eclosure_elem);
1742           }
1743       }
1744
1745   /* An epsilon closure includes itself.  */
1746   ret = re_node_set_insert (&eclosure, node);
1747   if (BE (ret < 0, 0))
1748     return REG_ESPACE;
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 \f
1757 /* Functions for token which are used in the parser.  */
1758
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static void
1763 internal_function
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765 {
1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 }
1768
1769 /* Peek a token from INPUT, and return the length of the token.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static int
1773 internal_function
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775 {
1776   unsigned char c;
1777
1778   if (re_string_eoi (input))
1779     {
1780       token->type = END_OF_RE;
1781       return 0;
1782     }
1783
1784   c = re_string_peek_byte (input, 0);
1785   token->opr.c = c;
1786
1787   token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789   token->mb_partial = 0;
1790   if (input->mb_cur_max > 1 &&
1791       !re_string_first_byte (input, re_string_cur_idx (input)))
1792     {
1793       token->type = CHARACTER;
1794       token->mb_partial = 1;
1795       return 1;
1796     }
1797 #endif
1798   if (c == '\\')
1799     {
1800       unsigned char c2;
1801       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802         {
1803           token->type = BACK_SLASH;
1804           return 1;
1805         }
1806
1807       c2 = re_string_peek_byte_case (input, 1);
1808       token->opr.c = c2;
1809       token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811       if (input->mb_cur_max > 1)
1812         {
1813           wint_t wc = re_string_wchar_at (input,
1814                                           re_string_cur_idx (input) + 1);
1815           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816         }
1817       else
1818 #endif
1819         token->word_char = IS_WORD_CHAR (c2) != 0;
1820
1821       switch (c2)
1822         {
1823         case '|':
1824           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825             token->type = OP_ALT;
1826           break;
1827         case '1': case '2': case '3': case '4': case '5':
1828         case '6': case '7': case '8': case '9':
1829           if (!(syntax & RE_NO_BK_REFS))
1830             {
1831               token->type = OP_BACK_REF;
1832               token->opr.idx = c2 - '1';
1833             }
1834           break;
1835         case '<':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = WORD_FIRST;
1840             }
1841           break;
1842         case '>':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_LAST;
1847             }
1848           break;
1849         case 'b':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_DELIM;
1854             }
1855           break;
1856         case 'B':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = NOT_WORD_DELIM;
1861             }
1862           break;
1863         case 'w':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             token->type = OP_WORD;
1866           break;
1867         case 'W':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_NOTWORD;
1870           break;
1871         case 's':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_SPACE;
1874           break;
1875         case 'S':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_NOTSPACE;
1878           break;
1879         case '`':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = BUF_FIRST;
1884             }
1885           break;
1886         case '\'':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = BUF_LAST;
1891             }
1892           break;
1893         case '(':
1894           if (!(syntax & RE_NO_BK_PARENS))
1895             token->type = OP_OPEN_SUBEXP;
1896           break;
1897         case ')':
1898           if (!(syntax & RE_NO_BK_PARENS))
1899             token->type = OP_CLOSE_SUBEXP;
1900           break;
1901         case '+':
1902           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903             token->type = OP_DUP_PLUS;
1904           break;
1905         case '?':
1906           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907             token->type = OP_DUP_QUESTION;
1908           break;
1909         case '{':
1910           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911             token->type = OP_OPEN_DUP_NUM;
1912           break;
1913         case '}':
1914           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915             token->type = OP_CLOSE_DUP_NUM;
1916           break;
1917         default:
1918           break;
1919         }
1920       return 2;
1921     }
1922
1923   token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925   if (input->mb_cur_max > 1)
1926     {
1927       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929     }
1930   else
1931 #endif
1932     token->word_char = IS_WORD_CHAR (token->opr.c);
1933
1934   switch (c)
1935     {
1936     case '\n':
1937       if (syntax & RE_NEWLINE_ALT)
1938         token->type = OP_ALT;
1939       break;
1940     case '|':
1941       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942         token->type = OP_ALT;
1943       break;
1944     case '*':
1945       token->type = OP_DUP_ASTERISK;
1946       break;
1947     case '+':
1948       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949         token->type = OP_DUP_PLUS;
1950       break;
1951     case '?':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953         token->type = OP_DUP_QUESTION;
1954       break;
1955     case '{':
1956       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957         token->type = OP_OPEN_DUP_NUM;
1958       break;
1959     case '}':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961         token->type = OP_CLOSE_DUP_NUM;
1962       break;
1963     case '(':
1964       if (syntax & RE_NO_BK_PARENS)
1965         token->type = OP_OPEN_SUBEXP;
1966       break;
1967     case ')':
1968       if (syntax & RE_NO_BK_PARENS)
1969         token->type = OP_CLOSE_SUBEXP;
1970       break;
1971     case '[':
1972       token->type = OP_OPEN_BRACKET;
1973       break;
1974     case '.':
1975       token->type = OP_PERIOD;
1976       break;
1977     case '^':
1978       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979           re_string_cur_idx (input) != 0)
1980         {
1981           char prev = re_string_peek_byte (input, -1);
1982           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983             break;
1984         }
1985       token->type = ANCHOR;
1986       token->opr.ctx_type = LINE_FIRST;
1987       break;
1988     case '$':
1989       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990           re_string_cur_idx (input) + 1 != re_string_length (input))
1991         {
1992           re_token_t next;
1993           re_string_skip_bytes (input, 1);
1994           peek_token (&next, input, syntax);
1995           re_string_skip_bytes (input, -1);
1996           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997             break;
1998         }
1999       token->type = ANCHOR;
2000       token->opr.ctx_type = LINE_LAST;
2001       break;
2002     default:
2003       break;
2004     }
2005   return 1;
2006 }
2007
2008 /* Peek a token from INPUT, and return the length of the token.
2009    We must not use this function out of bracket expressions.  */
2010
2011 static int
2012 internal_function
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014 {
2015   unsigned char c;
2016   if (re_string_eoi (input))
2017     {
2018       token->type = END_OF_RE;
2019       return 0;
2020     }
2021   c = re_string_peek_byte (input, 0);
2022   token->opr.c = c;
2023
2024 #ifdef RE_ENABLE_I18N
2025   if (input->mb_cur_max > 1 &&
2026       !re_string_first_byte (input, re_string_cur_idx (input)))
2027     {
2028       token->type = CHARACTER;
2029       return 1;
2030     }
2031 #endif /* RE_ENABLE_I18N */
2032
2033   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034       && re_string_cur_idx (input) + 1 < re_string_length (input))
2035     {
2036       /* In this case, '\' escape a character.  */
2037       unsigned char c2;
2038       re_string_skip_bytes (input, 1);
2039       c2 = re_string_peek_byte (input, 0);
2040       token->opr.c = c2;
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044   if (c == '[') /* '[' is a special char in a bracket exps.  */
2045     {
2046       unsigned char c2;
2047       int token_len;
2048       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049         c2 = re_string_peek_byte (input, 1);
2050       else
2051         c2 = 0;
2052       token->opr.c = c2;
2053       token_len = 2;
2054       switch (c2)
2055         {
2056         case '.':
2057           token->type = OP_OPEN_COLL_ELEM;
2058           break;
2059         case '=':
2060           token->type = OP_OPEN_EQUIV_CLASS;
2061           break;
2062         case ':':
2063           if (syntax & RE_CHAR_CLASSES)
2064             {
2065               token->type = OP_OPEN_CHAR_CLASS;
2066               break;
2067             }
2068           /* else fall through.  */
2069         default:
2070           token->type = CHARACTER;
2071           token->opr.c = c;
2072           token_len = 1;
2073           break;
2074         }
2075       return token_len;
2076     }
2077   switch (c)
2078     {
2079     case '-':
2080       token->type = OP_CHARSET_RANGE;
2081       break;
2082     case ']':
2083       token->type = OP_CLOSE_BRACKET;
2084       break;
2085     case '^':
2086       token->type = OP_NON_MATCH_LIST;
2087       break;
2088     default:
2089       token->type = CHARACTER;
2090     }
2091   return 1;
2092 }
2093 \f
2094 /* Functions for parser.  */
2095
2096 /* Entry point of the parser.
2097    Parse the regular expression REGEXP and return the structure tree.
2098    If an error is occured, ERR is set by error code, and return NULL.
2099    This function build the following tree, from regular expression <reg_exp>:
2100            CAT
2101            / \
2102           /   \
2103    <reg_exp>  EOR
2104
2105    CAT means concatenation.
2106    EOR means end of regular expression.  */
2107
2108 static bin_tree_t *
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110        reg_errcode_t *err)
2111 {
2112   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113   bin_tree_t *tree, *eor, *root;
2114   re_token_t current_token;
2115   dfa->syntax = syntax;
2116   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119     return NULL;
2120   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121   if (tree != NULL)
2122     root = create_tree (dfa, tree, eor, CONCAT);
2123   else
2124     root = eor;
2125   if (BE (eor == NULL || root == NULL, 0))
2126     {
2127       *err = REG_ESPACE;
2128       return NULL;
2129     }
2130   return root;
2131 }
2132
2133 /* This function build the following tree, from regular expression
2134    <branch1>|<branch2>:
2135            ALT
2136            / \
2137           /   \
2138    <branch1> <branch2>
2139
2140    ALT means alternative, which represents the operator `|'.  */
2141
2142 static bin_tree_t *
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147   bin_tree_t *tree, *branch = NULL;
2148   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150     return NULL;
2151
2152   while (token->type == OP_ALT)
2153     {
2154       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155       if (token->type != OP_ALT && token->type != END_OF_RE
2156           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157         {
2158           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160             return NULL;
2161         }
2162       else
2163         branch = NULL;
2164       tree = create_tree (dfa, tree, branch, OP_ALT);
2165       if (BE (tree == NULL, 0))
2166         {
2167           *err = REG_ESPACE;
2168           return NULL;
2169         }
2170     }
2171   return tree;
2172 }
2173
2174 /* This function build the following tree, from regular expression
2175    <exp1><exp2>:
2176         CAT
2177         / \
2178        /   \
2179    <exp1> <exp2>
2180
2181    CAT means concatenation.  */
2182
2183 static bin_tree_t *
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2186 {
2187   bin_tree_t *tree, *exp;
2188   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191     return NULL;
2192
2193   while (token->type != OP_ALT && token->type != END_OF_RE
2194          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195     {
2196       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2197       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2198         {
2199           if (tree != NULL)
2200             postorder (tree, free_tree, NULL);
2201           return NULL;
2202         }
2203       if (tree != NULL && exp != NULL)
2204         {
2205           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2206           if (newtree == NULL)
2207             {
2208               postorder (exp, free_tree, NULL);
2209               postorder (tree, free_tree, NULL);
2210               *err = REG_ESPACE;
2211               return NULL;
2212             }
2213           tree = newtree;
2214         }
2215       else if (tree == NULL)
2216         tree = exp;
2217       /* Otherwise exp == NULL, we don't need to create new tree.  */
2218     }
2219   return tree;
2220 }
2221
2222 /* This function build the following tree, from regular expression a*:
2223          *
2224          |
2225          a
2226 */
2227
2228 static bin_tree_t *
2229 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2231 {
2232   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2233   bin_tree_t *tree;
2234   switch (token->type)
2235     {
2236     case CHARACTER:
2237       tree = create_token_tree (dfa, NULL, NULL, token);
2238       if (BE (tree == NULL, 0))
2239         {
2240           *err = REG_ESPACE;
2241           return NULL;
2242         }
2243 #ifdef RE_ENABLE_I18N
2244       if (dfa->mb_cur_max > 1)
2245         {
2246           while (!re_string_eoi (regexp)
2247                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2248             {
2249               bin_tree_t *mbc_remain;
2250               fetch_token (token, regexp, syntax);
2251               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253               if (BE (mbc_remain == NULL || tree == NULL, 0))
2254                 {
2255                   *err = REG_ESPACE;
2256                   return NULL;
2257                 }
2258             }
2259         }
2260 #endif
2261       break;
2262     case OP_OPEN_SUBEXP:
2263       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265         return NULL;
2266       break;
2267     case OP_OPEN_BRACKET:
2268       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270         return NULL;
2271       break;
2272     case OP_BACK_REF:
2273       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2274         {
2275           *err = REG_ESUBREG;
2276           return NULL;
2277         }
2278       dfa->used_bkref_map |= 1 << token->opr.idx;
2279       tree = create_token_tree (dfa, NULL, NULL, token);
2280       if (BE (tree == NULL, 0))
2281         {
2282           *err = REG_ESPACE;
2283           return NULL;
2284         }
2285       ++dfa->nbackref;
2286       dfa->has_mb_node = 1;
2287       break;
2288     case OP_OPEN_DUP_NUM:
2289       if (syntax & RE_CONTEXT_INVALID_DUP)
2290         {
2291           *err = REG_BADRPT;
2292           return NULL;
2293         }
2294       /* FALLTHROUGH */
2295     case OP_DUP_ASTERISK:
2296     case OP_DUP_PLUS:
2297     case OP_DUP_QUESTION:
2298       if (syntax & RE_CONTEXT_INVALID_OPS)
2299         {
2300           *err = REG_BADRPT;
2301           return NULL;
2302         }
2303       else if (syntax & RE_CONTEXT_INDEP_OPS)
2304         {
2305           fetch_token (token, regexp, syntax);
2306           return parse_expression (regexp, preg, token, syntax, nest, err);
2307         }
2308       /* else fall through  */
2309     case OP_CLOSE_SUBEXP:
2310       if ((token->type == OP_CLOSE_SUBEXP) &&
2311           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2312         {
2313           *err = REG_ERPAREN;
2314           return NULL;
2315         }
2316       /* else fall through  */
2317     case OP_CLOSE_DUP_NUM:
2318       /* We treat it as a normal character.  */
2319
2320       /* Then we can these characters as normal characters.  */
2321       token->type = CHARACTER;
2322       /* mb_partial and word_char bits should be initialized already
2323          by peek_token.  */
2324       tree = create_token_tree (dfa, NULL, NULL, token);
2325       if (BE (tree == NULL, 0))
2326         {
2327           *err = REG_ESPACE;
2328           return NULL;
2329         }
2330       break;
2331     case ANCHOR:
2332       if ((token->opr.ctx_type
2333            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334           && dfa->word_ops_used == 0)
2335         init_word_char (dfa);
2336       if (token->opr.ctx_type == WORD_DELIM
2337           || token->opr.ctx_type == NOT_WORD_DELIM)
2338         {
2339           bin_tree_t *tree_first, *tree_last;
2340           if (token->opr.ctx_type == WORD_DELIM)
2341             {
2342               token->opr.ctx_type = WORD_FIRST;
2343               tree_first = create_token_tree (dfa, NULL, NULL, token);
2344               token->opr.ctx_type = WORD_LAST;
2345             }
2346           else
2347             {
2348               token->opr.ctx_type = INSIDE_WORD;
2349               tree_first = create_token_tree (dfa, NULL, NULL, token);
2350               token->opr.ctx_type = INSIDE_NOTWORD;
2351             }
2352           tree_last = create_token_tree (dfa, NULL, NULL, token);
2353           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355             {
2356               *err = REG_ESPACE;
2357               return NULL;
2358             }
2359         }
2360       else
2361         {
2362           tree = create_token_tree (dfa, NULL, NULL, token);
2363           if (BE (tree == NULL, 0))
2364             {
2365               *err = REG_ESPACE;
2366               return NULL;
2367             }
2368         }
2369       /* We must return here, since ANCHORs can't be followed
2370          by repetition operators.
2371          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2373       fetch_token (token, regexp, syntax);
2374       return tree;
2375     case OP_PERIOD:
2376       tree = create_token_tree (dfa, NULL, NULL, token);
2377       if (BE (tree == NULL, 0))
2378         {
2379           *err = REG_ESPACE;
2380           return NULL;
2381         }
2382       if (dfa->mb_cur_max > 1)
2383         dfa->has_mb_node = 1;
2384       break;
2385     case OP_WORD:
2386     case OP_NOTWORD:
2387       tree = build_charclass_op (dfa, regexp->trans,
2388                                  (const unsigned char *) "alnum",
2389                                  (const unsigned char *) "_",
2390                                  token->type == OP_NOTWORD, err);
2391       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392         return NULL;
2393       break;
2394     case OP_SPACE:
2395     case OP_NOTSPACE:
2396       tree = build_charclass_op (dfa, regexp->trans,
2397                                  (const unsigned char *) "space",
2398                                  (const unsigned char *) "",
2399                                  token->type == OP_NOTSPACE, err);
2400       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2401         return NULL;
2402       break;
2403     case OP_ALT:
2404     case END_OF_RE:
2405       return NULL;
2406     case BACK_SLASH:
2407       *err = REG_EESCAPE;
2408       return NULL;
2409     default:
2410       /* Must not happen?  */
2411 #ifdef DEBUG
2412       assert (0);
2413 #endif
2414       return NULL;
2415     }
2416   fetch_token (token, regexp, syntax);
2417
2418   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2420     {
2421       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423         return NULL;
2424       /* In BRE consecutive duplications are not allowed.  */
2425       if ((syntax & RE_CONTEXT_INVALID_DUP)
2426           && (token->type == OP_DUP_ASTERISK
2427               || token->type == OP_OPEN_DUP_NUM))
2428         {
2429           *err = REG_BADRPT;
2430           return NULL;
2431         }
2432     }
2433
2434   return tree;
2435 }
2436
2437 /* This function build the following tree, from regular expression
2438    (<reg_exp>):
2439          SUBEXP
2440             |
2441         <reg_exp>
2442 */
2443
2444 static bin_tree_t *
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2447 {
2448   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449   bin_tree_t *tree;
2450   size_t cur_nsub;
2451   cur_nsub = preg->re_nsub++;
2452
2453   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2454
2455   /* The subexpression may be a null string.  */
2456   if (token->type == OP_CLOSE_SUBEXP)
2457     tree = NULL;
2458   else
2459     {
2460       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2462         {
2463           if (tree != NULL)
2464             postorder (tree, free_tree, NULL);
2465           *err = REG_EPAREN;
2466         }
2467       if (BE (*err != REG_NOERROR, 0))
2468         return NULL;
2469     }
2470
2471   if (cur_nsub <= '9' - '1')
2472     dfa->completed_bkref_map |= 1 << cur_nsub;
2473
2474   tree = create_tree (dfa, tree, NULL, SUBEXP);
2475   if (BE (tree == NULL, 0))
2476     {
2477       *err = REG_ESPACE;
2478       return NULL;
2479     }
2480   tree->token.opr.idx = cur_nsub;
2481   return tree;
2482 }
2483
2484 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2485
2486 static bin_tree_t *
2487 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2488               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2489 {
2490   bin_tree_t *tree = NULL, *old_tree = NULL;
2491   int i, start, end, start_idx = re_string_cur_idx (regexp);
2492   re_token_t start_token = *token;
2493
2494   if (token->type == OP_OPEN_DUP_NUM)
2495     {
2496       end = 0;
2497       start = fetch_number (regexp, token, syntax);
2498       if (start == -1)
2499         {
2500           if (token->type == CHARACTER && token->opr.c == ',')
2501             start = 0; /* We treat "{,m}" as "{0,m}".  */
2502           else
2503             {
2504               *err = REG_BADBR; /* <re>{} is invalid.  */
2505               return NULL;
2506             }
2507         }
2508       if (BE (start != -2, 1))
2509         {
2510           /* We treat "{n}" as "{n,n}".  */
2511           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2512                  : ((token->type == CHARACTER && token->opr.c == ',')
2513                     ? fetch_number (regexp, token, syntax) : -2));
2514         }
2515       if (BE (start == -2 || end == -2, 0))
2516         {
2517           /* Invalid sequence.  */
2518           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2519             {
2520               if (token->type == END_OF_RE)
2521                 *err = REG_EBRACE;
2522               else
2523                 *err = REG_BADBR;
2524
2525               return NULL;
2526             }
2527
2528           /* If the syntax bit is set, rollback.  */
2529           re_string_set_index (regexp, start_idx);
2530           *token = start_token;
2531           token->type = CHARACTER;
2532           /* mb_partial and word_char bits should be already initialized by
2533              peek_token.  */
2534           return elem;
2535         }
2536
2537       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2538         {
2539           /* First number greater than second.  */
2540           *err = REG_BADBR;
2541           return NULL;
2542         }
2543     }
2544   else
2545     {
2546       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2547       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2548     }
2549
2550   fetch_token (token, regexp, syntax);
2551
2552   if (BE (elem == NULL, 0))
2553     return NULL;
2554   if (BE (start == 0 && end == 0, 0))
2555     {
2556       postorder (elem, free_tree, NULL);
2557       return NULL;
2558     }
2559
2560   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2561   if (BE (start > 0, 0))
2562     {
2563       tree = elem;
2564       for (i = 2; i <= start; ++i)
2565         {
2566           elem = duplicate_tree (elem, dfa);
2567           tree = create_tree (dfa, tree, elem, CONCAT);
2568           if (BE (elem == NULL || tree == NULL, 0))
2569             goto parse_dup_op_espace;
2570         }
2571
2572       if (start == end)
2573         return tree;
2574
2575       /* Duplicate ELEM before it is marked optional.  */
2576       elem = duplicate_tree (elem, dfa);
2577       old_tree = tree;
2578     }
2579   else
2580     old_tree = NULL;
2581
2582   if (elem->token.type == SUBEXP)
2583     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2584
2585   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2586   if (BE (tree == NULL, 0))
2587     goto parse_dup_op_espace;
2588
2589   /* This loop is actually executed only when end != -1,
2590      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2591      already created the start+1-th copy.  */
2592   for (i = start + 2; i <= end; ++i)
2593     {
2594       elem = duplicate_tree (elem, dfa);
2595       tree = create_tree (dfa, tree, elem, CONCAT);
2596       if (BE (elem == NULL || tree == NULL, 0))
2597         goto parse_dup_op_espace;
2598
2599       tree = create_tree (dfa, tree, NULL, OP_ALT);
2600       if (BE (tree == NULL, 0))
2601         goto parse_dup_op_espace;
2602     }
2603
2604   if (old_tree)
2605     tree = create_tree (dfa, old_tree, tree, CONCAT);
2606
2607   return tree;
2608
2609  parse_dup_op_espace:
2610   *err = REG_ESPACE;
2611   return NULL;
2612 }
2613
2614 /* Size of the names for collating symbol/equivalence_class/character_class.
2615    I'm not sure, but maybe enough.  */
2616 #define BRACKET_NAME_BUF_SIZE 32
2617
2618 #ifndef _LIBC
2619   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2620      Build the range expression which starts from START_ELEM, and ends
2621      at END_ELEM.  The result are written to MBCSET and SBCSET.
2622      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2623      mbcset->range_ends, is a pointer argument sinse we may
2624      update it.  */
2625
2626 static reg_errcode_t
2627 internal_function
2628 # ifdef RE_ENABLE_I18N
2629 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2630                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2631 # else /* not RE_ENABLE_I18N */
2632 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2633                  bracket_elem_t *end_elem)
2634 # endif /* not RE_ENABLE_I18N */
2635 {
2636   unsigned int start_ch, end_ch;
2637   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2638   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2639           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2640           0))
2641     return REG_ERANGE;
2642
2643   /* We can handle no multi character collating elements without libc
2644      support.  */
2645   if (BE ((start_elem->type == COLL_SYM
2646            && strlen ((char *) start_elem->opr.name) > 1)
2647           || (end_elem->type == COLL_SYM
2648               && strlen ((char *) end_elem->opr.name) > 1), 0))
2649     return REG_ECOLLATE;
2650
2651 # ifdef RE_ENABLE_I18N
2652   {
2653     wchar_t wc;
2654     wint_t start_wc;
2655     wint_t end_wc;
2656     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2657
2658     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2659                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2660                    : 0));
2661     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2662               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2663                  : 0));
2664     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2665                 ? __btowc (start_ch) : start_elem->opr.wch);
2666     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2667               ? __btowc (end_ch) : end_elem->opr.wch);
2668     if (start_wc == WEOF || end_wc == WEOF)
2669       return REG_ECOLLATE;
2670     cmp_buf[0] = start_wc;
2671     cmp_buf[4] = end_wc;
2672     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2673       return REG_ERANGE;
2674
2675     /* Got valid collation sequence values, add them as a new entry.
2676        However, for !_LIBC we have no collation elements: if the
2677        character set is single byte, the single byte character set
2678        that we build below suffices.  parse_bracket_exp passes
2679        no MBCSET if dfa->mb_cur_max == 1.  */
2680     if (mbcset)
2681       {
2682         /* Check the space of the arrays.  */
2683         if (BE (*range_alloc == mbcset->nranges, 0))
2684           {
2685             /* There is not enough space, need realloc.  */
2686             wchar_t *new_array_start, *new_array_end;
2687             int new_nranges;
2688
2689             /* +1 in case of mbcset->nranges is 0.  */
2690             new_nranges = 2 * mbcset->nranges + 1;
2691             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2692                are NULL if *range_alloc == 0.  */
2693             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2694                                           new_nranges);
2695             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2696                                         new_nranges);
2697
2698             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2699               return REG_ESPACE;
2700
2701             mbcset->range_starts = new_array_start;
2702             mbcset->range_ends = new_array_end;
2703             *range_alloc = new_nranges;
2704           }
2705
2706         mbcset->range_starts[mbcset->nranges] = start_wc;
2707         mbcset->range_ends[mbcset->nranges++] = end_wc;
2708       }
2709
2710     /* Build the table for single byte characters.  */
2711     for (wc = 0; wc < SBC_MAX; ++wc)
2712       {
2713         cmp_buf[2] = wc;
2714         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2715             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2716           bitset_set (sbcset, wc);
2717       }
2718   }
2719 # else /* not RE_ENABLE_I18N */
2720   {
2721     unsigned int ch;
2722     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2723                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2724                    : 0));
2725     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2726               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2727                  : 0));
2728     if (start_ch > end_ch)
2729       return REG_ERANGE;
2730     /* Build the table for single byte characters.  */
2731     for (ch = 0; ch < SBC_MAX; ++ch)
2732       if (start_ch <= ch  && ch <= end_ch)
2733         bitset_set (sbcset, ch);
2734   }
2735 # endif /* not RE_ENABLE_I18N */
2736   return REG_NOERROR;
2737 }
2738 #endif /* not _LIBC */
2739
2740 #ifndef _LIBC
2741 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2742    Build the collating element which is represented by NAME.
2743    The result are written to MBCSET and SBCSET.
2744    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745    pointer argument since we may update it.  */
2746
2747 static reg_errcode_t
2748 internal_function
2749 # ifdef RE_ENABLE_I18N
2750 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2751                         int *coll_sym_alloc, const unsigned char *name)
2752 # else /* not RE_ENABLE_I18N */
2753 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2754 # endif /* not RE_ENABLE_I18N */
2755 {
2756   size_t name_len = strlen ((const char *) name);
2757   if (BE (name_len != 1, 0))
2758     return REG_ECOLLATE;
2759   else
2760     {
2761       bitset_set (sbcset, name[0]);
2762       return REG_NOERROR;
2763     }
2764 }
2765 #endif /* not _LIBC */
2766
2767 /* This function parse bracket expression like "[abc]", "[a-c]",
2768    "[[.a-a.]]" etc.  */
2769
2770 static bin_tree_t *
2771 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2772                    reg_syntax_t syntax, reg_errcode_t *err)
2773 {
2774 #ifdef _LIBC
2775   const unsigned char *collseqmb;
2776   const char *collseqwc;
2777   uint32_t nrules;
2778   int32_t table_size;
2779   const int32_t *symb_table;
2780   const unsigned char *extra;
2781
2782   /* Local function for parse_bracket_exp used in _LIBC environement.
2783      Seek the collating symbol entry correspondings to NAME.
2784      Return the index of the symbol in the SYMB_TABLE,
2785      or -1 if not found.  */
2786
2787   auto inline int32_t
2788   __attribute ((always_inline))
2789   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2790     {
2791       int32_t elem;
2792
2793       for (elem = 0; elem < table_size; elem++)
2794         if (symb_table[2 * elem] != 0)
2795           {
2796             int32_t idx = symb_table[2 * elem + 1];
2797             /* Skip the name of collating element name.  */
2798             idx += 1 + extra[idx];
2799             if (/* Compare the length of the name.  */
2800                 name_len == extra[idx]
2801                 /* Compare the name.  */
2802                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2803               /* Yep, this is the entry.  */
2804               return elem;
2805           }
2806       return -1;
2807     }
2808
2809   /* Local function for parse_bracket_exp used in _LIBC environment.
2810      Look up the collation sequence value of BR_ELEM.
2811      Return the value if succeeded, UINT_MAX otherwise.  */
2812
2813   auto inline unsigned int
2814   __attribute ((always_inline))
2815   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2816     {
2817       if (br_elem->type == SB_CHAR)
2818         {
2819           /*
2820           if (MB_CUR_MAX == 1)
2821           */
2822           if (nrules == 0)
2823             return collseqmb[br_elem->opr.ch];
2824           else
2825             {
2826               wint_t wc = __btowc (br_elem->opr.ch);
2827               return __collseq_table_lookup (collseqwc, wc);
2828             }
2829         }
2830       else if (br_elem->type == MB_CHAR)
2831         {
2832           if (nrules != 0)
2833             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2834         }
2835       else if (br_elem->type == COLL_SYM)
2836         {
2837           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2838           if (nrules != 0)
2839             {
2840               int32_t elem, idx;
2841               elem = seek_collating_symbol_entry (br_elem->opr.name,
2842                                                   sym_name_len);
2843               if (elem != -1)
2844                 {
2845                   /* We found the entry.  */
2846                   idx = symb_table[2 * elem + 1];
2847                   /* Skip the name of collating element name.  */
2848                   idx += 1 + extra[idx];
2849                   /* Skip the byte sequence of the collating element.  */
2850                   idx += 1 + extra[idx];
2851                   /* Adjust for the alignment.  */
2852                   idx = (idx + 3) & ~3;
2853                   /* Skip the multibyte collation sequence value.  */
2854                   idx += sizeof (unsigned int);
2855                   /* Skip the wide char sequence of the collating element.  */
2856                   idx += sizeof (unsigned int) *
2857                     (1 + *(unsigned int *) (extra + idx));
2858                   /* Return the collation sequence value.  */
2859                   return *(unsigned int *) (extra + idx);
2860                 }
2861               else if (sym_name_len == 1)
2862                 {
2863                   /* No valid character.  Match it as a single byte
2864                      character.  */
2865                   return collseqmb[br_elem->opr.name[0]];
2866                 }
2867             }
2868           else if (sym_name_len == 1)
2869             return collseqmb[br_elem->opr.name[0]];
2870         }
2871       return UINT_MAX;
2872     }
2873
2874   /* Local function for parse_bracket_exp used in _LIBC environement.
2875      Build the range expression which starts from START_ELEM, and ends
2876      at END_ELEM.  The result are written to MBCSET and SBCSET.
2877      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2878      mbcset->range_ends, is a pointer argument sinse we may
2879      update it.  */
2880
2881   auto inline reg_errcode_t
2882   __attribute ((always_inline))
2883   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2884                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2885     {
2886       unsigned int ch;
2887       uint32_t start_collseq;
2888       uint32_t end_collseq;
2889
2890       /* Equivalence Classes and Character Classes can't be a range
2891          start/end.  */
2892       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2893               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2894               0))
2895         return REG_ERANGE;
2896
2897       start_collseq = lookup_collation_sequence_value (start_elem);
2898       end_collseq = lookup_collation_sequence_value (end_elem);
2899       /* Check start/end collation sequence values.  */
2900       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2901         return REG_ECOLLATE;
2902       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2903         return REG_ERANGE;
2904
2905       /* Got valid collation sequence values, add them as a new entry.
2906          However, if we have no collation elements, and the character set
2907          is single byte, the single byte character set that we
2908          build below suffices. */
2909       if (nrules > 0 || dfa->mb_cur_max > 1)
2910         {
2911           /* Check the space of the arrays.  */
2912           if (BE (*range_alloc == mbcset->nranges, 0))
2913             {
2914               /* There is not enough space, need realloc.  */
2915               uint32_t *new_array_start;
2916               uint32_t *new_array_end;
2917               int new_nranges;
2918
2919               /* +1 in case of mbcset->nranges is 0.  */
2920               new_nranges = 2 * mbcset->nranges + 1;
2921               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2922                                             new_nranges);
2923               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2924                                           new_nranges);
2925
2926               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2927                 return REG_ESPACE;
2928
2929               mbcset->range_starts = new_array_start;
2930               mbcset->range_ends = new_array_end;
2931               *range_alloc = new_nranges;
2932             }
2933
2934           mbcset->range_starts[mbcset->nranges] = start_collseq;
2935           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2936         }
2937
2938       /* Build the table for single byte characters.  */
2939       for (ch = 0; ch < SBC_MAX; ch++)
2940         {
2941           uint32_t ch_collseq;
2942           /*
2943           if (MB_CUR_MAX == 1)
2944           */
2945           if (nrules == 0)
2946             ch_collseq = collseqmb[ch];
2947           else
2948             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2949           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2950             bitset_set (sbcset, ch);
2951         }
2952       return REG_NOERROR;
2953     }
2954
2955   /* Local function for parse_bracket_exp used in _LIBC environement.
2956      Build the collating element which is represented by NAME.
2957      The result are written to MBCSET and SBCSET.
2958      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2959      pointer argument sinse we may update it.  */
2960
2961   auto inline reg_errcode_t
2962   __attribute ((always_inline))
2963   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2964                           int *coll_sym_alloc, const unsigned char *name)
2965     {
2966       int32_t elem, idx;
2967       size_t name_len = strlen ((const char *) name);
2968       if (nrules != 0)
2969         {
2970           elem = seek_collating_symbol_entry (name, name_len);
2971           if (elem != -1)
2972             {
2973               /* We found the entry.  */
2974               idx = symb_table[2 * elem + 1];
2975               /* Skip the name of collating element name.  */
2976               idx += 1 + extra[idx];
2977             }
2978           else if (name_len == 1)
2979             {
2980               /* No valid character, treat it as a normal
2981                  character.  */
2982               bitset_set (sbcset, name[0]);
2983               return REG_NOERROR;
2984             }
2985           else
2986             return REG_ECOLLATE;
2987
2988           /* Got valid collation sequence, add it as a new entry.  */
2989           /* Check the space of the arrays.  */
2990           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2991             {
2992               /* Not enough, realloc it.  */
2993               /* +1 in case of mbcset->ncoll_syms is 0.  */
2994               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2995               /* Use realloc since mbcset->coll_syms is NULL
2996                  if *alloc == 0.  */
2997               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2998                                                    new_coll_sym_alloc);
2999               if (BE (new_coll_syms == NULL, 0))
3000                 return REG_ESPACE;
3001               mbcset->coll_syms = new_coll_syms;
3002               *coll_sym_alloc = new_coll_sym_alloc;
3003             }
3004           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3005           return REG_NOERROR;
3006         }
3007       else
3008         {
3009           if (BE (name_len != 1, 0))
3010             return REG_ECOLLATE;
3011           else
3012             {
3013               bitset_set (sbcset, name[0]);
3014               return REG_NOERROR;
3015             }
3016         }
3017     }
3018 #endif
3019
3020   re_token_t br_token;
3021   re_bitset_ptr_t sbcset;
3022 #ifdef RE_ENABLE_I18N
3023   re_charset_t *mbcset;
3024   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3025   int equiv_class_alloc = 0, char_class_alloc = 0;
3026 #endif /* not RE_ENABLE_I18N */
3027   int non_match = 0;
3028   bin_tree_t *work_tree;
3029   int token_len;
3030   int first_round = 1;
3031 #ifdef _LIBC
3032   collseqmb = (const unsigned char *)
3033     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3034   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3035   if (nrules)
3036     {
3037       /*
3038       if (MB_CUR_MAX > 1)
3039       */
3040       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3041       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3042       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3043                                                   _NL_COLLATE_SYMB_TABLEMB);
3044       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045                                                    _NL_COLLATE_SYMB_EXTRAMB);
3046     }
3047 #endif
3048   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049 #ifdef RE_ENABLE_I18N
3050   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3051 #endif /* RE_ENABLE_I18N */
3052 #ifdef RE_ENABLE_I18N
3053   if (BE (sbcset == NULL || mbcset == NULL, 0))
3054 #else
3055   if (BE (sbcset == NULL, 0))
3056 #endif /* RE_ENABLE_I18N */
3057     {
3058       re_free (sbcset);
3059 #ifdef RE_ENABLE_I18N
3060       re_free (mbcset);
3061 #endif
3062       *err = REG_ESPACE;
3063       return NULL;
3064     }
3065
3066   token_len = peek_token_bracket (token, regexp, syntax);
3067   if (BE (token->type == END_OF_RE, 0))
3068     {
3069       *err = REG_BADPAT;
3070       goto parse_bracket_exp_free_return;
3071     }
3072   if (token->type == OP_NON_MATCH_LIST)
3073     {
3074 #ifdef RE_ENABLE_I18N
3075       mbcset->non_match = 1;
3076 #endif /* not RE_ENABLE_I18N */
3077       non_match = 1;
3078       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079         bitset_set (sbcset, '\n');
3080       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3081       token_len = peek_token_bracket (token, regexp, syntax);
3082       if (BE (token->type == END_OF_RE, 0))
3083         {
3084           *err = REG_BADPAT;
3085           goto parse_bracket_exp_free_return;
3086         }
3087     }
3088
3089   /* We treat the first ']' as a normal character.  */
3090   if (token->type == OP_CLOSE_BRACKET)
3091     token->type = CHARACTER;
3092
3093   while (1)
3094     {
3095       bracket_elem_t start_elem, end_elem;
3096       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3098       reg_errcode_t ret;
3099       int token_len2 = 0, is_range_exp = 0;
3100       re_token_t token2;
3101
3102       start_elem.opr.name = start_name_buf;
3103       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3104                                    syntax, first_round);
3105       if (BE (ret != REG_NOERROR, 0))
3106         {
3107           *err = ret;
3108           goto parse_bracket_exp_free_return;
3109         }
3110       first_round = 0;
3111
3112       /* Get information about the next token.  We need it in any case.  */
3113       token_len = peek_token_bracket (token, regexp, syntax);
3114
3115       /* Do not check for ranges if we know they are not allowed.  */
3116       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3117         {
3118           if (BE (token->type == END_OF_RE, 0))
3119             {
3120               *err = REG_EBRACK;
3121               goto parse_bracket_exp_free_return;
3122             }
3123           if (token->type == OP_CHARSET_RANGE)
3124             {
3125               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3126               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3127               if (BE (token2.type == END_OF_RE, 0))
3128                 {
3129                   *err = REG_EBRACK;
3130                   goto parse_bracket_exp_free_return;
3131                 }
3132               if (token2.type == OP_CLOSE_BRACKET)
3133                 {
3134                   /* We treat the last '-' as a normal character.  */
3135                   re_string_skip_bytes (regexp, -token_len);
3136                   token->type = CHARACTER;
3137                 }
3138               else
3139                 is_range_exp = 1;
3140             }
3141         }
3142
3143       if (is_range_exp == 1)
3144         {
3145           end_elem.opr.name = end_name_buf;
3146           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3147                                        dfa, syntax, 1);
3148           if (BE (ret != REG_NOERROR, 0))
3149             {
3150               *err = ret;
3151               goto parse_bracket_exp_free_return;
3152             }
3153
3154           token_len = peek_token_bracket (token, regexp, syntax);
3155
3156 #ifdef _LIBC
3157           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3158                                   &start_elem, &end_elem);
3159 #else
3160 # ifdef RE_ENABLE_I18N
3161           *err = build_range_exp (sbcset,
3162                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3163                                   &range_alloc, &start_elem, &end_elem);
3164 # else
3165           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3166 # endif
3167 #endif /* RE_ENABLE_I18N */
3168           if (BE (*err != REG_NOERROR, 0))
3169             goto parse_bracket_exp_free_return;
3170         }
3171       else
3172         {
3173           switch (start_elem.type)
3174             {
3175             case SB_CHAR:
3176               bitset_set (sbcset, start_elem.opr.ch);
3177               break;
3178 #ifdef RE_ENABLE_I18N
3179             case MB_CHAR:
3180               /* Check whether the array has enough space.  */
3181               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3182                 {
3183                   wchar_t *new_mbchars;
3184                   /* Not enough, realloc it.  */
3185                   /* +1 in case of mbcset->nmbchars is 0.  */
3186                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3187                   /* Use realloc since array is NULL if *alloc == 0.  */
3188                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3189                                             mbchar_alloc);
3190                   if (BE (new_mbchars == NULL, 0))
3191                     goto parse_bracket_exp_espace;
3192                   mbcset->mbchars = new_mbchars;
3193                 }
3194               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3195               break;
3196 #endif /* RE_ENABLE_I18N */
3197             case EQUIV_CLASS:
3198               *err = build_equiv_class (sbcset,
3199 #ifdef RE_ENABLE_I18N
3200                                         mbcset, &equiv_class_alloc,
3201 #endif /* RE_ENABLE_I18N */
3202                                         start_elem.opr.name);
3203               if (BE (*err != REG_NOERROR, 0))
3204                 goto parse_bracket_exp_free_return;
3205               break;
3206             case COLL_SYM:
3207               *err = build_collating_symbol (sbcset,
3208 #ifdef RE_ENABLE_I18N
3209                                              mbcset, &coll_sym_alloc,
3210 #endif /* RE_ENABLE_I18N */
3211                                              start_elem.opr.name);
3212               if (BE (*err != REG_NOERROR, 0))
3213                 goto parse_bracket_exp_free_return;
3214               break;
3215             case CHAR_CLASS:
3216               *err = build_charclass (regexp->trans, sbcset,
3217 #ifdef RE_ENABLE_I18N
3218                                       mbcset, &char_class_alloc,
3219 #endif /* RE_ENABLE_I18N */
3220                                       start_elem.opr.name, syntax);
3221               if (BE (*err != REG_NOERROR, 0))
3222                goto parse_bracket_exp_free_return;
3223               break;
3224             default:
3225               assert (0);
3226               break;
3227             }
3228         }
3229       if (BE (token->type == END_OF_RE, 0))
3230         {
3231           *err = REG_EBRACK;
3232           goto parse_bracket_exp_free_return;
3233         }
3234       if (token->type == OP_CLOSE_BRACKET)
3235         break;
3236     }
3237
3238   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3239
3240   /* If it is non-matching list.  */
3241   if (non_match)
3242     bitset_not (sbcset);
3243
3244 #ifdef RE_ENABLE_I18N
3245   /* Ensure only single byte characters are set.  */
3246   if (dfa->mb_cur_max > 1)
3247     bitset_mask (sbcset, dfa->sb_char);
3248
3249   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3250       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3251                                                      || mbcset->non_match)))
3252     {
3253       bin_tree_t *mbc_tree;
3254       int sbc_idx;
3255       /* Build a tree for complex bracket.  */
3256       dfa->has_mb_node = 1;
3257       br_token.type = COMPLEX_BRACKET;
3258       br_token.opr.mbcset = mbcset;
3259       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3260       if (BE (mbc_tree == NULL, 0))
3261         goto parse_bracket_exp_espace;
3262       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3263         if (sbcset[sbc_idx])
3264           break;
3265       /* If there are no bits set in sbcset, there is no point
3266          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3267       if (sbc_idx < BITSET_WORDS)
3268         {
3269           /* Build a tree for simple bracket.  */
3270           br_token.type = SIMPLE_BRACKET;
3271           br_token.opr.sbcset = sbcset;
3272           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3273           if (BE (work_tree == NULL, 0))
3274             goto parse_bracket_exp_espace;
3275
3276           /* Then join them by ALT node.  */
3277           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3278           if (BE (work_tree == NULL, 0))
3279             goto parse_bracket_exp_espace;
3280         }
3281       else
3282         {
3283           re_free (sbcset);
3284           work_tree = mbc_tree;
3285         }
3286     }
3287   else
3288 #endif /* not RE_ENABLE_I18N */
3289     {
3290 #ifdef RE_ENABLE_I18N
3291       free_charset (mbcset);
3292 #endif
3293       /* Build a tree for simple bracket.  */
3294       br_token.type = SIMPLE_BRACKET;
3295       br_token.opr.sbcset = sbcset;
3296       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3297       if (BE (work_tree == NULL, 0))
3298         goto parse_bracket_exp_espace;
3299     }
3300   return work_tree;
3301
3302  parse_bracket_exp_espace:
3303   *err = REG_ESPACE;
3304  parse_bracket_exp_free_return:
3305   re_free (sbcset);
3306 #ifdef RE_ENABLE_I18N
3307   free_charset (mbcset);
3308 #endif /* RE_ENABLE_I18N */
3309   return NULL;
3310 }
3311
3312 /* Parse an element in the bracket expression.  */
3313
3314 static reg_errcode_t
3315 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3316                        re_token_t *token, int token_len, re_dfa_t *dfa,
3317                        reg_syntax_t syntax, int accept_hyphen)
3318 {
3319 #ifdef RE_ENABLE_I18N
3320   int cur_char_size;
3321   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3322   if (cur_char_size > 1)
3323     {
3324       elem->type = MB_CHAR;
3325       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3326       re_string_skip_bytes (regexp, cur_char_size);
3327       return REG_NOERROR;
3328     }
3329 #endif /* RE_ENABLE_I18N */
3330   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3331   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3332       || token->type == OP_OPEN_EQUIV_CLASS)
3333     return parse_bracket_symbol (elem, regexp, token);
3334   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3335     {
3336       /* A '-' must only appear as anything but a range indicator before
3337          the closing bracket.  Everything else is an error.  */
3338       re_token_t token2;
3339       (void) peek_token_bracket (&token2, regexp, syntax);
3340       if (token2.type != OP_CLOSE_BRACKET)
3341         /* The actual error value is not standardized since this whole
3342            case is undefined.  But ERANGE makes good sense.  */
3343         return REG_ERANGE;
3344     }
3345   elem->type = SB_CHAR;
3346   elem->opr.ch = token->opr.c;
3347   return REG_NOERROR;
3348 }
3349
3350 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3351    such as [:<character_class>:], [.<collating_element>.], and
3352    [=<equivalent_class>=].  */
3353
3354 static reg_errcode_t
3355 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3356                       re_token_t *token)
3357 {
3358   unsigned char ch, delim = token->opr.c;
3359   int i = 0;
3360   if (re_string_eoi(regexp))
3361     return REG_EBRACK;
3362   for (;; ++i)
3363     {
3364       if (i >= BRACKET_NAME_BUF_SIZE)
3365         return REG_EBRACK;
3366       if (token->type == OP_OPEN_CHAR_CLASS)
3367         ch = re_string_fetch_byte_case (regexp);
3368       else
3369         ch = re_string_fetch_byte (regexp);
3370       if (re_string_eoi(regexp))
3371         return REG_EBRACK;
3372       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3373         break;
3374       elem->opr.name[i] = ch;
3375     }
3376   re_string_skip_bytes (regexp, 1);
3377   elem->opr.name[i] = '\0';
3378   switch (token->type)
3379     {
3380     case OP_OPEN_COLL_ELEM:
3381       elem->type = COLL_SYM;
3382       break;
3383     case OP_OPEN_EQUIV_CLASS:
3384       elem->type = EQUIV_CLASS;
3385       break;
3386     case OP_OPEN_CHAR_CLASS:
3387       elem->type = CHAR_CLASS;
3388       break;
3389     default:
3390       break;
3391     }
3392   return REG_NOERROR;
3393 }
3394
3395   /* Helper function for parse_bracket_exp.
3396      Build the equivalence class which is represented by NAME.
3397      The result are written to MBCSET and SBCSET.
3398      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3399      is a pointer argument sinse we may update it.  */
3400
3401 static reg_errcode_t
3402 #ifdef RE_ENABLE_I18N
3403 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3404                    int *equiv_class_alloc, const unsigned char *name)
3405 #else /* not RE_ENABLE_I18N */
3406 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3407 #endif /* not RE_ENABLE_I18N */
3408 {
3409 #ifdef _LIBC
3410   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3411   if (nrules != 0)
3412     {
3413       const int32_t *table, *indirect;
3414       const unsigned char *weights, *extra, *cp;
3415       unsigned char char_buf[2];
3416       int32_t idx1, idx2;
3417       unsigned int ch;
3418       size_t len;
3419       /* This #include defines a local function!  */
3420 # include <locale/weight.h>
3421       /* Calculate the index for equivalence class.  */
3422       cp = name;
3423       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3424       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3425                                                _NL_COLLATE_WEIGHTMB);
3426       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3427                                                    _NL_COLLATE_EXTRAMB);
3428       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3429                                                 _NL_COLLATE_INDIRECTMB);
3430       idx1 = findidx (&cp, -1);
3431       if (BE (idx1 == 0 || *cp != '\0', 0))
3432         /* This isn't a valid character.  */
3433         return REG_ECOLLATE;
3434
3435       /* Build single byte matcing table for this equivalence class.  */
3436       len = weights[idx1 & 0xffffff];
3437       for (ch = 0; ch < SBC_MAX; ++ch)
3438         {
3439           char_buf[0] = ch;
3440           cp = char_buf;
3441           idx2 = findidx (&cp, 1);
3442 /*
3443           idx2 = table[ch];
3444 */
3445           if (idx2 == 0)
3446             /* This isn't a valid character.  */
3447             continue;
3448           /* Compare only if the length matches and the collation rule
3449              index is the same.  */
3450           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3451             {
3452               int cnt = 0;
3453
3454               while (cnt <= len &&
3455                      weights[(idx1 & 0xffffff) + 1 + cnt]
3456                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3457                 ++cnt;
3458
3459               if (cnt > len)
3460                 bitset_set (sbcset, ch);
3461             }
3462         }
3463       /* Check whether the array has enough space.  */
3464       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3465         {
3466           /* Not enough, realloc it.  */
3467           /* +1 in case of mbcset->nequiv_classes is 0.  */
3468           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3469           /* Use realloc since the array is NULL if *alloc == 0.  */
3470           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3471                                                    int32_t,
3472                                                    new_equiv_class_alloc);
3473           if (BE (new_equiv_classes == NULL, 0))
3474             return REG_ESPACE;
3475           mbcset->equiv_classes = new_equiv_classes;
3476           *equiv_class_alloc = new_equiv_class_alloc;
3477         }
3478       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3479     }
3480   else
3481 #endif /* _LIBC */
3482     {
3483       if (BE (strlen ((const char *) name) != 1, 0))
3484         return REG_ECOLLATE;
3485       bitset_set (sbcset, *name);
3486     }
3487   return REG_NOERROR;
3488 }
3489
3490   /* Helper function for parse_bracket_exp.
3491      Build the character class which is represented by NAME.
3492      The result are written to MBCSET and SBCSET.
3493      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3494      is a pointer argument sinse we may update it.  */
3495
3496 static reg_errcode_t
3497 #ifdef RE_ENABLE_I18N
3498 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3499                  re_charset_t *mbcset, int *char_class_alloc,
3500                  const unsigned char *class_name, reg_syntax_t syntax)
3501 #else /* not RE_ENABLE_I18N */
3502 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3503                  const unsigned char *class_name, reg_syntax_t syntax)
3504 #endif /* not RE_ENABLE_I18N */
3505 {
3506   int i;
3507   const char *name = (const char *) class_name;
3508
3509   /* In case of REG_ICASE "upper" and "lower" match the both of
3510      upper and lower cases.  */
3511   if ((syntax & RE_ICASE)
3512       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3513     name = "alpha";
3514
3515 #ifdef RE_ENABLE_I18N
3516   /* Check the space of the arrays.  */
3517   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3518     {
3519       /* Not enough, realloc it.  */
3520       /* +1 in case of mbcset->nchar_classes is 0.  */
3521       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3522       /* Use realloc since array is NULL if *alloc == 0.  */
3523       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3524                                                new_char_class_alloc);
3525       if (BE (new_char_classes == NULL, 0))
3526         return REG_ESPACE;
3527       mbcset->char_classes = new_char_classes;
3528       *char_class_alloc = new_char_class_alloc;
3529     }
3530   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3531 #endif /* RE_ENABLE_I18N */
3532
3533 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3534   do {                                          \
3535     if (BE (trans != NULL, 0))                  \
3536       {                                         \
3537         for (i = 0; i < SBC_MAX; ++i)           \
3538           if (ctype_func (i))                   \
3539             bitset_set (sbcset, trans[i]);      \
3540       }                                         \
3541     else                                        \
3542       {                                         \
3543         for (i = 0; i < SBC_MAX; ++i)           \
3544           if (ctype_func (i))                   \
3545             bitset_set (sbcset, i);             \
3546       }                                         \
3547   } while (0)
3548
3549   if (strcmp (name, "alnum") == 0)
3550     BUILD_CHARCLASS_LOOP (isalnum);
3551   else if (strcmp (name, "cntrl") == 0)
3552     BUILD_CHARCLASS_LOOP (iscntrl);
3553   else if (strcmp (name, "lower") == 0)
3554     BUILD_CHARCLASS_LOOP (islower);
3555   else if (strcmp (name, "space") == 0)
3556     BUILD_CHARCLASS_LOOP (isspace);
3557   else if (strcmp (name, "alpha") == 0)
3558     BUILD_CHARCLASS_LOOP (isalpha);
3559   else if (strcmp (name, "digit") == 0)
3560     BUILD_CHARCLASS_LOOP (isdigit);
3561   else if (strcmp (name, "print") == 0)
3562     BUILD_CHARCLASS_LOOP (isprint);
3563   else if (strcmp (name, "upper") == 0)
3564     BUILD_CHARCLASS_LOOP (isupper);
3565   else if (strcmp (name, "blank") == 0)
3566     BUILD_CHARCLASS_LOOP (isblank);
3567   else if (strcmp (name, "graph") == 0)
3568     BUILD_CHARCLASS_LOOP (isgraph);
3569   else if (strcmp (name, "punct") == 0)
3570     BUILD_CHARCLASS_LOOP (ispunct);
3571   else if (strcmp (name, "xdigit") == 0)
3572     BUILD_CHARCLASS_LOOP (isxdigit);
3573   else
3574     return REG_ECTYPE;
3575
3576   return REG_NOERROR;
3577 }
3578
3579 static bin_tree_t *
3580 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3581                     const unsigned char *class_name,
3582                     const unsigned char *extra, int non_match,
3583                     reg_errcode_t *err)
3584 {
3585   re_bitset_ptr_t sbcset;
3586 #ifdef RE_ENABLE_I18N
3587   re_charset_t *mbcset;
3588   int alloc = 0;
3589 #endif /* not RE_ENABLE_I18N */
3590   reg_errcode_t ret;
3591   re_token_t br_token;
3592   bin_tree_t *tree;
3593
3594   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3595 #ifdef RE_ENABLE_I18N
3596   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3597 #endif /* RE_ENABLE_I18N */
3598
3599 #ifdef RE_ENABLE_I18N
3600   if (BE (sbcset == NULL || mbcset == NULL, 0))
3601 #else /* not RE_ENABLE_I18N */
3602   if (BE (sbcset == NULL, 0))
3603 #endif /* not RE_ENABLE_I18N */
3604     {
3605       *err = REG_ESPACE;
3606       return NULL;
3607     }
3608
3609   if (non_match)
3610     {
3611 #ifdef RE_ENABLE_I18N
3612       mbcset->non_match = 1;
3613 #endif /* not RE_ENABLE_I18N */
3614     }
3615
3616   /* We don't care the syntax in this case.  */
3617   ret = build_charclass (trans, sbcset,
3618 #ifdef RE_ENABLE_I18N
3619                          mbcset, &alloc,
3620 #endif /* RE_ENABLE_I18N */
3621                          class_name, 0);
3622
3623   if (BE (ret != REG_NOERROR, 0))
3624     {
3625       re_free (sbcset);
3626 #ifdef RE_ENABLE_I18N
3627       free_charset (mbcset);
3628 #endif /* RE_ENABLE_I18N */
3629       *err = ret;
3630       return NULL;
3631     }
3632   /* \w match '_' also.  */
3633   for (; *extra; extra++)
3634     bitset_set (sbcset, *extra);
3635
3636   /* If it is non-matching list.  */
3637   if (non_match)
3638     bitset_not (sbcset);
3639
3640 #ifdef RE_ENABLE_I18N
3641   /* Ensure only single byte characters are set.  */
3642   if (dfa->mb_cur_max > 1)
3643     bitset_mask (sbcset, dfa->sb_char);
3644 #endif
3645
3646   /* Build a tree for simple bracket.  */
3647   br_token.type = SIMPLE_BRACKET;
3648   br_token.opr.sbcset = sbcset;
3649   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3650   if (BE (tree == NULL, 0))
3651     goto build_word_op_espace;
3652
3653 #ifdef RE_ENABLE_I18N
3654   if (dfa->mb_cur_max > 1)
3655     {
3656       bin_tree_t *mbc_tree;
3657       /* Build a tree for complex bracket.  */
3658       br_token.type = COMPLEX_BRACKET;
3659       br_token.opr.mbcset = mbcset;
3660       dfa->has_mb_node = 1;
3661       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3662       if (BE (mbc_tree == NULL, 0))
3663         goto build_word_op_espace;
3664       /* Then join them by ALT node.  */
3665       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3666       if (BE (mbc_tree != NULL, 1))
3667         return tree;
3668     }
3669   else
3670     {
3671       free_charset (mbcset);
3672       return tree;
3673     }
3674 #else /* not RE_ENABLE_I18N */
3675   return tree;
3676 #endif /* not RE_ENABLE_I18N */
3677
3678  build_word_op_espace:
3679   re_free (sbcset);
3680 #ifdef RE_ENABLE_I18N
3681   free_charset (mbcset);
3682 #endif /* RE_ENABLE_I18N */
3683   *err = REG_ESPACE;
3684   return NULL;
3685 }
3686
3687 /* This is intended for the expressions like "a{1,3}".
3688    Fetch a number from `input', and return the number.
3689    Return -1, if the number field is empty like "{,1}".
3690    Return -2, If an error is occured.  */
3691
3692 static int
3693 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3694 {
3695   int num = -1;
3696   unsigned char c;
3697   while (1)
3698     {
3699       fetch_token (token, input, syntax);
3700       c = token->opr.c;
3701       if (BE (token->type == END_OF_RE, 0))
3702         return -2;
3703       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3704         break;
3705       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3706              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3707       num = (num > RE_DUP_MAX) ? -2 : num;
3708     }
3709   return num;
3710 }
3711 \f
3712 #ifdef RE_ENABLE_I18N
3713 static void
3714 free_charset (re_charset_t *cset)
3715 {
3716   re_free (cset->mbchars);
3717 # ifdef _LIBC
3718   re_free (cset->coll_syms);
3719   re_free (cset->equiv_classes);
3720   re_free (cset->range_starts);
3721   re_free (cset->range_ends);
3722 # endif
3723   re_free (cset->char_classes);
3724   re_free (cset);
3725 }
3726 #endif /* RE_ENABLE_I18N */
3727 \f
3728 /* Functions for binary tree operation.  */
3729
3730 /* Create a tree node.  */
3731
3732 static bin_tree_t *
3733 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3734              re_token_type_t type)
3735 {
3736   re_token_t t;
3737   t.type = type;
3738   return create_token_tree (dfa, left, right, &t);
3739 }
3740
3741 static bin_tree_t *
3742 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743                    const re_token_t *token)
3744 {
3745   bin_tree_t *tree;
3746   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3747     {
3748       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3749
3750       if (storage == NULL)
3751         return NULL;
3752       storage->next = dfa->str_tree_storage;
3753       dfa->str_tree_storage = storage;
3754       dfa->str_tree_storage_idx = 0;
3755     }
3756   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3757
3758   tree->parent = NULL;
3759   tree->left = left;
3760   tree->right = right;
3761   tree->token = *token;
3762   tree->token.duplicated = 0;
3763   tree->token.opt_subexp = 0;
3764   tree->first = NULL;
3765   tree->next = NULL;
3766   tree->node_idx = -1;
3767
3768   if (left != NULL)
3769     left->parent = tree;
3770   if (right != NULL)
3771     right->parent = tree;
3772   return tree;
3773 }
3774
3775 /* Mark the tree SRC as an optional subexpression.
3776    To be called from preorder or postorder.  */
3777
3778 static reg_errcode_t
3779 mark_opt_subexp (void *extra, bin_tree_t *node)
3780 {
3781   int idx = (int) (long) extra;
3782   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3783     node->token.opt_subexp = 1;
3784
3785   return REG_NOERROR;
3786 }
3787
3788 /* Free the allocated memory inside NODE. */
3789
3790 static void
3791 free_token (re_token_t *node)
3792 {
3793 #ifdef RE_ENABLE_I18N
3794   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3795     free_charset (node->opr.mbcset);
3796   else
3797 #endif /* RE_ENABLE_I18N */
3798     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3799       re_free (node->opr.sbcset);
3800 }
3801
3802 /* Worker function for tree walking.  Free the allocated memory inside NODE
3803    and its children. */
3804
3805 static reg_errcode_t
3806 free_tree (void *extra, bin_tree_t *node)
3807 {
3808   free_token (&node->token);
3809   return REG_NOERROR;
3810 }
3811
3812
3813 /* Duplicate the node SRC, and return new node.  This is a preorder
3814    visit similar to the one implemented by the generic visitor, but
3815    we need more infrastructure to maintain two parallel trees --- so,
3816    it's easier to duplicate.  */
3817
3818 static bin_tree_t *
3819 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3820 {
3821   const bin_tree_t *node;
3822   bin_tree_t *dup_root;
3823   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3824
3825   for (node = root; ; )
3826     {
3827       /* Create a new tree and link it back to the current parent.  */
3828       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3829       if (*p_new == NULL)
3830         return NULL;
3831       (*p_new)->parent = dup_node;
3832       (*p_new)->token.duplicated = 1;
3833       dup_node = *p_new;
3834
3835       /* Go to the left node, or up and to the right.  */
3836       if (node->left)
3837         {
3838           node = node->left;
3839           p_new = &dup_node->left;
3840         }
3841       else
3842         {
3843           const bin_tree_t *prev = NULL;
3844           while (node->right == prev || node->right == NULL)
3845             {
3846               prev = node;
3847               node = node->parent;
3848               dup_node = dup_node->parent;
3849               if (!node)
3850                 return dup_root;
3851             }
3852           node = node->right;
3853           p_new = &dup_node->right;
3854         }
3855     }
3856 }