gnu/lib/libregex/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static inline re_dfastate_t *acquire_init_state_context
  56      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  57      __attribute ((always_inline)) internal_function;
  58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  59      internal_function;
  60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  61                            int *p_match_first)
  62      internal_function;
  63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  64                                     unsigned int context) internal_function;
  65 static int check_halt_state_context (const re_match_context_t *mctx,
  66                                      const re_dfastate_t *state, int idx)
  67      internal_function;
  68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  69                          regmatch_t *prev_idx_match, int cur_node,
  70                          int cur_idx, int nmatch) internal_function;
  71 static int proceed_next_node (const re_match_context_t *mctx,
  72                               int nregs, regmatch_t *regs,
  73                               int *pidx, int node, re_node_set *eps_via_nodes,
  74                               struct re_fail_stack_t *fs) internal_function;
  75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  76                                       int str_idx, int dest_node, int nregs,
  77                                       regmatch_t *regs,
  78                                       re_node_set *eps_via_nodes) internal_function;
  79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  80                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  81 static reg_errcode_t set_regs (const regex_t *preg,
  82                                const re_match_context_t *mctx,
  83                                size_t nmatch, regmatch_t *pmatch,
  84                                int fl_backtrack) internal_function;
  85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  86
  87 #ifdef RE_ENABLE_I18N
  88 static int sift_states_iter_mb (const re_match_context_t *mctx,
  89                                 re_sift_context_t *sctx,
  90                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  91 #endif /* RE_ENABLE_I18N */
  92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  93                                            re_sift_context_t *sctx) internal_function;
  94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  95                                           re_sift_context_t *sctx, int str_idx,
  96                                           re_node_set *cur_dest) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 111                                         int boundaries, int subexp_idx,
 112                                         int from_node, int bkref_idx) internal_function;
 113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 114                                       int limit, int subexp_idx,
 115                                       int node, int str_idx,
 116                                       int bkref_idx) internal_function;
 117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 118                                           re_node_set *dest_nodes,
 119                                           const re_node_set *candidates,
 120                                           re_node_set *limits,
 121                                           struct re_backref_cache_entry *bkref_ents,
 122                                           int str_idx) internal_function;
 123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 124                                         re_sift_context_t *sctx,
 125                                         int str_idx, const re_node_set *candidates) internal_function;
 126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 127                                                 int next_state_log_idx) internal_function;
 128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 129                                         re_dfastate_t **src, int num) internal_function;
 130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 131                                          re_match_context_t *mctx) internal_function;
 132 static re_dfastate_t *transit_state (reg_errcode_t *err,
 133                                      re_match_context_t *mctx,
 134                                      re_dfastate_t *state) internal_function;
 135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 136                                             re_match_context_t *mctx,
 137                                             re_dfastate_t *next_state) internal_function;
 138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 139                                                 re_node_set *cur_nodes,
 140                                                 int str_idx) internal_function;
 141 #if 0
 142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 143                                         re_match_context_t *mctx,
 144                                         re_dfastate_t *pstate) internal_function;
 145 #endif
 146 #ifdef RE_ENABLE_I18N
 147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 148                                        re_dfastate_t *pstate) internal_function;
 149 #endif /* RE_ENABLE_I18N */
 150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 151                                           const re_node_set *nodes) internal_function;
 152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 153                                  int bkref_node, int bkref_str_idx) internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str) internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes) internal_function;
 168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 169                                                re_node_set *cur_nodes,
 170                                                int ex_subexp, int type) internal_function;
 171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 172                                                    re_node_set *dst_nodes,
 173                                                    int target, int ex_subexp,
 174                                                    int type) internal_function;
 175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 176                                          re_node_set *cur_nodes, int cur_str,
 177                                          int subexp_num, int type) internal_function;
 178 static int build_trtable (re_dfa_t *dfa,
 179                           re_dfastate_t *state) internal_function;
 180 #ifdef RE_ENABLE_I18N
 181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 182                                     const re_string_t *input, int idx) internal_function;
 183 # ifdef _LIBC
 184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 185                                                    size_t name_len) internal_function;
 186 # endif /* _LIBC */
 187 #endif /* RE_ENABLE_I18N */
 188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 189                                        const re_dfastate_t *state,
 190                                        re_node_set *states_node,
 191                                        bitset *states_ch) internal_function;
 192 static int check_node_accept (const re_match_context_t *mctx,
 193                               const re_token_t *node, int idx) internal_function;
 194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 195 \f
 196 /* Entry point for POSIX code.  */
 197
 198 /* regexec searches for a given pattern, specified by PREG, in the
 199    string STRING.
 200
 201    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 202    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 203    least NMATCH elements, and we set them to the offsets of the
 204    corresponding matched substrings.
 205
 206    EFLAGS specifies `execution flags' which affect matching: if
 207    REG_NOTBOL is set, then ^ does not match at the beginning of the
 208    string; if REG_NOTEOL is set, then $ does not match at the end.
 209
 210    We return 0 if we find a match and REG_NOMATCH if not.  */
 211
 212 int
 213 regexec (preg, string, nmatch, pmatch, eflags)
 214     const regex_t *__restrict preg;
 215     const char *__restrict string;
 216     size_t nmatch;
 217     regmatch_t pmatch[];
 218     int eflags;
 219 {
 220   reg_errcode_t err;
 221   int start, length;
 222
 223   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 224     return REG_BADPAT;
 225
 226   if (eflags & REG_STARTEND)
 227     {
 228       start = pmatch[0].rm_so;
 229       length = pmatch[0].rm_eo;
 230     }
 231   else
 232     {
 233       start = 0;
 234       length = strlen (string);
 235     }
 236   if (preg->no_sub)
 237     err = re_search_internal (preg, string, length, start, length - start,
 238                               length, 0, NULL, eflags);
 239   else
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, nmatch, pmatch, eflags);
 242   return err != REG_NOERROR;
 243 }
 244
 245 #ifdef _LIBC
 246 # include <shlib-compat.h>
 247 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 248
 249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 250 __typeof__ (__regexec) __compat_regexec;
 251
 252 int
 253 attribute_compat_text_section
 254 __compat_regexec (const regex_t *__restrict preg,
 255                   const char *__restrict string, size_t nmatch,
 256                   regmatch_t pmatch[], int eflags)
 257 {
 258   return regexec (preg, string, nmatch, pmatch,
 259                   eflags & (REG_NOTBOL | REG_NOTEOL));
 260 }
 261 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 262 # endif
 263 #endif
 264
 265 /* Entry points for GNU code.  */
 266
 267 /* re_match, re_search, re_match_2, re_search_2
 268
 269    The former two functions operate on STRING with length LENGTH,
 270    while the later two operate on concatenation of STRING1 and STRING2
 271    with lengths LENGTH1 and LENGTH2, respectively.
 272
 273    re_match() matches the compiled pattern in BUFP against the string,
 274    starting at index START.
 275
 276    re_search() first tries matching at index START, then it tries to match
 277    starting from index START + 1, and so on.  The last start position tried
 278    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 279    way as re_match().)
 280
 281    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 282    the first STOP characters of the concatenation of the strings should be
 283    concerned.
 284
 285    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 286    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 287    computed relative to the concatenation, not relative to the individual
 288    strings.)
 289
 290    On success, re_match* functions return the length of the match, re_search*
 291    return the position of the start of the match.  Return value -1 means no
 292    match was found and -2 indicates an internal error.  */
 293
 294 int
 295 re_match (bufp, string, length, start, regs)
 296     struct re_pattern_buffer *bufp;
 297     const char *string;
 298     int length, start;
 299     struct re_registers *regs;
 300 {
 301   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_match, re_match)
 305 #endif
 306
 307 int
 308 re_search (bufp, string, length, start, range, regs)
 309     struct re_pattern_buffer *bufp;
 310     const char *string;
 311     int length, start, range;
 312     struct re_registers *regs;
 313 {
 314   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 315 }
 316 #ifdef _LIBC
 317 weak_alias (__re_search, re_search)
 318 #endif
 319
 320 int
 321 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 322     struct re_pattern_buffer *bufp;
 323     const char *string1, *string2;
 324     int length1, length2, start, stop;
 325     struct re_registers *regs;
 326 {
 327   return re_search_2_stub (bufp, string1, length1, string2, length2,
 328                            start, 0, regs, stop, 1);
 329 }
 330 #ifdef _LIBC
 331 weak_alias (__re_match_2, re_match_2)
 332 #endif
 333
 334 int
 335 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 336     struct re_pattern_buffer *bufp;
 337     const char *string1, *string2;
 338     int length1, length2, start, range, stop;
 339     struct re_registers *regs;
 340 {
 341   return re_search_2_stub (bufp, string1, length1, string2, length2,
 342                            start, range, regs, stop, 0);
 343 }
 344 #ifdef _LIBC
 345 weak_alias (__re_search_2, re_search_2)
 346 #endif
 347
 348 static int
 349 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 350                   stop, ret_len)
 351     struct re_pattern_buffer *bufp;
 352     const char *string1, *string2;
 353     int length1, length2, start, range, stop, ret_len;
 354     struct re_registers *regs;
 355 {
 356   const char *str;
 357   int rval;
 358   int len = length1 + length2;
 359   int free_str = 0;
 360
 361   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 362     return -2;
 363
 364   /* Concatenate the strings.  */
 365   if (length2 > 0)
 366     if (length1 > 0)
 367       {
 368         char *s = re_malloc (char, len);
 369
 370         if (BE (s == NULL, 0))
 371           return -2;
 372         memcpy (s, string1, length1);
 373         memcpy (s + length1, string2, length2);
 374         str = s;
 375         free_str = 1;
 376       }
 377     else
 378       str = string2;
 379   else
 380     str = string1;
 381
 382   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 383                          ret_len);
 384   if (free_str)
 385     re_free ((char *) str);
 386   return rval;
 387 }
 388
 389 /* The parameters have the same meaning as those of re_search.
 390    Additional parameters:
 391    If RET_LEN is nonzero the length of the match is returned (re_match style);
 392    otherwise the position of the match is returned.  */
 393
 394 static int
 395 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 396     struct re_pattern_buffer *bufp;
 397     const char *string;
 398     int length, start, range, stop, ret_len;
 399     struct re_registers *regs;
 400 {
 401   reg_errcode_t result;
 402   regmatch_t *pmatch;
 403   int nregs, rval;
 404   int eflags = 0;
 405
 406   /* Check for out-of-range.  */
 407   if (BE (start < 0 || start > length, 0))
 408     return -1;
 409   if (BE (start + range > length, 0))
 410     range = length - start;
 411   else if (BE (start + range < 0, 0))
 412     range = -start;
 413
 414   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 415   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 416
 417   /* Compile fastmap if we haven't yet.  */
 418   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 419     re_compile_fastmap (bufp);
 420
 421   if (BE (bufp->no_sub, 0))
 422     regs = NULL;
 423
 424   /* We need at least 1 register.  */
 425   if (regs == NULL)
 426     nregs = 1;
 427   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 428                regs->num_regs < bufp->re_nsub + 1, 0))
 429     {
 430       nregs = regs->num_regs;
 431       if (BE (nregs < 1, 0))
 432         {
 433           /* Nothing can be copied to regs.  */
 434           regs = NULL;
 435           nregs = 1;
 436         }
 437     }
 438   else
 439     nregs = bufp->re_nsub + 1;
 440   pmatch = re_malloc (regmatch_t, nregs);
 441   if (BE (pmatch == NULL, 0))
 442     return -2;
 443
 444   result = re_search_internal (bufp, string, length, start, range, stop,
 445                                nregs, pmatch, eflags);
 446
 447   rval = 0;
 448
 449   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 450   if (result != REG_NOERROR)
 451     rval = -1;
 452   else if (regs != NULL)
 453     {
 454       /* If caller wants register contents data back, copy them.  */
 455       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 456                                            bufp->regs_allocated);
 457       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 458         rval = -2;
 459     }
 460
 461   if (BE (rval == 0, 1))
 462     {
 463       if (ret_len)
 464         {
 465           assert (pmatch[0].rm_so == start);
 466           rval = pmatch[0].rm_eo - start;
 467         }
 468       else
 469         rval = pmatch[0].rm_so;
 470     }
 471   re_free (pmatch);
 472   return rval;
 473 }
 474
 475 static unsigned
 476 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 477     struct re_registers *regs;
 478     regmatch_t *pmatch;
 479     int nregs, regs_allocated;
 480 {
 481   int rval = REGS_REALLOCATE;
 482   int i;
 483   int need_regs = nregs + 1;
 484   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 485      uses.  */
 486
 487   /* Have the register data arrays been allocated?  */
 488   if (regs_allocated == REGS_UNALLOCATED)
 489     { /* No.  So allocate them with malloc.  */
 490       regs->start = re_malloc (regoff_t, need_regs);
 491       regs->end = re_malloc (regoff_t, need_regs);
 492       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 493         return REGS_UNALLOCATED;
 494       regs->num_regs = need_regs;
 495     }
 496   else if (regs_allocated == REGS_REALLOCATE)
 497     { /* Yes.  If we need more elements than were already
 498          allocated, reallocate them.  If we need fewer, just
 499          leave it alone.  */
 500       if (BE (need_regs > regs->num_regs, 0))
 501         {
 502           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 503           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 504           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 505             return REGS_UNALLOCATED;
 506           regs->start = new_start;
 507           regs->end = new_end;
 508           regs->num_regs = need_regs;
 509         }
 510     }
 511   else
 512     {
 513       assert (regs_allocated == REGS_FIXED);
 514       /* This function may not be called with REGS_FIXED and nregs too big.  */
 515       assert (regs->num_regs >= nregs);
 516       rval = REGS_FIXED;
 517     }
 518
 519   /* Copy the regs.  */
 520   for (i = 0; i < nregs; ++i)
 521     {
 522       regs->start[i] = pmatch[i].rm_so;
 523       regs->end[i] = pmatch[i].rm_eo;
 524     }
 525   for ( ; i < regs->num_regs; ++i)
 526     regs->start[i] = regs->end[i] = -1;
 527
 528   return rval;
 529 }
 530
 531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 532    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 533    this memory for recording register information.  STARTS and ENDS
 534    must be allocated using the malloc library routine, and must each
 535    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 536
 537    If NUM_REGS == 0, then subsequent matches should allocate their own
 538    register data.
 539
 540    Unless this function is called, the first search or match using
 541    PATTERN_BUFFER will allocate its own register data, without
 542    freeing the old data.  */
 543
 544 void
 545 re_set_registers (bufp, regs, num_regs, starts, ends)
 546     struct re_pattern_buffer *bufp;
 547     struct re_registers *regs;
 548     unsigned num_regs;
 549     regoff_t *starts, *ends;
 550 {
 551   if (num_regs)
 552     {
 553       bufp->regs_allocated = REGS_REALLOCATE;
 554       regs->num_regs = num_regs;
 555       regs->start = starts;
 556       regs->end = ends;
 557     }
 558   else
 559     {
 560       bufp->regs_allocated = REGS_UNALLOCATED;
 561       regs->num_regs = 0;
 562       regs->start = regs->end = (regoff_t *) 0;
 563     }
 564 }
 565 #ifdef _LIBC
 566 weak_alias (__re_set_registers, re_set_registers)
 567 #endif
 568 \f
 569 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 570    them unless specifically requested.  */
 571
 572 #if defined _REGEX_RE_COMP || defined _LIBC
 573 int
 574 # ifdef _LIBC
 575 weak_function
 576 # endif
 577 re_exec (s)
 578      const char *s;
 579 {
 580   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 581 }
 582 #endif /* _REGEX_RE_COMP */
 583 \f
 584 /* Internal entry point.  */
 585
 586 /* Searches for a compiled pattern PREG in the string STRING, whose
 587    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 588    mingings with regexec.  START, and RANGE have the same meanings
 589    with re_search.
 590    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 591    otherwise return the error code.
 592    Note: We assume front end functions already check ranges.
 593    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 594
 595 static reg_errcode_t
 596 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 597                     eflags)
 598     const regex_t *preg;
 599     const char *string;
 600     int length, start, range, stop, eflags;
 601     size_t nmatch;
 602     regmatch_t pmatch[];
 603 {
 604   reg_errcode_t err;
 605   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 606   int left_lim, right_lim, incr;
 607   int fl_longest_match, match_first, match_kind, match_last = -1;
 608   int extra_nmatch;
 609   int sb, ch;
 610 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 611   re_match_context_t mctx = { .dfa = dfa };
 612 #else
 613   re_match_context_t mctx;
 614 #endif
 615   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 616                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 617   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 618
 619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 620   memset (&mctx, '\0', sizeof (re_match_context_t));
 621   mctx.dfa = dfa;
 622 #endif
 623
 624   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 625   nmatch -= extra_nmatch;
 626
 627   /* Check if the DFA haven't been compiled.  */
 628   if (BE (preg->used == 0 || dfa->init_state == NULL
 629           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 630           || dfa->init_state_begbuf == NULL, 0))
 631     return REG_NOMATCH;
 632
 633 #ifdef DEBUG
 634   /* We assume front-end functions already check them.  */
 635   assert (start + range >= 0 && start + range <= length);
 636 #endif
 637
 638   /* If initial states with non-begbuf contexts have no elements,
 639      the regex must be anchored.  If preg->newline_anchor is set,
 640      we'll never use init_state_nl, so do not check it.  */
 641   if (dfa->init_state->nodes.nelem == 0
 642       && dfa->init_state_word->nodes.nelem == 0
 643       && (dfa->init_state_nl->nodes.nelem == 0
 644           || !preg->newline_anchor))
 645     {
 646       if (start != 0 && start + range != 0)
 647         return REG_NOMATCH;
 648       start = range = 0;
 649     }
 650
 651   /* We must check the longest matching, if nmatch > 0.  */
 652   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 653
 654   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 655                             preg->translate, preg->syntax & RE_ICASE, dfa);
 656   if (BE (err != REG_NOERROR, 0))
 657     goto free_return;
 658   mctx.input.stop = stop;
 659   mctx.input.raw_stop = stop;
 660   mctx.input.newline_anchor = preg->newline_anchor;
 661
 662   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 663   if (BE (err != REG_NOERROR, 0))
 664     goto free_return;
 665
 666   /* We will log all the DFA states through which the dfa pass,
 667      if nmatch > 1, or this dfa has "multibyte node", which is a
 668      back-reference or a node which can accept multibyte character or
 669      multi character collating element.  */
 670   if (nmatch > 1 || dfa->has_mb_node)
 671     {
 672       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 673       if (BE (mctx.state_log == NULL, 0))
 674         {
 675           err = REG_ESPACE;
 676           goto free_return;
 677         }
 678     }
 679   else
 680     mctx.state_log = NULL;
 681
 682   match_first = start;
 683   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 684                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 685
 686   /* Check incrementally whether of not the input string match.  */
 687   incr = (range < 0) ? -1 : 1;
 688   left_lim = (range < 0) ? start + range : start;
 689   right_lim = (range < 0) ? start : start + range;
 690   sb = dfa->mb_cur_max == 1;
 691   match_kind =
 692     (fastmap
 693      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 694         | (range >= 0 ? 2 : 0)
 695         | (t != NULL ? 1 : 0))
 696      : 8);
 697
 698   for (;; match_first += incr)
 699     {
 700       err = REG_NOMATCH;
 701       if (match_first < left_lim || right_lim < match_first)
 702         goto free_return;
 703
 704       /* Advance as rapidly as possible through the string, until we
 705          find a plausible place to start matching.  This may be done
 706          with varying efficiency, so there are various possibilities:
 707          only the most common of them are specialized, in order to
 708          save on code size.  We use a switch statement for speed.  */
 709       switch (match_kind)
 710         {
 711         case 8:
 712           /* No fastmap.  */
 713           break;
 714
 715         case 7:
 716           /* Fastmap with single-byte translation, match forward.  */
 717           while (BE (match_first < right_lim, 1)
 718                  && !fastmap[t[(unsigned char) string[match_first]]])
 719             ++match_first;
 720           goto forward_match_found_start_or_reached_end;
 721
 722         case 6:
 723           /* Fastmap without translation, match forward.  */
 724           while (BE (match_first < right_lim, 1)
 725                  && !fastmap[(unsigned char) string[match_first]])
 726             ++match_first;
 727
 728         forward_match_found_start_or_reached_end:
 729           if (BE (match_first == right_lim, 0))
 730             {
 731               ch = match_first >= length
 732                        ? 0 : (unsigned char) string[match_first];
 733               if (!fastmap[t ? t[ch] : ch])
 734                 goto free_return;
 735             }
 736           break;
 737
 738         case 4:
 739         case 5:
 740           /* Fastmap without multi-byte translation, match backwards.  */
 741           while (match_first >= left_lim)
 742             {
 743               ch = match_first >= length
 744                        ? 0 : (unsigned char) string[match_first];
 745               if (fastmap[t ? t[ch] : ch])
 746                 break;
 747               --match_first;
 748             }
 749           if (match_first < left_lim)
 750             goto free_return;
 751           break;
 752
 753         default:
 754           /* In this case, we can't determine easily the current byte,
 755              since it might be a component byte of a multibyte
 756              character.  Then we use the constructed buffer instead.  */
 757           for (;;)
 758             {
 759               /* If MATCH_FIRST is out of the valid range, reconstruct the
 760                  buffers.  */
 761               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 762               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 763                 {
 764                   err = re_string_reconstruct (&mctx.input, match_first,
 765                                                eflags);
 766                   if (BE (err != REG_NOERROR, 0))
 767                     goto free_return;
 768
 769                   offset = match_first - mctx.input.raw_mbs_idx;
 770                 }
 771               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 772                  Note that MATCH_FIRST must not be smaller than 0.  */
 773               ch = (match_first >= length
 774                     ? 0 : re_string_byte_at (&mctx.input, offset));
 775               if (fastmap[ch])
 776                 break;
 777               match_first += incr;
 778               if (match_first < left_lim || match_first > right_lim)
 779                 {
 780                   err = REG_NOMATCH;
 781                   goto free_return;
 782                 }
 783             }
 784           break;
 785         }
 786
 787       /* Reconstruct the buffers so that the matcher can assume that
 788          the matching starts from the beginning of the buffer.  */
 789       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 790       if (BE (err != REG_NOERROR, 0))
 791         goto free_return;
 792
 793 #ifdef RE_ENABLE_I18N
 794      /* Don't consider this char as a possible match start if it part,
 795         yet isn't the head, of a multibyte character.  */
 796       if (!sb && !re_string_first_byte (&mctx.input, 0))
 797         continue;
 798 #endif
 799
 800       /* It seems to be appropriate one, then use the matcher.  */
 801       /* We assume that the matching starts from 0.  */
 802       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 803       match_last = check_matching (&mctx, fl_longest_match,
 804                                    range >= 0 ? &match_first : NULL);
 805       if (match_last != -1)
 806         {
 807           if (BE (match_last == -2, 0))
 808             {
 809               err = REG_ESPACE;
 810               goto free_return;
 811             }
 812           else
 813             {
 814               mctx.match_last = match_last;
 815               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 816                 {
 817                   re_dfastate_t *pstate = mctx.state_log[match_last];
 818                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 819                                                              match_last);
 820                 }
 821               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 822                   || dfa->nbackref)
 823                 {
 824                   err = prune_impossible_nodes (&mctx);
 825                   if (err == REG_NOERROR)
 826                     break;
 827                   if (BE (err != REG_NOMATCH, 0))
 828                     goto free_return;
 829                   match_last = -1;
 830                 }
 831               else
 832                 break; /* We found a match.  */
 833             }
 834         }
 835
 836       match_ctx_clean (&mctx);
 837     }
 838
 839 #ifdef DEBUG
 840   assert (match_last != -1);
 841   assert (err == REG_NOERROR);
 842 #endif
 843
 844   /* Set pmatch[] if we need.  */
 845   if (nmatch > 0)
 846     {
 847       int reg_idx;
 848
 849       /* Initialize registers.  */
 850       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 851         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 852
 853       /* Set the points where matching start/end.  */
 854       pmatch[0].rm_so = 0;
 855       pmatch[0].rm_eo = mctx.match_last;
 856
 857       if (!preg->no_sub && nmatch > 1)
 858         {
 859           err = set_regs (preg, &mctx, nmatch, pmatch,
 860                           dfa->has_plural_match && dfa->nbackref > 0);
 861           if (BE (err != REG_NOERROR, 0))
 862             goto free_return;
 863         }
 864
 865       /* At last, add the offset to the each registers, since we slided
 866          the buffers so that we could assume that the matching starts
 867          from 0.  */
 868       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 869         if (pmatch[reg_idx].rm_so != -1)
 870           {
 871 #ifdef RE_ENABLE_I18N
 872             if (BE (mctx.input.offsets_needed != 0, 0))
 873               {
 874                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 875                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 876                 else
 877                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 878                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 879                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 880                 else
 881                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 882               }
 883 #else
 884             assert (mctx.input.offsets_needed == 0);
 885 #endif
 886             pmatch[reg_idx].rm_so += match_first;
 887             pmatch[reg_idx].rm_eo += match_first;
 888           }
 889       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 890         {
 891           pmatch[nmatch + reg_idx].rm_so = -1;
 892           pmatch[nmatch + reg_idx].rm_eo = -1;
 893         }
 894
 895       if (dfa->subexp_map)
 896         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 897           if (dfa->subexp_map[reg_idx] != reg_idx)
 898             {
 899               pmatch[reg_idx + 1].rm_so
 900                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 901               pmatch[reg_idx + 1].rm_eo
 902                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 903             }
 904     }
 905
 906  free_return:
 907   re_free (mctx.state_log);
 908   if (dfa->nbackref)
 909     match_ctx_free (&mctx);
 910   re_string_destruct (&mctx.input);
 911   return err;
 912 }
 913
 914 static reg_errcode_t
 915 prune_impossible_nodes (mctx)
 916      re_match_context_t *mctx;
 917 {
 918   re_dfa_t *const dfa = mctx->dfa;
 919   int halt_node, match_last;
 920   reg_errcode_t ret;
 921   re_dfastate_t **sifted_states;
 922   re_dfastate_t **lim_states = NULL;
 923   re_sift_context_t sctx;
 924 #ifdef DEBUG
 925   assert (mctx->state_log != NULL);
 926 #endif
 927   match_last = mctx->match_last;
 928   halt_node = mctx->last_node;
 929   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 930   if (BE (sifted_states == NULL, 0))
 931     {
 932       ret = REG_ESPACE;
 933       goto free_return;
 934     }
 935   if (dfa->nbackref)
 936     {
 937       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 938       if (BE (lim_states == NULL, 0))
 939         {
 940           ret = REG_ESPACE;
 941           goto free_return;
 942         }
 943       while (1)
 944         {
 945           memset (lim_states, '\0',
 946                   sizeof (re_dfastate_t *) * (match_last + 1));
 947           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 948                          match_last);
 949           ret = sift_states_backward (mctx, &sctx);
 950           re_node_set_free (&sctx.limits);
 951           if (BE (ret != REG_NOERROR, 0))
 952               goto free_return;
 953           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 954             break;
 955           do
 956             {
 957               --match_last;
 958               if (match_last < 0)
 959                 {
 960                   ret = REG_NOMATCH;
 961                   goto free_return;
 962                 }
 963             } while (mctx->state_log[match_last] == NULL
 964                      || !mctx->state_log[match_last]->halt);
 965           halt_node = check_halt_state_context (mctx,
 966                                                 mctx->state_log[match_last],
 967                                                 match_last);
 968         }
 969       ret = merge_state_array (dfa, sifted_states, lim_states,
 970                                match_last + 1);
 971       re_free (lim_states);
 972       lim_states = NULL;
 973       if (BE (ret != REG_NOERROR, 0))
 974         goto free_return;
 975     }
 976   else
 977     {
 978       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 979       ret = sift_states_backward (mctx, &sctx);
 980       re_node_set_free (&sctx.limits);
 981       if (BE (ret != REG_NOERROR, 0))
 982         goto free_return;
 983     }
 984   re_free (mctx->state_log);
 985   mctx->state_log = sifted_states;
 986   sifted_states = NULL;
 987   mctx->last_node = halt_node;
 988   mctx->match_last = match_last;
 989   ret = REG_NOERROR;
 990  free_return:
 991   re_free (sifted_states);
 992   re_free (lim_states);
 993   return ret;
 994 }
 995
 996 /* Acquire an initial state and return it.
 997    We must select appropriate initial state depending on the context,
 998    since initial states may have constraints like "\<", "^", etc..  */
 999
1000 static inline re_dfastate_t *
1001 acquire_init_state_context (err, mctx, idx)
1002      reg_errcode_t *err;
1003      const re_match_context_t *mctx;
1004      int idx;
1005 {
1006   re_dfa_t *const dfa = mctx->dfa;
1007   if (dfa->init_state->has_constraint)
1008     {
1009       unsigned int context;
1010       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1011       if (IS_WORD_CONTEXT (context))
1012         return dfa->init_state_word;
1013       else if (IS_ORDINARY_CONTEXT (context))
1014         return dfa->init_state;
1015       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1016         return dfa->init_state_begbuf;
1017       else if (IS_NEWLINE_CONTEXT (context))
1018         return dfa->init_state_nl;
1019       else if (IS_BEGBUF_CONTEXT (context))
1020         {
1021           /* It is relatively rare case, then calculate on demand.  */
1022           return re_acquire_state_context (err, dfa,
1023                                            dfa->init_state->entrance_nodes,
1024                                            context);
1025         }
1026       else
1027         /* Must not happen?  */
1028         return dfa->init_state;
1029     }
1030   else
1031     return dfa->init_state;
1032 }
1033
1034 /* Check whether the regular expression match input string INPUT or not,
1035    and return the index where the matching end, return -1 if not match,
1036    or return -2 in case of an error.
1037    FL_LONGEST_MATCH means we want the POSIX longest matching.
1038    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1039    next place where we may want to try matching.
1040    Note that the matcher assume that the maching starts from the current
1041    index of the buffer.  */
1042
1043 static int
1044 check_matching (mctx, fl_longest_match, p_match_first)
1045     re_match_context_t *mctx;
1046     int fl_longest_match;
1047     int *p_match_first;
1048 {
1049   re_dfa_t *const dfa = mctx->dfa;
1050   reg_errcode_t err;
1051   int match = 0;
1052   int match_last = -1;
1053   int cur_str_idx = re_string_cur_idx (&mctx->input);
1054   re_dfastate_t *cur_state;
1055   int at_init_state = p_match_first != NULL;
1056   int next_start_idx = cur_str_idx;
1057
1058   err = REG_NOERROR;
1059   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1060   /* An initial state must not be NULL (invalid).  */
1061   if (BE (cur_state == NULL, 0))
1062     {
1063       assert (err == REG_ESPACE);
1064       return -2;
1065     }
1066
1067   if (mctx->state_log != NULL)
1068     {
1069       mctx->state_log[cur_str_idx] = cur_state;
1070
1071       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1072          later.  E.g. Processing back references.  */
1073       if (BE (dfa->nbackref, 0))
1074         {
1075           at_init_state = 0;
1076           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1077           if (BE (err != REG_NOERROR, 0))
1078             return err;
1079
1080           if (cur_state->has_backref)
1081             {
1082               err = transit_state_bkref (mctx, &cur_state->nodes);
1083               if (BE (err != REG_NOERROR, 0))
1084                 return err;
1085             }
1086         }
1087     }
1088
1089   /* If the RE accepts NULL string.  */
1090   if (BE (cur_state->halt, 0))
1091     {
1092       if (!cur_state->has_constraint
1093           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1094         {
1095           if (!fl_longest_match)
1096             return cur_str_idx;
1097           else
1098             {
1099               match_last = cur_str_idx;
1100               match = 1;
1101             }
1102         }
1103     }
1104
1105   while (!re_string_eoi (&mctx->input))
1106     {
1107       re_dfastate_t *old_state = cur_state;
1108       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1109
1110       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1111           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1112               && mctx->input.valid_len < mctx->input.len))
1113         {
1114           err = extend_buffers (mctx);
1115           if (BE (err != REG_NOERROR, 0))
1116             {
1117               assert (err == REG_ESPACE);
1118               return -2;
1119             }
1120         }
1121
1122       cur_state = transit_state (&err, mctx, cur_state);
1123       if (mctx->state_log != NULL)
1124         cur_state = merge_state_with_log (&err, mctx, cur_state);
1125
1126       if (cur_state == NULL)
1127         {
1128           /* Reached the invalid state or an error.  Try to recover a valid
1129              state using the state log, if available and if we have not
1130              already found a valid (even if not the longest) match.  */
1131           if (BE (err != REG_NOERROR, 0))
1132             return -2;
1133
1134           if (mctx->state_log == NULL
1135               || (match && !fl_longest_match)
1136               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1137             break;
1138         }
1139
1140       if (BE (at_init_state, 0))
1141         {
1142           if (old_state == cur_state)
1143             next_start_idx = next_char_idx;
1144           else
1145             at_init_state = 0;
1146         }
1147
1148       if (cur_state->halt)
1149         {
1150           /* Reached a halt state.
1151              Check the halt state can satisfy the current context.  */
1152           if (!cur_state->has_constraint
1153               || check_halt_state_context (mctx, cur_state,
1154                                            re_string_cur_idx (&mctx->input)))
1155             {
1156               /* We found an appropriate halt state.  */
1157               match_last = re_string_cur_idx (&mctx->input);
1158               match = 1;
1159
1160               /* We found a match, do not modify match_first below.  */
1161               p_match_first = NULL;
1162               if (!fl_longest_match)
1163                 break;
1164             }
1165         }
1166     }
1167
1168   if (p_match_first)
1169     *p_match_first += next_start_idx;
1170
1171   return match_last;
1172 }
1173
1174 /* Check NODE match the current context.  */
1175
1176 static int check_halt_node_context (dfa, node, context)
1177     const re_dfa_t *dfa;
1178     int node;
1179     unsigned int context;
1180 {
1181   re_token_type_t type = dfa->nodes[node].type;
1182   unsigned int constraint = dfa->nodes[node].constraint;
1183   if (type != END_OF_RE)
1184     return 0;
1185   if (!constraint)
1186     return 1;
1187   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1188     return 0;
1189   return 1;
1190 }
1191
1192 /* Check the halt state STATE match the current context.
1193    Return 0 if not match, if the node, STATE has, is a halt node and
1194    match the context, return the node.  */
1195
1196 static int
1197 check_halt_state_context (mctx, state, idx)
1198     const re_match_context_t *mctx;
1199     const re_dfastate_t *state;
1200     int idx;
1201 {
1202   int i;
1203   unsigned int context;
1204 #ifdef DEBUG
1205   assert (state->halt);
1206 #endif
1207   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1208   for (i = 0; i < state->nodes.nelem; ++i)
1209     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1210       return state->nodes.elems[i];
1211   return 0;
1212 }
1213
1214 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1215    corresponding to the DFA).
1216    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1217    of errors.  */
1218
1219 static int
1220 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1221     const re_match_context_t *mctx;
1222     regmatch_t *regs;
1223     int nregs, *pidx, node;
1224     re_node_set *eps_via_nodes;
1225     struct re_fail_stack_t *fs;
1226 {
1227   re_dfa_t *const dfa = mctx->dfa;
1228   int i, err, dest_node;
1229   dest_node = -1;
1230   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1231     {
1232       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1233       re_node_set *edests = &dfa->edests[node];
1234       int dest_node;
1235       err = re_node_set_insert (eps_via_nodes, node);
1236       if (BE (err < 0, 0))
1237         return -2;
1238       /* Pick up a valid destination, or return -1 if none is found.  */
1239       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1240         {
1241           int candidate = edests->elems[i];
1242           if (!re_node_set_contains (cur_nodes, candidate))
1243             continue;
1244           if (dest_node == -1)
1245             dest_node = candidate;
1246
1247           else
1248             {
1249               /* In order to avoid infinite loop like "(a*)*", return the second
1250                  epsilon-transition if the first was already considered.  */
1251               if (re_node_set_contains (eps_via_nodes, dest_node))
1252                 return candidate;
1253
1254               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1255               else if (fs != NULL
1256                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1257                                            eps_via_nodes))
1258                 return -2;
1259
1260               /* We know we are going to exit.  */
1261               break;
1262             }
1263         }
1264       return dest_node;
1265     }
1266   else
1267     {
1268       int naccepted = 0;
1269       re_token_type_t type = dfa->nodes[node].type;
1270
1271 #ifdef RE_ENABLE_I18N
1272       if (dfa->nodes[node].accept_mb)
1273         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1274       else
1275 #endif /* RE_ENABLE_I18N */
1276       if (type == OP_BACK_REF)
1277         {
1278           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1279           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1280           if (fs != NULL)
1281             {
1282               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1283                 return -1;
1284               else if (naccepted)
1285                 {
1286                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1287                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1288                               naccepted) != 0)
1289                     return -1;
1290                 }
1291             }
1292
1293           if (naccepted == 0)
1294             {
1295               err = re_node_set_insert (eps_via_nodes, node);
1296               if (BE (err < 0, 0))
1297                 return -2;
1298               dest_node = dfa->edests[node].elems[0];
1299               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1300                                         dest_node))
1301                 return dest_node;
1302             }
1303         }
1304
1305       if (naccepted != 0
1306           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1307         {
1308           dest_node = dfa->nexts[node];
1309           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1310           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1311                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312                                                dest_node)))
1313             return -1;
1314           re_node_set_empty (eps_via_nodes);
1315           return dest_node;
1316         }
1317     }
1318   return -1;
1319 }
1320
1321 static reg_errcode_t
1322 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1323      struct re_fail_stack_t *fs;
1324      int str_idx, dest_node, nregs;
1325      regmatch_t *regs;
1326      re_node_set *eps_via_nodes;
1327 {
1328   reg_errcode_t err;
1329   int num = fs->num++;
1330   if (fs->num == fs->alloc)
1331     {
1332       struct re_fail_stack_ent_t *new_array;
1333       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1334                                        * fs->alloc * 2));
1335       if (new_array == NULL)
1336         return REG_ESPACE;
1337       fs->alloc *= 2;
1338       fs->stack = new_array;
1339     }
1340   fs->stack[num].idx = str_idx;
1341   fs->stack[num].node = dest_node;
1342   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1343   if (fs->stack[num].regs == NULL)
1344     return REG_ESPACE;
1345   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1346   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1347   return err;
1348 }
1349
1350 static int
1351 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1352      struct re_fail_stack_t *fs;
1353      int *pidx, nregs;
1354      regmatch_t *regs;
1355      re_node_set *eps_via_nodes;
1356 {
1357   int num = --fs->num;
1358   assert (num >= 0);
1359   *pidx = fs->stack[num].idx;
1360   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1361   re_node_set_free (eps_via_nodes);
1362   re_free (fs->stack[num].regs);
1363   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1364   return fs->stack[num].node;
1365 }
1366
1367 /* Set the positions where the subexpressions are starts/ends to registers
1368    PMATCH.
1369    Note: We assume that pmatch[0] is already set, and
1370    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1371
1372 static reg_errcode_t
1373 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1374      const regex_t *preg;
1375      const re_match_context_t *mctx;
1376      size_t nmatch;
1377      regmatch_t *pmatch;
1378      int fl_backtrack;
1379 {
1380   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1381   int idx, cur_node;
1382   re_node_set eps_via_nodes;
1383   struct re_fail_stack_t *fs;
1384   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385   regmatch_t *prev_idx_match;
1386
1387 #ifdef DEBUG
1388   assert (nmatch > 1);
1389   assert (mctx->state_log != NULL);
1390 #endif
1391   if (fl_backtrack)
1392     {
1393       fs = &fs_body;
1394       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1395       if (fs->stack == NULL)
1396         return REG_ESPACE;
1397     }
1398   else
1399     fs = NULL;
1400
1401   cur_node = dfa->init_node;
1402   re_node_set_init_empty (&eps_via_nodes);
1403
1404   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1405   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1406
1407   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1408     {
1409       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1410
1411       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1412         {
1413           int reg_idx;
1414           if (fs)
1415             {
1416               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1417                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1418                   break;
1419               if (reg_idx == nmatch)
1420                 {
1421                   re_node_set_free (&eps_via_nodes);
1422                   return free_fail_stack_return (fs);
1423                 }
1424               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1425                                          &eps_via_nodes);
1426             }
1427           else
1428             {
1429               re_node_set_free (&eps_via_nodes);
1430               return REG_NOERROR;
1431             }
1432         }
1433
1434       /* Proceed to next node.  */
1435       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1436                                     &eps_via_nodes, fs);
1437
1438       if (BE (cur_node < 0, 0))
1439         {
1440           if (BE (cur_node == -2, 0))
1441             {
1442               re_node_set_free (&eps_via_nodes);
1443               free_fail_stack_return (fs);
1444               return REG_ESPACE;
1445             }
1446           if (fs)
1447             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1448                                        &eps_via_nodes);
1449           else
1450             {
1451               re_node_set_free (&eps_via_nodes);
1452               return REG_NOMATCH;
1453             }
1454         }
1455     }
1456   re_node_set_free (&eps_via_nodes);
1457   return free_fail_stack_return (fs);
1458 }
1459
1460 static reg_errcode_t
1461 free_fail_stack_return (fs)
1462      struct re_fail_stack_t *fs;
1463 {
1464   if (fs)
1465     {
1466       int fs_idx;
1467       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1468         {
1469           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1470           re_free (fs->stack[fs_idx].regs);
1471         }
1472       re_free (fs->stack);
1473     }
1474   return REG_NOERROR;
1475 }
1476
1477 static void
1478 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1479      re_dfa_t *dfa;
1480      regmatch_t *pmatch, *prev_idx_match;
1481      int cur_node, cur_idx, nmatch;
1482 {
1483   int type = dfa->nodes[cur_node].type;
1484   if (type == OP_OPEN_SUBEXP)
1485     {
1486       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1487
1488       /* We are at the first node of this sub expression.  */
1489       if (reg_num < nmatch)
1490         {
1491           pmatch[reg_num].rm_so = cur_idx;
1492           pmatch[reg_num].rm_eo = -1;
1493         }
1494     }
1495   else if (type == OP_CLOSE_SUBEXP)
1496     {
1497       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1498       if (reg_num < nmatch)
1499         {
1500           /* We are at the last node of this sub expression.  */
1501           if (pmatch[reg_num].rm_so < cur_idx)
1502             {
1503               pmatch[reg_num].rm_eo = cur_idx;
1504               /* This is a non-empty match or we are not inside an optional
1505                  subexpression.  Accept this right away.  */
1506               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1507             }
1508           else
1509             {
1510               if (dfa->nodes[cur_node].opt_subexp
1511                   && prev_idx_match[reg_num].rm_so != -1)
1512                 /* We transited through an empty match for an optional
1513                    subexpression, like (a?)*, and this is not the subexp's
1514                    first match.  Copy back the old content of the registers
1515                    so that matches of an inner subexpression are undone as
1516                    well, like in ((a?))*.  */
1517                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1518               else
1519                 /* We completed a subexpression, but it may be part of
1520                    an optional one, so do not update PREV_IDX_MATCH.  */
1521                 pmatch[reg_num].rm_eo = cur_idx;
1522             }
1523         }
1524     }
1525 }
1526
1527 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1528    and sift the nodes in each states according to the following rules.
1529    Updated state_log will be wrote to STATE_LOG.
1530
1531    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1532      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1533         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1534         the LAST_NODE, we throw away the node `a'.
1535      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1536         string `s' and transit to `b':
1537         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1538            away the node `a'.
1539         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1540             thrown away, we throw away the node `a'.
1541      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1542         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1543            node `a'.
1544         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1545             we throw away the node `a'.  */
1546
1547 #define STATE_NODE_CONTAINS(state,node) \
1548   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1549
1550 static reg_errcode_t
1551 sift_states_backward (mctx, sctx)
1552      re_match_context_t *mctx;
1553      re_sift_context_t *sctx;
1554 {
1555   reg_errcode_t err;
1556   int null_cnt = 0;
1557   int str_idx = sctx->last_str_idx;
1558   re_node_set cur_dest;
1559
1560 #ifdef DEBUG
1561   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1562 #endif
1563
1564   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1565      transit to the last_node and the last_node itself.  */
1566   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1567   if (BE (err != REG_NOERROR, 0))
1568     return err;
1569   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1570   if (BE (err != REG_NOERROR, 0))
1571     goto free_return;
1572
1573   /* Then check each states in the state_log.  */
1574   while (str_idx > 0)
1575     {
1576       /* Update counters.  */
1577       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1578       if (null_cnt > mctx->max_mb_elem_len)
1579         {
1580           memset (sctx->sifted_states, '\0',
1581                   sizeof (re_dfastate_t *) * str_idx);
1582           re_node_set_free (&cur_dest);
1583           return REG_NOERROR;
1584         }
1585       re_node_set_empty (&cur_dest);
1586       --str_idx;
1587
1588       if (mctx->state_log[str_idx])
1589         {
1590           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1591           if (BE (err != REG_NOERROR, 0))
1592             goto free_return;
1593         }
1594
1595       /* Add all the nodes which satisfy the following conditions:
1596          - It can epsilon transit to a node in CUR_DEST.
1597          - It is in CUR_SRC.
1598          And update state_log.  */
1599       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1600       if (BE (err != REG_NOERROR, 0))
1601         goto free_return;
1602     }
1603   err = REG_NOERROR;
1604  free_return:
1605   re_node_set_free (&cur_dest);
1606   return err;
1607 }
1608
1609 static reg_errcode_t
1610 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1611      re_match_context_t *mctx;
1612      re_sift_context_t *sctx;
1613      int str_idx;
1614      re_node_set *cur_dest;
1615 {
1616   re_dfa_t *const dfa = mctx->dfa;
1617   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1618   int i;
1619
1620   /* Then build the next sifted state.
1621      We build the next sifted state on `cur_dest', and update
1622      `sifted_states[str_idx]' with `cur_dest'.
1623      Note:
1624      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1625      `cur_src' points the node_set of the old `state_log[str_idx]'
1626      (with the epsilon nodes pre-filtered out).  */
1627   for (i = 0; i < cur_src->nelem; i++)
1628     {
1629       int prev_node = cur_src->elems[i];
1630       int naccepted = 0;
1631       int ret;
1632
1633 #ifdef DEBUG
1634       re_token_type_t type = dfa->nodes[prev_node].type;
1635       assert (!IS_EPSILON_NODE (type));
1636 #endif
1637 #ifdef RE_ENABLE_I18N
1638       /* If the node may accept `multi byte'.  */
1639       if (dfa->nodes[prev_node].accept_mb)
1640         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1641                                          str_idx, sctx->last_str_idx);
1642 #endif /* RE_ENABLE_I18N */
1643
1644       /* We don't check backreferences here.
1645          See update_cur_sifted_state().  */
1646       if (!naccepted
1647           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1648           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1649                                   dfa->nexts[prev_node]))
1650         naccepted = 1;
1651
1652       if (naccepted == 0)
1653         continue;
1654
1655       if (sctx->limits.nelem)
1656         {
1657           int to_idx = str_idx + naccepted;
1658           if (check_dst_limits (mctx, &sctx->limits,
1659                                 dfa->nexts[prev_node], to_idx,
1660                                 prev_node, str_idx))
1661             continue;
1662         }
1663       ret = re_node_set_insert (cur_dest, prev_node);
1664       if (BE (ret == -1, 0))
1665         return REG_ESPACE;
1666     }
1667
1668   return REG_NOERROR;
1669 }
1670
1671 /* Helper functions.  */
1672
1673 static reg_errcode_t
1674 clean_state_log_if_needed (mctx, next_state_log_idx)
1675     re_match_context_t *mctx;
1676     int next_state_log_idx;
1677 {
1678   int top = mctx->state_log_top;
1679
1680   if (next_state_log_idx >= mctx->input.bufs_len
1681       || (next_state_log_idx >= mctx->input.valid_len
1682           && mctx->input.valid_len < mctx->input.len))
1683     {
1684       reg_errcode_t err;
1685       err = extend_buffers (mctx);
1686       if (BE (err != REG_NOERROR, 0))
1687         return err;
1688     }
1689
1690   if (top < next_state_log_idx)
1691     {
1692       memset (mctx->state_log + top + 1, '\0',
1693               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1694       mctx->state_log_top = next_state_log_idx;
1695     }
1696   return REG_NOERROR;
1697 }
1698
1699 static reg_errcode_t
1700 merge_state_array (dfa, dst, src, num)
1701      re_dfa_t *dfa;
1702      re_dfastate_t **dst;
1703      re_dfastate_t **src;
1704      int num;
1705 {
1706   int st_idx;
1707   reg_errcode_t err;
1708   for (st_idx = 0; st_idx < num; ++st_idx)
1709     {
1710       if (dst[st_idx] == NULL)
1711         dst[st_idx] = src[st_idx];
1712       else if (src[st_idx] != NULL)
1713         {
1714           re_node_set merged_set;
1715           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1716                                         &src[st_idx]->nodes);
1717           if (BE (err != REG_NOERROR, 0))
1718             return err;
1719           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1720           re_node_set_free (&merged_set);
1721           if (BE (err != REG_NOERROR, 0))
1722             return err;
1723         }
1724     }
1725   return REG_NOERROR;
1726 }
1727
1728 static reg_errcode_t
1729 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1730      re_match_context_t *mctx;
1731      re_sift_context_t *sctx;
1732      int str_idx;
1733      re_node_set *dest_nodes;
1734 {
1735   re_dfa_t *const dfa = mctx->dfa;
1736   reg_errcode_t err;
1737   const re_node_set *candidates;
1738   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1739                 : &mctx->state_log[str_idx]->nodes);
1740
1741   if (dest_nodes->nelem == 0)
1742     sctx->sifted_states[str_idx] = NULL;
1743   else
1744     {
1745       if (candidates)
1746         {
1747           /* At first, add the nodes which can epsilon transit to a node in
1748              DEST_NODE.  */
1749           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1750           if (BE (err != REG_NOERROR, 0))
1751             return err;
1752
1753           /* Then, check the limitations in the current sift_context.  */
1754           if (sctx->limits.nelem)
1755             {
1756               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1757                                          mctx->bkref_ents, str_idx);
1758               if (BE (err != REG_NOERROR, 0))
1759                 return err;
1760             }
1761         }
1762
1763       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1764       if (BE (err != REG_NOERROR, 0))
1765         return err;
1766     }
1767
1768   if (candidates && mctx->state_log[str_idx]->has_backref)
1769     {
1770       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1771       if (BE (err != REG_NOERROR, 0))
1772         return err;
1773     }
1774   return REG_NOERROR;
1775 }
1776
1777 static reg_errcode_t
1778 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1779      re_dfa_t *dfa;
1780      re_node_set *dest_nodes;
1781      const re_node_set *candidates;
1782 {
1783   reg_errcode_t err = REG_NOERROR;
1784   int i;
1785
1786   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1787   if (BE (err != REG_NOERROR, 0))
1788     return err;
1789
1790   if (!state->inveclosure.alloc)
1791     {
1792       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1793       if (BE (err != REG_NOERROR, 0))
1794         return REG_ESPACE;
1795       for (i = 0; i < dest_nodes->nelem; i++)
1796         re_node_set_merge (&state->inveclosure,
1797                            dfa->inveclosures + dest_nodes->elems[i]);
1798     }
1799   return re_node_set_add_intersect (dest_nodes, candidates,
1800                                     &state->inveclosure);
1801 }
1802
1803 static reg_errcode_t
1804 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1805      re_dfa_t *dfa;
1806      int node;
1807      re_node_set *dest_nodes;
1808      const re_node_set *candidates;
1809 {
1810     int ecl_idx;
1811     reg_errcode_t err;
1812     re_node_set *inv_eclosure = dfa->inveclosures + node;
1813     re_node_set except_nodes;
1814     re_node_set_init_empty (&except_nodes);
1815     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1816       {
1817         int cur_node = inv_eclosure->elems[ecl_idx];
1818         if (cur_node == node)
1819           continue;
1820         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1821           {
1822             int edst1 = dfa->edests[cur_node].elems[0];
1823             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1824                          ? dfa->edests[cur_node].elems[1] : -1);
1825             if ((!re_node_set_contains (inv_eclosure, edst1)
1826                  && re_node_set_contains (dest_nodes, edst1))
1827                 || (edst2 > 0
1828                     && !re_node_set_contains (inv_eclosure, edst2)
1829                     && re_node_set_contains (dest_nodes, edst2)))
1830               {
1831                 err = re_node_set_add_intersect (&except_nodes, candidates,
1832                                                  dfa->inveclosures + cur_node);
1833                 if (BE (err != REG_NOERROR, 0))
1834                   {
1835                     re_node_set_free (&except_nodes);
1836                     return err;
1837                   }
1838               }
1839           }
1840       }
1841     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1842       {
1843         int cur_node = inv_eclosure->elems[ecl_idx];
1844         if (!re_node_set_contains (&except_nodes, cur_node))
1845           {
1846             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1847             re_node_set_remove_at (dest_nodes, idx);
1848           }
1849       }
1850     re_node_set_free (&except_nodes);
1851     return REG_NOERROR;
1852 }
1853
1854 static int
1855 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1856      re_match_context_t *mctx;
1857      re_node_set *limits;
1858      int dst_node, dst_idx, src_node, src_idx;
1859 {
1860   re_dfa_t *const dfa = mctx->dfa;
1861   int lim_idx, src_pos, dst_pos;
1862
1863   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1864   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1865   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1866     {
1867       int subexp_idx;
1868       struct re_backref_cache_entry *ent;
1869       ent = mctx->bkref_ents + limits->elems[lim_idx];
1870       subexp_idx = dfa->nodes[ent->node].opr.idx;
1871
1872       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1873                                            subexp_idx, dst_node, dst_idx,
1874                                            dst_bkref_idx);
1875       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1876                                            subexp_idx, src_node, src_idx,
1877                                            src_bkref_idx);
1878
1879       /* In case of:
1880          <src> <dst> ( <subexp> )
1881          ( <subexp> ) <src> <dst>
1882          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1883       if (src_pos == dst_pos)
1884         continue; /* This is unrelated limitation.  */
1885       else
1886         return 1;
1887     }
1888   return 0;
1889 }
1890
1891 static int
1892 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1893      re_match_context_t *mctx;
1894      int boundaries, subexp_idx, from_node, bkref_idx;
1895 {
1896   re_dfa_t *const dfa = mctx->dfa;
1897   re_node_set *eclosures = dfa->eclosures + from_node;
1898   int node_idx;
1899
1900   /* Else, we are on the boundary: examine the nodes on the epsilon
1901      closure.  */
1902   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1903     {
1904       int node = eclosures->elems[node_idx];
1905       switch (dfa->nodes[node].type)
1906         {
1907         case OP_BACK_REF:
1908           if (bkref_idx != -1)
1909             {
1910               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1911               do
1912                 {
1913                   int dst, cpos;
1914
1915                   if (ent->node != node)
1916                     continue;
1917
1918                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1919                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1920                     continue;
1921
1922                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1923                      OP_CLOSE_SUBEXP cases below.  But, if the
1924                      destination node is the same node as the source
1925                      node, don't recurse because it would cause an
1926                      infinite loop: a regex that exhibits this behavior
1927                      is ()\1*\1*  */
1928                   dst = dfa->edests[node].elems[0];
1929                   if (dst == from_node)
1930                     {
1931                       if (boundaries & 1)
1932                         return -1;
1933                       else /* if (boundaries & 2) */
1934                         return 0;
1935                     }
1936
1937                   cpos =
1938                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1939                                                  dst, bkref_idx);
1940                   if (cpos == -1 /* && (boundaries & 1) */)
1941                     return -1;
1942                   if (cpos == 0 && (boundaries & 2))
1943                     return 0;
1944
1945                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1946                 }
1947               while (ent++->more);
1948             }
1949           break;
1950
1951         case OP_OPEN_SUBEXP:
1952           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1953             return -1;
1954           break;
1955
1956         case OP_CLOSE_SUBEXP:
1957           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1958             return 0;
1959           break;
1960
1961         default:
1962             break;
1963         }
1964     }
1965
1966   return (boundaries & 2) ? 1 : 0;
1967 }
1968
1969 static int
1970 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1971      re_match_context_t *mctx;
1972      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1973 {
1974   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1975   int boundaries;
1976
1977   /* If we are outside the range of the subexpression, return -1 or 1.  */
1978   if (str_idx < lim->subexp_from)
1979     return -1;
1980
1981   if (lim->subexp_to < str_idx)
1982     return 1;
1983
1984   /* If we are within the subexpression, return 0.  */
1985   boundaries = (str_idx == lim->subexp_from);
1986   boundaries |= (str_idx == lim->subexp_to) << 1;
1987   if (boundaries == 0)
1988     return 0;
1989
1990   /* Else, examine epsilon closure.  */
1991   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1992                                       from_node, bkref_idx);
1993 }
1994
1995 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1996    which are against limitations from DEST_NODES. */
1997
1998 static reg_errcode_t
1999 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2000      re_dfa_t *dfa;
2001      re_node_set *dest_nodes;
2002      const re_node_set *candidates;
2003      re_node_set *limits;
2004      struct re_backref_cache_entry *bkref_ents;
2005      int str_idx;
2006 {
2007   reg_errcode_t err;
2008   int node_idx, lim_idx;
2009
2010   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2011     {
2012       int subexp_idx;
2013       struct re_backref_cache_entry *ent;
2014       ent = bkref_ents + limits->elems[lim_idx];
2015
2016       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2017         continue; /* This is unrelated limitation.  */
2018
2019       subexp_idx = dfa->nodes[ent->node].opr.idx;
2020       if (ent->subexp_to == str_idx)
2021         {
2022           int ops_node = -1;
2023           int cls_node = -1;
2024           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2025             {
2026               int node = dest_nodes->elems[node_idx];
2027               re_token_type_t type = dfa->nodes[node].type;
2028               if (type == OP_OPEN_SUBEXP
2029                   && subexp_idx == dfa->nodes[node].opr.idx)
2030                 ops_node = node;
2031               else if (type == OP_CLOSE_SUBEXP
2032                        && subexp_idx == dfa->nodes[node].opr.idx)
2033                 cls_node = node;
2034             }
2035
2036           /* Check the limitation of the open subexpression.  */
2037           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2038           if (ops_node >= 0)
2039             {
2040               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2041                                            candidates);
2042               if (BE (err != REG_NOERROR, 0))
2043                 return err;
2044             }
2045
2046           /* Check the limitation of the close subexpression.  */
2047           if (cls_node >= 0)
2048             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2049               {
2050                 int node = dest_nodes->elems[node_idx];
2051                 if (!re_node_set_contains (dfa->inveclosures + node,
2052                                            cls_node)
2053                     && !re_node_set_contains (dfa->eclosures + node,
2054                                               cls_node))
2055                   {
2056                     /* It is against this limitation.
2057                        Remove it form the current sifted state.  */
2058                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2059                                                  candidates);
2060                     if (BE (err != REG_NOERROR, 0))
2061                       return err;
2062                     --node_idx;
2063                   }
2064               }
2065         }
2066       else /* (ent->subexp_to != str_idx)  */
2067         {
2068           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2069             {
2070               int node = dest_nodes->elems[node_idx];
2071               re_token_type_t type = dfa->nodes[node].type;
2072               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2073                 {
2074                   if (subexp_idx != dfa->nodes[node].opr.idx)
2075                     continue;
2076                   /* It is against this limitation.
2077                      Remove it form the current sifted state.  */
2078                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2079                                                candidates);
2080                   if (BE (err != REG_NOERROR, 0))
2081                     return err;
2082                 }
2083             }
2084         }
2085     }
2086   return REG_NOERROR;
2087 }
2088
2089 static reg_errcode_t
2090 sift_states_bkref (mctx, sctx, str_idx, candidates)
2091      re_match_context_t *mctx;
2092      re_sift_context_t *sctx;
2093      int str_idx;
2094      const re_node_set *candidates;
2095 {
2096   re_dfa_t *const dfa = mctx->dfa;
2097   reg_errcode_t err;
2098   int node_idx, node;
2099   re_sift_context_t local_sctx;
2100   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2101
2102   if (first_idx == -1)
2103     return REG_NOERROR;
2104
2105   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2106
2107   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2108     {
2109       int enabled_idx;
2110       re_token_type_t type;
2111       struct re_backref_cache_entry *entry;
2112       node = candidates->elems[node_idx];
2113       type = dfa->nodes[node].type;
2114       /* Avoid infinite loop for the REs like "()\1+".  */
2115       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2116         continue;
2117       if (type != OP_BACK_REF)
2118         continue;
2119
2120       entry = mctx->bkref_ents + first_idx;
2121       enabled_idx = first_idx;
2122       do
2123         {
2124           int subexp_len, to_idx, dst_node;
2125           re_dfastate_t *cur_state;
2126
2127           if (entry->node != node)
2128             continue;
2129           subexp_len = entry->subexp_to - entry->subexp_from;
2130           to_idx = str_idx + subexp_len;
2131           dst_node = (subexp_len ? dfa->nexts[node]
2132                       : dfa->edests[node].elems[0]);
2133
2134           if (to_idx > sctx->last_str_idx
2135               || sctx->sifted_states[to_idx] == NULL
2136               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2137               || check_dst_limits (mctx, &sctx->limits, node,
2138                                    str_idx, dst_node, to_idx))
2139             continue;
2140
2141           if (local_sctx.sifted_states == NULL)
2142             {
2143               local_sctx = *sctx;
2144               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2145               if (BE (err != REG_NOERROR, 0))
2146                 goto free_return;
2147             }
2148           local_sctx.last_node = node;
2149           local_sctx.last_str_idx = str_idx;
2150           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2151           if (BE (err < 0, 0))
2152             {
2153               err = REG_ESPACE;
2154               goto free_return;
2155             }
2156           cur_state = local_sctx.sifted_states[str_idx];
2157           err = sift_states_backward (mctx, &local_sctx);
2158           if (BE (err != REG_NOERROR, 0))
2159             goto free_return;
2160           if (sctx->limited_states != NULL)
2161             {
2162               err = merge_state_array (dfa, sctx->limited_states,
2163                                        local_sctx.sifted_states,
2164                                        str_idx + 1);
2165               if (BE (err != REG_NOERROR, 0))
2166                 goto free_return;
2167             }
2168           local_sctx.sifted_states[str_idx] = cur_state;
2169           re_node_set_remove (&local_sctx.limits, enabled_idx);
2170
2171           /* mctx->bkref_ents may have changed, reload the pointer.  */
2172           entry = mctx->bkref_ents + enabled_idx;
2173         }
2174       while (enabled_idx++, entry++->more);
2175     }
2176   err = REG_NOERROR;
2177  free_return:
2178   if (local_sctx.sifted_states != NULL)
2179     {
2180       re_node_set_free (&local_sctx.limits);
2181     }
2182
2183   return err;
2184 }
2185
2186
2187 #ifdef RE_ENABLE_I18N
2188 static int
2189 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2190     const re_match_context_t *mctx;
2191     re_sift_context_t *sctx;
2192     int node_idx, str_idx, max_str_idx;
2193 {
2194   re_dfa_t *const dfa = mctx->dfa;
2195   int naccepted;
2196   /* Check the node can accept `multi byte'.  */
2197   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2198   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2199       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2200                             dfa->nexts[node_idx]))
2201     /* The node can't accept the `multi byte', or the
2202        destination was already thrown away, then the node
2203        could't accept the current input `multi byte'.   */
2204     naccepted = 0;
2205   /* Otherwise, it is sure that the node could accept
2206      `naccepted' bytes input.  */
2207   return naccepted;
2208 }
2209 #endif /* RE_ENABLE_I18N */
2210
2211 \f
2212 /* Functions for state transition.  */
2213
2214 /* Return the next state to which the current state STATE will transit by
2215    accepting the current input byte, and update STATE_LOG if necessary.
2216    If STATE can accept a multibyte char/collating element/back reference
2217    update the destination of STATE_LOG.  */
2218
2219 static re_dfastate_t *
2220 transit_state (err, mctx, state)
2221      reg_errcode_t *err;
2222      re_match_context_t *mctx;
2223      re_dfastate_t *state;
2224 {
2225   re_dfastate_t **trtable;
2226   unsigned char ch;
2227
2228 #ifdef RE_ENABLE_I18N
2229   /* If the current state can accept multibyte.  */
2230   if (BE (state->accept_mb, 0))
2231     {
2232       *err = transit_state_mb (mctx, state);
2233       if (BE (*err != REG_NOERROR, 0))
2234         return NULL;
2235     }
2236 #endif /* RE_ENABLE_I18N */
2237
2238   /* Then decide the next state with the single byte.  */
2239 #if 0
2240   if (0)
2241     /* don't use transition table  */
2242     return transit_state_sb (err, mctx, state);
2243 #endif
2244
2245   /* Use transition table  */
2246   ch = re_string_fetch_byte (&mctx->input);
2247   for (;;)
2248     {
2249       trtable = state->trtable;
2250       if (BE (trtable != NULL, 1))
2251         return trtable[ch];
2252
2253       trtable = state->word_trtable;
2254       if (BE (trtable != NULL, 1))
2255         {
2256           unsigned int context;
2257           context
2258             = re_string_context_at (&mctx->input,
2259                                     re_string_cur_idx (&mctx->input) - 1,
2260                                     mctx->eflags);
2261           if (IS_WORD_CONTEXT (context))
2262             return trtable[ch + SBC_MAX];
2263           else
2264             return trtable[ch];
2265         }
2266
2267       if (!build_trtable (mctx->dfa, state))
2268         {
2269           *err = REG_ESPACE;
2270           return NULL;
2271         }
2272
2273       /* Retry, we now have a transition table.  */
2274     }
2275 }
2276
2277 /* Update the state_log if we need */
2278 re_dfastate_t *
2279 merge_state_with_log (err, mctx, next_state)
2280      reg_errcode_t *err;
2281      re_match_context_t *mctx;
2282      re_dfastate_t *next_state;
2283 {
2284   re_dfa_t *const dfa = mctx->dfa;
2285   int cur_idx = re_string_cur_idx (&mctx->input);
2286
2287   if (cur_idx > mctx->state_log_top)
2288     {
2289       mctx->state_log[cur_idx] = next_state;
2290       mctx->state_log_top = cur_idx;
2291     }
2292   else if (mctx->state_log[cur_idx] == 0)
2293     {
2294       mctx->state_log[cur_idx] = next_state;
2295     }
2296   else
2297     {
2298       re_dfastate_t *pstate;
2299       unsigned int context;
2300       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302          the destination of a multibyte char/collating element/
2303          back reference.  Then the next state is the union set of
2304          these destinations and the results of the transition table.  */
2305       pstate = mctx->state_log[cur_idx];
2306       log_nodes = pstate->entrance_nodes;
2307       if (next_state != NULL)
2308         {
2309           table_nodes = next_state->entrance_nodes;
2310           *err = re_node_set_init_union (&next_nodes, table_nodes,
2311                                              log_nodes);
2312           if (BE (*err != REG_NOERROR, 0))
2313             return NULL;
2314         }
2315       else
2316         next_nodes = *log_nodes;
2317       /* Note: We already add the nodes of the initial state,
2318          then we don't need to add them here.  */
2319
2320       context = re_string_context_at (&mctx->input,
2321                                       re_string_cur_idx (&mctx->input) - 1,
2322                                       mctx->eflags);
2323       next_state = mctx->state_log[cur_idx]
2324         = re_acquire_state_context (err, dfa, &next_nodes, context);
2325       /* We don't need to check errors here, since the return value of
2326          this function is next_state and ERR is already set.  */
2327
2328       if (table_nodes != NULL)
2329         re_node_set_free (&next_nodes);
2330     }
2331
2332   if (BE (dfa->nbackref, 0) && next_state != NULL)
2333     {
2334       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335          later.  We must check them here, since the back references in the
2336          next state might use them.  */
2337       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338                                         cur_idx);
2339       if (BE (*err != REG_NOERROR, 0))
2340         return NULL;
2341
2342       /* If the next state has back references.  */
2343       if (next_state->has_backref)
2344         {
2345           *err = transit_state_bkref (mctx, &next_state->nodes);
2346           if (BE (*err != REG_NOERROR, 0))
2347             return NULL;
2348           next_state = mctx->state_log[cur_idx];
2349         }
2350     }
2351
2352   return next_state;
2353 }
2354
2355 /* Skip bytes in the input that correspond to part of a
2356    multi-byte match, then look in the log for a state
2357    from which to restart matching.  */
2358 re_dfastate_t *
2359 find_recover_state (err, mctx)
2360      reg_errcode_t *err;
2361      re_match_context_t *mctx;
2362 {
2363   re_dfastate_t *cur_state = NULL;
2364   do
2365     {
2366       int max = mctx->state_log_top;
2367       int cur_str_idx = re_string_cur_idx (&mctx->input);
2368
2369       do
2370         {
2371           if (++cur_str_idx > max)
2372             return NULL;
2373           re_string_skip_bytes (&mctx->input, 1);
2374         }
2375       while (mctx->state_log[cur_str_idx] == NULL);
2376
2377       cur_state = merge_state_with_log (err, mctx, NULL);
2378     }
2379   while (err == REG_NOERROR && cur_state == NULL);
2380   return cur_state;
2381 }
2382
2383 /* Helper functions for transit_state.  */
2384
2385 /* From the node set CUR_NODES, pick up the nodes whose types are
2386    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2387    expression. And register them to use them later for evaluating the
2388    correspoding back references.  */
2389
2390 static reg_errcode_t
2391 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2392      re_match_context_t *mctx;
2393      re_node_set *cur_nodes;
2394      int str_idx;
2395 {
2396   re_dfa_t *const dfa = mctx->dfa;
2397   int node_idx;
2398   reg_errcode_t err;
2399
2400   /* TODO: This isn't efficient.
2401            Because there might be more than one nodes whose types are
2402            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2403            nodes.
2404            E.g. RE: (a){2}  */
2405   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2406     {
2407       int node = cur_nodes->elems[node_idx];
2408       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2409           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2410           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2411         {
2412           err = match_ctx_add_subtop (mctx, node, str_idx);
2413           if (BE (err != REG_NOERROR, 0))
2414             return err;
2415         }
2416     }
2417   return REG_NOERROR;
2418 }
2419
2420 #if 0
2421 /* Return the next state to which the current state STATE will transit by
2422    accepting the current input byte.  */
2423
2424 static re_dfastate_t *
2425 transit_state_sb (err, mctx, state)
2426      reg_errcode_t *err;
2427      re_match_context_t *mctx;
2428      re_dfastate_t *state;
2429 {
2430   re_dfa_t *const dfa = mctx->dfa;
2431   re_node_set next_nodes;
2432   re_dfastate_t *next_state;
2433   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2434   unsigned int context;
2435
2436   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2437   if (BE (*err != REG_NOERROR, 0))
2438     return NULL;
2439   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2440     {
2441       int cur_node = state->nodes.elems[node_cnt];
2442       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2443         {
2444           *err = re_node_set_merge (&next_nodes,
2445                                     dfa->eclosures + dfa->nexts[cur_node]);
2446           if (BE (*err != REG_NOERROR, 0))
2447             {
2448               re_node_set_free (&next_nodes);
2449               return NULL;
2450             }
2451         }
2452     }
2453   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2454   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2455   /* We don't need to check errors here, since the return value of
2456      this function is next_state and ERR is already set.  */
2457
2458   re_node_set_free (&next_nodes);
2459   re_string_skip_bytes (&mctx->input, 1);
2460   return next_state;
2461 }
2462 #endif
2463
2464 #ifdef RE_ENABLE_I18N
2465 static reg_errcode_t
2466 transit_state_mb (mctx, pstate)
2467     re_match_context_t *mctx;
2468     re_dfastate_t *pstate;
2469 {
2470   re_dfa_t *const dfa = mctx->dfa;
2471   reg_errcode_t err;
2472   int i;
2473
2474   for (i = 0; i < pstate->nodes.nelem; ++i)
2475     {
2476       re_node_set dest_nodes, *new_nodes;
2477       int cur_node_idx = pstate->nodes.elems[i];
2478       int naccepted, dest_idx;
2479       unsigned int context;
2480       re_dfastate_t *dest_state;
2481
2482       if (!dfa->nodes[cur_node_idx].accept_mb)
2483         continue;
2484
2485       if (dfa->nodes[cur_node_idx].constraint)
2486         {
2487           context = re_string_context_at (&mctx->input,
2488                                           re_string_cur_idx (&mctx->input),
2489                                           mctx->eflags);
2490           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2491                                            context))
2492             continue;
2493         }
2494
2495       /* How many bytes the node can accept?  */
2496       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2497                                            re_string_cur_idx (&mctx->input));
2498       if (naccepted == 0)
2499         continue;
2500
2501       /* The node can accepts `naccepted' bytes.  */
2502       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2503       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2504                                : mctx->max_mb_elem_len);
2505       err = clean_state_log_if_needed (mctx, dest_idx);
2506       if (BE (err != REG_NOERROR, 0))
2507         return err;
2508 #ifdef DEBUG
2509       assert (dfa->nexts[cur_node_idx] != -1);
2510 #endif
2511       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2512
2513       dest_state = mctx->state_log[dest_idx];
2514       if (dest_state == NULL)
2515         dest_nodes = *new_nodes;
2516       else
2517         {
2518           err = re_node_set_init_union (&dest_nodes,
2519                                         dest_state->entrance_nodes, new_nodes);
2520           if (BE (err != REG_NOERROR, 0))
2521             return err;
2522         }
2523       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2524       mctx->state_log[dest_idx]
2525         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2526       if (dest_state != NULL)
2527         re_node_set_free (&dest_nodes);
2528       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2529         return err;
2530     }
2531   return REG_NOERROR;
2532 }
2533 #endif /* RE_ENABLE_I18N */
2534
2535 static reg_errcode_t
2536 transit_state_bkref (mctx, nodes)
2537     re_match_context_t *mctx;
2538     const re_node_set *nodes;
2539 {
2540   re_dfa_t *const dfa = mctx->dfa;
2541   reg_errcode_t err;
2542   int i;
2543   int cur_str_idx = re_string_cur_idx (&mctx->input);
2544
2545   for (i = 0; i < nodes->nelem; ++i)
2546     {
2547       int dest_str_idx, prev_nelem, bkc_idx;
2548       int node_idx = nodes->elems[i];
2549       unsigned int context;
2550       const re_token_t *node = dfa->nodes + node_idx;
2551       re_node_set *new_dest_nodes;
2552
2553       /* Check whether `node' is a backreference or not.  */
2554       if (node->type != OP_BACK_REF)
2555         continue;
2556
2557       if (node->constraint)
2558         {
2559           context = re_string_context_at (&mctx->input, cur_str_idx,
2560                                           mctx->eflags);
2561           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2562             continue;
2563         }
2564
2565       /* `node' is a backreference.
2566          Check the substring which the substring matched.  */
2567       bkc_idx = mctx->nbkref_ents;
2568       err = get_subexp (mctx, node_idx, cur_str_idx);
2569       if (BE (err != REG_NOERROR, 0))
2570         goto free_return;
2571
2572       /* And add the epsilon closures (which is `new_dest_nodes') of
2573          the backreference to appropriate state_log.  */
2574 #ifdef DEBUG
2575       assert (dfa->nexts[node_idx] != -1);
2576 #endif
2577       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2578         {
2579           int subexp_len;
2580           re_dfastate_t *dest_state;
2581           struct re_backref_cache_entry *bkref_ent;
2582           bkref_ent = mctx->bkref_ents + bkc_idx;
2583           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2584             continue;
2585           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2586           new_dest_nodes = (subexp_len == 0
2587                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2588                             : dfa->eclosures + dfa->nexts[node_idx]);
2589           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2590                           - bkref_ent->subexp_from);
2591           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2592                                           mctx->eflags);
2593           dest_state = mctx->state_log[dest_str_idx];
2594           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2595                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2596           /* Add `new_dest_node' to state_log.  */
2597           if (dest_state == NULL)
2598             {
2599               mctx->state_log[dest_str_idx]
2600                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2601                                             context);
2602               if (BE (mctx->state_log[dest_str_idx] == NULL
2603                       && err != REG_NOERROR, 0))
2604                 goto free_return;
2605             }
2606           else
2607             {
2608               re_node_set dest_nodes;
2609               err = re_node_set_init_union (&dest_nodes,
2610                                             dest_state->entrance_nodes,
2611                                             new_dest_nodes);
2612               if (BE (err != REG_NOERROR, 0))
2613                 {
2614                   re_node_set_free (&dest_nodes);
2615                   goto free_return;
2616                 }
2617               mctx->state_log[dest_str_idx]
2618                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2619               re_node_set_free (&dest_nodes);
2620               if (BE (mctx->state_log[dest_str_idx] == NULL
2621                       && err != REG_NOERROR, 0))
2622                 goto free_return;
2623             }
2624           /* We need to check recursively if the backreference can epsilon
2625              transit.  */
2626           if (subexp_len == 0
2627               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2628             {
2629               err = check_subexp_matching_top (mctx, new_dest_nodes,
2630                                                cur_str_idx);
2631               if (BE (err != REG_NOERROR, 0))
2632                 goto free_return;
2633               err = transit_state_bkref (mctx, new_dest_nodes);
2634               if (BE (err != REG_NOERROR, 0))
2635                 goto free_return;
2636             }
2637         }
2638     }
2639   err = REG_NOERROR;
2640  free_return:
2641   return err;
2642 }
2643
2644 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2645    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2646    Note that we might collect inappropriate candidates here.
2647    However, the cost of checking them strictly here is too high, then we
2648    delay these checking for prune_impossible_nodes().  */
2649
2650 static reg_errcode_t
2651 get_subexp (mctx, bkref_node, bkref_str_idx)
2652      re_match_context_t *mctx;
2653      int bkref_node, bkref_str_idx;
2654 {
2655   re_dfa_t *const dfa = mctx->dfa;
2656   int subexp_num, sub_top_idx;
2657   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2658   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2659   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2660   if (cache_idx != -1)
2661     {
2662       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2663       do
2664         if (entry->node == bkref_node)
2665           return REG_NOERROR; /* We already checked it.  */
2666       while (entry++->more);
2667     }
2668
2669   subexp_num = dfa->nodes[bkref_node].opr.idx;
2670
2671   /* For each sub expression  */
2672   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2673     {
2674       reg_errcode_t err;
2675       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2676       re_sub_match_last_t *sub_last;
2677       int sub_last_idx, sl_str, bkref_str_off;
2678
2679       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2680         continue; /* It isn't related.  */
2681
2682       sl_str = sub_top->str_idx;
2683       bkref_str_off = bkref_str_idx;
2684       /* At first, check the last node of sub expressions we already
2685          evaluated.  */
2686       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2687         {
2688           int sl_str_diff;
2689           sub_last = sub_top->lasts[sub_last_idx];
2690           sl_str_diff = sub_last->str_idx - sl_str;
2691           /* The matched string by the sub expression match with the substring
2692              at the back reference?  */
2693           if (sl_str_diff > 0)
2694             {
2695               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2696                 {
2697                   /* Not enough chars for a successful match.  */
2698                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2699                     break;
2700
2701                   err = clean_state_log_if_needed (mctx,
2702                                                    bkref_str_off
2703                                                    + sl_str_diff);
2704                   if (BE (err != REG_NOERROR, 0))
2705                     return err;
2706                   buf = (const char *) re_string_get_buffer (&mctx->input);
2707                 }
2708               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2709                 break; /* We don't need to search this sub expression any more.  */
2710             }
2711           bkref_str_off += sl_str_diff;
2712           sl_str += sl_str_diff;
2713           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2714                                 bkref_str_idx);
2715
2716           /* Reload buf, since the preceding call might have reallocated
2717              the buffer.  */
2718           buf = (const char *) re_string_get_buffer (&mctx->input);
2719
2720           if (err == REG_NOMATCH)
2721             continue;
2722           if (BE (err != REG_NOERROR, 0))
2723             return err;
2724         }
2725
2726       if (sub_last_idx < sub_top->nlasts)
2727         continue;
2728       if (sub_last_idx > 0)
2729         ++sl_str;
2730       /* Then, search for the other last nodes of the sub expression.  */
2731       for (; sl_str <= bkref_str_idx; ++sl_str)
2732         {
2733           int cls_node, sl_str_off;
2734           const re_node_set *nodes;
2735           sl_str_off = sl_str - sub_top->str_idx;
2736           /* The matched string by the sub expression match with the substring
2737              at the back reference?  */
2738           if (sl_str_off > 0)
2739             {
2740               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2741                 {
2742                   /* If we are at the end of the input, we cannot match.  */
2743                   if (bkref_str_off >= mctx->input.len)
2744                     break;
2745
2746                   err = extend_buffers (mctx);
2747                   if (BE (err != REG_NOERROR, 0))
2748                     return err;
2749
2750                   buf = (const char *) re_string_get_buffer (&mctx->input);
2751                 }
2752               if (buf [bkref_str_off++] != buf[sl_str - 1])
2753                 break; /* We don't need to search this sub expression
2754                           any more.  */
2755             }
2756           if (mctx->state_log[sl_str] == NULL)
2757             continue;
2758           /* Does this state have a ')' of the sub expression?  */
2759           nodes = &mctx->state_log[sl_str]->nodes;
2760           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2761           if (cls_node == -1)
2762             continue; /* No.  */
2763           if (sub_top->path == NULL)
2764             {
2765               sub_top->path = calloc (sizeof (state_array_t),
2766                                       sl_str - sub_top->str_idx + 1);
2767               if (sub_top->path == NULL)
2768                 return REG_ESPACE;
2769             }
2770           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2771              in the current context?  */
2772           err = check_arrival (mctx, sub_top->path, sub_top->node,
2773                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2774           if (err == REG_NOMATCH)
2775               continue;
2776           if (BE (err != REG_NOERROR, 0))
2777               return err;
2778           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2779           if (BE (sub_last == NULL, 0))
2780             return REG_ESPACE;
2781           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2782                                 bkref_str_idx);
2783           if (err == REG_NOMATCH)
2784             continue;
2785         }
2786     }
2787   return REG_NOERROR;
2788 }
2789
2790 /* Helper functions for get_subexp().  */
2791
2792 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2793    If it can arrive, register the sub expression expressed with SUB_TOP
2794    and SUB_LAST.  */
2795
2796 static reg_errcode_t
2797 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2798      re_match_context_t *mctx;
2799      const re_sub_match_top_t *sub_top;
2800      re_sub_match_last_t *sub_last;
2801      int bkref_node, bkref_str;
2802 {
2803   reg_errcode_t err;
2804   int to_idx;
2805   /* Can the subexpression arrive the back reference?  */
2806   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2807                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2808   if (err != REG_NOERROR)
2809     return err;
2810   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2811                              sub_last->str_idx);
2812   if (BE (err != REG_NOERROR, 0))
2813     return err;
2814   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2815   return clean_state_log_if_needed (mctx, to_idx);
2816 }
2817
2818 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2819    Search '(' if FL_OPEN, or search ')' otherwise.
2820    TODO: This function isn't efficient...
2821          Because there might be more than one nodes whose types are
2822          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2823          nodes.
2824          E.g. RE: (a){2}  */
2825
2826 static int
2827 find_subexp_node (dfa, nodes, subexp_idx, type)
2828      const re_dfa_t *dfa;
2829      const re_node_set *nodes;
2830      int subexp_idx, type;
2831 {
2832   int cls_idx;
2833   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2834     {
2835       int cls_node = nodes->elems[cls_idx];
2836       const re_token_t *node = dfa->nodes + cls_node;
2837       if (node->type == type
2838           && node->opr.idx == subexp_idx)
2839         return cls_node;
2840     }
2841   return -1;
2842 }
2843
2844 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2845    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2846    heavily reused.
2847    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2848
2849 static reg_errcode_t
2850 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2851                type)
2852      re_match_context_t *mctx;
2853      state_array_t *path;
2854      int top_node, top_str, last_node, last_str, type;
2855 {
2856   re_dfa_t *const dfa = mctx->dfa;
2857   reg_errcode_t err;
2858   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2859   re_dfastate_t *cur_state = NULL;
2860   re_node_set *cur_nodes, next_nodes;
2861   re_dfastate_t **backup_state_log;
2862   unsigned int context;
2863
2864   subexp_num = dfa->nodes[top_node].opr.idx;
2865   /* Extend the buffer if we need.  */
2866   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2867     {
2868       re_dfastate_t **new_array;
2869       int old_alloc = path->alloc;
2870       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2871       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2872       if (new_array == NULL)
2873         {
2874           path->alloc = old_alloc;
2875           return REG_ESPACE;
2876         }
2877       path->array = new_array;
2878       memset (new_array + old_alloc, '\0',
2879               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2880     }
2881
2882   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2883
2884   /* Temporary modify MCTX.  */
2885   backup_state_log = mctx->state_log;
2886   backup_cur_idx = mctx->input.cur_idx;
2887   mctx->state_log = path->array;
2888   mctx->input.cur_idx = str_idx;
2889
2890   /* Setup initial node set.  */
2891   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2892   if (str_idx == top_str)
2893     {
2894       err = re_node_set_init_1 (&next_nodes, top_node);
2895       if (BE (err != REG_NOERROR, 0))
2896         return err;
2897       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2898       if (BE (err != REG_NOERROR, 0))
2899         {
2900           re_node_set_free (&next_nodes);
2901           return err;
2902         }
2903     }
2904   else
2905     {
2906       cur_state = mctx->state_log[str_idx];
2907       if (cur_state && cur_state->has_backref)
2908         {
2909           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2910           if (BE ( err != REG_NOERROR, 0))
2911             return err;
2912         }
2913       else
2914         re_node_set_init_empty (&next_nodes);
2915     }
2916   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2917     {
2918       if (next_nodes.nelem)
2919         {
2920           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2921                                     subexp_num, type);
2922           if (BE ( err != REG_NOERROR, 0))
2923             {
2924               re_node_set_free (&next_nodes);
2925               return err;
2926             }
2927         }
2928       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2929       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2930         {
2931           re_node_set_free (&next_nodes);
2932           return err;
2933         }
2934       mctx->state_log[str_idx] = cur_state;
2935     }
2936
2937   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2938     {
2939       re_node_set_empty (&next_nodes);
2940       if (mctx->state_log[str_idx + 1])
2941         {
2942           err = re_node_set_merge (&next_nodes,
2943                                    &mctx->state_log[str_idx + 1]->nodes);
2944           if (BE (err != REG_NOERROR, 0))
2945             {
2946               re_node_set_free (&next_nodes);
2947               return err;
2948             }
2949         }
2950       if (cur_state)
2951         {
2952           err = check_arrival_add_next_nodes (mctx, str_idx,
2953                                               &cur_state->non_eps_nodes, &next_nodes);
2954           if (BE (err != REG_NOERROR, 0))
2955             {
2956               re_node_set_free (&next_nodes);
2957               return err;
2958             }
2959         }
2960       ++str_idx;
2961       if (next_nodes.nelem)
2962         {
2963           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2964           if (BE (err != REG_NOERROR, 0))
2965             {
2966               re_node_set_free (&next_nodes);
2967               return err;
2968             }
2969           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2970                                     subexp_num, type);
2971           if (BE ( err != REG_NOERROR, 0))
2972             {
2973               re_node_set_free (&next_nodes);
2974               return err;
2975             }
2976         }
2977       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2978       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2979       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2980         {
2981           re_node_set_free (&next_nodes);
2982           return err;
2983         }
2984       mctx->state_log[str_idx] = cur_state;
2985       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2986     }
2987   re_node_set_free (&next_nodes);
2988   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2989                : &mctx->state_log[last_str]->nodes);
2990   path->next_idx = str_idx;
2991
2992   /* Fix MCTX.  */
2993   mctx->state_log = backup_state_log;
2994   mctx->input.cur_idx = backup_cur_idx;
2995
2996   /* Then check the current node set has the node LAST_NODE.  */
2997   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2998     return REG_NOERROR;
2999
3000   return REG_NOMATCH;
3001 }
3002
3003 /* Helper functions for check_arrival.  */
3004
3005 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3006    to NEXT_NODES.
3007    TODO: This function is similar to the functions transit_state*(),
3008          however this function has many additional works.
3009          Can't we unify them?  */
3010
3011 static reg_errcode_t
3012 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3013      re_match_context_t *mctx;
3014      int str_idx;
3015      re_node_set *cur_nodes, *next_nodes;
3016 {
3017   re_dfa_t *const dfa = mctx->dfa;
3018   int result;
3019   int cur_idx;
3020   reg_errcode_t err;
3021   re_node_set union_set;
3022   re_node_set_init_empty (&union_set);
3023   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3024     {
3025       int naccepted = 0;
3026       int cur_node = cur_nodes->elems[cur_idx];
3027 #ifdef DEBUG
3028       re_token_type_t type = dfa->nodes[cur_node].type;
3029       assert (!IS_EPSILON_NODE (type));
3030 #endif
3031 #ifdef RE_ENABLE_I18N
3032       /* If the node may accept `multi byte'.  */
3033       if (dfa->nodes[cur_node].accept_mb)
3034         {
3035           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3036                                                str_idx);
3037           if (naccepted > 1)
3038             {
3039               re_dfastate_t *dest_state;
3040               int next_node = dfa->nexts[cur_node];
3041               int next_idx = str_idx + naccepted;
3042               dest_state = mctx->state_log[next_idx];
3043               re_node_set_empty (&union_set);
3044               if (dest_state)
3045                 {
3046                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3047                   if (BE (err != REG_NOERROR, 0))
3048                     {
3049                       re_node_set_free (&union_set);
3050                       return err;
3051                     }
3052                 }
3053               result = re_node_set_insert (&union_set, next_node);
3054               if (BE (result < 0, 0))
3055                 {
3056                   re_node_set_free (&union_set);
3057                   return REG_ESPACE;
3058                 }
3059               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3060                                                             &union_set);
3061               if (BE (mctx->state_log[next_idx] == NULL
3062                       && err != REG_NOERROR, 0))
3063                 {
3064                   re_node_set_free (&union_set);
3065                   return err;
3066                 }
3067             }
3068         }
3069 #endif /* RE_ENABLE_I18N */
3070       if (naccepted
3071           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3072         {
3073           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3074           if (BE (result < 0, 0))
3075             {
3076               re_node_set_free (&union_set);
3077               return REG_ESPACE;
3078             }
3079         }
3080     }
3081   re_node_set_free (&union_set);
3082   return REG_NOERROR;
3083 }
3084
3085 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3086    CUR_NODES, however exclude the nodes which are:
3087     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3088     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3089 */
3090
3091 static reg_errcode_t
3092 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3093      re_dfa_t *dfa;
3094      re_node_set *cur_nodes;
3095      int ex_subexp, type;
3096 {
3097   reg_errcode_t err;
3098   int idx, outside_node;
3099   re_node_set new_nodes;
3100 #ifdef DEBUG
3101   assert (cur_nodes->nelem);
3102 #endif
3103   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3104   if (BE (err != REG_NOERROR, 0))
3105     return err;
3106   /* Create a new node set NEW_NODES with the nodes which are epsilon
3107      closures of the node in CUR_NODES.  */
3108
3109   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3110     {
3111       int cur_node = cur_nodes->elems[idx];
3112       re_node_set *eclosure = dfa->eclosures + cur_node;
3113       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3114       if (outside_node == -1)
3115         {
3116           /* There are no problematic nodes, just merge them.  */
3117           err = re_node_set_merge (&new_nodes, eclosure);
3118           if (BE (err != REG_NOERROR, 0))
3119             {
3120               re_node_set_free (&new_nodes);
3121               return err;
3122             }
3123         }
3124       else
3125         {
3126           /* There are problematic nodes, re-calculate incrementally.  */
3127           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3128                                               ex_subexp, type);
3129           if (BE (err != REG_NOERROR, 0))
3130             {
3131               re_node_set_free (&new_nodes);
3132               return err;
3133             }
3134         }
3135     }
3136   re_node_set_free (cur_nodes);
3137   *cur_nodes = new_nodes;
3138   return REG_NOERROR;
3139 }
3140
3141 /* Helper function for check_arrival_expand_ecl.
3142    Check incrementally the epsilon closure of TARGET, and if it isn't
3143    problematic append it to DST_NODES.  */
3144
3145 static reg_errcode_t
3146 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3147      re_dfa_t *dfa;
3148      int target, ex_subexp, type;
3149      re_node_set *dst_nodes;
3150 {
3151   int cur_node;
3152   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3153     {
3154       int err;
3155
3156       if (dfa->nodes[cur_node].type == type
3157           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3158         {
3159           if (type == OP_CLOSE_SUBEXP)
3160             {
3161               err = re_node_set_insert (dst_nodes, cur_node);
3162               if (BE (err == -1, 0))
3163                 return REG_ESPACE;
3164             }
3165           break;
3166         }
3167       err = re_node_set_insert (dst_nodes, cur_node);
3168       if (BE (err == -1, 0))
3169         return REG_ESPACE;
3170       if (dfa->edests[cur_node].nelem == 0)
3171         break;
3172       if (dfa->edests[cur_node].nelem == 2)
3173         {
3174           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3175                                               dfa->edests[cur_node].elems[1],
3176                                               ex_subexp, type);
3177           if (BE (err != REG_NOERROR, 0))
3178             return err;
3179         }
3180       cur_node = dfa->edests[cur_node].elems[0];
3181     }
3182   return REG_NOERROR;
3183 }
3184
3185
3186 /* For all the back references in the current state, calculate the
3187    destination of the back references by the appropriate entry
3188    in MCTX->BKREF_ENTS.  */
3189
3190 static reg_errcode_t
3191 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3192                     type)
3193      re_match_context_t *mctx;
3194      int cur_str, subexp_num, type;
3195      re_node_set *cur_nodes;
3196 {
3197   re_dfa_t *const dfa = mctx->dfa;
3198   reg_errcode_t err;
3199   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3200   struct re_backref_cache_entry *ent;
3201
3202   if (cache_idx_start == -1)
3203     return REG_NOERROR;
3204
3205  restart:
3206   ent = mctx->bkref_ents + cache_idx_start;
3207   do
3208     {
3209       int to_idx, next_node;
3210
3211       /* Is this entry ENT is appropriate?  */
3212       if (!re_node_set_contains (cur_nodes, ent->node))
3213         continue; /* No.  */
3214
3215       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3216       /* Calculate the destination of the back reference, and append it
3217          to MCTX->STATE_LOG.  */
3218       if (to_idx == cur_str)
3219         {
3220           /* The backreference did epsilon transit, we must re-check all the
3221              node in the current state.  */
3222           re_node_set new_dests;
3223           reg_errcode_t err2, err3;
3224           next_node = dfa->edests[ent->node].elems[0];
3225           if (re_node_set_contains (cur_nodes, next_node))
3226             continue;
3227           err = re_node_set_init_1 (&new_dests, next_node);
3228           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3229           err3 = re_node_set_merge (cur_nodes, &new_dests);
3230           re_node_set_free (&new_dests);
3231           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3232                   || err3 != REG_NOERROR, 0))
3233             {
3234               err = (err != REG_NOERROR ? err
3235                      : (err2 != REG_NOERROR ? err2 : err3));
3236               return err;
3237             }
3238           /* TODO: It is still inefficient...  */
3239           goto restart;
3240         }
3241       else
3242         {
3243           re_node_set union_set;
3244           next_node = dfa->nexts[ent->node];
3245           if (mctx->state_log[to_idx])
3246             {
3247               int ret;
3248               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3249                                         next_node))
3250                 continue;
3251               err = re_node_set_init_copy (&union_set,
3252                                            &mctx->state_log[to_idx]->nodes);
3253               ret = re_node_set_insert (&union_set, next_node);
3254               if (BE (err != REG_NOERROR || ret < 0, 0))
3255                 {
3256                   re_node_set_free (&union_set);
3257                   err = err != REG_NOERROR ? err : REG_ESPACE;
3258                   return err;
3259                 }
3260             }
3261           else
3262             {
3263               err = re_node_set_init_1 (&union_set, next_node);
3264               if (BE (err != REG_NOERROR, 0))
3265                 return err;
3266             }
3267           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3268           re_node_set_free (&union_set);
3269           if (BE (mctx->state_log[to_idx] == NULL
3270                   && err != REG_NOERROR, 0))
3271             return err;
3272         }
3273     }
3274   while (ent++->more);
3275   return REG_NOERROR;
3276 }
3277
3278 /* Build transition table for the state.
3279    Return 1 if succeeded, otherwise return NULL.  */
3280
3281 static int
3282 build_trtable (dfa, state)
3283     re_dfa_t *dfa;
3284     re_dfastate_t *state;
3285 {
3286   reg_errcode_t err;
3287   int i, j, ch, need_word_trtable = 0;
3288   unsigned int elem, mask;
3289   int dests_node_malloced = 0, dest_states_malloced = 0;
3290   int ndests; /* Number of the destination states from `state'.  */
3291   re_dfastate_t **trtable;
3292   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3293   re_node_set follows, *dests_node;
3294   bitset *dests_ch;
3295   bitset acceptable;
3296
3297   /* We build DFA states which corresponds to the destination nodes
3298      from `state'.  `dests_node[i]' represents the nodes which i-th
3299      destination state contains, and `dests_ch[i]' represents the
3300      characters which i-th destination state accepts.  */
3301 #ifdef _LIBC
3302   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3303     dests_node = (re_node_set *)
3304       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3305   else
3306 #endif
3307     {
3308       dests_node = (re_node_set *)
3309         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3310       if (BE (dests_node == NULL, 0))
3311         return 0;
3312       dests_node_malloced = 1;
3313     }
3314   dests_ch = (bitset *) (dests_node + SBC_MAX);
3315
3316   /* Initialize transiton table.  */
3317   state->word_trtable = state->trtable = NULL;
3318
3319   /* At first, group all nodes belonging to `state' into several
3320      destinations.  */
3321   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3322   if (BE (ndests <= 0, 0))
3323     {
3324       if (dests_node_malloced)
3325         free (dests_node);
3326       /* Return 0 in case of an error, 1 otherwise.  */
3327       if (ndests == 0)
3328         {
3329           state->trtable = (re_dfastate_t **)
3330             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3331           return 1;
3332         }
3333       return 0;
3334     }
3335
3336   err = re_node_set_alloc (&follows, ndests + 1);
3337   if (BE (err != REG_NOERROR, 0))
3338     goto out_free;
3339
3340 #ifdef _LIBC
3341   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3342                          + ndests * 3 * sizeof (re_dfastate_t *)))
3343     dest_states = (re_dfastate_t **)
3344       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3345   else
3346 #endif
3347     {
3348       dest_states = (re_dfastate_t **)
3349         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3350       if (BE (dest_states == NULL, 0))
3351         {
3352 out_free:
3353           if (dest_states_malloced)
3354             free (dest_states);
3355           re_node_set_free (&follows);
3356           for (i = 0; i < ndests; ++i)
3357             re_node_set_free (dests_node + i);
3358           if (dests_node_malloced)
3359             free (dests_node);
3360           return 0;
3361         }
3362       dest_states_malloced = 1;
3363     }
3364   dest_states_word = dest_states + ndests;
3365   dest_states_nl = dest_states_word + ndests;
3366   bitset_empty (acceptable);
3367
3368   /* Then build the states for all destinations.  */
3369   for (i = 0; i < ndests; ++i)
3370     {
3371       int next_node;
3372       re_node_set_empty (&follows);
3373       /* Merge the follows of this destination states.  */
3374       for (j = 0; j < dests_node[i].nelem; ++j)
3375         {
3376           next_node = dfa->nexts[dests_node[i].elems[j]];
3377           if (next_node != -1)
3378             {
3379               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3380               if (BE (err != REG_NOERROR, 0))
3381                 goto out_free;
3382             }
3383         }
3384       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3385       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3386         goto out_free;
3387       /* If the new state has context constraint,
3388          build appropriate states for these contexts.  */
3389       if (dest_states[i]->has_constraint)
3390         {
3391           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3392                                                           CONTEXT_WORD);
3393           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3394             goto out_free;
3395
3396           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3397             need_word_trtable = 1;
3398
3399           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3400                                                         CONTEXT_NEWLINE);
3401           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3402             goto out_free;
3403         }
3404       else
3405         {
3406           dest_states_word[i] = dest_states[i];
3407           dest_states_nl[i] = dest_states[i];
3408         }
3409       bitset_merge (acceptable, dests_ch[i]);
3410     }
3411
3412   if (!BE (need_word_trtable, 0))
3413     {
3414       /* We don't care about whether the following character is a word
3415          character, or we are in a single-byte character set so we can
3416          discern by looking at the character code: allocate a
3417          256-entry transition table.  */
3418       trtable = state->trtable =
3419         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3420       if (BE (trtable == NULL, 0))
3421         goto out_free;
3422
3423       /* For all characters ch...:  */
3424       for (i = 0; i < BITSET_UINTS; ++i)
3425         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3426              elem;
3427              mask <<= 1, elem >>= 1, ++ch)
3428           if (BE (elem & 1, 0))
3429             {
3430               /* There must be exactly one destination which accepts
3431                  character ch.  See group_nodes_into_DFAstates.  */
3432               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3433                 ;
3434
3435               /* j-th destination accepts the word character ch.  */
3436               if (dfa->word_char[i] & mask)
3437                 trtable[ch] = dest_states_word[j];
3438               else
3439                 trtable[ch] = dest_states[j];
3440             }
3441     }
3442   else
3443     {
3444       /* We care about whether the following character is a word
3445          character, and we are in a multi-byte character set: discern
3446          by looking at the character code: build two 256-entry
3447          transition tables, one starting at trtable[0] and one
3448          starting at trtable[SBC_MAX].  */
3449       trtable = state->word_trtable =
3450         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3451       if (BE (trtable == NULL, 0))
3452         goto out_free;
3453
3454       /* For all characters ch...:  */
3455       for (i = 0; i < BITSET_UINTS; ++i)
3456         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3457              elem;
3458              mask <<= 1, elem >>= 1, ++ch)
3459           if (BE (elem & 1, 0))
3460             {
3461               /* There must be exactly one destination which accepts
3462                  character ch.  See group_nodes_into_DFAstates.  */
3463               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3464                 ;
3465
3466               /* j-th destination accepts the word character ch.  */
3467               trtable[ch] = dest_states[j];
3468               trtable[ch + SBC_MAX] = dest_states_word[j];
3469             }
3470     }
3471
3472   /* new line */
3473   if (bitset_contain (acceptable, NEWLINE_CHAR))
3474     {
3475       /* The current state accepts newline character.  */
3476       for (j = 0; j < ndests; ++j)
3477         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3478           {
3479             /* k-th destination accepts newline character.  */
3480             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3481             if (need_word_trtable)
3482               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3483             /* There must be only one destination which accepts
3484                newline.  See group_nodes_into_DFAstates.  */
3485             break;
3486           }
3487     }
3488
3489   if (dest_states_malloced)
3490     free (dest_states);
3491
3492   re_node_set_free (&follows);
3493   for (i = 0; i < ndests; ++i)
3494     re_node_set_free (dests_node + i);
3495
3496   if (dests_node_malloced)
3497     free (dests_node);
3498
3499   return 1;
3500 }
3501
3502 /* Group all nodes belonging to STATE into several destinations.
3503    Then for all destinations, set the nodes belonging to the destination
3504    to DESTS_NODE[i] and set the characters accepted by the destination
3505    to DEST_CH[i].  This function return the number of destinations.  */
3506
3507 static int
3508 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3509     re_dfa_t *dfa;
3510     const re_dfastate_t *state;
3511     re_node_set *dests_node;
3512     bitset *dests_ch;
3513 {
3514   reg_errcode_t err;
3515   int result;
3516   int i, j, k;
3517   int ndests; /* Number of the destinations from `state'.  */
3518   bitset accepts; /* Characters a node can accept.  */
3519   const re_node_set *cur_nodes = &state->nodes;
3520   bitset_empty (accepts);
3521   ndests = 0;
3522
3523   /* For all the nodes belonging to `state',  */
3524   for (i = 0; i < cur_nodes->nelem; ++i)
3525     {
3526       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527       re_token_type_t type = node->type;
3528       unsigned int constraint = node->constraint;
3529
3530       /* Enumerate all single byte character this node can accept.  */
3531       if (type == CHARACTER)
3532         bitset_set (accepts, node->opr.c);
3533       else if (type == SIMPLE_BRACKET)
3534         {
3535           bitset_merge (accepts, node->opr.sbcset);
3536         }
3537       else if (type == OP_PERIOD)
3538         {
3539 #ifdef RE_ENABLE_I18N
3540           if (dfa->mb_cur_max > 1)
3541             bitset_merge (accepts, dfa->sb_char);
3542           else
3543 #endif
3544             bitset_set_all (accepts);
3545           if (!(dfa->syntax & RE_DOT_NEWLINE))
3546             bitset_clear (accepts, '\n');
3547           if (dfa->syntax & RE_DOT_NOT_NULL)
3548             bitset_clear (accepts, '\0');
3549         }
3550 #ifdef RE_ENABLE_I18N
3551       else if (type == OP_UTF8_PERIOD)
3552         {
3553           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3554           if (!(dfa->syntax & RE_DOT_NEWLINE))
3555             bitset_clear (accepts, '\n');
3556           if (dfa->syntax & RE_DOT_NOT_NULL)
3557             bitset_clear (accepts, '\0');
3558         }
3559 #endif
3560       else
3561         continue;
3562
3563       /* Check the `accepts' and sift the characters which are not
3564          match it the context.  */
3565       if (constraint)
3566         {
3567           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3568             {
3569               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3570               bitset_empty (accepts);
3571               if (accepts_newline)
3572                 bitset_set (accepts, NEWLINE_CHAR);
3573               else
3574                 continue;
3575             }
3576           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3577             {
3578               bitset_empty (accepts);
3579               continue;
3580             }
3581
3582           if (constraint & NEXT_WORD_CONSTRAINT)
3583             {
3584               unsigned int any_set = 0;
3585               if (type == CHARACTER && !node->word_char)
3586                 {
3587                   bitset_empty (accepts);
3588                   continue;
3589                 }
3590 #ifdef RE_ENABLE_I18N
3591               if (dfa->mb_cur_max > 1)
3592                 for (j = 0; j < BITSET_UINTS; ++j)
3593                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3594               else
3595 #endif
3596                 for (j = 0; j < BITSET_UINTS; ++j)
3597                   any_set |= (accepts[j] &= dfa->word_char[j]);
3598               if (!any_set)
3599                 continue;
3600             }
3601           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3602             {
3603               unsigned int any_set = 0;
3604               if (type == CHARACTER && node->word_char)
3605                 {
3606                   bitset_empty (accepts);
3607                   continue;
3608                 }
3609 #ifdef RE_ENABLE_I18N
3610               if (dfa->mb_cur_max > 1)
3611                 for (j = 0; j < BITSET_UINTS; ++j)
3612                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3613               else
3614 #endif
3615                 for (j = 0; j < BITSET_UINTS; ++j)
3616                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3617               if (!any_set)
3618                 continue;
3619             }
3620         }
3621
3622       /* Then divide `accepts' into DFA states, or create a new
3623          state.  Above, we make sure that accepts is not empty.  */
3624       for (j = 0; j < ndests; ++j)
3625         {
3626           bitset intersec; /* Intersection sets, see below.  */
3627           bitset remains;
3628           /* Flags, see below.  */
3629           int has_intersec, not_subset, not_consumed;
3630
3631           /* Optimization, skip if this state doesn't accept the character.  */
3632           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3633             continue;
3634
3635           /* Enumerate the intersection set of this state and `accepts'.  */
3636           has_intersec = 0;
3637           for (k = 0; k < BITSET_UINTS; ++k)
3638             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3639           /* And skip if the intersection set is empty.  */
3640           if (!has_intersec)
3641             continue;
3642
3643           /* Then check if this state is a subset of `accepts'.  */
3644           not_subset = not_consumed = 0;
3645           for (k = 0; k < BITSET_UINTS; ++k)
3646             {
3647               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3648               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3649             }
3650
3651           /* If this state isn't a subset of `accepts', create a
3652              new group state, which has the `remains'. */
3653           if (not_subset)
3654             {
3655               bitset_copy (dests_ch[ndests], remains);
3656               bitset_copy (dests_ch[j], intersec);
3657               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3658               if (BE (err != REG_NOERROR, 0))
3659                 goto error_return;
3660               ++ndests;
3661             }
3662
3663           /* Put the position in the current group. */
3664           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3665           if (BE (result < 0, 0))
3666             goto error_return;
3667
3668           /* If all characters are consumed, go to next node. */
3669           if (!not_consumed)
3670             break;
3671         }
3672       /* Some characters remain, create a new group. */
3673       if (j == ndests)
3674         {
3675           bitset_copy (dests_ch[ndests], accepts);
3676           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3677           if (BE (err != REG_NOERROR, 0))
3678             goto error_return;
3679           ++ndests;
3680           bitset_empty (accepts);
3681         }
3682     }
3683   return ndests;
3684  error_return:
3685   for (j = 0; j < ndests; ++j)
3686     re_node_set_free (dests_node + j);
3687   return -1;
3688 }
3689
3690 #ifdef RE_ENABLE_I18N
3691 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3692    Return the number of the bytes the node accepts.
3693    STR_IDX is the current index of the input string.
3694
3695    This function handles the nodes which can accept one character, or
3696    one collating element like '.', '[a-z]', opposite to the other nodes
3697    can only accept one byte.  */
3698
3699 static int
3700 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3701     re_dfa_t *dfa;
3702     int node_idx, str_idx;
3703     const re_string_t *input;
3704 {
3705   const re_token_t *node = dfa->nodes + node_idx;
3706   int char_len, elem_len;
3707   int i;
3708
3709   if (BE (node->type == OP_UTF8_PERIOD, 0))
3710     {
3711       unsigned char c = re_string_byte_at (input, str_idx), d;
3712       if (BE (c < 0xc2, 1))
3713         return 0;
3714
3715       if (str_idx + 2 > input->len)
3716         return 0;
3717
3718       d = re_string_byte_at (input, str_idx + 1);
3719       if (c < 0xe0)
3720         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3721       else if (c < 0xf0)
3722         {
3723           char_len = 3;
3724           if (c == 0xe0 && d < 0xa0)
3725             return 0;
3726         }
3727       else if (c < 0xf8)
3728         {
3729           char_len = 4;
3730           if (c == 0xf0 && d < 0x90)
3731             return 0;
3732         }
3733       else if (c < 0xfc)
3734         {
3735           char_len = 5;
3736           if (c == 0xf8 && d < 0x88)
3737             return 0;
3738         }
3739       else if (c < 0xfe)
3740         {
3741           char_len = 6;
3742           if (c == 0xfc && d < 0x84)
3743             return 0;
3744         }
3745       else
3746         return 0;
3747
3748       if (str_idx + char_len > input->len)
3749         return 0;
3750
3751       for (i = 1; i < char_len; ++i)
3752         {
3753           d = re_string_byte_at (input, str_idx + i);
3754           if (d < 0x80 || d > 0xbf)
3755             return 0;
3756         }
3757       return char_len;
3758     }
3759
3760   char_len = re_string_char_size_at (input, str_idx);
3761   if (node->type == OP_PERIOD)
3762     {
3763       if (char_len <= 1)
3764         return 0;
3765       /* FIXME: I don't think this if is needed, as both '\n'
3766          and '\0' are char_len == 1.  */
3767       /* '.' accepts any one character except the following two cases.  */
3768       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3769            re_string_byte_at (input, str_idx) == '\n') ||
3770           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3771            re_string_byte_at (input, str_idx) == '\0'))
3772         return 0;
3773       return char_len;
3774     }
3775
3776   elem_len = re_string_elem_size_at (input, str_idx);
3777   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3778     return 0;
3779
3780   if (node->type == COMPLEX_BRACKET)
3781     {
3782       const re_charset_t *cset = node->opr.mbcset;
3783 # ifdef _LIBC
3784       const unsigned char *pin
3785         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3786       int j;
3787       uint32_t nrules;
3788 # endif /* _LIBC */
3789       int match_len = 0;
3790       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3791                     ? re_string_wchar_at (input, str_idx) : 0);
3792
3793       /* match with multibyte character?  */
3794       for (i = 0; i < cset->nmbchars; ++i)
3795         if (wc == cset->mbchars[i])
3796           {
3797             match_len = char_len;
3798             goto check_node_accept_bytes_match;
3799           }
3800       /* match with character_class?  */
3801       for (i = 0; i < cset->nchar_classes; ++i)
3802         {
3803           wctype_t wt = cset->char_classes[i];
3804           if (__iswctype (wc, wt))
3805             {
3806               match_len = char_len;
3807               goto check_node_accept_bytes_match;
3808             }
3809         }
3810
3811 # ifdef _LIBC
3812       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3813       if (nrules != 0)
3814         {
3815           unsigned int in_collseq = 0;
3816           const int32_t *table, *indirect;
3817           const unsigned char *weights, *extra;
3818           const char *collseqwc;
3819           int32_t idx;
3820           /* This #include defines a local function!  */
3821 #  include <locale/weight.h>
3822
3823           /* match with collating_symbol?  */
3824           if (cset->ncoll_syms)
3825             extra = (const unsigned char *)
3826               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3827           for (i = 0; i < cset->ncoll_syms; ++i)
3828             {
3829               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3830               /* Compare the length of input collating element and
3831                  the length of current collating element.  */
3832               if (*coll_sym != elem_len)
3833                 continue;
3834               /* Compare each bytes.  */
3835               for (j = 0; j < *coll_sym; j++)
3836                 if (pin[j] != coll_sym[1 + j])
3837                   break;
3838               if (j == *coll_sym)
3839                 {
3840                   /* Match if every bytes is equal.  */
3841                   match_len = j;
3842                   goto check_node_accept_bytes_match;
3843                 }
3844             }
3845
3846           if (cset->nranges)
3847             {
3848               if (elem_len <= char_len)
3849                 {
3850                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3851                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3852                 }
3853               else
3854                 in_collseq = find_collation_sequence_value (pin, elem_len);
3855             }
3856           /* match with range expression?  */
3857           for (i = 0; i < cset->nranges; ++i)
3858             if (cset->range_starts[i] <= in_collseq
3859                 && in_collseq <= cset->range_ends[i])
3860               {
3861                 match_len = elem_len;
3862                 goto check_node_accept_bytes_match;
3863               }
3864
3865           /* match with equivalence_class?  */
3866           if (cset->nequiv_classes)
3867             {
3868               const unsigned char *cp = pin;
3869               table = (const int32_t *)
3870                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3871               weights = (const unsigned char *)
3872                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3873               extra = (const unsigned char *)
3874                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3875               indirect = (const int32_t *)
3876                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3877               idx = findidx (&cp);
3878               if (idx > 0)
3879                 for (i = 0; i < cset->nequiv_classes; ++i)
3880                   {
3881                     int32_t equiv_class_idx = cset->equiv_classes[i];
3882                     size_t weight_len = weights[idx];
3883                     if (weight_len == weights[equiv_class_idx])
3884                       {
3885                         int cnt = 0;
3886                         while (cnt <= weight_len
3887                                && (weights[equiv_class_idx + 1 + cnt]
3888                                    == weights[idx + 1 + cnt]))
3889                           ++cnt;
3890                         if (cnt > weight_len)
3891                           {
3892                             match_len = elem_len;
3893                             goto check_node_accept_bytes_match;
3894                           }
3895                       }
3896                   }
3897             }
3898         }
3899       else
3900 # endif /* _LIBC */
3901         {
3902           /* match with range expression?  */
3903 #if __GNUC__ >= 2
3904           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3905 #else
3906           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3907           cmp_buf[2] = wc;
3908 #endif
3909           for (i = 0; i < cset->nranges; ++i)
3910             {
3911               cmp_buf[0] = cset->range_starts[i];
3912               cmp_buf[4] = cset->range_ends[i];
3913               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3914                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3915                 {
3916                   match_len = char_len;
3917                   goto check_node_accept_bytes_match;
3918                 }
3919             }
3920         }
3921     check_node_accept_bytes_match:
3922       if (!cset->non_match)
3923         return match_len;
3924       else
3925         {
3926           if (match_len > 0)
3927             return 0;
3928           else
3929             return (elem_len > char_len) ? elem_len : char_len;
3930         }
3931     }
3932   return 0;
3933 }
3934
3935 # ifdef _LIBC
3936 static unsigned int
3937 find_collation_sequence_value (mbs, mbs_len)
3938     const unsigned char *mbs;
3939     size_t mbs_len;
3940 {
3941   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3942   if (nrules == 0)
3943     {
3944       if (mbs_len == 1)
3945         {
3946           /* No valid character.  Match it as a single byte character.  */
3947           const unsigned char *collseq = (const unsigned char *)
3948             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3949           return collseq[mbs[0]];
3950         }
3951       return UINT_MAX;
3952     }
3953   else
3954     {
3955       int32_t idx;
3956       const unsigned char *extra = (const unsigned char *)
3957         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3958       int32_t extrasize = (const unsigned char *)
3959         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3960
3961       for (idx = 0; idx < extrasize;)
3962         {
3963           int mbs_cnt, found = 0;
3964           int32_t elem_mbs_len;
3965           /* Skip the name of collating element name.  */
3966           idx = idx + extra[idx] + 1;
3967           elem_mbs_len = extra[idx++];
3968           if (mbs_len == elem_mbs_len)
3969             {
3970               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3971                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3972                   break;
3973               if (mbs_cnt == elem_mbs_len)
3974                 /* Found the entry.  */
3975                 found = 1;
3976             }
3977           /* Skip the byte sequence of the collating element.  */
3978           idx += elem_mbs_len;
3979           /* Adjust for the alignment.  */
3980           idx = (idx + 3) & ~3;
3981           /* Skip the collation sequence value.  */
3982           idx += sizeof (uint32_t);
3983           /* Skip the wide char sequence of the collating element.  */
3984           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3985           /* If we found the entry, return the sequence value.  */
3986           if (found)
3987             return *(uint32_t *) (extra + idx);
3988           /* Skip the collation sequence value.  */
3989           idx += sizeof (uint32_t);
3990         }
3991       return UINT_MAX;
3992     }
3993 }
3994 # endif /* _LIBC */
3995 #endif /* RE_ENABLE_I18N */
3996
3997 /* Check whether the node accepts the byte which is IDX-th
3998    byte of the INPUT.  */
3999
4000 static int
4001 check_node_accept (mctx, node, idx)
4002     const re_match_context_t *mctx;
4003     const re_token_t *node;
4004     int idx;
4005 {
4006   unsigned char ch;
4007   ch = re_string_byte_at (&mctx->input, idx);
4008   switch (node->type)
4009     {
4010     case CHARACTER:
4011       if (node->opr.c != ch)
4012         return 0;
4013       break;
4014
4015     case SIMPLE_BRACKET:
4016       if (!bitset_contain (node->opr.sbcset, ch))
4017         return 0;
4018       break;
4019
4020 #ifdef RE_ENABLE_I18N
4021     case OP_UTF8_PERIOD:
4022       if (ch >= 0x80)
4023         return 0;
4024       /* FALLTHROUGH */
4025 #endif
4026     case OP_PERIOD:
4027       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4028           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4029         return 0;
4030       break;
4031
4032     default:
4033       return 0;
4034     }
4035
4036   if (node->constraint)
4037     {
4038       /* The node has constraints.  Check whether the current context
4039          satisfies the constraints.  */
4040       unsigned int context = re_string_context_at (&mctx->input, idx,
4041                                                    mctx->eflags);
4042       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4043         return 0;
4044     }
4045
4046   return 1;
4047 }
4048
4049 /* Extend the buffers, if the buffers have run out.  */
4050
4051 static reg_errcode_t
4052 extend_buffers (mctx)
4053      re_match_context_t *mctx;
4054 {
4055   reg_errcode_t ret;
4056   re_string_t *pstr = &mctx->input;
4057
4058   /* Double the lengthes of the buffers.  */
4059   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4060   if (BE (ret != REG_NOERROR, 0))
4061     return ret;
4062
4063   if (mctx->state_log != NULL)
4064     {
4065       /* And double the length of state_log.  */
4066       /* XXX We have no indication of the size of this buffer.  If this
4067          allocation fail we have no indication that the state_log array
4068          does not have the right size.  */
4069       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070                                               pstr->bufs_len + 1);
4071       if (BE (new_array == NULL, 0))
4072         return REG_ESPACE;
4073       mctx->state_log = new_array;
4074     }
4075
4076   /* Then reconstruct the buffers.  */
4077   if (pstr->icase)
4078     {
4079 #ifdef RE_ENABLE_I18N
4080       if (pstr->mb_cur_max > 1)
4081         {
4082           ret = build_wcs_upper_buffer (pstr);
4083           if (BE (ret != REG_NOERROR, 0))
4084             return ret;
4085         }
4086       else
4087 #endif /* RE_ENABLE_I18N  */
4088         build_upper_buffer (pstr);
4089     }
4090   else
4091     {
4092 #ifdef RE_ENABLE_I18N
4093       if (pstr->mb_cur_max > 1)
4094         build_wcs_buffer (pstr);
4095       else
4096 #endif /* RE_ENABLE_I18N  */
4097         {
4098           if (pstr->trans != NULL)
4099             re_string_translate_buffer (pstr);
4100         }
4101     }
4102   return REG_NOERROR;
4103 }
4104
4105 \f
4106 /* Functions for matching context.  */
4107
4108 /* Initialize MCTX.  */
4109
4110 static reg_errcode_t
4111 match_ctx_init (mctx, eflags, n)
4112     re_match_context_t *mctx;
4113     int eflags, n;
4114 {
4115   mctx->eflags = eflags;
4116   mctx->match_last = -1;
4117   if (n > 0)
4118     {
4119       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4120       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4121       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4122         return REG_ESPACE;
4123     }
4124   /* Already zero-ed by the caller.
4125      else
4126        mctx->bkref_ents = NULL;
4127      mctx->nbkref_ents = 0;
4128      mctx->nsub_tops = 0;  */
4129   mctx->abkref_ents = n;
4130   mctx->max_mb_elem_len = 1;
4131   mctx->asub_tops = n;
4132   return REG_NOERROR;
4133 }
4134
4135 /* Clean the entries which depend on the current input in MCTX.
4136    This function must be invoked when the matcher changes the start index
4137    of the input, or changes the input string.  */
4138
4139 static void
4140 match_ctx_clean (mctx)
4141     re_match_context_t *mctx;
4142 {
4143   int st_idx;
4144   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4145     {
4146       int sl_idx;
4147       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4148       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4149         {
4150           re_sub_match_last_t *last = top->lasts[sl_idx];
4151           re_free (last->path.array);
4152           re_free (last);
4153         }
4154       re_free (top->lasts);
4155       if (top->path)
4156         {
4157           re_free (top->path->array);
4158           re_free (top->path);
4159         }
4160       free (top);
4161     }
4162
4163   mctx->nsub_tops = 0;
4164   mctx->nbkref_ents = 0;
4165 }
4166
4167 /* Free all the memory associated with MCTX.  */
4168
4169 static void
4170 match_ctx_free (mctx)
4171     re_match_context_t *mctx;
4172 {
4173   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4174   match_ctx_clean (mctx);
4175   re_free (mctx->sub_tops);
4176   re_free (mctx->bkref_ents);
4177 }
4178
4179 /* Add a new backreference entry to MCTX.
4180    Note that we assume that caller never call this function with duplicate
4181    entry, and call with STR_IDX which isn't smaller than any existing entry.
4182 */
4183
4184 static reg_errcode_t
4185 match_ctx_add_entry (mctx, node, str_idx, from, to)
4186      re_match_context_t *mctx;
4187      int node, str_idx, from, to;
4188 {
4189   if (mctx->nbkref_ents >= mctx->abkref_ents)
4190     {
4191       struct re_backref_cache_entry* new_entry;
4192       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4193                               mctx->abkref_ents * 2);
4194       if (BE (new_entry == NULL, 0))
4195         {
4196           re_free (mctx->bkref_ents);
4197           return REG_ESPACE;
4198         }
4199       mctx->bkref_ents = new_entry;
4200       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4201               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4202       mctx->abkref_ents *= 2;
4203     }
4204   if (mctx->nbkref_ents > 0
4205       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4206     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4207
4208   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4209   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4210   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4211   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4212
4213   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4214      If bit N is clear, means that this entry won't epsilon-transition to
4215      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4216      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4217      such node.
4218
4219      A backreference does not epsilon-transition unless it is empty, so set
4220      to all zeros if FROM != TO.  */
4221   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4222     = (from == to ? ~0 : 0);
4223
4224   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4225   if (mctx->max_mb_elem_len < to - from)
4226     mctx->max_mb_elem_len = to - from;
4227   return REG_NOERROR;
4228 }
4229
4230 /* Search for the first entry which has the same str_idx, or -1 if none is
4231    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4232
4233 static int
4234 search_cur_bkref_entry (mctx, str_idx)
4235      re_match_context_t *mctx;
4236      int str_idx;
4237 {
4238   int left, right, mid, last;
4239   last = right = mctx->nbkref_ents;
4240   for (left = 0; left < right;)
4241     {
4242       mid = (left + right) / 2;
4243       if (mctx->bkref_ents[mid].str_idx < str_idx)
4244         left = mid + 1;
4245       else
4246         right = mid;
4247     }
4248   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4249     return left;
4250   else
4251     return -1;
4252 }
4253
4254 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4255    at STR_IDX.  */
4256
4257 static reg_errcode_t
4258 match_ctx_add_subtop (mctx, node, str_idx)
4259      re_match_context_t *mctx;
4260      int node, str_idx;
4261 {
4262 #ifdef DEBUG
4263   assert (mctx->sub_tops != NULL);
4264   assert (mctx->asub_tops > 0);
4265 #endif
4266   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4267     {
4268       int new_asub_tops = mctx->asub_tops * 2;
4269       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4270                                                    re_sub_match_top_t *,
4271                                                    new_asub_tops);
4272       if (BE (new_array == NULL, 0))
4273         return REG_ESPACE;
4274       mctx->sub_tops = new_array;
4275       mctx->asub_tops = new_asub_tops;
4276     }
4277   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4278   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4279     return REG_ESPACE;
4280   mctx->sub_tops[mctx->nsub_tops]->node = node;
4281   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4282   return REG_NOERROR;
4283 }
4284
4285 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4286    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4287
4288 static re_sub_match_last_t *
4289 match_ctx_add_sublast (subtop, node, str_idx)
4290      re_sub_match_top_t *subtop;
4291      int node, str_idx;
4292 {
4293   re_sub_match_last_t *new_entry;
4294   if (BE (subtop->nlasts == subtop->alasts, 0))
4295     {
4296       int new_alasts = 2 * subtop->alasts + 1;
4297       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4298                                                     re_sub_match_last_t *,
4299                                                     new_alasts);
4300       if (BE (new_array == NULL, 0))
4301         return NULL;
4302       subtop->lasts = new_array;
4303       subtop->alasts = new_alasts;
4304     }
4305   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4306   if (BE (new_entry != NULL, 1))
4307     {
4308       subtop->lasts[subtop->nlasts] = new_entry;
4309       new_entry->node = node;
4310       new_entry->str_idx = str_idx;
4311       ++subtop->nlasts;
4312     }
4313   return new_entry;
4314 }
4315
4316 static void
4317 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4318     re_sift_context_t *sctx;
4319     re_dfastate_t **sifted_sts, **limited_sts;
4320     int last_node, last_str_idx;
4321 {
4322   sctx->sifted_states = sifted_sts;
4323   sctx->limited_states = limited_sts;
4324   sctx->last_node = last_node;
4325   sctx->last_str_idx = last_str_idx;
4326   re_node_set_init_empty (&sctx->limits);
4327 }