1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 int *char_class_alloc,
97 const unsigned char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
213 re_compile_pattern (pattern, length, bufp)
216 struct re_pattern_buffer *bufp;
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225 /* Match anchors at newline. */
226 bufp->newline_anchor = 1;
228 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 weak_alias (__re_compile_pattern, re_compile_pattern)
238 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
254 re_set_syntax (syntax)
257 reg_syntax_t ret = re_syntax_options;
259 re_syntax_options = syntax;
263 weak_alias (__re_set_syntax, re_set_syntax)
267 re_compile_fastmap (bufp)
268 struct re_pattern_buffer *bufp;
270 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271 char *fastmap = bufp->fastmap;
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
289 __attribute ((always_inline))
290 re_set_fastmap (char *fastmap, int icase, int ch)
294 fastmap[tolower (ch)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 int node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
312 if (type == CHARACTER)
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
323 *p++ = dfa->nodes[node].opr.c;
324 while (++node < dfa->nodes_len
325 && dfa->nodes[node].type == CHARACTER
326 && dfa->nodes[node].mb_partial)
327 *p++ = dfa->nodes[node].opr.c;
328 memset (&state, '\0', sizeof (state));
329 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 re_set_fastmap (fastmap, 0, buf[0]);
337 else if (type == SIMPLE_BRACKET)
340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345 if (w & ((bitset_word_t) 1 << j))
346 re_set_fastmap (fastmap, icase, ch);
349 #ifdef RE_ENABLE_I18N
350 else if (type == COMPLEX_BRACKET)
352 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
356 /* See if we have to try all bytes which start multiple collation
358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 collation element, and don't catch 'b' since 'b' is
360 the only collation element which starts from 'b' (and
361 it is caught by SIMPLE_BRACKET). */
362 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363 && (cset->ncoll_syms || cset->nranges))
365 const int32_t *table = (const int32_t *)
366 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367 for (i = 0; i < SBC_MAX; ++i)
369 re_set_fastmap (fastmap, icase, i);
373 /* See if we have to start the match at all multibyte characters,
374 i.e. where we would not find an invalid sequence. This only
375 applies to multibyte character sets; for single byte character
376 sets, the SIMPLE_BRACKET again suffices. */
377 if (dfa->mb_cur_max > 1
378 && (cset->nchar_classes || cset->non_match || cset->nranges
380 || cset->nequiv_classes
388 memset (&mbs, 0, sizeof (mbs));
389 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390 re_set_fastmap (fastmap, false, (int) c);
397 /* ... Else catch all bytes which can start the mbchars. */
398 for (i = 0; i < cset->nmbchars; ++i)
402 memset (&state, '\0', sizeof (state));
403 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
409 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
414 #endif /* RE_ENABLE_I18N */
415 else if (type == OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 || type == OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 || type == END_OF_RE)
421 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422 if (type == END_OF_RE)
423 bufp->can_be_null = 1;
429 /* Entry point for POSIX code. */
430 /* regcomp takes a regular expression as a string and compiles it.
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
445 PATTERN is the address of the pattern string.
447 CFLAGS is a series of bits which affect compilation.
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
466 regcomp (preg, pattern, cflags)
467 regex_t *__restrict preg;
468 const char *__restrict pattern;
472 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473 : RE_SYNTAX_POSIX_BASIC);
479 /* Try to allocate space for the fastmap. */
480 preg->fastmap = re_malloc (char, SBC_MAX);
481 if (BE (preg->fastmap == NULL, 0))
484 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
486 /* If REG_NEWLINE is set, newlines are treated differently. */
487 if (cflags & REG_NEWLINE)
488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
489 syntax &= ~RE_DOT_NEWLINE;
490 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491 /* It also changes the matching behavior. */
492 preg->newline_anchor = 1;
495 preg->newline_anchor = 0;
496 preg->no_sub = !!(cflags & REG_NOSUB);
497 preg->translate = NULL;
499 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
501 /* POSIX doesn't distinguish between an unmatched open-group and an
502 unmatched close-group: both are REG_EPAREN. */
503 if (ret == REG_ERPAREN)
506 /* We have already checked preg->fastmap != NULL. */
507 if (BE (ret == REG_NOERROR, 1))
508 /* Compute the fastmap now, since regexec cannot modify the pattern
509 buffer. This function never fails in this implementation. */
510 (void) re_compile_fastmap (preg);
513 /* Some error occurred while compiling the expression. */
514 re_free (preg->fastmap);
515 preg->fastmap = NULL;
521 weak_alias (__regcomp, regcomp)
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
528 regerror (errcode, preg, errbuf, errbuf_size)
530 const regex_t *__restrict preg;
531 char *__restrict errbuf;
538 || errcode >= (int) (sizeof (__re_error_msgid_idx)
539 / sizeof (__re_error_msgid_idx[0])), 0))
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
546 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
548 msg_size = strlen (msg) + 1; /* Includes the null. */
550 if (BE (errbuf_size != 0, 1))
552 if (BE (msg_size > errbuf_size, 0))
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
557 memcpy (errbuf, msg, errbuf_size - 1);
558 errbuf[errbuf_size - 1] = 0;
562 memcpy (errbuf, msg, msg_size);
568 weak_alias (__regerror, regerror)
572 #ifdef RE_ENABLE_I18N
573 /* This static array is used for the map to single-byte characters when
574 UTF-8 is used. Otherwise we would allocate memory just to initialize
575 it the same all the time. UTF-8 is the preferred encoding so this is
576 a worthwhile optimization. */
577 static const bitset_t utf8_sb_map =
579 /* Set the first 128 bits. */
580 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
586 free_dfa_content (re_dfa_t *dfa)
591 for (i = 0; i < dfa->nodes_len; ++i)
592 free_token (dfa->nodes + i);
593 re_free (dfa->nexts);
594 for (i = 0; i < dfa->nodes_len; ++i)
596 if (dfa->eclosures != NULL)
597 re_node_set_free (dfa->eclosures + i);
598 if (dfa->inveclosures != NULL)
599 re_node_set_free (dfa->inveclosures + i);
600 if (dfa->edests != NULL)
601 re_node_set_free (dfa->edests + i);
603 re_free (dfa->edests);
604 re_free (dfa->eclosures);
605 re_free (dfa->inveclosures);
606 re_free (dfa->nodes);
608 if (dfa->state_table)
609 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 struct re_state_table_entry *entry = dfa->state_table + i;
612 for (j = 0; j < entry->num; ++j)
614 re_dfastate_t *state = entry->array[j];
617 re_free (entry->array);
619 re_free (dfa->state_table);
620 #ifdef RE_ENABLE_I18N
621 if (dfa->sb_char != utf8_sb_map)
622 re_free (dfa->sb_char);
624 re_free (dfa->subexp_map);
626 re_free (dfa->re_str);
633 /* Free dynamically allocated space used by PREG. */
639 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640 if (BE (dfa != NULL, 1))
641 free_dfa_content (dfa);
645 re_free (preg->fastmap);
646 preg->fastmap = NULL;
648 re_free (preg->translate);
649 preg->translate = NULL;
652 weak_alias (__regfree, regfree)
655 /* Entry points compatible with 4.2 BSD regex library. We don't define
656 them unless specifically requested. */
658 #if defined _REGEX_RE_COMP || defined _LIBC
660 /* BSD has one and only one pattern buffer. */
661 static struct re_pattern_buffer re_comp_buf;
665 /* Make these definitions weak in libc, so POSIX programs can redefine
666 these names if they don't use our functions, and still use
667 regcomp/regexec above without link errors. */
678 if (!re_comp_buf.buffer)
679 return gettext ("No previous regular expression");
683 if (re_comp_buf.buffer)
685 fastmap = re_comp_buf.fastmap;
686 re_comp_buf.fastmap = NULL;
687 __regfree (&re_comp_buf);
688 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689 re_comp_buf.fastmap = fastmap;
692 if (re_comp_buf.fastmap == NULL)
694 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695 if (re_comp_buf.fastmap == NULL)
696 return (char *) gettext (__re_error_msgid
697 + __re_error_msgid_idx[(int) REG_ESPACE]);
700 /* Since `re_exec' always passes NULL for the `regs' argument, we
701 don't need to initialize the pattern buffer fields which affect it. */
703 /* Match anchors at newlines. */
704 re_comp_buf.newline_anchor = 1;
706 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
712 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
716 libc_freeres_fn (free_mem)
718 __regfree (&re_comp_buf);
722 #endif /* _REGEX_RE_COMP */
724 /* Internal entry point.
725 Compile the regular expression PATTERN, whose length is LENGTH.
726 SYNTAX indicate regular expression's syntax. */
729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732 reg_errcode_t err = REG_NOERROR;
736 /* Initialize the pattern buffer. */
737 preg->fastmap_accurate = 0;
738 preg->syntax = syntax;
739 preg->not_bol = preg->not_eol = 0;
742 preg->can_be_null = 0;
743 preg->regs_allocated = REGS_UNALLOCATED;
745 /* Initialize the dfa. */
746 dfa = (re_dfa_t *) preg->buffer;
747 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 /* If zero allocated, but buffer is non-null, try to realloc
750 enough space. This loses if buffer's address is bogus, but
751 that is the user's responsibility. If ->buffer is NULL this
752 is a simple allocation. */
753 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756 preg->allocated = sizeof (re_dfa_t);
757 preg->buffer = (unsigned char *) dfa;
759 preg->used = sizeof (re_dfa_t);
761 err = init_dfa (dfa, length);
762 if (BE (err != REG_NOERROR, 0))
764 free_dfa_content (dfa);
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa->re_str = re_malloc (char, length + 1);
772 strncpy (dfa->re_str, pattern, length + 1);
775 __libc_lock_init (dfa->lock);
777 err = re_string_construct (®exp, pattern, length, preg->translate,
778 syntax & RE_ICASE, dfa);
779 if (BE (err != REG_NOERROR, 0))
781 re_compile_internal_free_return:
782 free_workarea_compile (preg);
783 re_string_destruct (®exp);
784 free_dfa_content (dfa);
790 /* Parse the regular expression, and build a structure tree. */
792 dfa->str_tree = parse (®exp, preg, syntax, &err);
793 if (BE (dfa->str_tree == NULL, 0))
794 goto re_compile_internal_free_return;
796 /* Analyze the tree and create the nfa. */
797 err = analyze (preg);
798 if (BE (err != REG_NOERROR, 0))
799 goto re_compile_internal_free_return;
801 #ifdef RE_ENABLE_I18N
802 /* If possible, do searching in single byte encoding to speed things up. */
803 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
807 /* Then create the initial state of the dfa. */
808 err = create_initial_state (dfa);
810 /* Release work areas. */
811 free_workarea_compile (preg);
812 re_string_destruct (®exp);
814 if (BE (err != REG_NOERROR, 0))
816 free_dfa_content (dfa);
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
828 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 unsigned int table_size;
835 memset (dfa, '\0', sizeof (re_dfa_t));
837 /* Force allocation of str_tree_storage the first time. */
838 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
840 /* Avoid overflows. */
841 if (pat_len == SIZE_MAX)
844 dfa->nodes_alloc = pat_len + 1;
845 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847 /* table_size = 2 ^ ceil(log pat_len) */
848 for (table_size = 1; ; table_size <<= 1)
849 if (table_size > pat_len)
852 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853 dfa->state_hash_mask = table_size - 1;
855 dfa->mb_cur_max = MB_CUR_MAX;
857 if (dfa->mb_cur_max == 6
858 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
860 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
863 # ifdef HAVE_LANGINFO_CODESET
864 codeset_name = nl_langinfo (CODESET);
866 codeset_name = getenv ("LC_ALL");
867 if (codeset_name == NULL || codeset_name[0] == '\0')
868 codeset_name = getenv ("LC_CTYPE");
869 if (codeset_name == NULL || codeset_name[0] == '\0')
870 codeset_name = getenv ("LANG");
871 if (codeset_name == NULL)
873 else if (strchr (codeset_name, '.') != NULL)
874 codeset_name = strchr (codeset_name, '.') + 1;
877 if (strcasecmp (codeset_name, "UTF-8") == 0
878 || strcasecmp (codeset_name, "UTF8") == 0)
881 /* We check exhaustively in the loop below if this charset is a
882 superset of ASCII. */
883 dfa->map_notascii = 0;
886 #ifdef RE_ENABLE_I18N
887 if (dfa->mb_cur_max > 1)
890 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
895 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896 if (BE (dfa->sb_char == NULL, 0))
899 /* Set the bits corresponding to single byte chars. */
900 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
903 wint_t wch = __btowc (ch);
905 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
907 if (isascii (ch) && wch != ch)
908 dfa->map_notascii = 1;
915 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
920 /* Initialize WORD_CHAR table, which indicate which character is
921 "word". In this case "word" means that it is the word construction
922 character used by some operators like "\<", "\>", etc. */
926 init_word_char (re_dfa_t *dfa)
928 dfa->word_ops_used = 1;
931 if (BE (dfa->map_notascii == 0, 1))
933 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
934 them, an issue when this code is used in Gnulib. */
935 bitset_word_t bits0 = 0x00000000;
936 bitset_word_t bits1 = 0x03ff0000;
937 bitset_word_t bits2 = 0x87fffffe;
938 bitset_word_t bits3 = 0x07fffffe;
940 if (BITSET_WORD_BITS == 64)
942 /* Pacify gcc -Woverflow on 32-bit platformns. */
943 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
944 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947 else if (BITSET_WORD_BITS == 32)
949 dfa->word_char[0] = bits0;
950 dfa->word_char[1] = bits1;
951 dfa->word_char[2] = bits2;
952 dfa->word_char[3] = bits3;
959 if (BE (dfa->is_utf8, 1))
961 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
967 for (; i < BITSET_WORDS; ++i)
968 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
969 if (isalnum (ch) || ch == '_')
970 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973 /* Free the work area which are only used while compiling. */
976 free_workarea_compile (regex_t *preg)
978 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
979 bin_tree_storage_t *storage, *next;
980 for (storage = dfa->str_tree_storage; storage; storage = next)
982 next = storage->next;
985 dfa->str_tree_storage = NULL;
986 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
987 dfa->str_tree = NULL;
988 re_free (dfa->org_indices);
989 dfa->org_indices = NULL;
992 /* Create initial states for all contexts. */
995 create_initial_state (re_dfa_t *dfa)
999 re_node_set init_nodes;
1001 /* Initial states have the epsilon closure of the node which is
1002 the first node of the regular expression. */
1003 first = dfa->str_tree->first->node_idx;
1004 dfa->init_node = first;
1005 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006 if (BE (err != REG_NOERROR, 0))
1009 /* The back-references which are in initial states can epsilon transit,
1010 since in this case all of the subexpressions can be null.
1011 Then we add epsilon closures of the nodes which are the next nodes of
1012 the back-references. */
1013 if (dfa->nbackref > 0)
1014 for (i = 0; i < init_nodes.nelem; ++i)
1016 int node_idx = init_nodes.elems[i];
1017 re_token_type_t type = dfa->nodes[node_idx].type;
1020 if (type != OP_BACK_REF)
1022 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1024 re_token_t *clexp_node;
1025 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026 if (clexp_node->type == OP_CLOSE_SUBEXP
1027 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 if (clexp_idx == init_nodes.nelem)
1033 if (type == OP_BACK_REF)
1035 int dest_idx = dfa->edests[node_idx].elems[0];
1036 if (!re_node_set_contains (&init_nodes, dest_idx))
1038 reg_errcode_t err = re_node_set_merge (&init_nodes,
1041 if (err != REG_NOERROR)
1048 /* It must be the first time to invoke acquire_state. */
1049 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1050 /* We don't check ERR here, since the initial state must not be NULL. */
1051 if (BE (dfa->init_state == NULL, 0))
1053 if (dfa->init_state->has_constraint)
1055 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1063 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1064 || dfa->init_state_begbuf == NULL, 0))
1068 dfa->init_state_word = dfa->init_state_nl
1069 = dfa->init_state_begbuf = dfa->init_state;
1071 re_node_set_free (&init_nodes);
1075 #ifdef RE_ENABLE_I18N
1076 /* If it is possible to do searching in single byte encoding instead of UTF-8
1077 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1078 DFA nodes where needed. */
1081 optimize_utf8 (re_dfa_t *dfa)
1083 int node, i, mb_chars = 0, has_period = 0;
1085 for (node = 0; node < dfa->nodes_len; ++node)
1086 switch (dfa->nodes[node].type)
1089 if (dfa->nodes[node].opr.c >= 0x80)
1093 switch (dfa->nodes[node].opr.ctx_type)
1101 /* Word anchors etc. cannot be handled. It's okay to test
1102 opr.ctx_type since constraints (for all DFA nodes) are
1103 created by ORing one or more opr.ctx_type values. */
1113 case OP_DUP_ASTERISK:
1114 case OP_OPEN_SUBEXP:
1115 case OP_CLOSE_SUBEXP:
1117 case COMPLEX_BRACKET:
1119 case SIMPLE_BRACKET:
1120 /* Just double check. The non-ASCII range starts at 0x80. */
1121 assert (0x80 % BITSET_WORD_BITS == 0);
1122 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1123 if (dfa->nodes[node].opr.sbcset[i])
1130 if (mb_chars || has_period)
1131 for (node = 0; node < dfa->nodes_len; ++node)
1133 if (dfa->nodes[node].type == CHARACTER
1134 && dfa->nodes[node].opr.c >= 0x80)
1135 dfa->nodes[node].mb_partial = 0;
1136 else if (dfa->nodes[node].type == OP_PERIOD)
1137 dfa->nodes[node].type = OP_UTF8_PERIOD;
1140 /* The search can be in single byte locale. */
1141 dfa->mb_cur_max = 1;
1143 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1147 /* Analyze the structure tree, and calculate "first", "next", "edest",
1148 "eclosure", and "inveclosure". */
1150 static reg_errcode_t
1151 analyze (regex_t *preg)
1153 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1156 /* Allocate arrays. */
1157 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1158 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1159 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1160 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1161 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1162 || dfa->eclosures == NULL, 0))
1165 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1166 if (dfa->subexp_map != NULL)
1169 for (i = 0; i < preg->re_nsub; i++)
1170 dfa->subexp_map[i] = i;
1171 preorder (dfa->str_tree, optimize_subexps, dfa);
1172 for (i = 0; i < preg->re_nsub; i++)
1173 if (dfa->subexp_map[i] != i)
1175 if (i == preg->re_nsub)
1177 free (dfa->subexp_map);
1178 dfa->subexp_map = NULL;
1182 ret = postorder (dfa->str_tree, lower_subexps, preg);
1183 if (BE (ret != REG_NOERROR, 0))
1185 ret = postorder (dfa->str_tree, calc_first, dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1188 preorder (dfa->str_tree, calc_next, dfa);
1189 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1190 if (BE (ret != REG_NOERROR, 0))
1192 ret = calc_eclosure (dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1196 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1197 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1198 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1201 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1202 if (BE (dfa->inveclosures == NULL, 0))
1204 ret = calc_inveclosure (dfa);
1210 /* Our parse trees are very unbalanced, so we cannot use a stack to
1211 implement parse tree visits. Instead, we use parent pointers and
1212 some hairy code in these two functions. */
1213 static reg_errcode_t
1214 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217 bin_tree_t *node, *prev;
1219 for (node = root; ; )
1221 /* Descend down the tree, preferably to the left (or to the right
1222 if that's the only child). */
1223 while (node->left || node->right)
1231 reg_errcode_t err = fn (extra, node);
1232 if (BE (err != REG_NOERROR, 0))
1234 if (node->parent == NULL)
1237 node = node->parent;
1239 /* Go up while we have a node that is reached from the right. */
1240 while (node->right == prev || node->right == NULL);
1245 static reg_errcode_t
1246 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1251 for (node = root; ; )
1253 reg_errcode_t err = fn (extra, node);
1254 if (BE (err != REG_NOERROR, 0))
1257 /* Go to the left node, or up and to the right. */
1262 bin_tree_t *prev = NULL;
1263 while (node->right == prev || node->right == NULL)
1266 node = node->parent;
1275 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1276 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1277 backreferences as well. Requires a preorder visit. */
1278 static reg_errcode_t
1279 optimize_subexps (void *extra, bin_tree_t *node)
1281 re_dfa_t *dfa = (re_dfa_t *) extra;
1283 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1285 int idx = node->token.opr.idx;
1286 node->token.opr.idx = dfa->subexp_map[idx];
1287 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1290 else if (node->token.type == SUBEXP
1291 && node->left && node->left->token.type == SUBEXP)
1293 int other_idx = node->left->token.opr.idx;
1295 node->left = node->left->left;
1297 node->left->parent = node;
1299 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1300 if (other_idx < BITSET_WORD_BITS)
1301 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1307 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1308 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1309 static reg_errcode_t
1310 lower_subexps (void *extra, bin_tree_t *node)
1312 regex_t *preg = (regex_t *) extra;
1313 reg_errcode_t err = REG_NOERROR;
1315 if (node->left && node->left->token.type == SUBEXP)
1317 node->left = lower_subexp (&err, preg, node->left);
1319 node->left->parent = node;
1321 if (node->right && node->right->token.type == SUBEXP)
1323 node->right = lower_subexp (&err, preg, node->right);
1325 node->right->parent = node;
1332 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1334 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335 bin_tree_t *body = node->left;
1336 bin_tree_t *op, *cls, *tree1, *tree;
1339 /* We do not optimize empty subexpressions, because otherwise we may
1340 have bad CONCAT nodes with NULL children. This is obviously not
1341 very common, so we do not lose much. An example that triggers
1342 this case is the sed "script" /\(\)/x. */
1343 && node->left != NULL
1344 && (node->token.opr.idx >= BITSET_WORD_BITS
1345 || !(dfa->used_bkref_map
1346 & ((bitset_word_t) 1 << node->token.opr.idx))))
1349 /* Convert the SUBEXP node to the concatenation of an
1350 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1351 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1352 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1353 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1354 tree = create_tree (dfa, op, tree1, CONCAT);
1355 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1361 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1362 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1366 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1367 nodes. Requires a postorder visit. */
1368 static reg_errcode_t
1369 calc_first (void *extra, bin_tree_t *node)
1371 re_dfa_t *dfa = (re_dfa_t *) extra;
1372 if (node->token.type == CONCAT)
1374 node->first = node->left->first;
1375 node->node_idx = node->left->node_idx;
1380 node->node_idx = re_dfa_add_node (dfa, node->token);
1381 if (BE (node->node_idx == -1, 0))
1383 if (node->token.type == ANCHOR)
1384 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1389 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1390 static reg_errcode_t
1391 calc_next (void *extra, bin_tree_t *node)
1393 switch (node->token.type)
1395 case OP_DUP_ASTERISK:
1396 node->left->next = node;
1399 node->left->next = node->right->first;
1400 node->right->next = node->next;
1404 node->left->next = node->next;
1406 node->right->next = node->next;
1412 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1413 static reg_errcode_t
1414 link_nfa_nodes (void *extra, bin_tree_t *node)
1416 re_dfa_t *dfa = (re_dfa_t *) extra;
1417 int idx = node->node_idx;
1418 reg_errcode_t err = REG_NOERROR;
1420 switch (node->token.type)
1426 assert (node->next == NULL);
1429 case OP_DUP_ASTERISK:
1433 dfa->has_plural_match = 1;
1434 if (node->left != NULL)
1435 left = node->left->first->node_idx;
1437 left = node->next->node_idx;
1438 if (node->right != NULL)
1439 right = node->right->first->node_idx;
1441 right = node->next->node_idx;
1443 assert (right > -1);
1444 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1449 case OP_OPEN_SUBEXP:
1450 case OP_CLOSE_SUBEXP:
1451 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1455 dfa->nexts[idx] = node->next->node_idx;
1456 if (node->token.type == OP_BACK_REF)
1457 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1461 assert (!IS_EPSILON_NODE (node->token.type));
1462 dfa->nexts[idx] = node->next->node_idx;
1469 /* Duplicate the epsilon closure of the node ROOT_NODE.
1470 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1471 to their own constraint. */
1473 static reg_errcode_t
1475 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1476 int root_node, unsigned int init_constraint)
1478 int org_node, clone_node, ret;
1479 unsigned int constraint = init_constraint;
1480 for (org_node = top_org_node, clone_node = top_clone_node;;)
1482 int org_dest, clone_dest;
1483 if (dfa->nodes[org_node].type == OP_BACK_REF)
1485 /* If the back reference epsilon-transit, its destination must
1486 also have the constraint. Then duplicate the epsilon closure
1487 of the destination of the back reference, and store it in
1488 edests of the back reference. */
1489 org_dest = dfa->nexts[org_node];
1490 re_node_set_empty (dfa->edests + clone_node);
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1494 dfa->nexts[clone_node] = dfa->nexts[org_node];
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1499 else if (dfa->edests[org_node].nelem == 0)
1501 /* In case of the node can't epsilon-transit, don't duplicate the
1502 destination and store the original destination as the
1503 destination of the node. */
1504 dfa->nexts[clone_node] = dfa->nexts[org_node];
1507 else if (dfa->edests[org_node].nelem == 1)
1509 /* In case of the node can epsilon-transit, and it has only one
1511 org_dest = dfa->edests[org_node].elems[0];
1512 re_node_set_empty (dfa->edests + clone_node);
1513 /* If the node is root_node itself, it means the epsilon clsoure
1514 has a loop. Then tie it to the destination of the root_node. */
1515 if (org_node == root_node && clone_node != org_node)
1517 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518 if (BE (ret < 0, 0))
1522 /* In case of the node has another constraint, add it. */
1523 constraint |= dfa->nodes[org_node].constraint;
1524 clone_dest = duplicate_node (dfa, org_dest, constraint);
1525 if (BE (clone_dest == -1, 0))
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1531 else /* dfa->edests[org_node].nelem == 2 */
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest = dfa->edests[org_node].elems[0];
1536 re_node_set_empty (dfa->edests + clone_node);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539 if (clone_dest == -1)
1541 /* There is no such duplicated node, create a new one. */
1543 clone_dest = duplicate_node (dfa, org_dest, constraint);
1544 if (BE (clone_dest == -1, 0))
1546 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 if (BE (ret < 0, 0))
1549 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550 root_node, constraint);
1551 if (BE (err != REG_NOERROR, 0))
1556 /* There is a duplicated node which satisfies the constraint,
1557 use it to avoid infinite loop. */
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1563 org_dest = dfa->edests[org_node].elems[1];
1564 clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 if (BE (clone_dest == -1, 0))
1567 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 if (BE (ret < 0, 0))
1571 org_node = org_dest;
1572 clone_node = clone_dest;
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578 satisfies the constraint CONSTRAINT. */
1581 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1582 unsigned int constraint)
1585 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587 if (org_node == dfa->org_indices[idx]
1588 && constraint == dfa->nodes[idx].constraint)
1589 return idx; /* Found. */
1591 return -1; /* Not found. */
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595 Return the index of the new node, or -1 if insufficient storage is
1599 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1601 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602 if (BE (dup_idx != -1, 1))
1604 dfa->nodes[dup_idx].constraint = constraint;
1605 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1606 dfa->nodes[dup_idx].duplicated = 1;
1608 /* Store the index of the original node. */
1609 dfa->org_indices[dup_idx] = org_idx;
1614 static reg_errcode_t
1615 calc_inveclosure (re_dfa_t *dfa)
1618 for (idx = 0; idx < dfa->nodes_len; ++idx)
1619 re_node_set_init_empty (dfa->inveclosures + idx);
1621 for (src = 0; src < dfa->nodes_len; ++src)
1623 int *elems = dfa->eclosures[src].elems;
1624 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1626 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1627 if (BE (ret == -1, 0))
1635 /* Calculate "eclosure" for all the node in DFA. */
1637 static reg_errcode_t
1638 calc_eclosure (re_dfa_t *dfa)
1640 int node_idx, incomplete;
1642 assert (dfa->nodes_len > 0);
1645 /* For each nodes, calculate epsilon closure. */
1646 for (node_idx = 0; ; ++node_idx)
1649 re_node_set eclosure_elem;
1650 if (node_idx == dfa->nodes_len)
1659 assert (dfa->eclosures[node_idx].nelem != -1);
1662 /* If we have already calculated, skip it. */
1663 if (dfa->eclosures[node_idx].nelem != 0)
1665 /* Calculate epsilon closure of `node_idx'. */
1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1667 if (BE (err != REG_NOERROR, 0))
1670 if (dfa->eclosures[node_idx].nelem == 0)
1673 re_node_set_free (&eclosure_elem);
1679 /* Calculate epsilon closure of NODE. */
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1686 re_node_set eclosure;
1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690 if (BE (err != REG_NOERROR, 0))
1693 /* This indicates that we are calculating this node now.
1694 We reference this value to avoid infinite loop. */
1695 dfa->eclosures[node].nelem = -1;
1697 /* If the current node has constraints, duplicate all nodes
1698 since they must inherit the constraints. */
1699 if (dfa->nodes[node].constraint
1700 && dfa->edests[node].nelem
1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703 err = duplicate_node_closure (dfa, node, node, node,
1704 dfa->nodes[node].constraint);
1705 if (BE (err != REG_NOERROR, 0))
1709 /* Expand each epsilon destination nodes. */
1710 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 re_node_set eclosure_elem;
1714 int edest = dfa->edests[node].elems[i];
1715 /* If calculating the epsilon closure of `edest' is in progress,
1716 return intermediate result. */
1717 if (dfa->eclosures[edest].nelem == -1)
1722 /* If we haven't calculated the epsilon closure of `edest' yet,
1723 calculate now. Otherwise use calculated epsilon closure. */
1724 if (dfa->eclosures[edest].nelem == 0)
1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1727 if (BE (err != REG_NOERROR, 0))
1731 eclosure_elem = dfa->eclosures[edest];
1732 /* Merge the epsilon closure of `edest'. */
1733 err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 if (BE (err != REG_NOERROR, 0))
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1741 re_node_set_free (&eclosure_elem);
1745 /* An epsilon closure includes itself. */
1746 ret = re_node_set_insert (&eclosure, node);
1747 if (BE (ret < 0, 0))
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1778 if (re_string_eoi (input))
1780 token->type = END_OF_RE;
1784 c = re_string_peek_byte (input, 0);
1787 token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789 token->mb_partial = 0;
1790 if (input->mb_cur_max > 1 &&
1791 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 token->type = CHARACTER;
1794 token->mb_partial = 1;
1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 token->type = BACK_SLASH;
1807 c2 = re_string_peek_byte_case (input, 1);
1809 token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811 if (input->mb_cur_max > 1)
1813 wint_t wc = re_string_wchar_at (input,
1814 re_string_cur_idx (input) + 1);
1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1819 token->word_char = IS_WORD_CHAR (c2) != 0;
1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 token->type = OP_ALT;
1827 case '1': case '2': case '3': case '4': case '5':
1828 case '6': case '7': case '8': case '9':
1829 if (!(syntax & RE_NO_BK_REFS))
1831 token->type = OP_BACK_REF;
1832 token->opr.idx = c2 - '1';
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_FIRST;
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_LAST;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_DELIM;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = NOT_WORD_DELIM;
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_WORD;
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTWORD;
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_SPACE;
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTSPACE;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_FIRST;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_LAST;
1894 if (!(syntax & RE_NO_BK_PARENS))
1895 token->type = OP_OPEN_SUBEXP;
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_CLOSE_SUBEXP;
1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_PLUS;
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_QUESTION;
1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 token->type = OP_OPEN_DUP_NUM;
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_CLOSE_DUP_NUM;
1923 token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925 if (input->mb_cur_max > 1)
1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1932 token->word_char = IS_WORD_CHAR (token->opr.c);
1937 if (syntax & RE_NEWLINE_ALT)
1938 token->type = OP_ALT;
1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 token->type = OP_ALT;
1945 token->type = OP_DUP_ASTERISK;
1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 token->type = OP_OPEN_DUP_NUM;
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_CLOSE_DUP_NUM;
1964 if (syntax & RE_NO_BK_PARENS)
1965 token->type = OP_OPEN_SUBEXP;
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_CLOSE_SUBEXP;
1972 token->type = OP_OPEN_BRACKET;
1975 token->type = OP_PERIOD;
1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 re_string_cur_idx (input) != 0)
1981 char prev = re_string_peek_byte (input, -1);
1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985 token->type = ANCHOR;
1986 token->opr.ctx_type = LINE_FIRST;
1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 re_string_cur_idx (input) + 1 != re_string_length (input))
1993 re_string_skip_bytes (input, 1);
1994 peek_token (&next, input, syntax);
1995 re_string_skip_bytes (input, -1);
1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_LAST;
2008 /* Peek a token from INPUT, and return the length of the token.
2009 We must not use this function out of bracket expressions. */
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016 if (re_string_eoi (input))
2018 token->type = END_OF_RE;
2021 c = re_string_peek_byte (input, 0);
2024 #ifdef RE_ENABLE_I18N
2025 if (input->mb_cur_max > 1 &&
2026 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 token->type = CHARACTER;
2031 #endif /* RE_ENABLE_I18N */
2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 /* In this case, '\' escape a character. */
2038 re_string_skip_bytes (input, 1);
2039 c2 = re_string_peek_byte (input, 0);
2041 token->type = CHARACTER;
2044 if (c == '[') /* '[' is a special char in a bracket exps. */
2048 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 c2 = re_string_peek_byte (input, 1);
2057 token->type = OP_OPEN_COLL_ELEM;
2060 token->type = OP_OPEN_EQUIV_CLASS;
2063 if (syntax & RE_CHAR_CLASSES)
2065 token->type = OP_OPEN_CHAR_CLASS;
2068 /* else fall through. */
2070 token->type = CHARACTER;
2080 token->type = OP_CHARSET_RANGE;
2083 token->type = OP_CLOSE_BRACKET;
2086 token->type = OP_NON_MATCH_LIST;
2089 token->type = CHARACTER;
2094 /* Functions for parser. */
2096 /* Entry point of the parser.
2097 Parse the regular expression REGEXP and return the structure tree.
2098 If an error is occured, ERR is set by error code, and return NULL.
2099 This function build the following tree, from regular expression <reg_exp>:
2105 CAT means concatenation.
2106 EOR means end of regular expression. */
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 bin_tree_t *tree, *eor, *root;
2114 re_token_t current_token;
2115 dfa->syntax = syntax;
2116 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122 root = create_tree (dfa, tree, eor, CONCAT);
2125 if (BE (eor == NULL || root == NULL, 0))
2133 /* This function build the following tree, from regular expression
2134 <branch1>|<branch2>:
2140 ALT means alternative, which represents the operator `|'. */
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 bin_tree_t *tree, *branch = NULL;
2148 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 while (token->type == OP_ALT)
2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 if (token->type != OP_ALT && token->type != END_OF_RE
2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2164 tree = create_tree (dfa, tree, branch, OP_ALT);
2165 if (BE (tree == NULL, 0))
2174 /* This function build the following tree, from regular expression
2181 CAT means concatenation. */
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2187 bin_tree_t *tree, *exp;
2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 while (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2200 postorder (tree, free_tree, NULL);
2203 if (tree != NULL && exp != NULL)
2205 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2206 if (newtree == NULL)
2208 postorder (exp, free_tree, NULL);
2209 postorder (tree, free_tree, NULL);
2215 else if (tree == NULL)
2217 /* Otherwise exp == NULL, we don't need to create new tree. */
2222 /* This function build the following tree, from regular expression a*:
2229 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2232 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2234 switch (token->type)
2237 tree = create_token_tree (dfa, NULL, NULL, token);
2238 if (BE (tree == NULL, 0))
2243 #ifdef RE_ENABLE_I18N
2244 if (dfa->mb_cur_max > 1)
2246 while (!re_string_eoi (regexp)
2247 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2249 bin_tree_t *mbc_remain;
2250 fetch_token (token, regexp, syntax);
2251 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253 if (BE (mbc_remain == NULL || tree == NULL, 0))
2262 case OP_OPEN_SUBEXP:
2263 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 case OP_OPEN_BRACKET:
2268 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2273 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2278 dfa->used_bkref_map |= 1 << token->opr.idx;
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2286 dfa->has_mb_node = 1;
2288 case OP_OPEN_DUP_NUM:
2289 if (syntax & RE_CONTEXT_INVALID_DUP)
2295 case OP_DUP_ASTERISK:
2297 case OP_DUP_QUESTION:
2298 if (syntax & RE_CONTEXT_INVALID_OPS)
2303 else if (syntax & RE_CONTEXT_INDEP_OPS)
2305 fetch_token (token, regexp, syntax);
2306 return parse_expression (regexp, preg, token, syntax, nest, err);
2308 /* else fall through */
2309 case OP_CLOSE_SUBEXP:
2310 if ((token->type == OP_CLOSE_SUBEXP) &&
2311 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2316 /* else fall through */
2317 case OP_CLOSE_DUP_NUM:
2318 /* We treat it as a normal character. */
2320 /* Then we can these characters as normal characters. */
2321 token->type = CHARACTER;
2322 /* mb_partial and word_char bits should be initialized already
2324 tree = create_token_tree (dfa, NULL, NULL, token);
2325 if (BE (tree == NULL, 0))
2332 if ((token->opr.ctx_type
2333 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334 && dfa->word_ops_used == 0)
2335 init_word_char (dfa);
2336 if (token->opr.ctx_type == WORD_DELIM
2337 || token->opr.ctx_type == NOT_WORD_DELIM)
2339 bin_tree_t *tree_first, *tree_last;
2340 if (token->opr.ctx_type == WORD_DELIM)
2342 token->opr.ctx_type = WORD_FIRST;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = WORD_LAST;
2348 token->opr.ctx_type = INSIDE_WORD;
2349 tree_first = create_token_tree (dfa, NULL, NULL, token);
2350 token->opr.ctx_type = INSIDE_NOTWORD;
2352 tree_last = create_token_tree (dfa, NULL, NULL, token);
2353 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2369 /* We must return here, since ANCHORs can't be followed
2370 by repetition operators.
2371 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2373 fetch_token (token, regexp, syntax);
2376 tree = create_token_tree (dfa, NULL, NULL, token);
2377 if (BE (tree == NULL, 0))
2382 if (dfa->mb_cur_max > 1)
2383 dfa->has_mb_node = 1;
2387 tree = build_charclass_op (dfa, regexp->trans,
2388 (const unsigned char *) "alnum",
2389 (const unsigned char *) "_",
2390 token->type == OP_NOTWORD, err);
2391 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 tree = build_charclass_op (dfa, regexp->trans,
2397 (const unsigned char *) "space",
2398 (const unsigned char *) "",
2399 token->type == OP_NOTSPACE, err);
2400 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2410 /* Must not happen? */
2416 fetch_token (token, regexp, syntax);
2418 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2421 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2424 /* In BRE consecutive duplications are not allowed. */
2425 if ((syntax & RE_CONTEXT_INVALID_DUP)
2426 && (token->type == OP_DUP_ASTERISK
2427 || token->type == OP_OPEN_DUP_NUM))
2437 /* This function build the following tree, from regular expression
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2448 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2451 cur_nsub = preg->re_nsub++;
2453 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2455 /* The subexpression may be a null string. */
2456 if (token->type == OP_CLOSE_SUBEXP)
2460 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2464 postorder (tree, free_tree, NULL);
2467 if (BE (*err != REG_NOERROR, 0))
2471 if (cur_nsub <= '9' - '1')
2472 dfa->completed_bkref_map |= 1 << cur_nsub;
2474 tree = create_tree (dfa, tree, NULL, SUBEXP);
2475 if (BE (tree == NULL, 0))
2480 tree->token.opr.idx = cur_nsub;
2484 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2487 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2488 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2490 bin_tree_t *tree = NULL, *old_tree = NULL;
2491 int i, start, end, start_idx = re_string_cur_idx (regexp);
2492 re_token_t start_token = *token;
2494 if (token->type == OP_OPEN_DUP_NUM)
2497 start = fetch_number (regexp, token, syntax);
2500 if (token->type == CHARACTER && token->opr.c == ',')
2501 start = 0; /* We treat "{,m}" as "{0,m}". */
2504 *err = REG_BADBR; /* <re>{} is invalid. */
2508 if (BE (start != -2, 1))
2510 /* We treat "{n}" as "{n,n}". */
2511 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2512 : ((token->type == CHARACTER && token->opr.c == ',')
2513 ? fetch_number (regexp, token, syntax) : -2));
2515 if (BE (start == -2 || end == -2, 0))
2517 /* Invalid sequence. */
2518 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2520 if (token->type == END_OF_RE)
2528 /* If the syntax bit is set, rollback. */
2529 re_string_set_index (regexp, start_idx);
2530 *token = start_token;
2531 token->type = CHARACTER;
2532 /* mb_partial and word_char bits should be already initialized by
2537 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2539 /* First number greater than second. */
2546 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2547 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2550 fetch_token (token, regexp, syntax);
2552 if (BE (elem == NULL, 0))
2554 if (BE (start == 0 && end == 0, 0))
2556 postorder (elem, free_tree, NULL);
2560 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2561 if (BE (start > 0, 0))
2564 for (i = 2; i <= start; ++i)
2566 elem = duplicate_tree (elem, dfa);
2567 tree = create_tree (dfa, tree, elem, CONCAT);
2568 if (BE (elem == NULL || tree == NULL, 0))
2569 goto parse_dup_op_espace;
2575 /* Duplicate ELEM before it is marked optional. */
2576 elem = duplicate_tree (elem, dfa);
2582 if (elem->token.type == SUBEXP)
2583 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2585 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2586 if (BE (tree == NULL, 0))
2587 goto parse_dup_op_espace;
2589 /* This loop is actually executed only when end != -1,
2590 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2591 already created the start+1-th copy. */
2592 for (i = start + 2; i <= end; ++i)
2594 elem = duplicate_tree (elem, dfa);
2595 tree = create_tree (dfa, tree, elem, CONCAT);
2596 if (BE (elem == NULL || tree == NULL, 0))
2597 goto parse_dup_op_espace;
2599 tree = create_tree (dfa, tree, NULL, OP_ALT);
2600 if (BE (tree == NULL, 0))
2601 goto parse_dup_op_espace;
2605 tree = create_tree (dfa, old_tree, tree, CONCAT);
2609 parse_dup_op_espace:
2614 /* Size of the names for collating symbol/equivalence_class/character_class.
2615 I'm not sure, but maybe enough. */
2616 #define BRACKET_NAME_BUF_SIZE 32
2619 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2620 Build the range expression which starts from START_ELEM, and ends
2621 at END_ELEM. The result are written to MBCSET and SBCSET.
2622 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2623 mbcset->range_ends, is a pointer argument sinse we may
2626 static reg_errcode_t
2628 # ifdef RE_ENABLE_I18N
2629 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2630 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2631 # else /* not RE_ENABLE_I18N */
2632 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2633 bracket_elem_t *end_elem)
2634 # endif /* not RE_ENABLE_I18N */
2636 unsigned int start_ch, end_ch;
2637 /* Equivalence Classes and Character Classes can't be a range start/end. */
2638 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2639 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643 /* We can handle no multi character collating elements without libc
2645 if (BE ((start_elem->type == COLL_SYM
2646 && strlen ((char *) start_elem->opr.name) > 1)
2647 || (end_elem->type == COLL_SYM
2648 && strlen ((char *) end_elem->opr.name) > 1), 0))
2649 return REG_ECOLLATE;
2651 # ifdef RE_ENABLE_I18N
2656 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2658 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2659 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2661 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2662 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2664 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2665 ? __btowc (start_ch) : start_elem->opr.wch);
2666 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2667 ? __btowc (end_ch) : end_elem->opr.wch);
2668 if (start_wc == WEOF || end_wc == WEOF)
2669 return REG_ECOLLATE;
2670 cmp_buf[0] = start_wc;
2671 cmp_buf[4] = end_wc;
2672 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2675 /* Got valid collation sequence values, add them as a new entry.
2676 However, for !_LIBC we have no collation elements: if the
2677 character set is single byte, the single byte character set
2678 that we build below suffices. parse_bracket_exp passes
2679 no MBCSET if dfa->mb_cur_max == 1. */
2682 /* Check the space of the arrays. */
2683 if (BE (*range_alloc == mbcset->nranges, 0))
2685 /* There is not enough space, need realloc. */
2686 wchar_t *new_array_start, *new_array_end;
2689 /* +1 in case of mbcset->nranges is 0. */
2690 new_nranges = 2 * mbcset->nranges + 1;
2691 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2692 are NULL if *range_alloc == 0. */
2693 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2695 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2698 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2701 mbcset->range_starts = new_array_start;
2702 mbcset->range_ends = new_array_end;
2703 *range_alloc = new_nranges;
2706 mbcset->range_starts[mbcset->nranges] = start_wc;
2707 mbcset->range_ends[mbcset->nranges++] = end_wc;
2710 /* Build the table for single byte characters. */
2711 for (wc = 0; wc < SBC_MAX; ++wc)
2714 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2715 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2716 bitset_set (sbcset, wc);
2719 # else /* not RE_ENABLE_I18N */
2722 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2723 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2725 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2726 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2728 if (start_ch > end_ch)
2730 /* Build the table for single byte characters. */
2731 for (ch = 0; ch < SBC_MAX; ++ch)
2732 if (start_ch <= ch && ch <= end_ch)
2733 bitset_set (sbcset, ch);
2735 # endif /* not RE_ENABLE_I18N */
2738 #endif /* not _LIBC */
2741 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2742 Build the collating element which is represented by NAME.
2743 The result are written to MBCSET and SBCSET.
2744 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745 pointer argument since we may update it. */
2747 static reg_errcode_t
2749 # ifdef RE_ENABLE_I18N
2750 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2751 int *coll_sym_alloc, const unsigned char *name)
2752 # else /* not RE_ENABLE_I18N */
2753 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2754 # endif /* not RE_ENABLE_I18N */
2756 size_t name_len = strlen ((const char *) name);
2757 if (BE (name_len != 1, 0))
2758 return REG_ECOLLATE;
2761 bitset_set (sbcset, name[0]);
2765 #endif /* not _LIBC */
2767 /* This function parse bracket expression like "[abc]", "[a-c]",
2771 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2772 reg_syntax_t syntax, reg_errcode_t *err)
2775 const unsigned char *collseqmb;
2776 const char *collseqwc;
2779 const int32_t *symb_table;
2780 const unsigned char *extra;
2782 /* Local function for parse_bracket_exp used in _LIBC environement.
2783 Seek the collating symbol entry correspondings to NAME.
2784 Return the index of the symbol in the SYMB_TABLE,
2785 or -1 if not found. */
2788 __attribute ((always_inline))
2789 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2793 for (elem = 0; elem < table_size; elem++)
2794 if (symb_table[2 * elem] != 0)
2796 int32_t idx = symb_table[2 * elem + 1];
2797 /* Skip the name of collating element name. */
2798 idx += 1 + extra[idx];
2799 if (/* Compare the length of the name. */
2800 name_len == extra[idx]
2801 /* Compare the name. */
2802 && memcmp (name, &extra[idx + 1], name_len) == 0)
2803 /* Yep, this is the entry. */
2809 /* Local function for parse_bracket_exp used in _LIBC environment.
2810 Look up the collation sequence value of BR_ELEM.
2811 Return the value if succeeded, UINT_MAX otherwise. */
2813 auto inline unsigned int
2814 __attribute ((always_inline))
2815 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2817 if (br_elem->type == SB_CHAR)
2820 if (MB_CUR_MAX == 1)
2823 return collseqmb[br_elem->opr.ch];
2826 wint_t wc = __btowc (br_elem->opr.ch);
2827 return __collseq_table_lookup (collseqwc, wc);
2830 else if (br_elem->type == MB_CHAR)
2833 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2835 else if (br_elem->type == COLL_SYM)
2837 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2841 elem = seek_collating_symbol_entry (br_elem->opr.name,
2845 /* We found the entry. */
2846 idx = symb_table[2 * elem + 1];
2847 /* Skip the name of collating element name. */
2848 idx += 1 + extra[idx];
2849 /* Skip the byte sequence of the collating element. */
2850 idx += 1 + extra[idx];
2851 /* Adjust for the alignment. */
2852 idx = (idx + 3) & ~3;
2853 /* Skip the multibyte collation sequence value. */
2854 idx += sizeof (unsigned int);
2855 /* Skip the wide char sequence of the collating element. */
2856 idx += sizeof (unsigned int) *
2857 (1 + *(unsigned int *) (extra + idx));
2858 /* Return the collation sequence value. */
2859 return *(unsigned int *) (extra + idx);
2861 else if (sym_name_len == 1)
2863 /* No valid character. Match it as a single byte
2865 return collseqmb[br_elem->opr.name[0]];
2868 else if (sym_name_len == 1)
2869 return collseqmb[br_elem->opr.name[0]];
2874 /* Local function for parse_bracket_exp used in _LIBC environement.
2875 Build the range expression which starts from START_ELEM, and ends
2876 at END_ELEM. The result are written to MBCSET and SBCSET.
2877 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2878 mbcset->range_ends, is a pointer argument sinse we may
2881 auto inline reg_errcode_t
2882 __attribute ((always_inline))
2883 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2884 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2887 uint32_t start_collseq;
2888 uint32_t end_collseq;
2890 /* Equivalence Classes and Character Classes can't be a range
2892 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2893 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2897 start_collseq = lookup_collation_sequence_value (start_elem);
2898 end_collseq = lookup_collation_sequence_value (end_elem);
2899 /* Check start/end collation sequence values. */
2900 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2901 return REG_ECOLLATE;
2902 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2905 /* Got valid collation sequence values, add them as a new entry.
2906 However, if we have no collation elements, and the character set
2907 is single byte, the single byte character set that we
2908 build below suffices. */
2909 if (nrules > 0 || dfa->mb_cur_max > 1)
2911 /* Check the space of the arrays. */
2912 if (BE (*range_alloc == mbcset->nranges, 0))
2914 /* There is not enough space, need realloc. */
2915 uint32_t *new_array_start;
2916 uint32_t *new_array_end;
2919 /* +1 in case of mbcset->nranges is 0. */
2920 new_nranges = 2 * mbcset->nranges + 1;
2921 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2923 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2926 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2929 mbcset->range_starts = new_array_start;
2930 mbcset->range_ends = new_array_end;
2931 *range_alloc = new_nranges;
2934 mbcset->range_starts[mbcset->nranges] = start_collseq;
2935 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2938 /* Build the table for single byte characters. */
2939 for (ch = 0; ch < SBC_MAX; ch++)
2941 uint32_t ch_collseq;
2943 if (MB_CUR_MAX == 1)
2946 ch_collseq = collseqmb[ch];
2948 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2949 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2950 bitset_set (sbcset, ch);
2955 /* Local function for parse_bracket_exp used in _LIBC environement.
2956 Build the collating element which is represented by NAME.
2957 The result are written to MBCSET and SBCSET.
2958 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2959 pointer argument sinse we may update it. */
2961 auto inline reg_errcode_t
2962 __attribute ((always_inline))
2963 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2964 int *coll_sym_alloc, const unsigned char *name)
2967 size_t name_len = strlen ((const char *) name);
2970 elem = seek_collating_symbol_entry (name, name_len);
2973 /* We found the entry. */
2974 idx = symb_table[2 * elem + 1];
2975 /* Skip the name of collating element name. */
2976 idx += 1 + extra[idx];
2978 else if (name_len == 1)
2980 /* No valid character, treat it as a normal
2982 bitset_set (sbcset, name[0]);
2986 return REG_ECOLLATE;
2988 /* Got valid collation sequence, add it as a new entry. */
2989 /* Check the space of the arrays. */
2990 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2992 /* Not enough, realloc it. */
2993 /* +1 in case of mbcset->ncoll_syms is 0. */
2994 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2995 /* Use realloc since mbcset->coll_syms is NULL
2997 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2998 new_coll_sym_alloc);
2999 if (BE (new_coll_syms == NULL, 0))
3001 mbcset->coll_syms = new_coll_syms;
3002 *coll_sym_alloc = new_coll_sym_alloc;
3004 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3009 if (BE (name_len != 1, 0))
3010 return REG_ECOLLATE;
3013 bitset_set (sbcset, name[0]);
3020 re_token_t br_token;
3021 re_bitset_ptr_t sbcset;
3022 #ifdef RE_ENABLE_I18N
3023 re_charset_t *mbcset;
3024 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3025 int equiv_class_alloc = 0, char_class_alloc = 0;
3026 #endif /* not RE_ENABLE_I18N */
3028 bin_tree_t *work_tree;
3030 int first_round = 1;
3032 collseqmb = (const unsigned char *)
3033 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3034 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3040 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3041 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3042 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3043 _NL_COLLATE_SYMB_TABLEMB);
3044 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045 _NL_COLLATE_SYMB_EXTRAMB);
3048 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049 #ifdef RE_ENABLE_I18N
3050 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3051 #endif /* RE_ENABLE_I18N */
3052 #ifdef RE_ENABLE_I18N
3053 if (BE (sbcset == NULL || mbcset == NULL, 0))
3055 if (BE (sbcset == NULL, 0))
3056 #endif /* RE_ENABLE_I18N */
3059 #ifdef RE_ENABLE_I18N
3066 token_len = peek_token_bracket (token, regexp, syntax);
3067 if (BE (token->type == END_OF_RE, 0))
3070 goto parse_bracket_exp_free_return;
3072 if (token->type == OP_NON_MATCH_LIST)
3074 #ifdef RE_ENABLE_I18N
3075 mbcset->non_match = 1;
3076 #endif /* not RE_ENABLE_I18N */
3078 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079 bitset_set (sbcset, '\n');
3080 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3081 token_len = peek_token_bracket (token, regexp, syntax);
3082 if (BE (token->type == END_OF_RE, 0))
3085 goto parse_bracket_exp_free_return;
3089 /* We treat the first ']' as a normal character. */
3090 if (token->type == OP_CLOSE_BRACKET)
3091 token->type = CHARACTER;
3095 bracket_elem_t start_elem, end_elem;
3096 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3099 int token_len2 = 0, is_range_exp = 0;
3102 start_elem.opr.name = start_name_buf;
3103 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3104 syntax, first_round);
3105 if (BE (ret != REG_NOERROR, 0))
3108 goto parse_bracket_exp_free_return;
3112 /* Get information about the next token. We need it in any case. */
3113 token_len = peek_token_bracket (token, regexp, syntax);
3115 /* Do not check for ranges if we know they are not allowed. */
3116 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3118 if (BE (token->type == END_OF_RE, 0))
3121 goto parse_bracket_exp_free_return;
3123 if (token->type == OP_CHARSET_RANGE)
3125 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3126 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3127 if (BE (token2.type == END_OF_RE, 0))
3130 goto parse_bracket_exp_free_return;
3132 if (token2.type == OP_CLOSE_BRACKET)
3134 /* We treat the last '-' as a normal character. */
3135 re_string_skip_bytes (regexp, -token_len);
3136 token->type = CHARACTER;
3143 if (is_range_exp == 1)
3145 end_elem.opr.name = end_name_buf;
3146 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3148 if (BE (ret != REG_NOERROR, 0))
3151 goto parse_bracket_exp_free_return;
3154 token_len = peek_token_bracket (token, regexp, syntax);
3157 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3158 &start_elem, &end_elem);
3160 # ifdef RE_ENABLE_I18N
3161 *err = build_range_exp (sbcset,
3162 dfa->mb_cur_max > 1 ? mbcset : NULL,
3163 &range_alloc, &start_elem, &end_elem);
3165 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3167 #endif /* RE_ENABLE_I18N */
3168 if (BE (*err != REG_NOERROR, 0))
3169 goto parse_bracket_exp_free_return;
3173 switch (start_elem.type)
3176 bitset_set (sbcset, start_elem.opr.ch);
3178 #ifdef RE_ENABLE_I18N
3180 /* Check whether the array has enough space. */
3181 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3183 wchar_t *new_mbchars;
3184 /* Not enough, realloc it. */
3185 /* +1 in case of mbcset->nmbchars is 0. */
3186 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3187 /* Use realloc since array is NULL if *alloc == 0. */
3188 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3190 if (BE (new_mbchars == NULL, 0))
3191 goto parse_bracket_exp_espace;
3192 mbcset->mbchars = new_mbchars;
3194 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3196 #endif /* RE_ENABLE_I18N */
3198 *err = build_equiv_class (sbcset,
3199 #ifdef RE_ENABLE_I18N
3200 mbcset, &equiv_class_alloc,
3201 #endif /* RE_ENABLE_I18N */
3202 start_elem.opr.name);
3203 if (BE (*err != REG_NOERROR, 0))
3204 goto parse_bracket_exp_free_return;
3207 *err = build_collating_symbol (sbcset,
3208 #ifdef RE_ENABLE_I18N
3209 mbcset, &coll_sym_alloc,
3210 #endif /* RE_ENABLE_I18N */
3211 start_elem.opr.name);
3212 if (BE (*err != REG_NOERROR, 0))
3213 goto parse_bracket_exp_free_return;
3216 *err = build_charclass (regexp->trans, sbcset,
3217 #ifdef RE_ENABLE_I18N
3218 mbcset, &char_class_alloc,
3219 #endif /* RE_ENABLE_I18N */
3220 start_elem.opr.name, syntax);
3221 if (BE (*err != REG_NOERROR, 0))
3222 goto parse_bracket_exp_free_return;
3229 if (BE (token->type == END_OF_RE, 0))
3232 goto parse_bracket_exp_free_return;
3234 if (token->type == OP_CLOSE_BRACKET)
3238 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3240 /* If it is non-matching list. */
3242 bitset_not (sbcset);
3244 #ifdef RE_ENABLE_I18N
3245 /* Ensure only single byte characters are set. */
3246 if (dfa->mb_cur_max > 1)
3247 bitset_mask (sbcset, dfa->sb_char);
3249 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3250 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3251 || mbcset->non_match)))
3253 bin_tree_t *mbc_tree;
3255 /* Build a tree for complex bracket. */
3256 dfa->has_mb_node = 1;
3257 br_token.type = COMPLEX_BRACKET;
3258 br_token.opr.mbcset = mbcset;
3259 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3260 if (BE (mbc_tree == NULL, 0))
3261 goto parse_bracket_exp_espace;
3262 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3263 if (sbcset[sbc_idx])
3265 /* If there are no bits set in sbcset, there is no point
3266 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3267 if (sbc_idx < BITSET_WORDS)
3269 /* Build a tree for simple bracket. */
3270 br_token.type = SIMPLE_BRACKET;
3271 br_token.opr.sbcset = sbcset;
3272 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3273 if (BE (work_tree == NULL, 0))
3274 goto parse_bracket_exp_espace;
3276 /* Then join them by ALT node. */
3277 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3278 if (BE (work_tree == NULL, 0))
3279 goto parse_bracket_exp_espace;
3284 work_tree = mbc_tree;
3288 #endif /* not RE_ENABLE_I18N */
3290 #ifdef RE_ENABLE_I18N
3291 free_charset (mbcset);
3293 /* Build a tree for simple bracket. */
3294 br_token.type = SIMPLE_BRACKET;
3295 br_token.opr.sbcset = sbcset;
3296 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3297 if (BE (work_tree == NULL, 0))
3298 goto parse_bracket_exp_espace;
3302 parse_bracket_exp_espace:
3304 parse_bracket_exp_free_return:
3306 #ifdef RE_ENABLE_I18N
3307 free_charset (mbcset);
3308 #endif /* RE_ENABLE_I18N */
3312 /* Parse an element in the bracket expression. */
3314 static reg_errcode_t
3315 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3316 re_token_t *token, int token_len, re_dfa_t *dfa,
3317 reg_syntax_t syntax, int accept_hyphen)
3319 #ifdef RE_ENABLE_I18N
3321 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3322 if (cur_char_size > 1)
3324 elem->type = MB_CHAR;
3325 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3326 re_string_skip_bytes (regexp, cur_char_size);
3329 #endif /* RE_ENABLE_I18N */
3330 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3331 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3332 || token->type == OP_OPEN_EQUIV_CLASS)
3333 return parse_bracket_symbol (elem, regexp, token);
3334 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3336 /* A '-' must only appear as anything but a range indicator before
3337 the closing bracket. Everything else is an error. */
3339 (void) peek_token_bracket (&token2, regexp, syntax);
3340 if (token2.type != OP_CLOSE_BRACKET)
3341 /* The actual error value is not standardized since this whole
3342 case is undefined. But ERANGE makes good sense. */
3345 elem->type = SB_CHAR;
3346 elem->opr.ch = token->opr.c;
3350 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3351 such as [:<character_class>:], [.<collating_element>.], and
3352 [=<equivalent_class>=]. */
3354 static reg_errcode_t
3355 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3358 unsigned char ch, delim = token->opr.c;
3360 if (re_string_eoi(regexp))
3364 if (i >= BRACKET_NAME_BUF_SIZE)
3366 if (token->type == OP_OPEN_CHAR_CLASS)
3367 ch = re_string_fetch_byte_case (regexp);
3369 ch = re_string_fetch_byte (regexp);
3370 if (re_string_eoi(regexp))
3372 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3374 elem->opr.name[i] = ch;
3376 re_string_skip_bytes (regexp, 1);
3377 elem->opr.name[i] = '\0';
3378 switch (token->type)
3380 case OP_OPEN_COLL_ELEM:
3381 elem->type = COLL_SYM;
3383 case OP_OPEN_EQUIV_CLASS:
3384 elem->type = EQUIV_CLASS;
3386 case OP_OPEN_CHAR_CLASS:
3387 elem->type = CHAR_CLASS;
3395 /* Helper function for parse_bracket_exp.
3396 Build the equivalence class which is represented by NAME.
3397 The result are written to MBCSET and SBCSET.
3398 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3399 is a pointer argument sinse we may update it. */
3401 static reg_errcode_t
3402 #ifdef RE_ENABLE_I18N
3403 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3404 int *equiv_class_alloc, const unsigned char *name)
3405 #else /* not RE_ENABLE_I18N */
3406 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3407 #endif /* not RE_ENABLE_I18N */
3410 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3413 const int32_t *table, *indirect;
3414 const unsigned char *weights, *extra, *cp;
3415 unsigned char char_buf[2];
3419 /* This #include defines a local function! */
3420 # include <locale/weight.h>
3421 /* Calculate the index for equivalence class. */
3423 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3424 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3425 _NL_COLLATE_WEIGHTMB);
3426 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3427 _NL_COLLATE_EXTRAMB);
3428 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3429 _NL_COLLATE_INDIRECTMB);
3430 idx1 = findidx (&cp, -1);
3431 if (BE (idx1 == 0 || *cp != '\0', 0))
3432 /* This isn't a valid character. */
3433 return REG_ECOLLATE;
3435 /* Build single byte matcing table for this equivalence class. */
3436 len = weights[idx1 & 0xffffff];
3437 for (ch = 0; ch < SBC_MAX; ++ch)
3441 idx2 = findidx (&cp, 1);
3446 /* This isn't a valid character. */
3448 /* Compare only if the length matches and the collation rule
3449 index is the same. */
3450 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3454 while (cnt <= len &&
3455 weights[(idx1 & 0xffffff) + 1 + cnt]
3456 == weights[(idx2 & 0xffffff) + 1 + cnt])
3460 bitset_set (sbcset, ch);
3463 /* Check whether the array has enough space. */
3464 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3466 /* Not enough, realloc it. */
3467 /* +1 in case of mbcset->nequiv_classes is 0. */
3468 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3469 /* Use realloc since the array is NULL if *alloc == 0. */
3470 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3472 new_equiv_class_alloc);
3473 if (BE (new_equiv_classes == NULL, 0))
3475 mbcset->equiv_classes = new_equiv_classes;
3476 *equiv_class_alloc = new_equiv_class_alloc;
3478 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3483 if (BE (strlen ((const char *) name) != 1, 0))
3484 return REG_ECOLLATE;
3485 bitset_set (sbcset, *name);
3490 /* Helper function for parse_bracket_exp.
3491 Build the character class which is represented by NAME.
3492 The result are written to MBCSET and SBCSET.
3493 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3494 is a pointer argument sinse we may update it. */
3496 static reg_errcode_t
3497 #ifdef RE_ENABLE_I18N
3498 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3499 re_charset_t *mbcset, int *char_class_alloc,
3500 const unsigned char *class_name, reg_syntax_t syntax)
3501 #else /* not RE_ENABLE_I18N */
3502 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3503 const unsigned char *class_name, reg_syntax_t syntax)
3504 #endif /* not RE_ENABLE_I18N */
3507 const char *name = (const char *) class_name;
3509 /* In case of REG_ICASE "upper" and "lower" match the both of
3510 upper and lower cases. */
3511 if ((syntax & RE_ICASE)
3512 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3515 #ifdef RE_ENABLE_I18N
3516 /* Check the space of the arrays. */
3517 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3519 /* Not enough, realloc it. */
3520 /* +1 in case of mbcset->nchar_classes is 0. */
3521 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3522 /* Use realloc since array is NULL if *alloc == 0. */
3523 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3524 new_char_class_alloc);
3525 if (BE (new_char_classes == NULL, 0))
3527 mbcset->char_classes = new_char_classes;
3528 *char_class_alloc = new_char_class_alloc;
3530 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3531 #endif /* RE_ENABLE_I18N */
3533 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3535 if (BE (trans != NULL, 0)) \
3537 for (i = 0; i < SBC_MAX; ++i) \
3538 if (ctype_func (i)) \
3539 bitset_set (sbcset, trans[i]); \
3543 for (i = 0; i < SBC_MAX; ++i) \
3544 if (ctype_func (i)) \
3545 bitset_set (sbcset, i); \
3549 if (strcmp (name, "alnum") == 0)
3550 BUILD_CHARCLASS_LOOP (isalnum);
3551 else if (strcmp (name, "cntrl") == 0)
3552 BUILD_CHARCLASS_LOOP (iscntrl);
3553 else if (strcmp (name, "lower") == 0)
3554 BUILD_CHARCLASS_LOOP (islower);
3555 else if (strcmp (name, "space") == 0)
3556 BUILD_CHARCLASS_LOOP (isspace);
3557 else if (strcmp (name, "alpha") == 0)
3558 BUILD_CHARCLASS_LOOP (isalpha);
3559 else if (strcmp (name, "digit") == 0)
3560 BUILD_CHARCLASS_LOOP (isdigit);
3561 else if (strcmp (name, "print") == 0)
3562 BUILD_CHARCLASS_LOOP (isprint);
3563 else if (strcmp (name, "upper") == 0)
3564 BUILD_CHARCLASS_LOOP (isupper);
3565 else if (strcmp (name, "blank") == 0)
3566 BUILD_CHARCLASS_LOOP (isblank);
3567 else if (strcmp (name, "graph") == 0)
3568 BUILD_CHARCLASS_LOOP (isgraph);
3569 else if (strcmp (name, "punct") == 0)
3570 BUILD_CHARCLASS_LOOP (ispunct);
3571 else if (strcmp (name, "xdigit") == 0)
3572 BUILD_CHARCLASS_LOOP (isxdigit);
3580 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3581 const unsigned char *class_name,
3582 const unsigned char *extra, int non_match,
3585 re_bitset_ptr_t sbcset;
3586 #ifdef RE_ENABLE_I18N
3587 re_charset_t *mbcset;
3589 #endif /* not RE_ENABLE_I18N */
3591 re_token_t br_token;
3594 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3595 #ifdef RE_ENABLE_I18N
3596 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3597 #endif /* RE_ENABLE_I18N */
3599 #ifdef RE_ENABLE_I18N
3600 if (BE (sbcset == NULL || mbcset == NULL, 0))
3601 #else /* not RE_ENABLE_I18N */
3602 if (BE (sbcset == NULL, 0))
3603 #endif /* not RE_ENABLE_I18N */
3611 #ifdef RE_ENABLE_I18N
3612 mbcset->non_match = 1;
3613 #endif /* not RE_ENABLE_I18N */
3616 /* We don't care the syntax in this case. */
3617 ret = build_charclass (trans, sbcset,
3618 #ifdef RE_ENABLE_I18N
3620 #endif /* RE_ENABLE_I18N */
3623 if (BE (ret != REG_NOERROR, 0))
3626 #ifdef RE_ENABLE_I18N
3627 free_charset (mbcset);
3628 #endif /* RE_ENABLE_I18N */
3632 /* \w match '_' also. */
3633 for (; *extra; extra++)
3634 bitset_set (sbcset, *extra);
3636 /* If it is non-matching list. */
3638 bitset_not (sbcset);
3640 #ifdef RE_ENABLE_I18N
3641 /* Ensure only single byte characters are set. */
3642 if (dfa->mb_cur_max > 1)
3643 bitset_mask (sbcset, dfa->sb_char);
3646 /* Build a tree for simple bracket. */
3647 br_token.type = SIMPLE_BRACKET;
3648 br_token.opr.sbcset = sbcset;
3649 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3650 if (BE (tree == NULL, 0))
3651 goto build_word_op_espace;
3653 #ifdef RE_ENABLE_I18N
3654 if (dfa->mb_cur_max > 1)
3656 bin_tree_t *mbc_tree;
3657 /* Build a tree for complex bracket. */
3658 br_token.type = COMPLEX_BRACKET;
3659 br_token.opr.mbcset = mbcset;
3660 dfa->has_mb_node = 1;
3661 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3662 if (BE (mbc_tree == NULL, 0))
3663 goto build_word_op_espace;
3664 /* Then join them by ALT node. */
3665 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3666 if (BE (mbc_tree != NULL, 1))
3671 free_charset (mbcset);
3674 #else /* not RE_ENABLE_I18N */
3676 #endif /* not RE_ENABLE_I18N */
3678 build_word_op_espace:
3680 #ifdef RE_ENABLE_I18N
3681 free_charset (mbcset);
3682 #endif /* RE_ENABLE_I18N */
3687 /* This is intended for the expressions like "a{1,3}".
3688 Fetch a number from `input', and return the number.
3689 Return -1, if the number field is empty like "{,1}".
3690 Return -2, If an error is occured. */
3693 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3699 fetch_token (token, input, syntax);
3701 if (BE (token->type == END_OF_RE, 0))
3703 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3705 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3706 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3707 num = (num > RE_DUP_MAX) ? -2 : num;
3712 #ifdef RE_ENABLE_I18N
3714 free_charset (re_charset_t *cset)
3716 re_free (cset->mbchars);
3718 re_free (cset->coll_syms);
3719 re_free (cset->equiv_classes);
3720 re_free (cset->range_starts);
3721 re_free (cset->range_ends);
3723 re_free (cset->char_classes);
3726 #endif /* RE_ENABLE_I18N */
3728 /* Functions for binary tree operation. */
3730 /* Create a tree node. */
3733 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3734 re_token_type_t type)
3738 return create_token_tree (dfa, left, right, &t);
3742 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743 const re_token_t *token)
3746 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3748 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3750 if (storage == NULL)
3752 storage->next = dfa->str_tree_storage;
3753 dfa->str_tree_storage = storage;
3754 dfa->str_tree_storage_idx = 0;
3756 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3758 tree->parent = NULL;
3760 tree->right = right;
3761 tree->token = *token;
3762 tree->token.duplicated = 0;
3763 tree->token.opt_subexp = 0;
3766 tree->node_idx = -1;
3769 left->parent = tree;
3771 right->parent = tree;
3775 /* Mark the tree SRC as an optional subexpression.
3776 To be called from preorder or postorder. */
3778 static reg_errcode_t
3779 mark_opt_subexp (void *extra, bin_tree_t *node)
3781 int idx = (int) (long) extra;
3782 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3783 node->token.opt_subexp = 1;
3788 /* Free the allocated memory inside NODE. */
3791 free_token (re_token_t *node)
3793 #ifdef RE_ENABLE_I18N
3794 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3795 free_charset (node->opr.mbcset);
3797 #endif /* RE_ENABLE_I18N */
3798 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3799 re_free (node->opr.sbcset);
3802 /* Worker function for tree walking. Free the allocated memory inside NODE
3803 and its children. */
3805 static reg_errcode_t
3806 free_tree (void *extra, bin_tree_t *node)
3808 free_token (&node->token);
3813 /* Duplicate the node SRC, and return new node. This is a preorder
3814 visit similar to the one implemented by the generic visitor, but
3815 we need more infrastructure to maintain two parallel trees --- so,
3816 it's easier to duplicate. */
3819 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3821 const bin_tree_t *node;
3822 bin_tree_t *dup_root;
3823 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3825 for (node = root; ; )
3827 /* Create a new tree and link it back to the current parent. */
3828 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831 (*p_new)->parent = dup_node;
3832 (*p_new)->token.duplicated = 1;
3835 /* Go to the left node, or up and to the right. */
3839 p_new = &dup_node->left;
3843 const bin_tree_t *prev = NULL;
3844 while (node->right == prev || node->right == NULL)
3847 node = node->parent;
3848 dup_node = dup_node->parent;
3853 p_new = &dup_node->right;