1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
31 #include <sys/types.h>
35 extern char *calloc(), *malloc(), *realloc();
39 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
49 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
50 /* We can handle multibyte string. */
59 #ifndef DEBUG /* use the same approach as regex.c */
65 #define isgraph(C) (isprint(C) && !isspace(C))
68 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
69 #define ISALPHA(C) isalpha(C)
70 #define ISUPPER(C) isupper(C)
71 #define ISLOWER(C) islower(C)
72 #define ISDIGIT(C) isdigit(C)
73 #define ISXDIGIT(C) isxdigit(C)
74 #define ISSPACE(C) isspace(C)
75 #define ISPUNCT(C) ispunct(C)
76 #define ISALNUM(C) isalnum(C)
77 #define ISPRINT(C) isprint(C)
78 #define ISGRAPH(C) isgraph(C)
79 #define ISCNTRL(C) iscntrl(C)
81 #define ISALPHA(C) (isascii(C) && isalpha(C))
82 #define ISUPPER(C) (isascii(C) && isupper(C))
83 #define ISLOWER(C) (isascii(C) && islower(C))
84 #define ISDIGIT(C) (isascii(C) && isdigit(C))
85 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
86 #define ISSPACE(C) (isascii(C) && isspace(C))
87 #define ISPUNCT(C) (isascii(C) && ispunct(C))
88 #define ISALNUM(C) (isascii(C) && isalnum(C))
89 #define ISPRINT(C) (isascii(C) && isprint(C))
90 #define ISGRAPH(C) (isascii(C) && isgraph(C))
91 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
94 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
95 - Its arg may be any int or unsigned int; it need not be an unsigned char.
96 - It's guaranteed to evaluate its argument exactly once.
97 - It's typically faster.
98 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
99 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
100 it's important to use the locale's definition of `digit' even when the
101 host does not conform to Posix. */
102 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
104 /* If we (don't) have I18N. */
105 /* glibc defines _ */
107 # ifdef HAVE_LIBINTL_H
108 # include <libintl.h>
110 # define _(Str) gettext (Str)
113 # define _(Str) (Str)
119 #include "hard-locale.h"
121 /* HPUX, define those as macros in sys/param.h */
129 static void dfamust PARAMS ((struct dfa *dfa));
130 static void regexp PARAMS ((int toplevel));
133 xcalloc (size_t n, size_t s)
135 ptr_t r = calloc(n, s);
138 dfaerror(_("Memory exhausted"));
149 dfaerror(_("Memory exhausted"));
154 xrealloc (ptr_t p, size_t n)
156 ptr_t r = realloc(p, n);
160 dfaerror(_("Memory exhausted"));
164 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
165 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
166 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
168 /* Reallocate an array of type t if nalloc is too small for index. */
169 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
170 if ((index) >= (nalloc)) \
174 while ((index) >= (nalloc)); \
175 REALLOC(p, t, nalloc); \
186 fprintf(stderr, "END");
187 else if (t < NOTCHAR)
188 fprintf(stderr, "%c", t);
193 case EMPTY: s = "EMPTY"; break;
194 case BACKREF: s = "BACKREF"; break;
195 case BEGLINE: s = "BEGLINE"; break;
196 case ENDLINE: s = "ENDLINE"; break;
197 case BEGWORD: s = "BEGWORD"; break;
198 case ENDWORD: s = "ENDWORD"; break;
199 case LIMWORD: s = "LIMWORD"; break;
200 case NOTLIMWORD: s = "NOTLIMWORD"; break;
201 case QMARK: s = "QMARK"; break;
202 case STAR: s = "STAR"; break;
203 case PLUS: s = "PLUS"; break;
204 case CAT: s = "CAT"; break;
205 case OR: s = "OR"; break;
206 case ORTOP: s = "ORTOP"; break;
207 case LPAREN: s = "LPAREN"; break;
208 case RPAREN: s = "RPAREN"; break;
209 case CRANGE: s = "CRANGE"; break;
211 case ANYCHAR: s = "ANYCHAR"; break;
212 case MBCSET: s = "MBCSET"; break;
213 #endif /* MBS_SUPPORT */
214 default: s = "CSET"; break;
216 fprintf(stderr, "%s", s);
221 /* Stuff pertaining to charclasses. */
224 tstbit (unsigned b, charclass c)
226 return c[b / INTBITS] & 1 << b % INTBITS;
230 setbit (unsigned b, charclass c)
232 c[b / INTBITS] |= 1 << b % INTBITS;
236 clrbit (unsigned b, charclass c)
238 c[b / INTBITS] &= ~(1 << b % INTBITS);
242 copyset (charclass src, charclass dst)
244 memcpy (dst, src, sizeof (charclass));
248 zeroset (charclass s)
250 memset (s, 0, sizeof (charclass));
258 for (i = 0; i < CHARCLASS_INTS; ++i)
263 equal (charclass s1, charclass s2)
265 return memcmp (s1, s2, sizeof (charclass)) == 0;
268 /* A pointer to the current dfa is kept here during parsing. */
269 static struct dfa *dfa;
271 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
273 charclass_index (charclass s)
277 for (i = 0; i < dfa->cindex; ++i)
278 if (equal(s, dfa->charclasses[i]))
280 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
282 copyset(s, dfa->charclasses[i]);
286 /* Syntax bits controlling the behavior of the lexical analyzer. */
287 static reg_syntax_t syntax_bits, syntax_bits_set;
289 /* Flag for case-folding letters into sets. */
290 static int case_fold;
292 /* End-of-line byte in data. */
293 static unsigned char eolbyte;
295 /* Entry point to set syntax options. */
297 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
305 /* Like setbit, but if case is folded, set both cases of a letter. */
307 setbit_case_fold (unsigned b, charclass c)
313 setbit (tolower (b), c);
314 else if (ISLOWER (b))
315 setbit (toupper (b), c);
319 /* Lexical analyzer. All the dross that deals with the obnoxious
320 GNU Regex syntax bits is located here. The poor, suffering
321 reader is referred to the GNU Regex documentation for the
322 meaning of the @#%!@#%^!@ syntax bits. */
324 static char const *lexstart; /* Pointer to beginning of input string. */
325 static char const *lexptr; /* Pointer to next input character. */
326 static int lexleft; /* Number of characters remaining. */
327 static token lasttok; /* Previous token returned; initially END. */
328 static int laststart; /* True if we're separated from beginning or (, |
329 only by zero-width characters. */
330 static int parens; /* Count of outstanding left parens. */
331 static int minrep, maxrep; /* Repeat counts for {m,n}. */
332 static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
335 /* These variables are used only if (MB_CUR_MAX > 1). */
336 static mbstate_t mbs; /* Mbstate for mbrlen(). */
337 static ssize_t cur_mb_len; /* Byte length of the current scanning
338 multibyte character. Must also handle
339 negative result from mbrlen(). */
340 static ssize_t cur_mb_index; /* Byte index of the current scanning multibyte
343 singlebyte character : cur_mb_index = 0
345 1st byte : cur_mb_index = 1
346 2nd byte : cur_mb_index = 2
348 nth byte : cur_mb_index = n */
349 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
350 Each element store the amount of remain
351 byte of corresponding multibyte character
352 in the input string. A element's value
353 is 0 if corresponding character is a
355 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
356 mblen_buf : 0, 3, 2, 1
358 static wchar_t *inputwcs; /* Wide character representation of input
360 The length of this array is same as
361 the length of input string(char array).
362 inputstring[i] is a single-byte char,
363 or 1st byte of a multibyte char.
364 And inputwcs[i] is the codepoint. */
365 static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */
366 static unsigned char const *buf_end; /* refference to end in dfaexec(). */
367 #endif /* MBS_SUPPORT */
370 /* This function update cur_mb_len, and cur_mb_index.
371 p points current lexptr, len is the remaining buffer length. */
373 update_mb_len_index (unsigned char const *p, size_t len)
375 /* If last character is a part of a multibyte character,
376 we update cur_mb_index. */
378 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
381 /* If last character is a single byte character, or the
382 last portion of a multibyte character, we check whether
383 next character is a multibyte character or not. */
386 cur_mb_len = mbrlen(p, len, &mbs);
388 /* It is a multibyte character.
389 cur_mb_len was already set by mbrlen(). */
391 else if (cur_mb_len < 1)
392 /* Invalid sequence. We treat it as a singlebyte character.
393 cur_mb_index is aleady 0. */
395 /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
396 cur_mb_index is aleady 0. */
399 #endif /* MBS_SUPPORT */
402 /* Note that characters become unsigned here. */
403 # define FETCH(c, eoferr) \
410 return lasttok = END; \
412 if (MB_CUR_MAX > 1) \
413 update_mb_len_index(lexptr, lexleft); \
414 (c) = (unsigned char) *lexptr++; \
418 /* This function fetch a wide character, and update cur_mb_len,
419 used only if the current locale is a multibyte environment. */
421 fetch_wc (char const *eoferr)
432 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
438 lexptr += cur_mb_len;
439 lexleft -= cur_mb_len;
443 /* Note that characters become unsigned here. */
444 # define FETCH(c, eoferr) \
451 return lasttok = END; \
453 (c) = (unsigned char) *lexptr++; \
456 #endif /* MBS_SUPPORT */
459 /* Multibyte character handling sub-routin for lex.
460 This function parse a bracket expression and build a struct
463 parse_bracket_exp_mb ()
467 /* Work area to build a mb_char_classes. */
468 struct mb_char_classes *work_mbc;
469 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
470 equivs_al, coll_elems_al;
472 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
473 dfa->mbcsets_alloc, dfa->nmbcsets + 1);
474 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
475 We will update dfa->multibyte_prop in addtok(), because we can't
476 decide the index in dfa->tokens[]. */
478 /* Initialize work are */
479 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
482 range_sts_al = range_ends_al = 0;
483 ch_classes_al = equivs_al = coll_elems_al = 0;
484 MALLOC(work_mbc->chars, wchar_t, chars_al);
486 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
487 work_mbc->nequivs = work_mbc->ncoll_elems = 0;
488 work_mbc->chars = work_mbc->ch_classes = NULL;
489 work_mbc->range_sts = work_mbc->range_ends = NULL;
490 work_mbc->equivs = work_mbc->coll_elems = NULL;
492 wc = fetch_wc(_("Unbalanced ["));
495 wc = fetch_wc(_("Unbalanced ["));
496 work_mbc->invert = 1;
499 work_mbc->invert = 0;
502 wc1 = WEOF; /* mark wc1 is not initialized". */
504 /* Note that if we're looking at some other [:...:] construct,
505 we just treat it as a bunch of ordinary characters. We can do
506 this because we assume regex has checked for syntax errors before
507 dfa is ever called. */
508 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
510 #define BRACKET_BUFFER_SIZE 128
511 char str[BRACKET_BUFFER_SIZE];
513 wc = fetch_wc(_("Unbalanced ["));
515 /* If pattern contains `[[:', `[[.', or `[[='. */
516 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
519 unsigned char delim = (unsigned char)wc;
524 dfaerror (_("Unbalanced ["));
525 c = (unsigned char) *lexptr++;
528 if ((c == delim && *lexptr == ']') || lexleft == 0)
530 if (len < BRACKET_BUFFER_SIZE)
533 /* This is in any case an invalid class name. */
540 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
541 work_mbc->nchars + 2);
542 work_mbc->chars[work_mbc->nchars++] = L'[';
543 work_mbc->chars[work_mbc->nchars++] = delim;
547 if (--lexleft, *lexptr++ != ']')
548 dfaerror (_("Unbalanced ["));
550 /* build character class. */
553 /* Query the character class as wctype_t. */
556 if (ch_classes_al == 0)
557 MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
558 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
560 work_mbc->nch_classes + 1);
561 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
564 else if (delim == '=' || delim == '.')
567 MALLOC(elem, char, len + 1);
568 strncpy(elem, str, len + 1);
571 /* build equivalent class. */
574 MALLOC(work_mbc->equivs, char*, ++equivs_al);
575 REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
577 work_mbc->nequivs + 1);
578 work_mbc->equivs[work_mbc->nequivs++] = elem;
582 /* build collating element. */
584 if (coll_elems_al == 0)
585 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
586 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
588 work_mbc->ncoll_elems + 1);
589 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
595 /* We treat '[' as a normal character here. */
597 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
602 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
603 wc = fetch_wc(("Unbalanced ["));
607 wc1 = fetch_wc(_("Unbalanced ["));
610 /* build range characters. */
612 wc2 = fetch_wc(_("Unbalanced ["));
615 /* In the case [x-], the - is an ordinary hyphen,
616 which is left in c1, the lookahead character. */
617 lexptr -= cur_mb_len;
618 lexleft += cur_mb_len;
624 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
625 wc2 = fetch_wc(_("Unbalanced ["));
626 wc1 = fetch_wc(_("Unbalanced ["));
629 if (range_sts_al == 0)
631 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
632 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
634 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
635 range_sts_al, work_mbc->nranges + 1);
636 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
637 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
638 range_ends_al, work_mbc->nranges + 1);
639 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
642 /* build normal characters. */
644 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
645 work_mbc->nchars + 1);
646 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
649 while ((wc = wc1) != L']');
651 #endif /* MBS_SUPPORT */
654 #define FUNC(F, P) static int F(int c) { return P(c); }
656 #define FUNC(F, P) static int F(c) int c; { return P(c); }
659 FUNC(is_alpha, ISALPHA)
660 FUNC(is_upper, ISUPPER)
661 FUNC(is_lower, ISLOWER)
662 FUNC(is_digit, ISDIGIT)
663 FUNC(is_xdigit, ISXDIGIT)
664 FUNC(is_space, ISSPACE)
665 FUNC(is_punct, ISPUNCT)
666 FUNC(is_alnum, ISALNUM)
667 FUNC(is_print, ISPRINT)
668 FUNC(is_graph, ISGRAPH)
669 FUNC(is_cntrl, ISCNTRL)
674 return (c == ' ' || c == '\t');
677 /* The following list maps the names of the Posix named character classes
678 to predicate functions that determine whether a given character is in
679 the class. The leading [ has already been eaten by the lexical analyzer. */
682 int (*pred) PARAMS ((int));
683 } const prednames[] = {
684 { ":alpha:]", is_alpha },
685 { ":upper:]", is_upper },
686 { ":lower:]", is_lower },
687 { ":digit:]", is_digit },
688 { ":xdigit:]", is_xdigit },
689 { ":space:]", is_space },
690 { ":punct:]", is_punct },
691 { ":alnum:]", is_alnum },
692 { ":print:]", is_print },
693 { ":graph:]", is_graph },
694 { ":cntrl:]", is_cntrl },
695 { ":blank:]", is_blank },
699 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
700 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
703 looking_at (char const *s)
710 return strncmp(s, lexptr, len) == 0;
717 int backslash = 0, invert;
721 /* Basic plan: We fetch a character. If it's a backslash,
722 we set the backslash flag and go through the loop again.
723 On the plus side, this avoids having a duplicate of the
724 main switch inside the backslash case. On the minus side,
725 it means that just about every case begins with
726 "if (backslash) ...". */
727 for (i = 0; i < 2; ++i)
731 if (MB_CUR_MAX > 1 && cur_mb_index)
732 /* If this is a part of a multi-byte character, we must treat
733 this byte data as a normal character.
734 e.g. In case of SJIS encoding, some character contains '\',
735 but they must not be backslash. */
737 #endif /* MBS_SUPPORT */
744 dfaerror(_("Unfinished \\ escape"));
751 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
755 return lasttok = BEGLINE;
761 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
763 || (syntax_bits & RE_NO_BK_PARENS
764 ? lexleft > 0 && *lexptr == ')'
765 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
766 || (syntax_bits & RE_NO_BK_VBAR
767 ? lexleft > 0 && *lexptr == '|'
768 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
769 || ((syntax_bits & RE_NEWLINE_ALT)
770 && lexleft > 0 && *lexptr == '\n'))
771 return lasttok = ENDLINE;
783 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
786 return lasttok = BACKREF;
791 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
792 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
796 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
797 return lasttok = ENDLINE; /* FIXME: should be end of string */
801 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
802 return lasttok = BEGWORD;
806 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
807 return lasttok = ENDWORD;
811 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
812 return lasttok = LIMWORD;
816 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
817 return lasttok = NOTLIMWORD;
821 if (syntax_bits & RE_LIMITED_OPS)
823 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
825 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
827 return lasttok = QMARK;
832 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
834 return lasttok = STAR;
837 if (syntax_bits & RE_LIMITED_OPS)
839 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
841 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
843 return lasttok = PLUS;
846 if (!(syntax_bits & RE_INTERVALS))
848 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
850 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
853 if (syntax_bits & RE_NO_BK_BRACES)
855 /* Scan ahead for a valid interval; if it's not valid,
856 treat it as a literal '{'. */
857 int lo = -1, hi = -1;
858 char const *p = lexptr;
859 char const *lim = p + lexleft;
860 for (; p != lim && ISASCIIDIGIT (*p); p++)
861 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
862 if (p != lim && *p == ',')
863 while (++p != lim && ISASCIIDIGIT (*p))
864 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
867 if (p == lim || *p != '}'
868 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
875 {M,} - minimum count, maximum is infinity
876 {M,N} - M through N */
877 FETCH(c, _("unfinished repeat count"));
878 if (ISASCIIDIGIT (c))
883 FETCH(c, _("unfinished repeat count"));
884 if (! ISASCIIDIGIT (c))
886 minrep = 10 * minrep + c - '0';
890 dfaerror(_("malformed repeat count"));
893 FETCH (c, _("unfinished repeat count"));
894 if (! ISASCIIDIGIT (c))
901 FETCH (c, _("unfinished repeat count"));
902 if (! ISASCIIDIGIT (c))
904 maxrep = 10 * maxrep + c - '0';
906 if (0 <= maxrep && maxrep < minrep)
907 dfaerror (_("malformed repeat count"));
912 if (!(syntax_bits & RE_NO_BK_BRACES))
915 dfaerror(_("malformed repeat count"));
916 FETCH(c, _("unfinished repeat count"));
919 dfaerror(_("malformed repeat count"));
921 return lasttok = REPMN;
924 if (syntax_bits & RE_LIMITED_OPS)
926 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
932 if (syntax_bits & RE_LIMITED_OPS
934 || !(syntax_bits & RE_NEWLINE_ALT))
940 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
944 return lasttok = LPAREN;
947 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
949 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
953 return lasttok = RPAREN;
961 /* In multibyte environment period must match with a single
962 character not a byte. So we use ANYCHAR. */
964 return lasttok = ANYCHAR;
966 #endif /* MBS_SUPPORT */
969 if (!(syntax_bits & RE_DOT_NEWLINE))
970 clrbit(eolbyte, ccl);
971 if (syntax_bits & RE_DOT_NOT_NULL)
974 return lasttok = CSET + charclass_index(ccl);
978 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
981 for (c2 = 0; c2 < NOTCHAR; ++c2)
982 if (IS_WORD_CONSTITUENT(c2))
987 return lasttok = CSET + charclass_index(ccl);
996 /* In multibyte environment a bracket expression may contain
997 multibyte characters, which must be treated as characters
998 (not bytes). So we parse it by parse_bracket_exp_mb(). */
999 parse_bracket_exp_mb();
1000 return lasttok = MBCSET;
1004 FETCH(c, _("Unbalanced ["));
1007 FETCH(c, _("Unbalanced ["));
1014 /* Nobody ever said this had to be fast. :-)
1015 Note that if we're looking at some other [:...:]
1016 construct, we just treat it as a bunch of ordinary
1017 characters. We can do this because we assume
1018 regex has checked for syntax errors before
1019 dfa is ever called. */
1020 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1021 for (c1 = 0; prednames[c1].name; ++c1)
1022 if (looking_at(prednames[c1].name))
1024 int (*pred) PARAMS ((int)) = prednames[c1].pred;
1026 for (c2 = 0; c2 < NOTCHAR; ++c2)
1028 setbit_case_fold (c2, ccl);
1029 lexptr += strlen(prednames[c1].name);
1030 lexleft -= strlen(prednames[c1].name);
1031 FETCH(c1, _("Unbalanced ["));
1034 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1035 FETCH(c, _("Unbalanced ["));
1036 FETCH(c1, _("Unbalanced ["));
1039 FETCH(c2, _("Unbalanced ["));
1042 /* In the case [x-], the - is an ordinary hyphen,
1043 which is left in c1, the lookahead character. */
1050 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1051 FETCH(c2, _("Unbalanced ["));
1052 FETCH(c1, _("Unbalanced ["));
1053 if (!hard_LC_COLLATE) {
1054 for (; c <= c2; c++)
1055 setbit_case_fold (c, ccl);
1057 /* POSIX locales are painful - leave the decision to libc */
1058 char expr[6] = { '[', c, '-', c2, ']', '\0' };
1060 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1061 for (c = 0; c < NOTCHAR; ++c) {
1062 char buf[2] = { c, '\0' };
1064 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1065 && mat.rm_so == 0 && mat.rm_eo == 1)
1066 setbit_case_fold (c, ccl);
1075 setbit_case_fold (c, ccl);
1080 while ((c = c1) != ']');
1084 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1085 clrbit(eolbyte, ccl);
1087 return lasttok = CSET + charclass_index(ccl);
1092 if (case_fold && ISALPHA(c))
1095 setbit_case_fold (c, ccl);
1096 return lasttok = CSET + charclass_index(ccl);
1102 /* The above loop should consume at most a backslash
1103 and some other character. */
1105 return END; /* keeps pedantic compilers happy. */
1108 /* Recursive descent parser for regular expressions. */
1110 static token tok; /* Lookahead token. */
1111 static int depth; /* Current depth of a hypothetical stack
1112 holding deferred productions. This is
1113 used to determine the depth that will be
1114 required of the real stack later on in
1117 /* Add the given token to the parse tree, maintaining the depth count and
1118 updating the maximum depth if necessary. */
1125 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1127 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
1129 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1130 else if (t < NOTCHAR)
1131 dfa->multibyte_prop[dfa->tindex]
1132 = (cur_mb_len == 1)? 3 /* single-byte char */
1133 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1134 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1136 /* It may be unnecesssary, but it is safer to treat other
1137 symbols as singlebyte characters. */
1138 dfa->multibyte_prop[dfa->tindex] = 3;
1142 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1143 dfa->tokens[dfa->tindex++] = t;
1164 if (depth > dfa->depth)
1168 /* The grammar understood by the parser is as follows.
1187 <multibyte character>
1199 LPAREN regexp RPAREN
1202 The parser builds a parse tree in postfix form in an array of tokens. */
1207 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1208 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1210 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1211 #endif /* MBS_SUPPORT */
1212 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1217 /* We treat a multibyte character as a single atom, so that DFA
1218 can treat a multibyte character as a single expression.
1220 e.g. We construct following tree from "<mb1><mb2>".
1221 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1222 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1226 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1233 #endif /* MBS_SUPPORT */
1235 else if (tok == CRANGE)
1237 /* A character range like "[a-z]" in a locale other than "C" or
1238 "POSIX". This range might any sequence of one or more
1239 characters. Unfortunately the POSIX locale primitives give
1240 us no practical way to find what character sequences might be
1241 matched. Treat this approximately like "(.\1)" -- i.e. match
1242 one character, and then punt to the full matcher. */
1246 addtok (CSET + charclass_index (ccl));
1251 else if (tok == LPAREN)
1256 dfaerror(_("Unbalanced ("));
1263 /* Return the number of tokens in the given subexpression. */
1265 nsubtoks (int tindex)
1269 switch (dfa->tokens[tindex - 1])
1276 return 1 + nsubtoks(tindex - 1);
1280 ntoks1 = nsubtoks(tindex - 1);
1281 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1285 /* Copy the given subexpression to the top of the tree. */
1287 copytoks (int tindex, int ntokens)
1291 for (i = 0; i < ntokens; ++i)
1292 addtok(dfa->tokens[tindex + i]);
1298 int tindex, ntokens, i;
1301 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1304 ntokens = nsubtoks(dfa->tindex);
1305 tindex = dfa->tindex - ntokens;
1310 for (i = 1; i < minrep; ++i)
1312 copytoks(tindex, ntokens);
1315 for (; i < maxrep; ++i)
1317 copytoks(tindex, ntokens);
1334 while (tok != RPAREN && tok != OR && tok >= 0)
1342 regexp (int toplevel)
1356 /* Main entry point for the parser. S is a string to be parsed, len is the
1357 length of the string, so s can include NUL characters. D is a pointer to
1358 the struct dfa to parse into. */
1360 dfaparse (char const *s, size_t len, struct dfa *d)
1363 lexstart = lexptr = s;
1368 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1374 memset(&mbs, 0, sizeof(mbstate_t));
1376 #endif /* MBS_SUPPORT */
1378 if (! syntax_bits_set)
1379 dfaerror(_("No syntax specified"));
1387 dfaerror(_("Unbalanced )"));
1389 addtok(END - d->nregexps);
1398 /* Some primitives for operating on sets of positions. */
1400 /* Copy one set to another; the destination must be large enough. */
1402 copy (position_set const *src, position_set *dst)
1406 for (i = 0; i < src->nelem; ++i)
1407 dst->elems[i] = src->elems[i];
1408 dst->nelem = src->nelem;
1411 /* Insert a position in a set. Position sets are maintained in sorted
1412 order according to index. If position already exists in the set with
1413 the same index then their constraints are logically or'd together.
1414 S->elems must point to an array large enough to hold the resulting set. */
1416 insert (position p, position_set *s)
1421 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1423 if (i < s->nelem && p.index == s->elems[i].index)
1424 s->elems[i].constraint |= p.constraint;
1429 while (i < s->nelem)
1438 /* Merge two sets of positions into a third. The result is exactly as if
1439 the positions of both sets were inserted into an initially empty set. */
1441 merge (position_set const *s1, position_set const *s2, position_set *m)
1446 while (i < s1->nelem && j < s2->nelem)
1447 if (s1->elems[i].index > s2->elems[j].index)
1448 m->elems[m->nelem++] = s1->elems[i++];
1449 else if (s1->elems[i].index < s2->elems[j].index)
1450 m->elems[m->nelem++] = s2->elems[j++];
1453 m->elems[m->nelem] = s1->elems[i++];
1454 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1456 while (i < s1->nelem)
1457 m->elems[m->nelem++] = s1->elems[i++];
1458 while (j < s2->nelem)
1459 m->elems[m->nelem++] = s2->elems[j++];
1462 /* Delete a position from a set. */
1464 delete (position p, position_set *s)
1468 for (i = 0; i < s->nelem; ++i)
1469 if (p.index == s->elems[i].index)
1472 for (--s->nelem; i < s->nelem; ++i)
1473 s->elems[i] = s->elems[i + 1];
1476 /* Find the index of the state corresponding to the given position set with
1477 the given preceding context, or create a new state if there is no such
1478 state. Newline and letter tell whether we got here on a newline or
1479 letter, respectively. */
1481 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1487 newline = newline ? 1 : 0;
1488 letter = letter ? 1 : 0;
1490 for (i = 0; i < s->nelem; ++i)
1491 hash ^= s->elems[i].index + s->elems[i].constraint;
1493 /* Try to find a state that exactly matches the proposed one. */
1494 for (i = 0; i < d->sindex; ++i)
1496 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1497 || newline != d->states[i].newline || letter != d->states[i].letter)
1499 for (j = 0; j < s->nelem; ++j)
1500 if (s->elems[j].constraint
1501 != d->states[i].elems.elems[j].constraint
1502 || s->elems[j].index != d->states[i].elems.elems[j].index)
1508 /* We'll have to create a new state. */
1509 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1510 d->states[i].hash = hash;
1511 MALLOC(d->states[i].elems.elems, position, s->nelem);
1512 copy(s, &d->states[i].elems);
1513 d->states[i].newline = newline;
1514 d->states[i].letter = letter;
1515 d->states[i].backref = 0;
1516 d->states[i].constraint = 0;
1517 d->states[i].first_end = 0;
1520 d->states[i].mbps.nelem = 0;
1522 for (j = 0; j < s->nelem; ++j)
1523 if (d->tokens[s->elems[j].index] < 0)
1525 constraint = s->elems[j].constraint;
1526 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1527 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1528 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1529 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1530 d->states[i].constraint |= constraint;
1531 if (! d->states[i].first_end)
1532 d->states[i].first_end = d->tokens[s->elems[j].index];
1534 else if (d->tokens[s->elems[j].index] == BACKREF)
1536 d->states[i].constraint = NO_CONSTRAINT;
1537 d->states[i].backref = 1;
1545 /* Find the epsilon closure of a set of positions. If any position of the set
1546 contains a symbol that matches the empty string in some context, replace
1547 that position with the elements of its follow labeled with an appropriate
1548 constraint. Repeat exhaustively until no funny positions are left.
1549 S->elems must be large enough to hold the result. */
1551 epsclosure (position_set *s, struct dfa const *d)
1557 MALLOC(visited, int, d->tindex);
1558 for (i = 0; i < d->tindex; ++i)
1561 for (i = 0; i < s->nelem; ++i)
1562 if (d->tokens[s->elems[i].index] >= NOTCHAR
1563 && d->tokens[s->elems[i].index] != BACKREF
1565 && d->tokens[s->elems[i].index] != ANYCHAR
1566 && d->tokens[s->elems[i].index] != MBCSET
1568 && d->tokens[s->elems[i].index] < CSET)
1571 p.constraint = old.constraint;
1572 delete(s->elems[i], s);
1573 if (visited[old.index])
1578 visited[old.index] = 1;
1579 switch (d->tokens[old.index])
1582 p.constraint &= BEGLINE_CONSTRAINT;
1585 p.constraint &= ENDLINE_CONSTRAINT;
1588 p.constraint &= BEGWORD_CONSTRAINT;
1591 p.constraint &= ENDWORD_CONSTRAINT;
1594 p.constraint &= LIMWORD_CONSTRAINT;
1597 p.constraint &= NOTLIMWORD_CONSTRAINT;
1602 for (j = 0; j < d->follows[old.index].nelem; ++j)
1604 p.index = d->follows[old.index].elems[j].index;
1607 /* Force rescan to start at the beginning. */
1614 /* Perform bottom-up analysis on the parse tree, computing various functions.
1615 Note that at this point, we're pretending constructs like \< are real
1616 characters rather than constraints on what can follow them.
1618 Nullable: A node is nullable if it is at the root of a regexp that can
1619 match the empty string.
1620 * EMPTY leaves are nullable.
1621 * No other leaf is nullable.
1622 * A QMARK or STAR node is nullable.
1623 * A PLUS node is nullable if its argument is nullable.
1624 * A CAT node is nullable if both its arguments are nullable.
1625 * An OR node is nullable if either argument is nullable.
1627 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1628 that could correspond to the first character of a string matching the
1629 regexp rooted at the given node.
1630 * EMPTY leaves have empty firstpos.
1631 * The firstpos of a nonempty leaf is that leaf itself.
1632 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1634 * The firstpos of a CAT node is the firstpos of the left argument, union
1635 the firstpos of the right if the left argument is nullable.
1636 * The firstpos of an OR node is the union of firstpos of each argument.
1638 Lastpos: The lastpos of a node is the set of positions that could
1639 correspond to the last character of a string matching the regexp at
1641 * EMPTY leaves have empty lastpos.
1642 * The lastpos of a nonempty leaf is that leaf itself.
1643 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1645 * The lastpos of a CAT node is the lastpos of its right argument, union
1646 the lastpos of the left if the right argument is nullable.
1647 * The lastpos of an OR node is the union of the lastpos of each argument.
1649 Follow: The follow of a position is the set of positions that could
1650 correspond to the character following a character matching the node in
1651 a string matching the regexp. At this point we consider special symbols
1652 that match the empty string in some context to be just normal characters.
1653 Later, if we find that a special symbol is in a follow set, we will
1654 replace it with the elements of its follow, labeled with an appropriate
1656 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1657 the follow of every node in the lastpos.
1658 * Every node in the firstpos of the second argument of a CAT node is in
1659 the follow of every node in the lastpos of the first argument.
1661 Because of the postfix representation of the parse tree, the depth-first
1662 analysis is conveniently done by a linear scan with the aid of a stack.
1663 Sets are stored as arrays of the elements, obeying a stack-like allocation
1664 scheme; the number of elements in each set deeper in the stack can be
1665 used to determine the address of a particular set's array. */
1667 dfaanalyze (struct dfa *d, int searchflag)
1669 int *nullable; /* Nullable stack. */
1670 int *nfirstpos; /* Element count stack for firstpos sets. */
1671 position *firstpos; /* Array where firstpos elements are stored. */
1672 int *nlastpos; /* Element count stack for lastpos sets. */
1673 position *lastpos; /* Array where lastpos elements are stored. */
1674 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1675 position_set tmp; /* Temporary set for merging sets. */
1676 position_set merged; /* Result of merging sets. */
1677 int wants_newline; /* True if some position wants newline info. */
1679 int *o_nfirst, *o_nlast;
1680 position *o_firstpos, *o_lastpos;
1685 fprintf(stderr, "dfaanalyze:\n");
1686 for (i = 0; i < d->tindex; ++i)
1688 fprintf(stderr, " %d:", i);
1689 prtok(d->tokens[i]);
1694 d->searchflag = searchflag;
1696 MALLOC(nullable, int, d->depth);
1697 o_nullable = nullable;
1698 MALLOC(nfirstpos, int, d->depth);
1699 o_nfirst = nfirstpos;
1700 MALLOC(firstpos, position, d->nleaves);
1701 o_firstpos = firstpos, firstpos += d->nleaves;
1702 MALLOC(nlastpos, int, d->depth);
1704 MALLOC(lastpos, position, d->nleaves);
1705 o_lastpos = lastpos, lastpos += d->nleaves;
1706 MALLOC(nalloc, int, d->tindex);
1707 for (i = 0; i < d->tindex; ++i)
1709 MALLOC(merged.elems, position, d->nleaves);
1711 CALLOC(d->follows, position_set, d->tindex);
1713 for (i = 0; i < d->tindex; ++i)
1715 { /* Nonsyntactic #ifdef goo... */
1717 switch (d->tokens[i])
1720 /* The empty set is nullable. */
1723 /* The firstpos and lastpos of the empty leaf are both empty. */
1724 *nfirstpos++ = *nlastpos++ = 0;
1729 /* Every element in the firstpos of the argument is in the follow
1730 of every element in the lastpos. */
1731 tmp.nelem = nfirstpos[-1];
1732 tmp.elems = firstpos;
1734 for (j = 0; j < nlastpos[-1]; ++j)
1736 merge(&tmp, &d->follows[pos[j].index], &merged);
1737 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1738 nalloc[pos[j].index], merged.nelem - 1);
1739 copy(&merged, &d->follows[pos[j].index]);
1743 /* A QMARK or STAR node is automatically nullable. */
1744 if (d->tokens[i] != PLUS)
1749 /* Every element in the firstpos of the second argument is in the
1750 follow of every element in the lastpos of the first argument. */
1751 tmp.nelem = nfirstpos[-1];
1752 tmp.elems = firstpos;
1753 pos = lastpos + nlastpos[-1];
1754 for (j = 0; j < nlastpos[-2]; ++j)
1756 merge(&tmp, &d->follows[pos[j].index], &merged);
1757 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1758 nalloc[pos[j].index], merged.nelem - 1);
1759 copy(&merged, &d->follows[pos[j].index]);
1762 /* The firstpos of a CAT node is the firstpos of the first argument,
1763 union that of the second argument if the first is nullable. */
1765 nfirstpos[-2] += nfirstpos[-1];
1767 firstpos += nfirstpos[-1];
1770 /* The lastpos of a CAT node is the lastpos of the second argument,
1771 union that of the first argument if the second is nullable. */
1773 nlastpos[-2] += nlastpos[-1];
1776 pos = lastpos + nlastpos[-2];
1777 for (j = nlastpos[-1] - 1; j >= 0; --j)
1778 pos[j] = lastpos[j];
1779 lastpos += nlastpos[-2];
1780 nlastpos[-2] = nlastpos[-1];
1784 /* A CAT node is nullable if both arguments are nullable. */
1785 nullable[-2] = nullable[-1] && nullable[-2];
1791 /* The firstpos is the union of the firstpos of each argument. */
1792 nfirstpos[-2] += nfirstpos[-1];
1795 /* The lastpos is the union of the lastpos of each argument. */
1796 nlastpos[-2] += nlastpos[-1];
1799 /* An OR node is nullable if either argument is nullable. */
1800 nullable[-2] = nullable[-1] || nullable[-2];
1805 /* Anything else is a nonempty position. (Note that special
1806 constructs like \< are treated as nonempty strings here;
1807 an "epsilon closure" effectively makes them nullable later.
1808 Backreferences have to get a real position so we can detect
1809 transitions on them later. But they are nullable. */
1810 *nullable++ = d->tokens[i] == BACKREF;
1812 /* This position is in its own firstpos and lastpos. */
1813 *nfirstpos++ = *nlastpos++ = 1;
1814 --firstpos, --lastpos;
1815 firstpos->index = lastpos->index = i;
1816 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1818 /* Allocate the follow set for this position. */
1820 MALLOC(d->follows[i].elems, position, nalloc[i]);
1824 /* ... balance the above nonsyntactic #ifdef goo... */
1825 fprintf(stderr, "node %d:", i);
1826 prtok(d->tokens[i]);
1828 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1829 fprintf(stderr, " firstpos:");
1830 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1832 fprintf(stderr, " %d:", firstpos[j].index);
1833 prtok(d->tokens[firstpos[j].index]);
1835 fprintf(stderr, "\n lastpos:");
1836 for (j = nlastpos[-1] - 1; j >= 0; --j)
1838 fprintf(stderr, " %d:", lastpos[j].index);
1839 prtok(d->tokens[lastpos[j].index]);
1845 /* For each follow set that is the follow set of a real position, replace
1846 it with its epsilon closure. */
1847 for (i = 0; i < d->tindex; ++i)
1848 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1850 || d->tokens[i] == ANYCHAR
1851 || d->tokens[i] == MBCSET
1853 || d->tokens[i] >= CSET)
1856 fprintf(stderr, "follows(%d:", i);
1857 prtok(d->tokens[i]);
1858 fprintf(stderr, "):");
1859 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1861 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1862 prtok(d->tokens[d->follows[i].elems[j].index]);
1866 copy(&d->follows[i], &merged);
1867 epsclosure(&merged, d);
1868 if (d->follows[i].nelem < merged.nelem)
1869 REALLOC(d->follows[i].elems, position, merged.nelem);
1870 copy(&merged, &d->follows[i]);
1873 /* Get the epsilon closure of the firstpos of the regexp. The result will
1874 be the set of positions of state 0. */
1876 for (i = 0; i < nfirstpos[-1]; ++i)
1877 insert(firstpos[i], &merged);
1878 epsclosure(&merged, d);
1880 /* Check if any of the positions of state 0 will want newline context. */
1882 for (i = 0; i < merged.nelem; ++i)
1883 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1886 /* Build the initial state. */
1889 MALLOC(d->states, dfa_state, d->salloc);
1890 state_index(d, &merged, wants_newline, 0);
1901 /* Find, for each character, the transition out of state s of d, and store
1902 it in the appropriate slot of trans.
1904 We divide the positions of s into groups (positions can appear in more
1905 than one group). Each group is labeled with a set of characters that
1906 every position in the group matches (taking into account, if necessary,
1907 preceding context information of s). For each group, find the union
1908 of the its elements' follows. This set is the set of positions of the
1909 new state. For each character in the group's label, set the transition
1910 on this character to be to a state corresponding to the set's positions,
1911 and its associated backward context information, if necessary.
1913 If we are building a searching matcher, we include the positions of state
1916 The collection of groups is constructed by building an equivalence-class
1917 partition of the positions of s.
1919 For each position, find the set of characters C that it matches. Eliminate
1920 any characters from C that fail on grounds of backward context.
1922 Search through the groups, looking for a group whose label L has nonempty
1923 intersection with C. If L - C is nonempty, create a new group labeled
1924 L - C and having the same positions as the current group, and set L to
1925 the intersection of L and C. Insert the position in this group, set
1926 C = C - L, and resume scanning.
1928 If after comparing with every group there are characters remaining in C,
1929 create a new group labeled with the characters of C and insert this
1930 position in that group. */
1932 dfastate (int s, struct dfa *d, int trans[])
1934 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1935 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1936 int ngrps = 0; /* Number of groups actually used. */
1937 position pos; /* Current position being considered. */
1938 charclass matches; /* Set of matching characters. */
1939 int matchesf; /* True if matches is nonempty. */
1940 charclass intersect; /* Intersection with some label set. */
1941 int intersectf; /* True if intersect is nonempty. */
1942 charclass leftovers; /* Stuff in the label that didn't match. */
1943 int leftoversf; /* True if leftovers is nonempty. */
1944 static charclass letters; /* Set of characters considered letters. */
1945 static charclass newline; /* Set of characters that aren't newline. */
1946 position_set follows; /* Union of the follows of some group. */
1947 position_set tmp; /* Temporary space for merging sets. */
1948 int state; /* New state. */
1949 int wants_newline; /* New state wants to know newline context. */
1950 int state_newline; /* New state on a newline transition. */
1951 int wants_letter; /* New state wants to know letter context. */
1952 int state_letter; /* New state on a letter transition. */
1953 static int initialized; /* Flag for static initialization. */
1955 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */
1959 /* Initialize the set of letters, if necessary. */
1963 for (i = 0; i < NOTCHAR; ++i)
1964 if (IS_WORD_CONSTITUENT(i))
1966 setbit(eolbyte, newline);
1971 for (i = 0; i < d->states[s].elems.nelem; ++i)
1973 pos = d->states[s].elems.elems[i];
1974 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1975 setbit(d->tokens[pos.index], matches);
1976 else if (d->tokens[pos.index] >= CSET)
1977 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1979 else if (d->tokens[pos.index] == ANYCHAR
1980 || d->tokens[pos.index] == MBCSET)
1981 /* MB_CUR_MAX > 1 */
1983 /* ANYCHAR and MBCSET must match with a single character, so we
1984 must put it to d->states[s].mbps, which contains the positions
1985 which can match with a single character not a byte. */
1986 if (d->states[s].mbps.nelem == 0)
1988 MALLOC(d->states[s].mbps.elems, position,
1989 d->states[s].elems.nelem);
1991 insert(pos, &(d->states[s].mbps));
1994 #endif /* MBS_SUPPORT */
1998 /* Some characters may need to be eliminated from matches because
1999 they fail in the current context. */
2000 if (pos.constraint != 0xFF)
2002 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2003 d->states[s].newline, 1))
2004 clrbit(eolbyte, matches);
2005 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2006 d->states[s].newline, 0))
2007 for (j = 0; j < CHARCLASS_INTS; ++j)
2008 matches[j] &= newline[j];
2009 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2010 d->states[s].letter, 1))
2011 for (j = 0; j < CHARCLASS_INTS; ++j)
2012 matches[j] &= ~letters[j];
2013 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2014 d->states[s].letter, 0))
2015 for (j = 0; j < CHARCLASS_INTS; ++j)
2016 matches[j] &= letters[j];
2018 /* If there are no characters left, there's no point in going on. */
2019 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2021 if (j == CHARCLASS_INTS)
2025 for (j = 0; j < ngrps; ++j)
2027 /* If matches contains a single character only, and the current
2028 group's label doesn't contain that character, go on to the
2030 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2031 && !tstbit(d->tokens[pos.index], labels[j]))
2034 /* Check if this group's label has a nonempty intersection with
2037 for (k = 0; k < CHARCLASS_INTS; ++k)
2038 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2042 /* It does; now find the set differences both ways. */
2043 leftoversf = matchesf = 0;
2044 for (k = 0; k < CHARCLASS_INTS; ++k)
2046 /* Even an optimizing compiler can't know this for sure. */
2047 int match = matches[k], label = labels[j][k];
2049 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2050 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2053 /* If there were leftovers, create a new group labeled with them. */
2056 copyset(leftovers, labels[ngrps]);
2057 copyset(intersect, labels[j]);
2058 MALLOC(grps[ngrps].elems, position, d->nleaves);
2059 copy(&grps[j], &grps[ngrps]);
2063 /* Put the position in the current group. Note that there is no
2064 reason to call insert() here. */
2065 grps[j].elems[grps[j].nelem++] = pos;
2067 /* If every character matching the current position has been
2068 accounted for, we're done. */
2073 /* If we've passed the last group, and there are still characters
2074 unaccounted for, then we'll have to create a new group. */
2077 copyset(matches, labels[ngrps]);
2079 MALLOC(grps[ngrps].elems, position, d->nleaves);
2080 grps[ngrps].nelem = 1;
2081 grps[ngrps].elems[0] = pos;
2086 MALLOC(follows.elems, position, d->nleaves);
2087 MALLOC(tmp.elems, position, d->nleaves);
2089 /* If we are a searching matcher, the default transition is to a state
2090 containing the positions of state 0, otherwise the default transition
2091 is to fail miserably. */
2096 for (i = 0; i < d->states[0].elems.nelem; ++i)
2098 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2100 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2103 copy(&d->states[0].elems, &follows);
2104 state = state_index(d, &follows, 0, 0);
2106 state_newline = state_index(d, &follows, 1, 0);
2108 state_newline = state;
2110 state_letter = state_index(d, &follows, 0, 1);
2112 state_letter = state;
2113 for (i = 0; i < NOTCHAR; ++i)
2114 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2115 trans[eolbyte] = state_newline;
2118 for (i = 0; i < NOTCHAR; ++i)
2121 for (i = 0; i < ngrps; ++i)
2125 /* Find the union of the follows of the positions of the group.
2126 This is a hideously inefficient loop. Fix it someday. */
2127 for (j = 0; j < grps[i].nelem; ++j)
2128 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2129 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2134 /* If a token in follows.elems is not 1st byte of a multibyte
2135 character, or the states of follows must accept the bytes
2136 which are not 1st byte of the multibyte character.
2137 Then, if a state of follows encounter a byte, it must not be
2138 a 1st byte of a multibyte character nor singlebyte character.
2139 We cansel to add state[0].follows to next state, because
2140 state[0] must accept 1st-byte
2142 For example, we assume <sb a> is a certain singlebyte
2143 character, <mb A> is a certain multibyte character, and the
2144 codepoint of <sb a> equals the 2nd byte of the codepoint of
2146 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2147 by accepting accepts 1st byte of <mb A>, and state[i+1]
2148 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2149 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2150 <mb A>, so we can not add state[0]. */
2152 next_isnt_1st_byte = 0;
2153 for (j = 0; j < follows.nelem; ++j)
2155 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2157 next_isnt_1st_byte = 1;
2164 /* If we are building a searching matcher, throw in the positions
2165 of state 0 as well. */
2167 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2171 for (j = 0; j < d->states[0].elems.nelem; ++j)
2172 insert(d->states[0].elems.elems[j], &follows);
2174 /* Find out if the new state will want any context information. */
2176 if (tstbit(eolbyte, labels[i]))
2177 for (j = 0; j < follows.nelem; ++j)
2178 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2182 for (j = 0; j < CHARCLASS_INTS; ++j)
2183 if (labels[i][j] & letters[j])
2185 if (j < CHARCLASS_INTS)
2186 for (j = 0; j < follows.nelem; ++j)
2187 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2190 /* Find the state(s) corresponding to the union of the follows. */
2191 state = state_index(d, &follows, 0, 0);
2193 state_newline = state_index(d, &follows, 1, 0);
2195 state_newline = state;
2197 state_letter = state_index(d, &follows, 0, 1);
2199 state_letter = state;
2201 /* Set the transitions for each character in the current label. */
2202 for (j = 0; j < CHARCLASS_INTS; ++j)
2203 for (k = 0; k < INTBITS; ++k)
2204 if (labels[i][j] & 1 << k)
2206 int c = j * INTBITS + k;
2209 trans[c] = state_newline;
2210 else if (IS_WORD_CONSTITUENT(c))
2211 trans[c] = state_letter;
2212 else if (c < NOTCHAR)
2217 for (i = 0; i < ngrps; ++i)
2218 free(grps[i].elems);
2219 free(follows.elems);
2223 /* Some routines for manipulating a compiled dfa's transition tables.
2224 Each state may or may not have a transition table; if it does, and it
2225 is a non-accepting state, then d->trans[state] points to its table.
2226 If it is an accepting state then d->fails[state] points to its table.
2227 If it has no table at all, then d->trans[state] is NULL.
2228 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2231 build_state (int s, struct dfa *d)
2233 int *trans; /* The new transition table. */
2236 /* Set an upper limit on the number of transition tables that will ever
2237 exist at once. 1024 is arbitrary. The idea is that the frequently
2238 used transition tables will be quickly rebuilt, whereas the ones that
2239 were only needed once or twice will be cleared away. */
2240 if (d->trcount >= 1024)
2242 for (i = 0; i < d->tralloc; ++i)
2245 free((ptr_t) d->trans[i]);
2248 else if (d->fails[i])
2250 free((ptr_t) d->fails[i]);
2258 /* Set up the success bits for this state. */
2260 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2263 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2266 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2270 MALLOC(trans, int, NOTCHAR);
2271 dfastate(s, d, trans);
2273 /* Now go through the new transition table, and make sure that the trans
2274 and fail arrays are allocated large enough to hold a pointer for the
2275 largest state mentioned in the table. */
2276 for (i = 0; i < NOTCHAR; ++i)
2277 if (trans[i] >= d->tralloc)
2279 int oldalloc = d->tralloc;
2281 while (trans[i] >= d->tralloc)
2283 REALLOC(d->realtrans, int *, d->tralloc + 1);
2284 d->trans = d->realtrans + 1;
2285 REALLOC(d->fails, int *, d->tralloc);
2286 REALLOC(d->success, int, d->tralloc);
2287 while (oldalloc < d->tralloc)
2289 d->trans[oldalloc] = NULL;
2290 d->fails[oldalloc++] = NULL;
2294 /* Newline is a sentinel. */
2295 trans[eolbyte] = -1;
2297 if (ACCEPTING(s, *d))
2298 d->fails[s] = trans;
2300 d->trans[s] = trans;
2304 build_state_zero (struct dfa *d)
2308 CALLOC(d->realtrans, int *, d->tralloc + 1);
2309 d->trans = d->realtrans + 1;
2310 CALLOC(d->fails, int *, d->tralloc);
2311 MALLOC(d->success, int, d->tralloc);
2316 /* Multibyte character handling sub-routins for dfaexec. */
2318 /* Initial state may encounter the byte which is not a singlebyte character
2319 nor 1st byte of a multibyte character. But it is incorrect for initial
2320 state to accept such a byte.
2321 For example, in sjis encoding the regular expression like "\\" accepts
2322 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2323 0x815c. Then Initial state must skip the bytes which are not a singlebyte
2324 character nor 1st byte of a multibyte character. */
2325 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2328 while (inputwcs[p - buf_begin] == 0 \
2329 && mblen_buf[p - buf_begin] > 0 \
2336 return (size_t) -1; \
2341 realloc_trans_if_necessary(struct dfa *d, int new_state)
2343 /* Make sure that the trans and fail arrays are allocated large enough
2344 to hold a pointer for the new state. */
2345 if (new_state >= d->tralloc)
2347 int oldalloc = d->tralloc;
2349 while (new_state >= d->tralloc)
2351 REALLOC(d->realtrans, int *, d->tralloc + 1);
2352 d->trans = d->realtrans + 1;
2353 REALLOC(d->fails, int *, d->tralloc);
2354 REALLOC(d->success, int, d->tralloc);
2355 while (oldalloc < d->tralloc)
2357 d->trans[oldalloc] = NULL;
2358 d->fails[oldalloc++] = NULL;
2363 /* Return values of transit_state_singlebyte(), and
2364 transit_state_consume_1char. */
2367 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2368 TRANSIT_STATE_DONE, /* State transition has finished. */
2369 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2370 } status_transit_state;
2372 /* Consume a single byte and transit state from 's' to '*next_state'.
2373 This function is almost same as the state transition routin in dfaexec().
2374 But state transition is done just once, otherwise matching succeed or
2375 reach the end of the buffer. */
2376 static status_transit_state
2377 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2383 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2385 while (rval == TRANSIT_STATE_IN_PROGRESS)
2387 if ((t = d->trans[works]) != NULL)
2390 rval = TRANSIT_STATE_DONE;
2397 /* At the moment, it must not happen. */
2398 return TRANSIT_STATE_END_BUFFER;
2401 else if (d->fails[works])
2403 works = d->fails[works][*p];
2404 rval = TRANSIT_STATE_DONE;
2408 build_state(works, d);
2411 *next_state = works;
2415 /* Check whether period can match or not in the current context. If it can,
2416 return the amount of the bytes with which period can match, otherwise
2418 `pos' is the position of the period. `index' is the index from the
2419 buf_begin, and it is the current position in the buffer. */
2421 match_anychar (struct dfa *d, int s, position pos, int index)
2428 wc = inputwcs[index];
2429 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2431 /* Check context. */
2432 if (wc == (wchar_t)eolbyte)
2434 if (!(syntax_bits & RE_DOT_NEWLINE))
2438 else if (wc == (wchar_t)'\0')
2440 if (syntax_bits & RE_DOT_NOT_NULL)
2445 if (iswalnum(wc) || wc == L'_')
2448 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2449 newline, d->states[s].letter, letter))
2455 /* Check whether bracket expression can match or not in the current context.
2456 If it can, return the amount of the bytes with which expression can match,
2458 `pos' is the position of the bracket expression. `index' is the index
2459 from the buf_begin, and it is the current position in the buffer. */
2461 match_mb_charset (struct dfa *d, int s, position pos, int index)
2464 int match; /* Flag which represent that matching succeed. */
2465 int match_len; /* Length of the character (or collating element)
2466 with which this operator match. */
2467 size_t op_len; /* Length of the operator. */
2471 /* Pointer to the structure to which we are currently reffering. */
2472 struct mb_char_classes *work_mbc;
2476 wchar_t wc; /* Current reffering character. */
2478 wc = inputwcs[index];
2480 /* Check context. */
2481 if (wc == (wchar_t)eolbyte)
2483 if (!(syntax_bits & RE_DOT_NEWLINE))
2487 else if (wc == (wchar_t)'\0')
2489 if (syntax_bits & RE_DOT_NOT_NULL)
2493 if (iswalnum(wc) || wc == L'_')
2495 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2496 newline, d->states[s].letter, letter))
2499 /* Assign the current reffering operator to work_mbc. */
2500 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2501 match = !work_mbc->invert;
2502 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2504 /* match with a character class? */
2505 for (i = 0; i<work_mbc->nch_classes; i++)
2507 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2508 goto charset_matched;
2511 strncpy(buffer, buf_begin + index, match_len);
2512 buffer[match_len] = '\0';
2514 /* match with an equivalent class? */
2515 for (i = 0; i<work_mbc->nequivs; i++)
2517 op_len = strlen(work_mbc->equivs[i]);
2518 strncpy(buffer, buf_begin + index, op_len);
2519 buffer[op_len] = '\0';
2520 if (strcoll(work_mbc->equivs[i], buffer) == 0)
2523 goto charset_matched;
2527 /* match with a collating element? */
2528 for (i = 0; i<work_mbc->ncoll_elems; i++)
2530 op_len = strlen(work_mbc->coll_elems[i]);
2531 strncpy(buffer, buf_begin + index, op_len);
2532 buffer[op_len] = '\0';
2534 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2537 goto charset_matched;
2542 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2544 /* match with a range? */
2545 for (i = 0; i<work_mbc->nranges; i++)
2547 wcbuf[2] = work_mbc->range_sts[i];
2548 wcbuf[4] = work_mbc->range_ends[i];
2550 if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2551 wcscoll(wcbuf+4, wcbuf) >= 0)
2552 goto charset_matched;
2555 /* match with a character? */
2558 for (i = 0; i<work_mbc->nchars; i++)
2560 if (wc == work_mbc->chars[i])
2561 goto charset_matched;
2567 return match ? match_len : 0;
2570 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2571 array which corresponds to `d->states[s].mbps.elem' and each element of
2572 the array contains the amount of the bytes with which the element can
2574 `index' is the index from the buf_begin, and it is the current position
2576 Caller MUST free the array which this function return. */
2578 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2583 MALLOC(rarray, int, d->states[s].mbps.nelem);
2584 for (i = 0; i < d->states[s].mbps.nelem; ++i)
2586 position pos = d->states[s].mbps.elems[i];
2587 switch(d->tokens[pos.index])
2590 rarray[i] = match_anychar(d, s, pos, index);
2593 rarray[i] = match_mb_charset(d, s, pos, index);
2596 break; /* can not happen. */
2602 /* Consume a single character and enumerate all of the positions which can
2603 be next position from the state `s'.
2604 `match_lens' is the input. It can be NULL, but it can also be the output
2605 of check_matching_with_multibyte_ops() for optimization.
2606 `mbclen' and `pps' are the output. `mbclen' is the length of the
2607 character consumed, and `pps' is the set this function enumerate. */
2608 static status_transit_state
2609 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2610 int *match_lens, int *mbclen, position_set *pps)
2615 status_transit_state rs = TRANSIT_STATE_DONE;
2617 /* Calculate the length of the (single/multi byte) character
2618 to which p points. */
2619 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2620 : mblen_buf[*pp - buf_begin];
2622 /* Calculate the state which can be reached from the state `s' by
2623 consuming `*mbclen' single bytes from the buffer. */
2625 for (i = 0; i < *mbclen; i++)
2628 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2630 /* Copy the positions contained by `s1' to the set `pps'. */
2631 copy(&(d->states[s1].elems), pps);
2633 /* Check (inputed)match_lens, and initialize if it is NULL. */
2634 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2635 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2637 work_mbls = match_lens;
2639 /* Add all of the positions which can be reached from `s' by consuming
2640 a single character. */
2641 for (i = 0; i < d->states[s].mbps.nelem ; i++)
2643 if (work_mbls[i] == *mbclen)
2644 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2646 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2650 if (match_lens == NULL && work_mbls != NULL)
2655 /* Transit state from s, then return new state and update the pointer of the
2656 buffer. This function is for some operator which can match with a multi-
2657 byte character or a collating element(which may be multi characters). */
2659 transit_state (struct dfa *d, int s, unsigned char const **pp)
2662 int mbclen; /* The length of current input multibyte character. */
2665 int *match_lens = NULL;
2666 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2667 position_set follows;
2668 unsigned char const *p1 = *pp;
2669 status_transit_state rs;
2673 /* This state has (a) multibyte operator(s).
2674 We check whether each of them can match or not. */
2676 /* Note: caller must free the return value of this function. */
2677 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2679 for (i = 0; i < nelem; i++)
2680 /* Search the operator which match the longest string,
2683 if (match_lens[i] > maxlen)
2684 maxlen = match_lens[i];
2688 if (nelem == 0 || maxlen == 0)
2689 /* This state has no multibyte operator which can match.
2690 We need to check only one singlebyte character. */
2692 status_transit_state rs;
2693 rs = transit_state_singlebyte(d, s, *pp, &s1);
2695 /* We must update the pointer if state transition succeeded. */
2696 if (rs == TRANSIT_STATE_DONE)
2699 if (match_lens != NULL)
2704 /* This state has some operators which can match a multibyte character. */
2706 MALLOC(follows.elems, position, d->nleaves);
2708 /* `maxlen' may be longer than the length of a character, because it may
2709 not be a character but a (multi character) collating element.
2710 We enumerate all of the positions which `s' can reach by consuming
2712 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2714 wc = inputwcs[*pp - mbclen - buf_begin];
2715 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2716 realloc_trans_if_necessary(d, s1);
2718 while (*pp - p1 < maxlen)
2721 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2723 for (i = 0; i < nelem ; i++)
2725 if (match_lens[i] == *pp - p1)
2727 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2728 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2732 wc = inputwcs[*pp - mbclen - buf_begin];
2733 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2734 realloc_trans_if_necessary(d, s1);
2737 free(follows.elems);
2743 /* Search through a buffer looking for a match to the given struct dfa.
2744 Find the first occurrence of a string matching the regexp in the buffer,
2745 and the shortest possible version thereof. Return the offset of the first
2746 character after the match, or (size_t) -1 if none is found. BEGIN points to
2747 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE
2748 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place
2749 where we're supposed to store a 1 if backreferencing happened and the
2750 match needs to be verified by a backtracking matcher. Otherwise
2751 we store a 0 in *backref. */
2753 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2755 register int s; /* Current state. */
2756 register unsigned char const *p; /* Current input character. */
2757 register unsigned char const *end; /* One past the last input character. */
2758 register int **trans, *t; /* Copy of d->trans so it can be optimized
2760 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2761 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2762 static int sbit_init;
2769 for (i = 0; i < NOTCHAR; ++i)
2770 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2775 build_state_zero(d);
2778 p = (unsigned char const *) begin;
2785 int remain_bytes, i;
2789 /* initialize mblen_buf, and inputwcs. */
2790 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2791 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2792 memset(&mbs, 0, sizeof(mbstate_t));
2794 for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2796 if (remain_bytes == 0)
2799 = mbrtowc(inputwcs + i, begin + i,
2800 end - (unsigned char const *)begin - i + 1, &mbs);
2801 if (remain_bytes <= 1)
2804 inputwcs[i] = (wchar_t)begin[i];
2809 mblen_buf[i] = remain_bytes;
2815 mblen_buf[i] = remain_bytes;
2821 inputwcs[i] = 0; /* sentinel */
2823 #endif /* MBS_SUPPORT */
2829 while ((t = trans[s]))
2831 if (d->states[s].mbps.nelem != 0)
2833 /* Can match with a multibyte character( and multi character
2834 collating element). */
2835 unsigned char const *nextp;
2837 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2840 s = transit_state(d, s, &nextp);
2843 /* Trans table might be updated. */
2848 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2853 #endif /* MBS_SUPPORT */
2854 while ((t = trans[s]))
2867 #endif /* MBS_SUPPORT */
2872 else if ((t = d->fails[s]))
2874 if (d->success[s] & sbit[*p])
2877 *backref = (d->states[s].backref != 0);
2884 #endif /* MBS_SUPPORT */
2885 return (char const *) p - begin;
2891 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2892 if (d->states[s].mbps.nelem != 0)
2894 /* Can match with a multibyte character( and multi
2895 character collating element). */
2896 unsigned char const *nextp;
2898 s = transit_state(d, s, &nextp);
2901 /* Trans table might be updated. */
2908 #endif /* MBS_SUPPORT */
2919 /* Initialize the components of a dfa that the other routines don't
2920 initialize for themselves. */
2922 dfainit (struct dfa *d)
2925 MALLOC(d->charclasses, charclass, d->calloc);
2929 MALLOC(d->tokens, token, d->talloc);
2930 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2934 d->nmultibyte_prop = 1;
2935 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2937 d->mbcsets_alloc = 1;
2938 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2948 /* Parse and analyze a single string of the given length. */
2950 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2952 if (case_fold) /* dummy folding in service of dfamust() */
2957 lcopy = malloc(len);
2959 dfaerror(_("out of memory"));
2961 /* This is a kludge. */
2963 for (i = 0; i < len; ++i)
2964 if (ISUPPER ((unsigned char) s[i]))
2965 lcopy[i] = tolower ((unsigned char) s[i]);
2970 dfaparse(lcopy, len, d);
2973 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2975 dfaparse(s, len, d);
2976 dfaanalyze(d, searchflag);
2981 dfaparse(s, len, d);
2983 dfaanalyze(d, searchflag);
2987 /* Free the storage held by the components of a dfa. */
2989 dfafree (struct dfa *d)
2992 struct dfamust *dm, *ndm;
2994 free((ptr_t) d->charclasses);
2995 free((ptr_t) d->tokens);
3000 free((ptr_t) d->multibyte_prop);
3001 for (i = 0; i < d->nmbcsets; ++i)
3004 struct mb_char_classes *p = &(d->mbcsets[i]);
3005 if (p->chars != NULL)
3007 if (p->ch_classes != NULL)
3008 free(p->ch_classes);
3009 if (p->range_sts != NULL)
3011 if (p->range_ends != NULL)
3012 free(p->range_ends);
3014 for (j = 0; j < p->nequivs; ++j)
3016 if (p->equivs != NULL)
3019 for (j = 0; j < p->ncoll_elems; ++j)
3020 free(p->coll_elems[j]);
3021 if (p->coll_elems != NULL)
3022 free(p->coll_elems);
3024 free((ptr_t) d->mbcsets);
3026 #endif /* MBS_SUPPORT */
3028 for (i = 0; i < d->sindex; ++i)
3029 free((ptr_t) d->states[i].elems.elems);
3030 free((ptr_t) d->states);
3031 for (i = 0; i < d->tindex; ++i)
3032 if (d->follows[i].elems)
3033 free((ptr_t) d->follows[i].elems);
3034 free((ptr_t) d->follows);
3035 for (i = 0; i < d->tralloc; ++i)
3037 free((ptr_t) d->trans[i]);
3038 else if (d->fails[i])
3039 free((ptr_t) d->fails[i]);
3040 if (d->realtrans) free((ptr_t) d->realtrans);
3041 if (d->fails) free((ptr_t) d->fails);
3042 if (d->success) free((ptr_t) d->success);
3043 for (dm = d->musts; dm; dm = ndm)
3051 /* Having found the postfix representation of the regular expression,
3052 try to find a long sequence of characters that must appear in any line
3054 Finding a "longest" sequence is beyond the scope here;
3055 we take an easy way out and hope for the best.
3056 (Take "(ab|a)b"--please.)
3058 We do a bottom-up calculation of sequences of characters that must appear
3059 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3061 sequences that must appear at the left of the match ("left")
3062 sequences that must appear at the right of the match ("right")
3063 lists of sequences that must appear somewhere in the match ("in")
3064 sequences that must constitute the match ("is")
3066 When we get to the root of the tree, we use one of the longest of its
3067 calculated "in" sequences as our answer. The sequence we find is returned in
3068 d->must (where "d" is the single argument passed to "dfamust");
3069 the length of the sequence is returned in d->mustn.
3071 The sequences calculated for the various types of node (in pseudo ANSI c)
3072 are shown below. "p" is the operand of unary operators (and the left-hand
3073 operand of binary operators); "q" is the right-hand operand of binary
3076 "ZERO" means "a zero-length sequence" below.
3078 Type left right is in
3079 ---- ---- ----- -- --
3080 char c # c # c # c # c
3082 ANYCHAR ZERO ZERO ZERO ZERO
3084 MBCSET ZERO ZERO ZERO ZERO
3086 CSET ZERO ZERO ZERO ZERO
3088 STAR ZERO ZERO ZERO ZERO
3090 QMARK ZERO ZERO ZERO ZERO
3092 PLUS p->left p->right ZERO p->in
3094 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3095 p->left : q->right : q->is!=ZERO) ? q->in plus
3096 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3099 OR longest common longest common (do p->is and substrings common to
3100 leading trailing q->is have same p->in and q->in
3101 (sub)sequence (sub)sequence length and
3102 of p->left of p->right content) ?
3103 and q->left and q->right p->is : NULL
3105 If there's anything else we recognize in the tree, all four sequences get set
3106 to zero-length sequences. If there's something we don't recognize in the tree,
3107 we just return a zero-length sequence.
3109 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3112 And. . .is it here or someplace that we might ponder "optimizations" such as
3113 egrep 'psi|epsilon' -> egrep 'psi'
3114 egrep 'pepsi|epsilon' -> egrep 'epsi'
3115 (Yes, we now find "epsi" as a "string
3116 that must occur", but we might also
3117 simplify the *entire* r.e. being sought)
3118 grep '[c]' -> grep 'c'
3119 grep '(ab|a)b' -> grep 'ab'
3120 grep 'ab*' -> grep 'a'
3121 grep 'a*b' -> grep 'b'
3123 There are several issues:
3125 Is optimization easy (enough)?
3127 Does optimization actually accomplish anything,
3128 or is the automaton you get from "psi|epsilon" (for example)
3129 the same as the one you get from "psi" (for example)?
3131 Are optimizable r.e.'s likely to be used in real-life situations
3132 (something like 'ab*' is probably unlikely; something like is
3133 'psi|epsilon' is likelier)? */
3136 icatalloc (char *old, char *new)
3139 size_t oldsize, newsize;
3141 newsize = (new == NULL) ? 0 : strlen(new);
3144 else if (newsize == 0)
3146 else oldsize = strlen(old);
3148 result = (char *) malloc(newsize + 1);
3150 result = (char *) realloc((void *) old, oldsize + newsize + 1);
3151 if (result != NULL && new != NULL)
3152 (void) strcpy(result + oldsize, new);
3157 icpyalloc (char *string)
3159 return icatalloc((char *) NULL, string);
3163 istrstr (char *lookin, char *lookfor)
3168 len = strlen(lookfor);
3169 for (cp = lookin; *cp != '\0'; ++cp)
3170 if (strncmp(cp, lookfor, len) == 0)
3183 freelist (char **cpp)
3189 for (i = 0; cpp[i] != NULL; ++i)
3197 enlist (char **cpp, char *new, size_t len)
3203 if ((new = icpyalloc(new)) == NULL)
3209 /* Is there already something in the list that's new (or longer)? */
3210 for (i = 0; cpp[i] != NULL; ++i)
3211 if (istrstr(cpp[i], new) != NULL)
3216 /* Eliminate any obsoleted strings. */
3218 while (cpp[j] != NULL)
3219 if (istrstr(new, cpp[j]) == NULL)
3229 /* Add the new string. */
3230 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3238 /* Given pointers to two strings, return a pointer to an allocated
3239 list of their distinct common substrings. Return NULL if something
3242 comsubs (char *left, char *right)
3249 if (left == NULL || right == NULL)
3251 cpp = (char **) malloc(sizeof *cpp);
3255 for (lcp = left; *lcp != '\0'; ++lcp)
3258 rcp = strchr (right, *lcp);
3261 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3265 rcp = strchr (rcp + 1, *lcp);
3269 if ((cpp = enlist(cpp, lcp, len)) == NULL)
3276 addlists (char **old, char **new)
3280 if (old == NULL || new == NULL)
3282 for (i = 0; new[i] != NULL; ++i)
3284 old = enlist(old, new[i], strlen(new[i]));
3291 /* Given two lists of substrings, return a new list giving substrings
3294 inboth (char **left, char **right)
3300 if (left == NULL || right == NULL)
3302 both = (char **) malloc(sizeof *both);
3306 for (lnum = 0; left[lnum] != NULL; ++lnum)
3308 for (rnum = 0; right[rnum] != NULL; ++rnum)
3310 temp = comsubs(left[lnum], right[rnum]);
3316 both = addlists(both, temp);
3335 resetmust (must *mp)
3337 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3342 dfamust (struct dfa *dfa)
3353 static char empty_string[] = "";
3355 result = empty_string;
3357 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3361 for (i = 0; i <= dfa->tindex; ++i)
3363 for (i = 0; i <= dfa->tindex; ++i)
3365 mp[i].in = (char **) malloc(sizeof *mp[i].in);
3366 mp[i].left = malloc(2);
3367 mp[i].right = malloc(2);
3368 mp[i].is = malloc(2);
3369 if (mp[i].in == NULL || mp[i].left == NULL ||
3370 mp[i].right == NULL || mp[i].is == NULL)
3372 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3376 fprintf(stderr, "dfamust:\n");
3377 for (i = 0; i < dfa->tindex; ++i)
3379 fprintf(stderr, " %d:", i);
3380 prtok(dfa->tokens[i]);
3384 for (ri = 0; ri < dfa->tindex; ++ri)
3386 switch (t = dfa->tokens[ri])
3390 goto done; /* "cannot happen" */
3404 goto done; /* "cannot happen" */
3411 goto done; /* "cannot happen" */
3420 /* Guaranteed to be. Unlikely, but. . . */
3421 if (strcmp(lmp->is, rmp->is) != 0)
3423 /* Left side--easy */
3425 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3427 lmp->left[i] = '\0';
3429 ln = strlen(lmp->right);
3430 rn = strlen(rmp->right);
3434 for (i = 0; i < n; ++i)
3435 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3437 for (j = 0; j < i; ++j)
3438 lmp->right[j] = lmp->right[(ln - i) + j];
3439 lmp->right[j] = '\0';
3440 new = inboth(lmp->in, rmp->in);
3444 free((char *) lmp->in);
3450 goto done; /* "cannot happen" */
3455 if (mp != &musts[1])
3456 goto done; /* "cannot happen" */
3457 for (i = 0; musts[0].in[i] != NULL; ++i)
3458 if (strlen(musts[0].in[i]) > strlen(result))
3459 result = musts[0].in[i];
3460 if (strcmp(result, musts[0].is) == 0)
3465 goto done; /* "cannot happen" */
3472 /* In. Everything in left, plus everything in
3473 right, plus catenation of
3474 left's right and right's left. */
3475 lmp->in = addlists(lmp->in, rmp->in);
3476 if (lmp->in == NULL)
3478 if (lmp->right[0] != '\0' &&
3479 rmp->left[0] != '\0')
3483 tp = icpyalloc(lmp->right);
3486 tp = icatalloc(tp, rmp->left);
3489 lmp->in = enlist(lmp->in, tp,
3492 if (lmp->in == NULL)
3496 if (lmp->is[0] != '\0')
3498 lmp->left = icatalloc(lmp->left,
3500 if (lmp->left == NULL)
3504 if (rmp->is[0] == '\0')
3505 lmp->right[0] = '\0';
3506 lmp->right = icatalloc(lmp->right, rmp->right);
3507 if (lmp->right == NULL)
3509 /* Guaranteed to be */
3510 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3512 lmp->is = icatalloc(lmp->is, rmp->is);
3513 if (lmp->is == NULL)
3523 /* "cannot happen" */
3528 /* not on *my* shift */
3535 #endif /* MBS_SUPPORT */
3543 /* plain character */
3545 mp->is[0] = mp->left[0] = mp->right[0] = t;
3546 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3547 mp->in = enlist(mp->in, mp->is, (size_t)1);
3554 fprintf(stderr, " node: %d:", ri);
3555 prtok(dfa->tokens[ri]);
3556 fprintf(stderr, "\n in:");
3557 for (i = 0; mp->in[i]; ++i)
3558 fprintf(stderr, " \"%s\"", mp->in[i]);
3559 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3560 fprintf(stderr, " left: \"%s\"\n", mp->left);
3561 fprintf(stderr, " right: \"%s\"\n", mp->right);
3568 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3570 dm->must = malloc(strlen(result) + 1);
3571 strcpy(dm->must, result);
3572 dm->next = dfa->musts;
3576 for (i = 0; i <= dfa->tindex; ++i)
3579 ifree((char *) mp[i].in);
3586 /* vim:set shiftwidth=2: */