1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
31 #include <sys/types.h>
35 extern char *calloc(), *malloc(), *realloc();
39 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
49 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
50 /* We can handle multibyte string. */
59 #ifndef DEBUG /* use the same approach as regex.c */
65 #define isgraph(C) (isprint(C) && !isspace(C))
68 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
69 #define ISALPHA(C) isalpha(C)
70 #define ISUPPER(C) isupper(C)
71 #define ISLOWER(C) islower(C)
72 #define ISDIGIT(C) isdigit(C)
73 #define ISXDIGIT(C) isxdigit(C)
74 #define ISSPACE(C) isspace(C)
75 #define ISPUNCT(C) ispunct(C)
76 #define ISALNUM(C) isalnum(C)
77 #define ISPRINT(C) isprint(C)
78 #define ISGRAPH(C) isgraph(C)
79 #define ISCNTRL(C) iscntrl(C)
81 #define ISALPHA(C) (isascii(C) && isalpha(C))
82 #define ISUPPER(C) (isascii(C) && isupper(C))
83 #define ISLOWER(C) (isascii(C) && islower(C))
84 #define ISDIGIT(C) (isascii(C) && isdigit(C))
85 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
86 #define ISSPACE(C) (isascii(C) && isspace(C))
87 #define ISPUNCT(C) (isascii(C) && ispunct(C))
88 #define ISALNUM(C) (isascii(C) && isalnum(C))
89 #define ISPRINT(C) (isascii(C) && isprint(C))
90 #define ISGRAPH(C) (isascii(C) && isgraph(C))
91 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
94 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
95 - Its arg may be any int or unsigned int; it need not be an unsigned char.
96 - It's guaranteed to evaluate its argument exactly once.
97 - It's typically faster.
98 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
99 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
100 it's important to use the locale's definition of `digit' even when the
101 host does not conform to Posix. */
102 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
104 /* If we (don't) have I18N. */
105 /* glibc defines _ */
107 # ifdef HAVE_LIBINTL_H
108 # include <libintl.h>
110 # define _(Str) gettext (Str)
113 # define _(Str) (Str)
119 #include "hard-locale.h"
121 /* HPUX, define those as macros in sys/param.h */
129 static void dfamust PARAMS ((struct dfa *dfa));
130 static void regexp PARAMS ((int toplevel));
133 xcalloc (size_t n, size_t s)
135 ptr_t r = calloc(n, s);
138 dfaerror(_("Memory exhausted"));
149 dfaerror(_("Memory exhausted"));
154 xrealloc (ptr_t p, size_t n)
156 ptr_t r = realloc(p, n);
160 dfaerror(_("Memory exhausted"));
164 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
165 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
166 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
168 /* Reallocate an array of type t if nalloc is too small for index. */
169 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
170 if ((index) >= (nalloc)) \
174 while ((index) >= (nalloc)); \
175 REALLOC(p, t, nalloc); \
186 fprintf(stderr, "END");
187 else if (t < NOTCHAR)
188 fprintf(stderr, "%c", t);
193 case EMPTY: s = "EMPTY"; break;
194 case BACKREF: s = "BACKREF"; break;
195 case BEGLINE: s = "BEGLINE"; break;
196 case ENDLINE: s = "ENDLINE"; break;
197 case BEGWORD: s = "BEGWORD"; break;
198 case ENDWORD: s = "ENDWORD"; break;
199 case LIMWORD: s = "LIMWORD"; break;
200 case NOTLIMWORD: s = "NOTLIMWORD"; break;
201 case QMARK: s = "QMARK"; break;
202 case STAR: s = "STAR"; break;
203 case PLUS: s = "PLUS"; break;
204 case CAT: s = "CAT"; break;
205 case OR: s = "OR"; break;
206 case ORTOP: s = "ORTOP"; break;
207 case LPAREN: s = "LPAREN"; break;
208 case RPAREN: s = "RPAREN"; break;
209 case CRANGE: s = "CRANGE"; break;
211 case ANYCHAR: s = "ANYCHAR"; break;
212 case MBCSET: s = "MBCSET"; break;
213 #endif /* MBS_SUPPORT */
214 default: s = "CSET"; break;
216 fprintf(stderr, "%s", s);
221 /* Stuff pertaining to charclasses. */
224 tstbit (unsigned b, charclass c)
226 return c[b / INTBITS] & 1 << b % INTBITS;
230 setbit (unsigned b, charclass c)
232 c[b / INTBITS] |= 1 << b % INTBITS;
236 clrbit (unsigned b, charclass c)
238 c[b / INTBITS] &= ~(1 << b % INTBITS);
242 copyset (charclass src, charclass dst)
244 memcpy (dst, src, sizeof (charclass));
248 zeroset (charclass s)
250 memset (s, 0, sizeof (charclass));
258 for (i = 0; i < CHARCLASS_INTS; ++i)
263 equal (charclass s1, charclass s2)
265 return memcmp (s1, s2, sizeof (charclass)) == 0;
268 /* A pointer to the current dfa is kept here during parsing. */
269 static struct dfa *dfa;
271 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
273 charclass_index (charclass s)
277 for (i = 0; i < dfa->cindex; ++i)
278 if (equal(s, dfa->charclasses[i]))
280 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
282 copyset(s, dfa->charclasses[i]);
286 /* Syntax bits controlling the behavior of the lexical analyzer. */
287 static reg_syntax_t syntax_bits, syntax_bits_set;
289 /* Flag for case-folding letters into sets. */
290 static int case_fold;
292 /* End-of-line byte in data. */
293 static unsigned char eolbyte;
295 /* Entry point to set syntax options. */
297 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
305 /* Like setbit, but if case is folded, set both cases of a letter. */
307 setbit_case_fold (unsigned b, charclass c)
313 setbit (tolower (b), c);
314 else if (ISLOWER (b))
315 setbit (toupper (b), c);
319 /* Lexical analyzer. All the dross that deals with the obnoxious
320 GNU Regex syntax bits is located here. The poor, suffering
321 reader is referred to the GNU Regex documentation for the
322 meaning of the @#%!@#%^!@ syntax bits. */
324 static char const *lexstart; /* Pointer to beginning of input string. */
325 static char const *lexptr; /* Pointer to next input character. */
326 static int lexleft; /* Number of characters remaining. */
327 static token lasttok; /* Previous token returned; initially END. */
328 static int laststart; /* True if we're separated from beginning or (, |
329 only by zero-width characters. */
330 static int parens; /* Count of outstanding left parens. */
331 static int minrep, maxrep; /* Repeat counts for {m,n}. */
332 static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
335 /* These variables are used only if (MB_CUR_MAX > 1). */
336 static mbstate_t mbs; /* Mbstate for mbrlen(). */
337 static int cur_mb_len; /* Byte length of the current scanning
338 multibyte character. */
339 static int cur_mb_index; /* Byte index of the current scanning multibyte
342 singlebyte character : cur_mb_index = 0
344 1st byte : cur_mb_index = 1
345 2nd byte : cur_mb_index = 2
347 nth byte : cur_mb_index = n */
348 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
349 Each element store the amount of remain
350 byte of corresponding multibyte character
351 in the input string. A element's value
352 is 0 if corresponding character is a
354 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
355 mblen_buf : 0, 3, 2, 1
357 static wchar_t *inputwcs; /* Wide character representation of input
359 The length of this array is same as
360 the length of input string(char array).
361 inputstring[i] is a single-byte char,
362 or 1st byte of a multibyte char.
363 And inputwcs[i] is the codepoint. */
364 static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */
365 static unsigned char const *buf_end; /* refference to end in dfaexec(). */
366 #endif /* MBS_SUPPORT */
369 /* This function update cur_mb_len, and cur_mb_index.
370 p points current lexptr, len is the remaining buffer length. */
372 update_mb_len_index (unsigned char const *p, int len)
374 /* If last character is a part of a multibyte character,
375 we update cur_mb_index. */
377 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
380 /* If last character is a single byte character, or the
381 last portion of a multibyte character, we check whether
382 next character is a multibyte character or not. */
385 cur_mb_len = mbrlen(p, len, &mbs);
387 /* It is a multibyte character.
388 cur_mb_len was already set by mbrlen(). */
390 else if (cur_mb_len < 1)
391 /* Invalid sequence. We treat it as a singlebyte character.
392 cur_mb_index is aleady 0. */
394 /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
395 cur_mb_index is aleady 0. */
398 #endif /* MBS_SUPPORT */
401 /* Note that characters become unsigned here. */
402 # define FETCH(c, eoferr) \
409 return lasttok = END; \
411 if (MB_CUR_MAX > 1) \
412 update_mb_len_index(lexptr, lexleft); \
413 (c) = (unsigned char) *lexptr++; \
417 /* This function fetch a wide character, and update cur_mb_len,
418 used only if the current locale is a multibyte environment. */
420 fetch_wc (char const *eoferr)
431 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
437 lexptr += cur_mb_len;
438 lexleft -= cur_mb_len;
442 /* Note that characters become unsigned here. */
443 # define FETCH(c, eoferr) \
450 return lasttok = END; \
452 (c) = (unsigned char) *lexptr++; \
455 #endif /* MBS_SUPPORT */
458 /* Multibyte character handling sub-routin for lex.
459 This function parse a bracket expression and build a struct
462 parse_bracket_exp_mb ()
466 /* Work area to build a mb_char_classes. */
467 struct mb_char_classes *work_mbc;
468 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
469 equivs_al, coll_elems_al;
471 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
472 dfa->mbcsets_alloc, dfa->nmbcsets + 1);
473 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
474 We will update dfa->multibyte_prop in addtok(), because we can't
475 decide the index in dfa->tokens[]. */
477 /* Initialize work are */
478 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
481 range_sts_al = range_ends_al = 0;
482 ch_classes_al = equivs_al = coll_elems_al = 0;
483 MALLOC(work_mbc->chars, wchar_t, chars_al);
485 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
486 work_mbc->nequivs = work_mbc->ncoll_elems = 0;
487 work_mbc->chars = work_mbc->ch_classes = NULL;
488 work_mbc->range_sts = work_mbc->range_ends = NULL;
489 work_mbc->equivs = work_mbc->coll_elems = NULL;
491 wc = fetch_wc(_("Unbalanced ["));
494 wc = fetch_wc(_("Unbalanced ["));
495 work_mbc->invert = 1;
498 work_mbc->invert = 0;
501 wc1 = WEOF; /* mark wc1 is not initialized". */
503 /* Note that if we're looking at some other [:...:] construct,
504 we just treat it as a bunch of ordinary characters. We can do
505 this because we assume regex has checked for syntax errors before
506 dfa is ever called. */
507 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
509 #define BRACKET_BUFFER_SIZE 128
510 char str[BRACKET_BUFFER_SIZE];
512 wc = fetch_wc(_("Unbalanced ["));
514 /* If pattern contains `[[:', `[[.', or `[[='. */
515 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
518 unsigned char delim = (unsigned char)wc;
523 dfaerror (_("Unbalanced ["));
524 c = (unsigned char) *lexptr++;
527 if ((c == delim && *lexptr == ']') || lexleft == 0)
529 if (len < BRACKET_BUFFER_SIZE)
532 /* This is in any case an invalid class name. */
539 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
540 work_mbc->nchars + 2);
541 work_mbc->chars[work_mbc->nchars++] = L'[';
542 work_mbc->chars[work_mbc->nchars++] = delim;
546 if (--lexleft, *lexptr++ != ']')
547 dfaerror (_("Unbalanced ["));
549 /* build character class. */
552 /* Query the character class as wctype_t. */
555 if (ch_classes_al == 0)
556 MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
557 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
559 work_mbc->nch_classes + 1);
560 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
563 else if (delim == '=' || delim == '.')
566 MALLOC(elem, char, len + 1);
567 strncpy(elem, str, len + 1);
570 /* build equivalent class. */
573 MALLOC(work_mbc->equivs, char*, ++equivs_al);
574 REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
576 work_mbc->nequivs + 1);
577 work_mbc->equivs[work_mbc->nequivs++] = elem;
581 /* build collating element. */
583 if (coll_elems_al == 0)
584 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
585 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
587 work_mbc->ncoll_elems + 1);
588 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
594 /* We treat '[' as a normal character here. */
596 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
601 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
602 wc = fetch_wc(("Unbalanced ["));
606 wc1 = fetch_wc(_("Unbalanced ["));
609 /* build range characters. */
611 wc2 = fetch_wc(_("Unbalanced ["));
614 /* In the case [x-], the - is an ordinary hyphen,
615 which is left in c1, the lookahead character. */
616 lexptr -= cur_mb_len;
617 lexleft += cur_mb_len;
623 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
624 wc2 = fetch_wc(_("Unbalanced ["));
625 wc1 = fetch_wc(_("Unbalanced ["));
628 if (range_sts_al == 0)
630 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
631 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
633 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
634 range_sts_al, work_mbc->nranges + 1);
635 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
636 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
637 range_ends_al, work_mbc->nranges + 1);
638 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
641 /* build normal characters. */
643 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
644 work_mbc->nchars + 1);
645 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
648 while ((wc = wc1) != L']');
650 #endif /* MBS_SUPPORT */
653 #define FUNC(F, P) static int F(int c) { return P(c); }
655 #define FUNC(F, P) static int F(c) int c; { return P(c); }
658 FUNC(is_alpha, ISALPHA)
659 FUNC(is_upper, ISUPPER)
660 FUNC(is_lower, ISLOWER)
661 FUNC(is_digit, ISDIGIT)
662 FUNC(is_xdigit, ISXDIGIT)
663 FUNC(is_space, ISSPACE)
664 FUNC(is_punct, ISPUNCT)
665 FUNC(is_alnum, ISALNUM)
666 FUNC(is_print, ISPRINT)
667 FUNC(is_graph, ISGRAPH)
668 FUNC(is_cntrl, ISCNTRL)
673 return (c == ' ' || c == '\t');
676 /* The following list maps the names of the Posix named character classes
677 to predicate functions that determine whether a given character is in
678 the class. The leading [ has already been eaten by the lexical analyzer. */
681 int (*pred) PARAMS ((int));
682 } const prednames[] = {
683 { ":alpha:]", is_alpha },
684 { ":upper:]", is_upper },
685 { ":lower:]", is_lower },
686 { ":digit:]", is_digit },
687 { ":xdigit:]", is_xdigit },
688 { ":space:]", is_space },
689 { ":punct:]", is_punct },
690 { ":alnum:]", is_alnum },
691 { ":print:]", is_print },
692 { ":graph:]", is_graph },
693 { ":cntrl:]", is_cntrl },
694 { ":blank:]", is_blank },
698 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
699 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
702 looking_at (char const *s)
709 return strncmp(s, lexptr, len) == 0;
716 int backslash = 0, invert;
720 /* Basic plan: We fetch a character. If it's a backslash,
721 we set the backslash flag and go through the loop again.
722 On the plus side, this avoids having a duplicate of the
723 main switch inside the backslash case. On the minus side,
724 it means that just about every case begins with
725 "if (backslash) ...". */
726 for (i = 0; i < 2; ++i)
730 if (MB_CUR_MAX > 1 && cur_mb_index)
731 /* If this is a part of a multi-byte character, we must treat
732 this byte data as a normal character.
733 e.g. In case of SJIS encoding, some character contains '\',
734 but they must not be backslash. */
736 #endif /* MBS_SUPPORT */
743 dfaerror(_("Unfinished \\ escape"));
750 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
754 return lasttok = BEGLINE;
760 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
762 || (syntax_bits & RE_NO_BK_PARENS
763 ? lexleft > 0 && *lexptr == ')'
764 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
765 || (syntax_bits & RE_NO_BK_VBAR
766 ? lexleft > 0 && *lexptr == '|'
767 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
768 || ((syntax_bits & RE_NEWLINE_ALT)
769 && lexleft > 0 && *lexptr == '\n'))
770 return lasttok = ENDLINE;
782 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
785 return lasttok = BACKREF;
790 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
791 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
795 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
796 return lasttok = ENDLINE; /* FIXME: should be end of string */
800 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
801 return lasttok = BEGWORD;
805 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
806 return lasttok = ENDWORD;
810 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
811 return lasttok = LIMWORD;
815 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
816 return lasttok = NOTLIMWORD;
820 if (syntax_bits & RE_LIMITED_OPS)
822 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
824 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
826 return lasttok = QMARK;
831 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
833 return lasttok = STAR;
836 if (syntax_bits & RE_LIMITED_OPS)
838 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
840 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
842 return lasttok = PLUS;
845 if (!(syntax_bits & RE_INTERVALS))
847 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
849 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
852 if (syntax_bits & RE_NO_BK_BRACES)
854 /* Scan ahead for a valid interval; if it's not valid,
855 treat it as a literal '{'. */
856 int lo = -1, hi = -1;
857 char const *p = lexptr;
858 char const *lim = p + lexleft;
859 for (; p != lim && ISASCIIDIGIT (*p); p++)
860 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
861 if (p != lim && *p == ',')
862 while (++p != lim && ISASCIIDIGIT (*p))
863 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
866 if (p == lim || *p != '}'
867 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
874 {M,} - minimum count, maximum is infinity
875 {M,N} - M through N */
876 FETCH(c, _("unfinished repeat count"));
877 if (ISASCIIDIGIT (c))
882 FETCH(c, _("unfinished repeat count"));
883 if (! ISASCIIDIGIT (c))
885 minrep = 10 * minrep + c - '0';
889 dfaerror(_("malformed repeat count"));
892 FETCH (c, _("unfinished repeat count"));
893 if (! ISASCIIDIGIT (c))
900 FETCH (c, _("unfinished repeat count"));
901 if (! ISASCIIDIGIT (c))
903 maxrep = 10 * maxrep + c - '0';
905 if (0 <= maxrep && maxrep < minrep)
906 dfaerror (_("malformed repeat count"));
911 if (!(syntax_bits & RE_NO_BK_BRACES))
914 dfaerror(_("malformed repeat count"));
915 FETCH(c, _("unfinished repeat count"));
918 dfaerror(_("malformed repeat count"));
920 return lasttok = REPMN;
923 if (syntax_bits & RE_LIMITED_OPS)
925 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
931 if (syntax_bits & RE_LIMITED_OPS
933 || !(syntax_bits & RE_NEWLINE_ALT))
939 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
943 return lasttok = LPAREN;
946 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
948 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
952 return lasttok = RPAREN;
960 /* In multibyte environment period must match with a single
961 character not a byte. So we use ANYCHAR. */
963 return lasttok = ANYCHAR;
965 #endif /* MBS_SUPPORT */
968 if (!(syntax_bits & RE_DOT_NEWLINE))
969 clrbit(eolbyte, ccl);
970 if (syntax_bits & RE_DOT_NOT_NULL)
973 return lasttok = CSET + charclass_index(ccl);
977 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
980 for (c2 = 0; c2 < NOTCHAR; ++c2)
981 if (IS_WORD_CONSTITUENT(c2))
986 return lasttok = CSET + charclass_index(ccl);
995 /* In multibyte environment a bracket expression may contain
996 multibyte characters, which must be treated as characters
997 (not bytes). So we parse it by parse_bracket_exp_mb(). */
998 parse_bracket_exp_mb();
999 return lasttok = MBCSET;
1003 FETCH(c, _("Unbalanced ["));
1006 FETCH(c, _("Unbalanced ["));
1013 /* Nobody ever said this had to be fast. :-)
1014 Note that if we're looking at some other [:...:]
1015 construct, we just treat it as a bunch of ordinary
1016 characters. We can do this because we assume
1017 regex has checked for syntax errors before
1018 dfa is ever called. */
1019 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1020 for (c1 = 0; prednames[c1].name; ++c1)
1021 if (looking_at(prednames[c1].name))
1023 int (*pred) PARAMS ((int)) = prednames[c1].pred;
1025 for (c2 = 0; c2 < NOTCHAR; ++c2)
1027 setbit_case_fold (c2, ccl);
1028 lexptr += strlen(prednames[c1].name);
1029 lexleft -= strlen(prednames[c1].name);
1030 FETCH(c1, _("Unbalanced ["));
1033 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1034 FETCH(c, _("Unbalanced ["));
1035 FETCH(c1, _("Unbalanced ["));
1038 FETCH(c2, _("Unbalanced ["));
1041 /* In the case [x-], the - is an ordinary hyphen,
1042 which is left in c1, the lookahead character. */
1049 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1050 FETCH(c2, _("Unbalanced ["));
1051 FETCH(c1, _("Unbalanced ["));
1052 if (!hard_LC_COLLATE) {
1053 for (; c <= c2; c++)
1054 setbit_case_fold (c, ccl);
1056 /* POSIX locales are painful - leave the decision to libc */
1057 char expr[6] = { '[', c, '-', c2, ']', '\0' };
1059 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1060 for (c = 0; c < NOTCHAR; ++c) {
1061 char buf[2] = { c, '\0' };
1063 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1064 && mat.rm_so == 0 && mat.rm_eo == 1)
1065 setbit_case_fold (c, ccl);
1074 setbit_case_fold (c, ccl);
1079 while ((c = c1) != ']');
1083 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1084 clrbit(eolbyte, ccl);
1086 return lasttok = CSET + charclass_index(ccl);
1091 if (case_fold && ISALPHA(c))
1094 setbit_case_fold (c, ccl);
1095 return lasttok = CSET + charclass_index(ccl);
1101 /* The above loop should consume at most a backslash
1102 and some other character. */
1104 return END; /* keeps pedantic compilers happy. */
1107 /* Recursive descent parser for regular expressions. */
1109 static token tok; /* Lookahead token. */
1110 static int depth; /* Current depth of a hypothetical stack
1111 holding deferred productions. This is
1112 used to determine the depth that will be
1113 required of the real stack later on in
1116 /* Add the given token to the parse tree, maintaining the depth count and
1117 updating the maximum depth if necessary. */
1124 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1126 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
1128 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1129 else if (t < NOTCHAR)
1130 dfa->multibyte_prop[dfa->tindex]
1131 = (cur_mb_len == 1)? 3 /* single-byte char */
1132 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1133 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1135 /* It may be unnecesssary, but it is safer to treat other
1136 symbols as singlebyte characters. */
1137 dfa->multibyte_prop[dfa->tindex] = 3;
1141 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1142 dfa->tokens[dfa->tindex++] = t;
1163 if (depth > dfa->depth)
1167 /* The grammar understood by the parser is as follows.
1186 <multibyte character>
1198 LPAREN regexp RPAREN
1201 The parser builds a parse tree in postfix form in an array of tokens. */
1206 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1207 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1209 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1210 #endif /* MBS_SUPPORT */
1211 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1216 /* We treat a multibyte character as a single atom, so that DFA
1217 can treat a multibyte character as a single expression.
1219 e.g. We construct following tree from "<mb1><mb2>".
1220 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1221 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1225 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1232 #endif /* MBS_SUPPORT */
1234 else if (tok == CRANGE)
1236 /* A character range like "[a-z]" in a locale other than "C" or
1237 "POSIX". This range might any sequence of one or more
1238 characters. Unfortunately the POSIX locale primitives give
1239 us no practical way to find what character sequences might be
1240 matched. Treat this approximately like "(.\1)" -- i.e. match
1241 one character, and then punt to the full matcher. */
1245 addtok (CSET + charclass_index (ccl));
1250 else if (tok == LPAREN)
1255 dfaerror(_("Unbalanced ("));
1262 /* Return the number of tokens in the given subexpression. */
1264 nsubtoks (int tindex)
1268 switch (dfa->tokens[tindex - 1])
1275 return 1 + nsubtoks(tindex - 1);
1279 ntoks1 = nsubtoks(tindex - 1);
1280 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1284 /* Copy the given subexpression to the top of the tree. */
1286 copytoks (int tindex, int ntokens)
1290 for (i = 0; i < ntokens; ++i)
1291 addtok(dfa->tokens[tindex + i]);
1297 int tindex, ntokens, i;
1300 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1303 ntokens = nsubtoks(dfa->tindex);
1304 tindex = dfa->tindex - ntokens;
1309 for (i = 1; i < minrep; ++i)
1311 copytoks(tindex, ntokens);
1314 for (; i < maxrep; ++i)
1316 copytoks(tindex, ntokens);
1333 while (tok != RPAREN && tok != OR && tok >= 0)
1341 regexp (int toplevel)
1355 /* Main entry point for the parser. S is a string to be parsed, len is the
1356 length of the string, so s can include NUL characters. D is a pointer to
1357 the struct dfa to parse into. */
1359 dfaparse (char const *s, size_t len, struct dfa *d)
1362 lexstart = lexptr = s;
1367 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1373 memset(&mbs, 0, sizeof(mbstate_t));
1375 #endif /* MBS_SUPPORT */
1377 if (! syntax_bits_set)
1378 dfaerror(_("No syntax specified"));
1386 dfaerror(_("Unbalanced )"));
1388 addtok(END - d->nregexps);
1397 /* Some primitives for operating on sets of positions. */
1399 /* Copy one set to another; the destination must be large enough. */
1401 copy (position_set const *src, position_set *dst)
1405 for (i = 0; i < src->nelem; ++i)
1406 dst->elems[i] = src->elems[i];
1407 dst->nelem = src->nelem;
1410 /* Insert a position in a set. Position sets are maintained in sorted
1411 order according to index. If position already exists in the set with
1412 the same index then their constraints are logically or'd together.
1413 S->elems must point to an array large enough to hold the resulting set. */
1415 insert (position p, position_set *s)
1420 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1422 if (i < s->nelem && p.index == s->elems[i].index)
1423 s->elems[i].constraint |= p.constraint;
1428 while (i < s->nelem)
1437 /* Merge two sets of positions into a third. The result is exactly as if
1438 the positions of both sets were inserted into an initially empty set. */
1440 merge (position_set const *s1, position_set const *s2, position_set *m)
1445 while (i < s1->nelem && j < s2->nelem)
1446 if (s1->elems[i].index > s2->elems[j].index)
1447 m->elems[m->nelem++] = s1->elems[i++];
1448 else if (s1->elems[i].index < s2->elems[j].index)
1449 m->elems[m->nelem++] = s2->elems[j++];
1452 m->elems[m->nelem] = s1->elems[i++];
1453 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1455 while (i < s1->nelem)
1456 m->elems[m->nelem++] = s1->elems[i++];
1457 while (j < s2->nelem)
1458 m->elems[m->nelem++] = s2->elems[j++];
1461 /* Delete a position from a set. */
1463 delete (position p, position_set *s)
1467 for (i = 0; i < s->nelem; ++i)
1468 if (p.index == s->elems[i].index)
1471 for (--s->nelem; i < s->nelem; ++i)
1472 s->elems[i] = s->elems[i + 1];
1475 /* Find the index of the state corresponding to the given position set with
1476 the given preceding context, or create a new state if there is no such
1477 state. Newline and letter tell whether we got here on a newline or
1478 letter, respectively. */
1480 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1486 newline = newline ? 1 : 0;
1487 letter = letter ? 1 : 0;
1489 for (i = 0; i < s->nelem; ++i)
1490 hash ^= s->elems[i].index + s->elems[i].constraint;
1492 /* Try to find a state that exactly matches the proposed one. */
1493 for (i = 0; i < d->sindex; ++i)
1495 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1496 || newline != d->states[i].newline || letter != d->states[i].letter)
1498 for (j = 0; j < s->nelem; ++j)
1499 if (s->elems[j].constraint
1500 != d->states[i].elems.elems[j].constraint
1501 || s->elems[j].index != d->states[i].elems.elems[j].index)
1507 /* We'll have to create a new state. */
1508 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1509 d->states[i].hash = hash;
1510 MALLOC(d->states[i].elems.elems, position, s->nelem);
1511 copy(s, &d->states[i].elems);
1512 d->states[i].newline = newline;
1513 d->states[i].letter = letter;
1514 d->states[i].backref = 0;
1515 d->states[i].constraint = 0;
1516 d->states[i].first_end = 0;
1519 d->states[i].mbps.nelem = 0;
1521 for (j = 0; j < s->nelem; ++j)
1522 if (d->tokens[s->elems[j].index] < 0)
1524 constraint = s->elems[j].constraint;
1525 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1526 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1527 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1528 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1529 d->states[i].constraint |= constraint;
1530 if (! d->states[i].first_end)
1531 d->states[i].first_end = d->tokens[s->elems[j].index];
1533 else if (d->tokens[s->elems[j].index] == BACKREF)
1535 d->states[i].constraint = NO_CONSTRAINT;
1536 d->states[i].backref = 1;
1544 /* Find the epsilon closure of a set of positions. If any position of the set
1545 contains a symbol that matches the empty string in some context, replace
1546 that position with the elements of its follow labeled with an appropriate
1547 constraint. Repeat exhaustively until no funny positions are left.
1548 S->elems must be large enough to hold the result. */
1550 epsclosure (position_set *s, struct dfa const *d)
1556 MALLOC(visited, int, d->tindex);
1557 for (i = 0; i < d->tindex; ++i)
1560 for (i = 0; i < s->nelem; ++i)
1561 if (d->tokens[s->elems[i].index] >= NOTCHAR
1562 && d->tokens[s->elems[i].index] != BACKREF
1564 && d->tokens[s->elems[i].index] != ANYCHAR
1565 && d->tokens[s->elems[i].index] != MBCSET
1567 && d->tokens[s->elems[i].index] < CSET)
1570 p.constraint = old.constraint;
1571 delete(s->elems[i], s);
1572 if (visited[old.index])
1577 visited[old.index] = 1;
1578 switch (d->tokens[old.index])
1581 p.constraint &= BEGLINE_CONSTRAINT;
1584 p.constraint &= ENDLINE_CONSTRAINT;
1587 p.constraint &= BEGWORD_CONSTRAINT;
1590 p.constraint &= ENDWORD_CONSTRAINT;
1593 p.constraint &= LIMWORD_CONSTRAINT;
1596 p.constraint &= NOTLIMWORD_CONSTRAINT;
1601 for (j = 0; j < d->follows[old.index].nelem; ++j)
1603 p.index = d->follows[old.index].elems[j].index;
1606 /* Force rescan to start at the beginning. */
1613 /* Perform bottom-up analysis on the parse tree, computing various functions.
1614 Note that at this point, we're pretending constructs like \< are real
1615 characters rather than constraints on what can follow them.
1617 Nullable: A node is nullable if it is at the root of a regexp that can
1618 match the empty string.
1619 * EMPTY leaves are nullable.
1620 * No other leaf is nullable.
1621 * A QMARK or STAR node is nullable.
1622 * A PLUS node is nullable if its argument is nullable.
1623 * A CAT node is nullable if both its arguments are nullable.
1624 * An OR node is nullable if either argument is nullable.
1626 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1627 that could correspond to the first character of a string matching the
1628 regexp rooted at the given node.
1629 * EMPTY leaves have empty firstpos.
1630 * The firstpos of a nonempty leaf is that leaf itself.
1631 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1633 * The firstpos of a CAT node is the firstpos of the left argument, union
1634 the firstpos of the right if the left argument is nullable.
1635 * The firstpos of an OR node is the union of firstpos of each argument.
1637 Lastpos: The lastpos of a node is the set of positions that could
1638 correspond to the last character of a string matching the regexp at
1640 * EMPTY leaves have empty lastpos.
1641 * The lastpos of a nonempty leaf is that leaf itself.
1642 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1644 * The lastpos of a CAT node is the lastpos of its right argument, union
1645 the lastpos of the left if the right argument is nullable.
1646 * The lastpos of an OR node is the union of the lastpos of each argument.
1648 Follow: The follow of a position is the set of positions that could
1649 correspond to the character following a character matching the node in
1650 a string matching the regexp. At this point we consider special symbols
1651 that match the empty string in some context to be just normal characters.
1652 Later, if we find that a special symbol is in a follow set, we will
1653 replace it with the elements of its follow, labeled with an appropriate
1655 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1656 the follow of every node in the lastpos.
1657 * Every node in the firstpos of the second argument of a CAT node is in
1658 the follow of every node in the lastpos of the first argument.
1660 Because of the postfix representation of the parse tree, the depth-first
1661 analysis is conveniently done by a linear scan with the aid of a stack.
1662 Sets are stored as arrays of the elements, obeying a stack-like allocation
1663 scheme; the number of elements in each set deeper in the stack can be
1664 used to determine the address of a particular set's array. */
1666 dfaanalyze (struct dfa *d, int searchflag)
1668 int *nullable; /* Nullable stack. */
1669 int *nfirstpos; /* Element count stack for firstpos sets. */
1670 position *firstpos; /* Array where firstpos elements are stored. */
1671 int *nlastpos; /* Element count stack for lastpos sets. */
1672 position *lastpos; /* Array where lastpos elements are stored. */
1673 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1674 position_set tmp; /* Temporary set for merging sets. */
1675 position_set merged; /* Result of merging sets. */
1676 int wants_newline; /* True if some position wants newline info. */
1678 int *o_nfirst, *o_nlast;
1679 position *o_firstpos, *o_lastpos;
1684 fprintf(stderr, "dfaanalyze:\n");
1685 for (i = 0; i < d->tindex; ++i)
1687 fprintf(stderr, " %d:", i);
1688 prtok(d->tokens[i]);
1693 d->searchflag = searchflag;
1695 MALLOC(nullable, int, d->depth);
1696 o_nullable = nullable;
1697 MALLOC(nfirstpos, int, d->depth);
1698 o_nfirst = nfirstpos;
1699 MALLOC(firstpos, position, d->nleaves);
1700 o_firstpos = firstpos, firstpos += d->nleaves;
1701 MALLOC(nlastpos, int, d->depth);
1703 MALLOC(lastpos, position, d->nleaves);
1704 o_lastpos = lastpos, lastpos += d->nleaves;
1705 MALLOC(nalloc, int, d->tindex);
1706 for (i = 0; i < d->tindex; ++i)
1708 MALLOC(merged.elems, position, d->nleaves);
1710 CALLOC(d->follows, position_set, d->tindex);
1712 for (i = 0; i < d->tindex; ++i)
1714 { /* Nonsyntactic #ifdef goo... */
1716 switch (d->tokens[i])
1719 /* The empty set is nullable. */
1722 /* The firstpos and lastpos of the empty leaf are both empty. */
1723 *nfirstpos++ = *nlastpos++ = 0;
1728 /* Every element in the firstpos of the argument is in the follow
1729 of every element in the lastpos. */
1730 tmp.nelem = nfirstpos[-1];
1731 tmp.elems = firstpos;
1733 for (j = 0; j < nlastpos[-1]; ++j)
1735 merge(&tmp, &d->follows[pos[j].index], &merged);
1736 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1737 nalloc[pos[j].index], merged.nelem - 1);
1738 copy(&merged, &d->follows[pos[j].index]);
1742 /* A QMARK or STAR node is automatically nullable. */
1743 if (d->tokens[i] != PLUS)
1748 /* Every element in the firstpos of the second argument is in the
1749 follow of every element in the lastpos of the first argument. */
1750 tmp.nelem = nfirstpos[-1];
1751 tmp.elems = firstpos;
1752 pos = lastpos + nlastpos[-1];
1753 for (j = 0; j < nlastpos[-2]; ++j)
1755 merge(&tmp, &d->follows[pos[j].index], &merged);
1756 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1757 nalloc[pos[j].index], merged.nelem - 1);
1758 copy(&merged, &d->follows[pos[j].index]);
1761 /* The firstpos of a CAT node is the firstpos of the first argument,
1762 union that of the second argument if the first is nullable. */
1764 nfirstpos[-2] += nfirstpos[-1];
1766 firstpos += nfirstpos[-1];
1769 /* The lastpos of a CAT node is the lastpos of the second argument,
1770 union that of the first argument if the second is nullable. */
1772 nlastpos[-2] += nlastpos[-1];
1775 pos = lastpos + nlastpos[-2];
1776 for (j = nlastpos[-1] - 1; j >= 0; --j)
1777 pos[j] = lastpos[j];
1778 lastpos += nlastpos[-2];
1779 nlastpos[-2] = nlastpos[-1];
1783 /* A CAT node is nullable if both arguments are nullable. */
1784 nullable[-2] = nullable[-1] && nullable[-2];
1790 /* The firstpos is the union of the firstpos of each argument. */
1791 nfirstpos[-2] += nfirstpos[-1];
1794 /* The lastpos is the union of the lastpos of each argument. */
1795 nlastpos[-2] += nlastpos[-1];
1798 /* An OR node is nullable if either argument is nullable. */
1799 nullable[-2] = nullable[-1] || nullable[-2];
1804 /* Anything else is a nonempty position. (Note that special
1805 constructs like \< are treated as nonempty strings here;
1806 an "epsilon closure" effectively makes them nullable later.
1807 Backreferences have to get a real position so we can detect
1808 transitions on them later. But they are nullable. */
1809 *nullable++ = d->tokens[i] == BACKREF;
1811 /* This position is in its own firstpos and lastpos. */
1812 *nfirstpos++ = *nlastpos++ = 1;
1813 --firstpos, --lastpos;
1814 firstpos->index = lastpos->index = i;
1815 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1817 /* Allocate the follow set for this position. */
1819 MALLOC(d->follows[i].elems, position, nalloc[i]);
1823 /* ... balance the above nonsyntactic #ifdef goo... */
1824 fprintf(stderr, "node %d:", i);
1825 prtok(d->tokens[i]);
1827 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1828 fprintf(stderr, " firstpos:");
1829 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1831 fprintf(stderr, " %d:", firstpos[j].index);
1832 prtok(d->tokens[firstpos[j].index]);
1834 fprintf(stderr, "\n lastpos:");
1835 for (j = nlastpos[-1] - 1; j >= 0; --j)
1837 fprintf(stderr, " %d:", lastpos[j].index);
1838 prtok(d->tokens[lastpos[j].index]);
1844 /* For each follow set that is the follow set of a real position, replace
1845 it with its epsilon closure. */
1846 for (i = 0; i < d->tindex; ++i)
1847 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1849 || d->tokens[i] == ANYCHAR
1850 || d->tokens[i] == MBCSET
1852 || d->tokens[i] >= CSET)
1855 fprintf(stderr, "follows(%d:", i);
1856 prtok(d->tokens[i]);
1857 fprintf(stderr, "):");
1858 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1860 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1861 prtok(d->tokens[d->follows[i].elems[j].index]);
1865 copy(&d->follows[i], &merged);
1866 epsclosure(&merged, d);
1867 if (d->follows[i].nelem < merged.nelem)
1868 REALLOC(d->follows[i].elems, position, merged.nelem);
1869 copy(&merged, &d->follows[i]);
1872 /* Get the epsilon closure of the firstpos of the regexp. The result will
1873 be the set of positions of state 0. */
1875 for (i = 0; i < nfirstpos[-1]; ++i)
1876 insert(firstpos[i], &merged);
1877 epsclosure(&merged, d);
1879 /* Check if any of the positions of state 0 will want newline context. */
1881 for (i = 0; i < merged.nelem; ++i)
1882 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1885 /* Build the initial state. */
1888 MALLOC(d->states, dfa_state, d->salloc);
1889 state_index(d, &merged, wants_newline, 0);
1900 /* Find, for each character, the transition out of state s of d, and store
1901 it in the appropriate slot of trans.
1903 We divide the positions of s into groups (positions can appear in more
1904 than one group). Each group is labeled with a set of characters that
1905 every position in the group matches (taking into account, if necessary,
1906 preceding context information of s). For each group, find the union
1907 of the its elements' follows. This set is the set of positions of the
1908 new state. For each character in the group's label, set the transition
1909 on this character to be to a state corresponding to the set's positions,
1910 and its associated backward context information, if necessary.
1912 If we are building a searching matcher, we include the positions of state
1915 The collection of groups is constructed by building an equivalence-class
1916 partition of the positions of s.
1918 For each position, find the set of characters C that it matches. Eliminate
1919 any characters from C that fail on grounds of backward context.
1921 Search through the groups, looking for a group whose label L has nonempty
1922 intersection with C. If L - C is nonempty, create a new group labeled
1923 L - C and having the same positions as the current group, and set L to
1924 the intersection of L and C. Insert the position in this group, set
1925 C = C - L, and resume scanning.
1927 If after comparing with every group there are characters remaining in C,
1928 create a new group labeled with the characters of C and insert this
1929 position in that group. */
1931 dfastate (int s, struct dfa *d, int trans[])
1933 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1934 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1935 int ngrps = 0; /* Number of groups actually used. */
1936 position pos; /* Current position being considered. */
1937 charclass matches; /* Set of matching characters. */
1938 int matchesf; /* True if matches is nonempty. */
1939 charclass intersect; /* Intersection with some label set. */
1940 int intersectf; /* True if intersect is nonempty. */
1941 charclass leftovers; /* Stuff in the label that didn't match. */
1942 int leftoversf; /* True if leftovers is nonempty. */
1943 static charclass letters; /* Set of characters considered letters. */
1944 static charclass newline; /* Set of characters that aren't newline. */
1945 position_set follows; /* Union of the follows of some group. */
1946 position_set tmp; /* Temporary space for merging sets. */
1947 int state; /* New state. */
1948 int wants_newline; /* New state wants to know newline context. */
1949 int state_newline; /* New state on a newline transition. */
1950 int wants_letter; /* New state wants to know letter context. */
1951 int state_letter; /* New state on a letter transition. */
1952 static int initialized; /* Flag for static initialization. */
1954 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */
1958 /* Initialize the set of letters, if necessary. */
1962 for (i = 0; i < NOTCHAR; ++i)
1963 if (IS_WORD_CONSTITUENT(i))
1965 setbit(eolbyte, newline);
1970 for (i = 0; i < d->states[s].elems.nelem; ++i)
1972 pos = d->states[s].elems.elems[i];
1973 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1974 setbit(d->tokens[pos.index], matches);
1975 else if (d->tokens[pos.index] >= CSET)
1976 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1978 else if (d->tokens[pos.index] == ANYCHAR
1979 || d->tokens[pos.index] == MBCSET)
1980 /* MB_CUR_MAX > 1 */
1982 /* ANYCHAR and MBCSET must match with a single character, so we
1983 must put it to d->states[s].mbps, which contains the positions
1984 which can match with a single character not a byte. */
1985 if (d->states[s].mbps.nelem == 0)
1987 MALLOC(d->states[s].mbps.elems, position,
1988 d->states[s].elems.nelem);
1990 insert(pos, &(d->states[s].mbps));
1993 #endif /* MBS_SUPPORT */
1997 /* Some characters may need to be eliminated from matches because
1998 they fail in the current context. */
1999 if (pos.constraint != 0xFF)
2001 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2002 d->states[s].newline, 1))
2003 clrbit(eolbyte, matches);
2004 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2005 d->states[s].newline, 0))
2006 for (j = 0; j < CHARCLASS_INTS; ++j)
2007 matches[j] &= newline[j];
2008 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2009 d->states[s].letter, 1))
2010 for (j = 0; j < CHARCLASS_INTS; ++j)
2011 matches[j] &= ~letters[j];
2012 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2013 d->states[s].letter, 0))
2014 for (j = 0; j < CHARCLASS_INTS; ++j)
2015 matches[j] &= letters[j];
2017 /* If there are no characters left, there's no point in going on. */
2018 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2020 if (j == CHARCLASS_INTS)
2024 for (j = 0; j < ngrps; ++j)
2026 /* If matches contains a single character only, and the current
2027 group's label doesn't contain that character, go on to the
2029 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2030 && !tstbit(d->tokens[pos.index], labels[j]))
2033 /* Check if this group's label has a nonempty intersection with
2036 for (k = 0; k < CHARCLASS_INTS; ++k)
2037 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2041 /* It does; now find the set differences both ways. */
2042 leftoversf = matchesf = 0;
2043 for (k = 0; k < CHARCLASS_INTS; ++k)
2045 /* Even an optimizing compiler can't know this for sure. */
2046 int match = matches[k], label = labels[j][k];
2048 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2049 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2052 /* If there were leftovers, create a new group labeled with them. */
2055 copyset(leftovers, labels[ngrps]);
2056 copyset(intersect, labels[j]);
2057 MALLOC(grps[ngrps].elems, position, d->nleaves);
2058 copy(&grps[j], &grps[ngrps]);
2062 /* Put the position in the current group. Note that there is no
2063 reason to call insert() here. */
2064 grps[j].elems[grps[j].nelem++] = pos;
2066 /* If every character matching the current position has been
2067 accounted for, we're done. */
2072 /* If we've passed the last group, and there are still characters
2073 unaccounted for, then we'll have to create a new group. */
2076 copyset(matches, labels[ngrps]);
2078 MALLOC(grps[ngrps].elems, position, d->nleaves);
2079 grps[ngrps].nelem = 1;
2080 grps[ngrps].elems[0] = pos;
2085 MALLOC(follows.elems, position, d->nleaves);
2086 MALLOC(tmp.elems, position, d->nleaves);
2088 /* If we are a searching matcher, the default transition is to a state
2089 containing the positions of state 0, otherwise the default transition
2090 is to fail miserably. */
2095 for (i = 0; i < d->states[0].elems.nelem; ++i)
2097 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2099 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2102 copy(&d->states[0].elems, &follows);
2103 state = state_index(d, &follows, 0, 0);
2105 state_newline = state_index(d, &follows, 1, 0);
2107 state_newline = state;
2109 state_letter = state_index(d, &follows, 0, 1);
2111 state_letter = state;
2112 for (i = 0; i < NOTCHAR; ++i)
2113 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2114 trans[eolbyte] = state_newline;
2117 for (i = 0; i < NOTCHAR; ++i)
2120 for (i = 0; i < ngrps; ++i)
2124 /* Find the union of the follows of the positions of the group.
2125 This is a hideously inefficient loop. Fix it someday. */
2126 for (j = 0; j < grps[i].nelem; ++j)
2127 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2128 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2133 /* If a token in follows.elems is not 1st byte of a multibyte
2134 character, or the states of follows must accept the bytes
2135 which are not 1st byte of the multibyte character.
2136 Then, if a state of follows encounter a byte, it must not be
2137 a 1st byte of a multibyte character nor singlebyte character.
2138 We cansel to add state[0].follows to next state, because
2139 state[0] must accept 1st-byte
2141 For example, we assume <sb a> is a certain singlebyte
2142 character, <mb A> is a certain multibyte character, and the
2143 codepoint of <sb a> equals the 2nd byte of the codepoint of
2145 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2146 by accepting accepts 1st byte of <mb A>, and state[i+1]
2147 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2148 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2149 <mb A>, so we can not add state[0]. */
2151 next_isnt_1st_byte = 0;
2152 for (j = 0; j < follows.nelem; ++j)
2154 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2156 next_isnt_1st_byte = 1;
2163 /* If we are building a searching matcher, throw in the positions
2164 of state 0 as well. */
2166 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2170 for (j = 0; j < d->states[0].elems.nelem; ++j)
2171 insert(d->states[0].elems.elems[j], &follows);
2173 /* Find out if the new state will want any context information. */
2175 if (tstbit(eolbyte, labels[i]))
2176 for (j = 0; j < follows.nelem; ++j)
2177 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2181 for (j = 0; j < CHARCLASS_INTS; ++j)
2182 if (labels[i][j] & letters[j])
2184 if (j < CHARCLASS_INTS)
2185 for (j = 0; j < follows.nelem; ++j)
2186 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2189 /* Find the state(s) corresponding to the union of the follows. */
2190 state = state_index(d, &follows, 0, 0);
2192 state_newline = state_index(d, &follows, 1, 0);
2194 state_newline = state;
2196 state_letter = state_index(d, &follows, 0, 1);
2198 state_letter = state;
2200 /* Set the transitions for each character in the current label. */
2201 for (j = 0; j < CHARCLASS_INTS; ++j)
2202 for (k = 0; k < INTBITS; ++k)
2203 if (labels[i][j] & 1 << k)
2205 int c = j * INTBITS + k;
2208 trans[c] = state_newline;
2209 else if (IS_WORD_CONSTITUENT(c))
2210 trans[c] = state_letter;
2211 else if (c < NOTCHAR)
2216 for (i = 0; i < ngrps; ++i)
2217 free(grps[i].elems);
2218 free(follows.elems);
2222 /* Some routines for manipulating a compiled dfa's transition tables.
2223 Each state may or may not have a transition table; if it does, and it
2224 is a non-accepting state, then d->trans[state] points to its table.
2225 If it is an accepting state then d->fails[state] points to its table.
2226 If it has no table at all, then d->trans[state] is NULL.
2227 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2230 build_state (int s, struct dfa *d)
2232 int *trans; /* The new transition table. */
2235 /* Set an upper limit on the number of transition tables that will ever
2236 exist at once. 1024 is arbitrary. The idea is that the frequently
2237 used transition tables will be quickly rebuilt, whereas the ones that
2238 were only needed once or twice will be cleared away. */
2239 if (d->trcount >= 1024)
2241 for (i = 0; i < d->tralloc; ++i)
2244 free((ptr_t) d->trans[i]);
2247 else if (d->fails[i])
2249 free((ptr_t) d->fails[i]);
2257 /* Set up the success bits for this state. */
2259 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2262 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2265 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2269 MALLOC(trans, int, NOTCHAR);
2270 dfastate(s, d, trans);
2272 /* Now go through the new transition table, and make sure that the trans
2273 and fail arrays are allocated large enough to hold a pointer for the
2274 largest state mentioned in the table. */
2275 for (i = 0; i < NOTCHAR; ++i)
2276 if (trans[i] >= d->tralloc)
2278 int oldalloc = d->tralloc;
2280 while (trans[i] >= d->tralloc)
2282 REALLOC(d->realtrans, int *, d->tralloc + 1);
2283 d->trans = d->realtrans + 1;
2284 REALLOC(d->fails, int *, d->tralloc);
2285 REALLOC(d->success, int, d->tralloc);
2286 while (oldalloc < d->tralloc)
2288 d->trans[oldalloc] = NULL;
2289 d->fails[oldalloc++] = NULL;
2293 /* Newline is a sentinel. */
2294 trans[eolbyte] = -1;
2296 if (ACCEPTING(s, *d))
2297 d->fails[s] = trans;
2299 d->trans[s] = trans;
2303 build_state_zero (struct dfa *d)
2307 CALLOC(d->realtrans, int *, d->tralloc + 1);
2308 d->trans = d->realtrans + 1;
2309 CALLOC(d->fails, int *, d->tralloc);
2310 MALLOC(d->success, int, d->tralloc);
2315 /* Multibyte character handling sub-routins for dfaexec. */
2317 /* Initial state may encounter the byte which is not a singlebyte character
2318 nor 1st byte of a multibyte character. But it is incorrect for initial
2319 state to accept such a byte.
2320 For example, in sjis encoding the regular expression like "\\" accepts
2321 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2322 0x815c. Then Initial state must skip the bytes which are not a singlebyte
2323 character nor 1st byte of a multibyte character. */
2324 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2327 while (inputwcs[p - buf_begin] == 0 \
2328 && mblen_buf[p - buf_begin] > 0 \
2335 return (size_t) -1; \
2340 realloc_trans_if_necessary(struct dfa *d, int new_state)
2342 /* Make sure that the trans and fail arrays are allocated large enough
2343 to hold a pointer for the new state. */
2344 if (new_state >= d->tralloc)
2346 int oldalloc = d->tralloc;
2348 while (new_state >= d->tralloc)
2350 REALLOC(d->realtrans, int *, d->tralloc + 1);
2351 d->trans = d->realtrans + 1;
2352 REALLOC(d->fails, int *, d->tralloc);
2353 REALLOC(d->success, int, d->tralloc);
2354 while (oldalloc < d->tralloc)
2356 d->trans[oldalloc] = NULL;
2357 d->fails[oldalloc++] = NULL;
2362 /* Return values of transit_state_singlebyte(), and
2363 transit_state_consume_1char. */
2366 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2367 TRANSIT_STATE_DONE, /* State transition has finished. */
2368 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2369 } status_transit_state;
2371 /* Consume a single byte and transit state from 's' to '*next_state'.
2372 This function is almost same as the state transition routin in dfaexec().
2373 But state transition is done just once, otherwise matching succeed or
2374 reach the end of the buffer. */
2375 static status_transit_state
2376 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2382 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2384 while (rval == TRANSIT_STATE_IN_PROGRESS)
2386 if ((t = d->trans[works]) != NULL)
2389 rval = TRANSIT_STATE_DONE;
2396 /* At the moment, it must not happen. */
2397 return TRANSIT_STATE_END_BUFFER;
2400 else if (d->fails[works])
2402 works = d->fails[works][*p];
2403 rval = TRANSIT_STATE_DONE;
2407 build_state(works, d);
2410 *next_state = works;
2414 /* Check whether period can match or not in the current context. If it can,
2415 return the amount of the bytes with which period can match, otherwise
2417 `pos' is the position of the period. `index' is the index from the
2418 buf_begin, and it is the current position in the buffer. */
2420 match_anychar (struct dfa *d, int s, position pos, int index)
2427 wc = inputwcs[index];
2428 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2430 /* Check context. */
2431 if (wc == (wchar_t)eolbyte)
2433 if (!(syntax_bits & RE_DOT_NEWLINE))
2437 else if (wc == (wchar_t)'\0')
2439 if (syntax_bits & RE_DOT_NOT_NULL)
2444 if (iswalnum(wc) || wc == L'_')
2447 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2448 newline, d->states[s].letter, letter))
2454 /* Check whether bracket expression can match or not in the current context.
2455 If it can, return the amount of the bytes with which expression can match,
2457 `pos' is the position of the bracket expression. `index' is the index
2458 from the buf_begin, and it is the current position in the buffer. */
2460 match_mb_charset (struct dfa *d, int s, position pos, int index)
2463 int match; /* Flag which represent that matching succeed. */
2464 int match_len; /* Length of the character (or collating element)
2465 with which this operator match. */
2466 int op_len; /* Length of the operator. */
2470 /* Pointer to the structure to which we are currently reffering. */
2471 struct mb_char_classes *work_mbc;
2475 wchar_t wc; /* Current reffering character. */
2477 wc = inputwcs[index];
2479 /* Check context. */
2480 if (wc == (wchar_t)eolbyte)
2482 if (!(syntax_bits & RE_DOT_NEWLINE))
2486 else if (wc == (wchar_t)'\0')
2488 if (syntax_bits & RE_DOT_NOT_NULL)
2492 if (iswalnum(wc) || wc == L'_')
2494 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2495 newline, d->states[s].letter, letter))
2498 /* Assign the current reffering operator to work_mbc. */
2499 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2500 match = !work_mbc->invert;
2501 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2503 /* match with a character class? */
2504 for (i = 0; i<work_mbc->nch_classes; i++)
2506 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2507 goto charset_matched;
2510 strncpy(buffer, buf_begin + index, match_len);
2511 buffer[match_len] = '\0';
2513 /* match with an equivalent class? */
2514 for (i = 0; i<work_mbc->nequivs; i++)
2516 op_len = strlen(work_mbc->equivs[i]);
2517 strncpy(buffer, buf_begin + index, op_len);
2518 buffer[op_len] = '\0';
2519 if (strcoll(work_mbc->equivs[i], buffer) == 0)
2522 goto charset_matched;
2526 /* match with a collating element? */
2527 for (i = 0; i<work_mbc->ncoll_elems; i++)
2529 op_len = strlen(work_mbc->coll_elems[i]);
2530 strncpy(buffer, buf_begin + index, op_len);
2531 buffer[op_len] = '\0';
2533 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2536 goto charset_matched;
2541 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2543 /* match with a range? */
2544 for (i = 0; i<work_mbc->nranges; i++)
2546 wcbuf[2] = work_mbc->range_sts[i];
2547 wcbuf[4] = work_mbc->range_ends[i];
2549 if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2550 wcscoll(wcbuf+4, wcbuf) >= 0)
2551 goto charset_matched;
2554 /* match with a character? */
2557 for (i = 0; i<work_mbc->nchars; i++)
2559 if (wc == work_mbc->chars[i])
2560 goto charset_matched;
2566 return match ? match_len : 0;
2569 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2570 array which corresponds to `d->states[s].mbps.elem' and each element of
2571 the array contains the amount of the bytes with which the element can
2573 `index' is the index from the buf_begin, and it is the current position
2575 Caller MUST free the array which this function return. */
2577 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2582 MALLOC(rarray, int, d->states[s].mbps.nelem);
2583 for (i = 0; i < d->states[s].mbps.nelem; ++i)
2585 position pos = d->states[s].mbps.elems[i];
2586 switch(d->tokens[pos.index])
2589 rarray[i] = match_anychar(d, s, pos, index);
2592 rarray[i] = match_mb_charset(d, s, pos, index);
2595 break; /* can not happen. */
2601 /* Consume a single character and enumerate all of the positions which can
2602 be next position from the state `s'.
2603 `match_lens' is the input. It can be NULL, but it can also be the output
2604 of check_matching_with_multibyte_ops() for optimization.
2605 `mbclen' and `pps' are the output. `mbclen' is the length of the
2606 character consumed, and `pps' is the set this function enumerate. */
2607 static status_transit_state
2608 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2609 int *match_lens, int *mbclen, position_set *pps)
2614 status_transit_state rs = TRANSIT_STATE_DONE;
2616 /* Calculate the length of the (single/multi byte) character
2617 to which p points. */
2618 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2619 : mblen_buf[*pp - buf_begin];
2621 /* Calculate the state which can be reached from the state `s' by
2622 consuming `*mbclen' single bytes from the buffer. */
2624 for (i = 0; i < *mbclen; i++)
2627 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2629 /* Copy the positions contained by `s1' to the set `pps'. */
2630 copy(&(d->states[s1].elems), pps);
2632 /* Check (inputed)match_lens, and initialize if it is NULL. */
2633 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2634 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2636 work_mbls = match_lens;
2638 /* Add all of the positions which can be reached from `s' by consuming
2639 a single character. */
2640 for (i = 0; i < d->states[s].mbps.nelem ; i++)
2642 if (work_mbls[i] == *mbclen)
2643 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2645 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2649 if (match_lens == NULL && work_mbls != NULL)
2654 /* Transit state from s, then return new state and update the pointer of the
2655 buffer. This function is for some operator which can match with a multi-
2656 byte character or a collating element(which may be multi characters). */
2658 transit_state (struct dfa *d, int s, unsigned char const **pp)
2661 int mbclen; /* The length of current input multibyte character. */
2664 int *match_lens = NULL;
2665 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2666 position_set follows;
2667 unsigned char const *p1 = *pp;
2668 status_transit_state rs;
2672 /* This state has (a) multibyte operator(s).
2673 We check whether each of them can match or not. */
2675 /* Note: caller must free the return value of this function. */
2676 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2678 for (i = 0; i < nelem; i++)
2679 /* Search the operator which match the longest string,
2682 if (match_lens[i] > maxlen)
2683 maxlen = match_lens[i];
2687 if (nelem == 0 || maxlen == 0)
2688 /* This state has no multibyte operator which can match.
2689 We need to check only one singlebyte character. */
2691 status_transit_state rs;
2692 rs = transit_state_singlebyte(d, s, *pp, &s1);
2694 /* We must update the pointer if state transition succeeded. */
2695 if (rs == TRANSIT_STATE_DONE)
2698 if (match_lens != NULL)
2703 /* This state has some operators which can match a multibyte character. */
2705 MALLOC(follows.elems, position, d->nleaves);
2707 /* `maxlen' may be longer than the length of a character, because it may
2708 not be a character but a (multi character) collating element.
2709 We enumerate all of the positions which `s' can reach by consuming
2711 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2713 wc = inputwcs[*pp - mbclen - buf_begin];
2714 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2715 realloc_trans_if_necessary(d, s1);
2717 while (*pp - p1 < maxlen)
2720 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2722 for (i = 0; i < nelem ; i++)
2724 if (match_lens[i] == *pp - p1)
2726 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2727 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2731 wc = inputwcs[*pp - mbclen - buf_begin];
2732 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2733 realloc_trans_if_necessary(d, s1);
2736 free(follows.elems);
2742 /* Search through a buffer looking for a match to the given struct dfa.
2743 Find the first occurrence of a string matching the regexp in the buffer,
2744 and the shortest possible version thereof. Return the offset of the first
2745 character after the match, or (size_t) -1 if none is found. BEGIN points to
2746 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE
2747 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place
2748 where we're supposed to store a 1 if backreferencing happened and the
2749 match needs to be verified by a backtracking matcher. Otherwise
2750 we store a 0 in *backref. */
2752 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2754 register int s; /* Current state. */
2755 register unsigned char const *p; /* Current input character. */
2756 register unsigned char const *end; /* One past the last input character. */
2757 register int **trans, *t; /* Copy of d->trans so it can be optimized
2759 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2760 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2761 static int sbit_init;
2768 for (i = 0; i < NOTCHAR; ++i)
2769 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2774 build_state_zero(d);
2777 p = (unsigned char const *) begin;
2784 int remain_bytes, i;
2788 /* initialize mblen_buf, and inputwcs. */
2789 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2790 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2791 memset(&mbs, 0, sizeof(mbstate_t));
2793 for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2795 if (remain_bytes == 0)
2798 = mbrtowc(inputwcs + i, begin + i,
2799 end - (unsigned char const *)begin - i + 1, &mbs);
2800 if (remain_bytes <= 1)
2803 inputwcs[i] = (wchar_t)begin[i];
2808 mblen_buf[i] = remain_bytes;
2814 mblen_buf[i] = remain_bytes;
2820 inputwcs[i] = 0; /* sentinel */
2822 #endif /* MBS_SUPPORT */
2828 while ((t = trans[s]))
2830 if (d->states[s].mbps.nelem != 0)
2832 /* Can match with a multibyte character( and multi character
2833 collating element). */
2834 unsigned char const *nextp;
2836 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2839 s = transit_state(d, s, &nextp);
2842 /* Trans table might be updated. */
2847 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2852 #endif /* MBS_SUPPORT */
2853 while ((t = trans[s]))
2866 #endif /* MBS_SUPPORT */
2871 else if ((t = d->fails[s]))
2873 if (d->success[s] & sbit[*p])
2876 *backref = (d->states[s].backref != 0);
2883 #endif /* MBS_SUPPORT */
2884 return (char const *) p - begin;
2890 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2891 if (d->states[s].mbps.nelem != 0)
2893 /* Can match with a multibyte character( and multi
2894 character collating element). */
2895 unsigned char const *nextp;
2897 s = transit_state(d, s, &nextp);
2900 /* Trans table might be updated. */
2907 #endif /* MBS_SUPPORT */
2918 /* Initialize the components of a dfa that the other routines don't
2919 initialize for themselves. */
2921 dfainit (struct dfa *d)
2924 MALLOC(d->charclasses, charclass, d->calloc);
2928 MALLOC(d->tokens, token, d->talloc);
2929 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2933 d->nmultibyte_prop = 1;
2934 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2936 d->mbcsets_alloc = 1;
2937 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2947 /* Parse and analyze a single string of the given length. */
2949 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2951 if (case_fold) /* dummy folding in service of dfamust() */
2956 lcopy = malloc(len);
2958 dfaerror(_("out of memory"));
2960 /* This is a kludge. */
2962 for (i = 0; i < len; ++i)
2963 if (ISUPPER ((unsigned char) s[i]))
2964 lcopy[i] = tolower ((unsigned char) s[i]);
2969 dfaparse(lcopy, len, d);
2972 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2974 dfaparse(s, len, d);
2975 dfaanalyze(d, searchflag);
2980 dfaparse(s, len, d);
2982 dfaanalyze(d, searchflag);
2986 /* Free the storage held by the components of a dfa. */
2988 dfafree (struct dfa *d)
2991 struct dfamust *dm, *ndm;
2993 free((ptr_t) d->charclasses);
2994 free((ptr_t) d->tokens);
2999 free((ptr_t) d->multibyte_prop);
3000 for (i = 0; i < d->nmbcsets; ++i)
3003 struct mb_char_classes *p = &(d->mbcsets[i]);
3004 if (p->chars != NULL)
3006 if (p->ch_classes != NULL)
3007 free(p->ch_classes);
3008 if (p->range_sts != NULL)
3010 if (p->range_ends != NULL)
3011 free(p->range_ends);
3013 for (j = 0; j < p->nequivs; ++j)
3015 if (p->equivs != NULL)
3018 for (j = 0; j < p->ncoll_elems; ++j)
3019 free(p->coll_elems[j]);
3020 if (p->coll_elems != NULL)
3021 free(p->coll_elems);
3023 free((ptr_t) d->mbcsets);
3025 #endif /* MBS_SUPPORT */
3027 for (i = 0; i < d->sindex; ++i)
3028 free((ptr_t) d->states[i].elems.elems);
3029 free((ptr_t) d->states);
3030 for (i = 0; i < d->tindex; ++i)
3031 if (d->follows[i].elems)
3032 free((ptr_t) d->follows[i].elems);
3033 free((ptr_t) d->follows);
3034 for (i = 0; i < d->tralloc; ++i)
3036 free((ptr_t) d->trans[i]);
3037 else if (d->fails[i])
3038 free((ptr_t) d->fails[i]);
3039 if (d->realtrans) free((ptr_t) d->realtrans);
3040 if (d->fails) free((ptr_t) d->fails);
3041 if (d->success) free((ptr_t) d->success);
3042 for (dm = d->musts; dm; dm = ndm)
3050 /* Having found the postfix representation of the regular expression,
3051 try to find a long sequence of characters that must appear in any line
3053 Finding a "longest" sequence is beyond the scope here;
3054 we take an easy way out and hope for the best.
3055 (Take "(ab|a)b"--please.)
3057 We do a bottom-up calculation of sequences of characters that must appear
3058 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3060 sequences that must appear at the left of the match ("left")
3061 sequences that must appear at the right of the match ("right")
3062 lists of sequences that must appear somewhere in the match ("in")
3063 sequences that must constitute the match ("is")
3065 When we get to the root of the tree, we use one of the longest of its
3066 calculated "in" sequences as our answer. The sequence we find is returned in
3067 d->must (where "d" is the single argument passed to "dfamust");
3068 the length of the sequence is returned in d->mustn.
3070 The sequences calculated for the various types of node (in pseudo ANSI c)
3071 are shown below. "p" is the operand of unary operators (and the left-hand
3072 operand of binary operators); "q" is the right-hand operand of binary
3075 "ZERO" means "a zero-length sequence" below.
3077 Type left right is in
3078 ---- ---- ----- -- --
3079 char c # c # c # c # c
3081 ANYCHAR ZERO ZERO ZERO ZERO
3083 MBCSET ZERO ZERO ZERO ZERO
3085 CSET ZERO ZERO ZERO ZERO
3087 STAR ZERO ZERO ZERO ZERO
3089 QMARK ZERO ZERO ZERO ZERO
3091 PLUS p->left p->right ZERO p->in
3093 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3094 p->left : q->right : q->is!=ZERO) ? q->in plus
3095 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3098 OR longest common longest common (do p->is and substrings common to
3099 leading trailing q->is have same p->in and q->in
3100 (sub)sequence (sub)sequence length and
3101 of p->left of p->right content) ?
3102 and q->left and q->right p->is : NULL
3104 If there's anything else we recognize in the tree, all four sequences get set
3105 to zero-length sequences. If there's something we don't recognize in the tree,
3106 we just return a zero-length sequence.
3108 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3111 And. . .is it here or someplace that we might ponder "optimizations" such as
3112 egrep 'psi|epsilon' -> egrep 'psi'
3113 egrep 'pepsi|epsilon' -> egrep 'epsi'
3114 (Yes, we now find "epsi" as a "string
3115 that must occur", but we might also
3116 simplify the *entire* r.e. being sought)
3117 grep '[c]' -> grep 'c'
3118 grep '(ab|a)b' -> grep 'ab'
3119 grep 'ab*' -> grep 'a'
3120 grep 'a*b' -> grep 'b'
3122 There are several issues:
3124 Is optimization easy (enough)?
3126 Does optimization actually accomplish anything,
3127 or is the automaton you get from "psi|epsilon" (for example)
3128 the same as the one you get from "psi" (for example)?
3130 Are optimizable r.e.'s likely to be used in real-life situations
3131 (something like 'ab*' is probably unlikely; something like is
3132 'psi|epsilon' is likelier)? */
3135 icatalloc (char *old, char *new)
3138 size_t oldsize, newsize;
3140 newsize = (new == NULL) ? 0 : strlen(new);
3143 else if (newsize == 0)
3145 else oldsize = strlen(old);
3147 result = (char *) malloc(newsize + 1);
3149 result = (char *) realloc((void *) old, oldsize + newsize + 1);
3150 if (result != NULL && new != NULL)
3151 (void) strcpy(result + oldsize, new);
3156 icpyalloc (char *string)
3158 return icatalloc((char *) NULL, string);
3162 istrstr (char *lookin, char *lookfor)
3167 len = strlen(lookfor);
3168 for (cp = lookin; *cp != '\0'; ++cp)
3169 if (strncmp(cp, lookfor, len) == 0)
3182 freelist (char **cpp)
3188 for (i = 0; cpp[i] != NULL; ++i)
3196 enlist (char **cpp, char *new, size_t len)
3202 if ((new = icpyalloc(new)) == NULL)
3208 /* Is there already something in the list that's new (or longer)? */
3209 for (i = 0; cpp[i] != NULL; ++i)
3210 if (istrstr(cpp[i], new) != NULL)
3215 /* Eliminate any obsoleted strings. */
3217 while (cpp[j] != NULL)
3218 if (istrstr(new, cpp[j]) == NULL)
3228 /* Add the new string. */
3229 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3237 /* Given pointers to two strings, return a pointer to an allocated
3238 list of their distinct common substrings. Return NULL if something
3241 comsubs (char *left, char *right)
3248 if (left == NULL || right == NULL)
3250 cpp = (char **) malloc(sizeof *cpp);
3254 for (lcp = left; *lcp != '\0'; ++lcp)
3257 rcp = strchr (right, *lcp);
3260 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3264 rcp = strchr (rcp + 1, *lcp);
3268 if ((cpp = enlist(cpp, lcp, len)) == NULL)
3275 addlists (char **old, char **new)
3279 if (old == NULL || new == NULL)
3281 for (i = 0; new[i] != NULL; ++i)
3283 old = enlist(old, new[i], strlen(new[i]));
3290 /* Given two lists of substrings, return a new list giving substrings
3293 inboth (char **left, char **right)
3299 if (left == NULL || right == NULL)
3301 both = (char **) malloc(sizeof *both);
3305 for (lnum = 0; left[lnum] != NULL; ++lnum)
3307 for (rnum = 0; right[rnum] != NULL; ++rnum)
3309 temp = comsubs(left[lnum], right[rnum]);
3315 both = addlists(both, temp);
3334 resetmust (must *mp)
3336 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3341 dfamust (struct dfa *dfa)
3352 static char empty_string[] = "";
3354 result = empty_string;
3356 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3360 for (i = 0; i <= dfa->tindex; ++i)
3362 for (i = 0; i <= dfa->tindex; ++i)
3364 mp[i].in = (char **) malloc(sizeof *mp[i].in);
3365 mp[i].left = malloc(2);
3366 mp[i].right = malloc(2);
3367 mp[i].is = malloc(2);
3368 if (mp[i].in == NULL || mp[i].left == NULL ||
3369 mp[i].right == NULL || mp[i].is == NULL)
3371 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3375 fprintf(stderr, "dfamust:\n");
3376 for (i = 0; i < dfa->tindex; ++i)
3378 fprintf(stderr, " %d:", i);
3379 prtok(dfa->tokens[i]);
3383 for (ri = 0; ri < dfa->tindex; ++ri)
3385 switch (t = dfa->tokens[ri])
3389 goto done; /* "cannot happen" */
3403 goto done; /* "cannot happen" */
3410 goto done; /* "cannot happen" */
3419 /* Guaranteed to be. Unlikely, but. . . */
3420 if (strcmp(lmp->is, rmp->is) != 0)
3422 /* Left side--easy */
3424 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3426 lmp->left[i] = '\0';
3428 ln = strlen(lmp->right);
3429 rn = strlen(rmp->right);
3433 for (i = 0; i < n; ++i)
3434 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3436 for (j = 0; j < i; ++j)
3437 lmp->right[j] = lmp->right[(ln - i) + j];
3438 lmp->right[j] = '\0';
3439 new = inboth(lmp->in, rmp->in);
3443 free((char *) lmp->in);
3449 goto done; /* "cannot happen" */
3454 if (mp != &musts[1])
3455 goto done; /* "cannot happen" */
3456 for (i = 0; musts[0].in[i] != NULL; ++i)
3457 if (strlen(musts[0].in[i]) > strlen(result))
3458 result = musts[0].in[i];
3459 if (strcmp(result, musts[0].is) == 0)
3464 goto done; /* "cannot happen" */
3471 /* In. Everything in left, plus everything in
3472 right, plus catenation of
3473 left's right and right's left. */
3474 lmp->in = addlists(lmp->in, rmp->in);
3475 if (lmp->in == NULL)
3477 if (lmp->right[0] != '\0' &&
3478 rmp->left[0] != '\0')
3482 tp = icpyalloc(lmp->right);
3485 tp = icatalloc(tp, rmp->left);
3488 lmp->in = enlist(lmp->in, tp,
3491 if (lmp->in == NULL)
3495 if (lmp->is[0] != '\0')
3497 lmp->left = icatalloc(lmp->left,
3499 if (lmp->left == NULL)
3503 if (rmp->is[0] == '\0')
3504 lmp->right[0] = '\0';
3505 lmp->right = icatalloc(lmp->right, rmp->right);
3506 if (lmp->right == NULL)
3508 /* Guaranteed to be */
3509 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3511 lmp->is = icatalloc(lmp->is, rmp->is);
3512 if (lmp->is == NULL)
3522 /* "cannot happen" */
3527 /* not on *my* shift */
3534 #endif /* MBS_SUPPORT */
3542 /* plain character */
3544 mp->is[0] = mp->left[0] = mp->right[0] = t;
3545 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3546 mp->in = enlist(mp->in, mp->is, (size_t)1);
3553 fprintf(stderr, " node: %d:", ri);
3554 prtok(dfa->tokens[ri]);
3555 fprintf(stderr, "\n in:");
3556 for (i = 0; mp->in[i]; ++i)
3557 fprintf(stderr, " \"%s\"", mp->in[i]);
3558 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3559 fprintf(stderr, " left: \"%s\"\n", mp->left);
3560 fprintf(stderr, " right: \"%s\"\n", mp->right);
3567 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3569 dm->must = malloc(strlen(result) + 1);
3570 strcpy(dm->must, result);
3571 dm->next = dfa->musts;
3575 for (i = 0; i <= dfa->tindex; ++i)
3578 ifree((char *) mp[i].in);
3585 /* vim:set shiftwidth=2: */