1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
35 #include "awkgram.tab.h"
39 #define type(v) (v)->nobj /* badly overloaded here */
40 #define info(v) (v)->ntype /* badly overloaded here */
41 #define left(v) (v)->narg[0]
42 #define right(v) (v)->narg[1]
43 #define parent(v) (v)->nnext
45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF case EMPTYRE: /* empty string in regexp */
47 #define UNARY case STAR: case PLUS: case QUEST:
49 /* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
62 int rtok; /* next token in current re */
64 static const uschar *rlxstr;
65 static const uschar *prestr; /* current position in current re */
66 static const uschar *lastre; /* origin of last re */
67 static const uschar *lastatom; /* origin of last Atom */
68 static const uschar *starttok;
69 static const uschar *basestr; /* starts with original, replaced during
70 repetition processing */
71 static const uschar *firstbasestr;
79 #define NFA 128 /* cache this many dynamic fa's */
81 int nfatab = 0; /* entries in fatab */
84 intalloc(size_t n, const char *f)
86 int *p = (int *) calloc(n, sizeof(int));
93 resizesetvec(const char *f)
99 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
100 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 if (setvec == NULL || tmpset == NULL)
106 resize_state(fa *f, int state)
113 if (++state < f->state_count)
116 new_count = state + 10; /* needs to be tuned */
118 p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
123 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
128 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
133 for (i = f->state_count; i < new_count; ++i) {
134 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
135 if (f->gototab[i] == NULL)
140 f->state_count = new_count;
146 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
152 if (setvec == NULL) { /* first time through any RE */
153 resizesetvec(__func__);
156 if (compile_time != RUNNING) /* a constant for sure */
157 return mkdfa(s, anchor);
158 for (i = 0; i < nfatab; i++) /* is it there already? */
159 if (fatab[i]->anchor == anchor
160 && strcmp((const char *) fatab[i]->restr, s) == 0) {
161 fatab[i]->use = now++;
164 pfa = mkdfa(s, anchor);
165 if (nfatab < NFA) { /* room for another */
167 fatab[nfatab]->use = now++;
171 use = fatab[0]->use; /* replace least-recently used */
173 for (i = 1; i < nfatab; i++)
174 if (fatab[i]->use < use) {
184 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
185 /* anchor = true for anchored matches, else false */
190 firstbasestr = (const uschar *) s;
191 basestr = firstbasestr;
193 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
194 /* put ALL STAR in front of reg. exp. */
195 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
196 /* put FINAL after reg. exp. */
199 penter(p1); /* enter parent pointers and leaf indices */
200 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
202 f->accept = poscnt-1; /* penter has computed number of positions in re */
203 cfoll(f, p1); /* set up follow sets */
206 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
207 f->posns[1] = intalloc(1, __func__);
209 f->initstat = makeinit(f, anchor);
211 f->restr = (uschar *) tostring(s);
212 if (firstbasestr != basestr) {
219 int makeinit(fa *f, bool anchor)
225 k = *(f->re[0].lfollow);
227 f->posns[2] = intalloc(k + 1, __func__);
228 for (i = 0; i <= k; i++) {
229 (f->posns[2])[i] = (f->re[0].lfollow)[i];
231 if ((f->posns[2])[1] == f->accept)
233 for (i = 0; i < NCHARS; i++)
234 f->gototab[2][i] = 0;
235 f->curstat = cgoto(f, 2, HAT);
237 *f->posns[2] = k-1; /* leave out position 0 */
238 for (i = 0; i < k; i++) {
239 (f->posns[0])[i] = (f->posns[2])[i];
242 f->out[0] = f->out[2];
244 --(*f->posns[f->curstat]);
249 void penter(Node *p) /* set up parent pointers and leaf indices */
266 parent(right(p)) = p;
270 default: /* can't happen */
271 FATAL("can't happen: unknown type %d in penter", type(p));
276 void freetr(Node *p) /* free parse tree */
294 default: /* can't happen */
295 FATAL("can't happen: unknown type %d in freetr", type(p));
300 /* in the parsing of regular expressions, metacharacters like . have */
301 /* to be seen literally; \056 is not a metacharacter. */
303 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
304 { /* only pick up one 8-bit byte (2 chars) */
309 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
311 n = 16 * n + *p - '0';
312 else if (*p >= 'a' && *p <= 'f')
313 n = 16 * n + *p - 'a' + 10;
314 else if (*p >= 'A' && *p <= 'F')
315 n = 16 * n + *p - 'A' + 10;
321 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
323 int quoted(const uschar **pp) /* pick up next thing after a \\ */
324 /* and increment *pp */
326 const uschar *p = *pp;
329 if ((c = *p++) == 't')
345 else if (c == 'x') { /* hexadecimal goo follows */
346 c = hexstr(&p); /* this adds a null if number is invalid */
347 } else if (isoctdigit(c)) { /* \d \dd \ddd */
349 if (isoctdigit(*p)) {
350 n = 8 * n + *p++ - '0';
352 n = 8 * n + *p++ - '0';
361 char *cclenter(const char *argp) /* add a character class */
364 const uschar *op, *p = (const uschar *) argp;
366 static uschar *buf = NULL;
367 static int bufsz = 100;
370 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
371 FATAL("out of space for character class [%.10s...] 1", p);
373 for (i = 0; (c = *p++) != 0; ) {
376 } else if (c == '-' && i > 0 && bp[-1] != 0) {
382 if (c > c2) { /* empty; ignore */
388 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
389 FATAL("out of space for character class [%.10s...] 2", p);
396 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
397 FATAL("out of space for character class [%.10s...] 3", p);
402 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
404 return (char *) tostring((char *) buf);
407 void overflo(const char *s)
409 FATAL("regular expression too big: out of space in %.30s...", s);
412 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
420 f->re[info(v)].ltype = type(v);
421 f->re[info(v)].lval.np = right(v);
422 while (f->accept >= maxsetvec) { /* guessing here! */
423 resizesetvec(__func__);
425 for (i = 0; i <= f->accept; i++)
428 follow(v); /* computes setvec and setcnt */
429 p = intalloc(setcnt + 1, __func__);
430 f->re[info(v)].lfollow = p;
432 for (i = f->accept; i >= 0; i--)
446 default: /* can't happen */
447 FATAL("can't happen: unknown type %d in cfoll", type(v));
451 int first(Node *p) /* collects initially active leaves of p into setvec */
452 /* returns 0 if p matches empty string */
459 lp = info(p); /* look for high-water mark of subscripts */
460 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
461 resizesetvec(__func__);
463 if (type(p) == EMPTYRE) {
467 if (setvec[lp] != 1) {
471 if (type(p) == CCL && (*(char *) right(p)) == '\0')
472 return(0); /* empty CCL */
475 if (first(left(p)) == 0)
483 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
487 if (first(left(p)) == 0 || b == 0) return(0);
492 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
496 void follow(Node *v) /* collects leaves that can follow v into setvec */
500 if (type(v) == FINAL)
516 if (v == left(p)) { /* v is left child of p */
517 if (first(right(p)) == 0) {
521 } else /* v is right child */
527 int member(int c, const char *sarg) /* is c in s? */
529 const uschar *s = (const uschar *) sarg;
537 int match(fa *f, const char *p0) /* shortest match ? */
540 const uschar *p = (const uschar *) p0;
543 assert (s < f->state_count);
548 /* assert(*p < NCHARS); */
549 if ((ns = f->gototab[s][*p]) != 0)
559 int pmatch(fa *f, const char *p0) /* longest match, for sub */
562 const uschar *p = (const uschar *) p0;
566 assert(s < f->state_count);
568 patbeg = (const char *)p;
573 if (f->out[s]) /* final state */
575 /* assert(*q < NCHARS); */
576 if ((ns = f->gototab[s][*q]) != 0)
581 assert(s < f->state_count);
583 if (s == 1) { /* no transition */
585 patbeg = (const char *) p;
589 goto nextin; /* no match */
593 patlen = q-p-1; /* don't count $ */
595 patbeg = (const char *) p;
604 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
607 const uschar *p = (const uschar *) p0;
611 assert(s < f->state_count);
613 patbeg = (const char *)p;
618 if (f->out[s]) /* final state */
620 /* assert(*q < NCHARS); */
621 if ((ns = f->gototab[s][*q]) != 0)
625 if (s == 1) { /* no transition */
627 patbeg = (const char *) p;
630 goto nnextin; /* no nonempty match */
634 patlen = q-p-1; /* don't count $ */
636 patbeg = (const char *) p;
652 * A stream-fed version of nematch which transfers characters to a
653 * null-terminated buffer. All characters up to and including the last
654 * character of the matching text or EOF are placed in the buffer. If
655 * a match is found, patbeg and patlen are set appropriately.
658 * false No match found.
662 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
665 int bufsize = *pbufsize;
666 int c, i, j, k, ns, s;
672 * All indices relative to buf.
673 * i <= j <= k <= bufsize
675 * i: origin of active substring
676 * j: current character
677 * k: destination of next getc()
685 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
686 FATAL("stream '%.30s...' too long", buf);
687 buf[k++] = (c = getc(f)) != EOF ? c : 0;
690 /* assert(c < NCHARS); */
692 if ((ns = pfa->gototab[s][c]) != 0)
695 s = cgoto(pfa, s, c);
697 if (pfa->out[s]) { /* final state */
699 if (c == 0) /* don't count $ */
702 } while (buf[j] && s != 1);
704 } while (buf[i] && !patlen);
706 /* adjbuf() may have relocated a resized buffer. Inform the world. */
711 patbeg = (char *) buf + i;
713 * Under no circumstances is the last character fed to
714 * the automaton part of the match. It is EOF's nullbyte,
715 * or it sent the automaton into a state with no further
716 * transitions available (s==1), or both. Room for a
717 * terminating nullbyte is guaranteed.
719 * ungetc any chars after the end of matching text
720 * (except for EOF's nullbyte, if present) and null
721 * terminate the buffer.
724 if (buf[--k] && ungetc(buf[k], f) == EOF)
725 FATAL("unable to ungetc '%c'", buf[k]);
726 while (k > i + patlen);
734 Node *reparse(const char *p) /* parses regular expression pointed to by p */
735 { /* uses relex() to scan regular expression */
738 DPRINTF("reparse <%s>\n", p);
739 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
741 /* GNU compatibility: an empty regexp matches anything */
743 /* FATAL("empty regular expression"); previous */
744 return(op2(EMPTYRE, NIL, NIL));
748 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
752 Node *regexp(void) /* top-level parse of reg expr */
754 return (alt(concat(primary())));
765 np = op2(CHAR, NIL, itonp(rlxval));
770 return (unary(op2(ALL, NIL, NIL)));
773 return (unary(op2(EMPTYRE, NIL, NIL)));
777 return (unary(op2(DOT, NIL, NIL)));
779 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
784 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
790 return (unary(op2(CHAR, NIL, itonp(HAT))));
793 return (unary(op2(CHAR, NIL, NIL)));
796 savelastatom = starttok - basestr; /* Retain over recursion */
798 if (rtok == ')') { /* special pleading for () */
800 return unary(op2(CCL, NIL, (Node *) tostring("")));
804 lastatom = basestr + savelastatom; /* Restore */
809 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
811 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
813 return 0; /*NOTREACHED*/
816 Node *concat(Node *np)
819 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
820 return (concat(op2(CAT, np, primary())));
823 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
833 return (alt(op2(OR, np, concat(primary()))));
838 Node *unary(Node *np)
843 return (unary(op2(STAR, np, NIL)));
846 return (unary(op2(PLUS, np, NIL)));
849 return (unary(op2(QUEST, np, NIL)));
852 return (unary(op2(ZERO, np, NIL)));
859 * Character class definitions conformant to the POSIX locale as
860 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
861 * and operating character sets are both ASCII (ISO646) or supersets
864 * Note that to avoid overflowing the temporary buffer used in
865 * relex(), the expanded character class (prior to range expansion)
866 * must be less than twice the size of their full name.
869 /* Because isblank doesn't show up in any of the header files on any
870 * system i use, it's defined here. if some other locale has a richer
871 * definition of "blank", define HAS_ISBLANK and provide your own
873 * the parentheses here are an attempt to find a path through the maze
874 * of macro definition and/or function and/or version provided. thanks
875 * to nelson beebe for the suggestion; let's see if it works everywhere.
878 /* #define HAS_ISBLANK */
881 int (xisblank)(int c)
883 return c==' ' || c=='\t';
888 static const struct charclass {
893 { "alnum", 5, isalnum },
894 { "alpha", 5, isalpha },
896 { "blank", 5, xisblank },
898 { "blank", 5, isblank },
900 { "cntrl", 5, iscntrl },
901 { "digit", 5, isdigit },
902 { "graph", 5, isgraph },
903 { "lower", 5, islower },
904 { "print", 5, isprint },
905 { "punct", 5, ispunct },
906 { "space", 5, isspace },
907 { "upper", 5, isupper },
908 { "xdigit", 6, isxdigit },
912 #define REPEAT_SIMPLE 0
913 #define REPEAT_PLUS_APPENDED 1
914 #define REPEAT_WITH_Q 2
915 #define REPEAT_ZERO 3
918 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
919 int atomlen, int firstnum, int secondnum, int special_case)
924 int init_q = (firstnum == 0); /* first added char will be ? */
925 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
926 int prefix_length = reptok - basestr; /* prefix includes first rep */
927 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
928 int size = prefix_length + suffix_length;
930 if (firstnum > 1) { /* add room for reps 2 through firstnum */
931 size += atomlen*(firstnum-1);
934 /* Adjust size of buffer for special cases */
935 if (special_case == REPEAT_PLUS_APPENDED) {
936 size++; /* for the final + */
937 } else if (special_case == REPEAT_WITH_Q) {
938 size += init_q + (atomlen+1)* (n_q_reps-init_q);
939 } else if (special_case == REPEAT_ZERO) {
940 size += 2; /* just a null ERE: () */
942 if ((buf = (uschar *) malloc(size + 1)) == NULL)
943 FATAL("out of space in reg expr %.10s..", lastre);
944 memcpy(buf, basestr, prefix_length); /* copy prefix */
946 if (special_case == REPEAT_ZERO) {
951 for (i = 1; i < firstnum; i++) { /* copy x reps */
952 memcpy(&buf[j], atom, atomlen);
955 if (special_case == REPEAT_PLUS_APPENDED) {
957 } else if (special_case == REPEAT_WITH_Q) {
960 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
961 memcpy(&buf[j], atom, atomlen);
966 memcpy(&buf[j], reptok+reptoklen, suffix_length);
969 /* free old basestr */
970 if (firstbasestr != basestr) {
975 prestr = buf + prefix_length;
976 if (special_case == REPEAT_ZERO) {
983 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
984 int atomlen, int firstnum, int secondnum)
987 In general, the repetition specifier or "bound" is replaced here
988 by an equivalent ERE string, repeating the immediately previous atom
989 and appending ? and + as needed. Note that the first copy of the
990 atom is left in place, except in the special_case of a zero-repeat
993 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
995 /* 0 or 1: should be handled before you get here */
996 FATAL("internal error");
998 return replace_repeat(reptok, reptoklen, atom, atomlen,
999 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1001 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1002 if (firstnum == 0) { /* {0} or {0,0} */
1003 /* This case is unusual because the resulting
1004 replacement string might actually be SMALLER than
1006 return replace_repeat(reptok, reptoklen, atom, atomlen,
1007 firstnum, secondnum, REPEAT_ZERO);
1008 } else { /* (firstnum >= 1) */
1009 return replace_repeat(reptok, reptoklen, atom, atomlen,
1010 firstnum, secondnum, REPEAT_SIMPLE);
1012 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1013 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1014 return replace_repeat(reptok, reptoklen, atom, atomlen,
1015 firstnum, secondnum, REPEAT_WITH_Q);
1016 } else { /* Error - shouldn't be here (n>m) */
1017 FATAL("internal error");
1022 int relex(void) /* lexical analyzer for reparse */
1026 static uschar *buf = NULL;
1027 static int bufsz = 100;
1029 const struct charclass *cc;
1032 bool commafound, digitfound;
1033 const uschar *startreptok;
1034 static int parens = 0;
1039 switch (c = *prestr++) {
1040 case '|': return OR;
1041 case '*': return STAR;
1042 case '+': return PLUS;
1043 case '?': return QUEST;
1044 case '.': return DOT;
1045 case '\0': prestr--; return '\0';
1057 /* unmatched close parenthesis; per POSIX, treat as literal */
1061 rlxval = quoted(&prestr);
1067 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1068 FATAL("out of space in reg expr %.10s..", lastre);
1070 if (*prestr == '^') {
1076 n = 2 * strlen((const char *) prestr)+1;
1077 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1078 FATAL("out of space for reg expr %.10s...", lastre);
1080 if ((c = *prestr++) == '\\') {
1082 if ((c = *prestr++) == '\0')
1083 FATAL("nonterminated character class %.20s...", lastre);
1085 /* } else if (c == '\n') { */
1086 /* FATAL("newline in character class %.20s...", lastre); */
1087 } else if (c == '[' && *prestr == ':') {
1088 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1089 for (cc = charclasses; cc->cc_name; cc++)
1090 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1092 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1093 prestr[2 + cc->cc_namelen] == ']') {
1094 prestr += cc->cc_namelen + 3;
1096 * BUG: We begin at 1, instead of 0, since we
1097 * would otherwise prematurely terminate the
1098 * string for classes like [[:cntrl:]]. This
1099 * means that we can't match the NUL character,
1100 * not without first adapting the entire
1101 * program to track each string's length.
1103 for (i = 1; i <= UCHAR_MAX; i++) {
1104 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1105 FATAL("out of space for reg expr %.10s...", lastre);
1106 if (cc->cc_func(i)) {
1107 /* escape backslash */
1119 } else if (c == '[' && *prestr == '.') {
1122 collate_char = *prestr++;
1123 if (*prestr == '.' && prestr[1] == ']') {
1125 /* Found it: map via locale TBD: for
1126 now, simply return this char. This
1127 is sufficient to pass conformance
1130 if (*prestr == ']') {
1132 rlxval = collate_char;
1136 } else if (c == '[' && *prestr == '=') {
1139 equiv_char = *prestr++;
1140 if (*prestr == '=' && prestr[1] == ']') {
1142 /* Found it: map via locale TBD: for now
1143 simply return this char. This is
1144 sufficient to pass conformance test
1147 if (*prestr == ']') {
1149 rlxval = equiv_char;
1153 } else if (c == '\0') {
1154 FATAL("nonterminated character class %.20s", lastre);
1155 } else if (bp == buf) { /* 1st char is special */
1157 } else if (c == ']') {
1159 rlxstr = (uschar *) tostring((char *) buf);
1169 if (isdigit(*(prestr))) {
1170 num = 0; /* Process as a repetition */
1174 startreptok = prestr-1;
1175 /* Remember start of previous atom here ? */
1176 } else { /* just a { char, not a repetition */
1181 if ((c = *prestr++) == '}') {
1183 if (digitfound) { /* {n,m} */
1186 FATAL("illegal repetition expression: class %.20s",
1188 if (n == 0 && m == 1) {
1198 if (digitfound) { /* {n} same as {n,n} */
1202 FATAL("illegal repetition expression: class %.20s",
1206 if (repeat(starttok, prestr-starttok, lastatom,
1207 startreptok - lastatom, n, m) > 0) {
1208 if (n == 0 && m == 0) {
1211 /* must rescan input for next token */
1214 /* Failed to replace: eat up {...} characters
1215 and treat like just PLUS */
1217 } else if (c == '\0') {
1218 FATAL("nonterminated character class %.20s",
1220 } else if (isdigit(c)) {
1221 num = 10 * num + c - '0';
1223 } else if (c == ',') {
1225 FATAL("illegal repetition expression: class %.20s",
1227 /* looking for {n,} or {n,m} */
1230 digitfound = false; /* reset */
1233 FATAL("illegal repetition expression: class %.20s",
1241 int cgoto(fa *f, int s, int c)
1246 assert(c == HAT || c < NCHARS);
1247 while (f->accept >= maxsetvec) { /* guessing here! */
1248 resizesetvec(__func__);
1250 for (i = 0; i <= f->accept; i++)
1254 /* compute positions of gototab[s,c] into setvec */
1256 for (i = 1; i <= *p; i++) {
1257 if ((k = f->re[p[i]].ltype) != FINAL) {
1258 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1259 || (k == DOT && c != 0 && c != HAT)
1260 || (k == ALL && c != 0)
1261 || (k == EMPTYRE && c != 0)
1262 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1263 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1264 q = f->re[p[i]].lfollow;
1265 for (j = 1; j <= *q; j++) {
1266 if (q[j] >= maxsetvec) {
1267 resizesetvec(__func__);
1269 if (setvec[q[j]] == 0) {
1277 /* determine if setvec is a previous state */
1280 for (i = f->accept; i >= 0; i--)
1284 resize_state(f, f->curstat > s ? f->curstat : s);
1285 /* tmpset == previous state? */
1286 for (i = 1; i <= f->curstat; i++) {
1288 if ((k = tmpset[0]) != p[0])
1290 for (j = 1; j <= k; j++)
1291 if (tmpset[j] != p[j])
1293 /* setvec is state i */
1295 f->gototab[s][c] = i;
1300 /* add tmpset to current set of states */
1302 resize_state(f, f->curstat);
1303 for (i = 0; i < NCHARS; i++)
1304 f->gototab[f->curstat][i] = 0;
1305 xfree(f->posns[f->curstat]);
1306 p = intalloc(setcnt + 1, __func__);
1308 f->posns[f->curstat] = p;
1310 f->gototab[s][c] = f->curstat;
1311 for (i = 0; i <= setcnt; i++)
1313 if (setvec[f->accept])
1314 f->out[f->curstat] = 1;
1316 f->out[f->curstat] = 0;
1321 void freefa(fa *f) /* free a finite automaton */
1327 for (i = 0; i < f->state_count; i++)
1328 xfree(f->gototab[i])
1329 for (i = 0; i <= f->curstat; i++)
1331 for (i = 0; i <= f->accept; i++) {
1332 xfree(f->re[i].lfollow);
1333 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1334 xfree(f->re[i].lval.np);