1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
38 #include "awkgram.tab.h"
42 #define type(v) (v)->nobj /* badly overloaded here */
43 #define info(v) (v)->ntype /* badly overloaded here */
44 #define left(v) (v)->narg[0]
45 #define right(v) (v)->narg[1]
46 #define parent(v) (v)->nnext
48 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
49 #define ELEAF case EMPTYRE: /* empty string in regexp */
50 #define UNARY case STAR: case PLUS: case QUEST:
52 /* encoding in tree Nodes:
53 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
54 left is index, right contains value or pointer to value
55 unary (STAR, PLUS, QUEST): left is child, right is null
56 binary (CAT, OR): left and right are children
57 parent contains pointer to parent
65 int rtok; /* next token in current re */
67 static const uschar *rlxstr;
68 static const uschar *prestr; /* current position in current re */
69 static const uschar *lastre; /* origin of last re */
70 static const uschar *lastatom; /* origin of last Atom */
71 static const uschar *starttok;
72 static const uschar *basestr; /* starts with original, replaced during
73 repetition processing */
74 static const uschar *firstbasestr;
82 #define NFA 128 /* cache this many dynamic fa's */
84 int nfatab = 0; /* entries in fatab */
87 intalloc(size_t n, const char *f)
89 int *p = (int *) calloc(n, sizeof(int));
96 resizesetvec(const char *f)
102 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
103 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
104 if (setvec == NULL || tmpset == NULL)
109 resize_state(fa *f, int state)
116 if (++state < f->state_count)
119 new_count = state + 10; /* needs to be tuned */
121 p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
126 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
131 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
136 for (i = f->state_count; i < new_count; ++i) {
137 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
138 if (f->gototab[i] == NULL)
143 f->state_count = new_count;
149 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
155 if (setvec == NULL) { /* first time through any RE */
156 resizesetvec(__func__);
159 if (compile_time != RUNNING) /* a constant for sure */
160 return mkdfa(s, anchor);
161 for (i = 0; i < nfatab; i++) /* is it there already? */
162 if (fatab[i]->anchor == anchor
163 && strcmp((const char *) fatab[i]->restr, s) == 0) {
164 fatab[i]->use = now++;
167 pfa = mkdfa(s, anchor);
168 if (nfatab < NFA) { /* room for another */
170 fatab[nfatab]->use = now++;
174 use = fatab[0]->use; /* replace least-recently used */
176 for (i = 1; i < nfatab; i++)
177 if (fatab[i]->use < use) {
187 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
188 /* anchor = true for anchored matches, else false */
193 firstbasestr = (const uschar *) s;
194 basestr = firstbasestr;
196 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
197 /* put ALL STAR in front of reg. exp. */
198 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
199 /* put FINAL after reg. exp. */
202 penter(p1); /* enter parent pointers and leaf indices */
203 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
205 f->accept = poscnt-1; /* penter has computed number of positions in re */
206 cfoll(f, p1); /* set up follow sets */
209 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
210 f->posns[1] = intalloc(1, __func__);
212 f->initstat = makeinit(f, anchor);
214 f->restr = (uschar *) tostring(s);
215 if (firstbasestr != basestr) {
222 int makeinit(fa *f, bool anchor)
228 k = *(f->re[0].lfollow);
230 f->posns[2] = intalloc(k + 1, __func__);
231 for (i = 0; i <= k; i++) {
232 (f->posns[2])[i] = (f->re[0].lfollow)[i];
234 if ((f->posns[2])[1] == f->accept)
236 for (i = 0; i < NCHARS; i++)
237 f->gototab[2][i] = 0;
238 f->curstat = cgoto(f, 2, HAT);
240 *f->posns[2] = k-1; /* leave out position 0 */
241 for (i = 0; i < k; i++) {
242 (f->posns[0])[i] = (f->posns[2])[i];
245 f->out[0] = f->out[2];
247 --(*f->posns[f->curstat]);
252 void penter(Node *p) /* set up parent pointers and leaf indices */
269 parent(right(p)) = p;
273 default: /* can't happen */
274 FATAL("can't happen: unknown type %d in penter", type(p));
279 void freetr(Node *p) /* free parse tree */
297 default: /* can't happen */
298 FATAL("can't happen: unknown type %d in freetr", type(p));
303 /* in the parsing of regular expressions, metacharacters like . have */
304 /* to be seen literally; \056 is not a metacharacter. */
306 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
307 { /* only pick up one 8-bit byte (2 chars) */
312 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
314 n = 16 * n + *p - '0';
315 else if (*p >= 'a' && *p <= 'f')
316 n = 16 * n + *p - 'a' + 10;
317 else if (*p >= 'A' && *p <= 'F')
318 n = 16 * n + *p - 'A' + 10;
324 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
326 int quoted(const uschar **pp) /* pick up next thing after a \\ */
327 /* and increment *pp */
329 const uschar *p = *pp;
332 if ((c = *p++) == 't')
348 else if (c == 'x') { /* hexadecimal goo follows */
349 c = hexstr(&p); /* this adds a null if number is invalid */
350 } else if (isoctdigit(c)) { /* \d \dd \ddd */
352 if (isoctdigit(*p)) {
353 n = 8 * n + *p++ - '0';
355 n = 8 * n + *p++ - '0';
364 static int collate_range_cmp(int a, int b)
368 if ((uschar)a == (uschar)b)
372 return (strcoll(s[0], s[1]));
375 char *cclenter(const char *argp) /* add a character class */
378 const uschar *op, *p = (const uschar *) argp;
380 static uschar *buf = NULL;
381 static int bufsz = 100;
384 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
385 FATAL("out of space for character class [%.10s...] 1", p);
387 for (i = 0; (c = *p++) != 0; ) {
390 } else if (c == '-' && i > 0 && bp[-1] != 0) {
396 if (collate_range_cmp(c, c2) > 0) {
401 for (j = 0; j < NCHARS; j++) {
402 if ((collate_range_cmp(c, j) > 0) ||
403 collate_range_cmp(j, c2) > 0)
405 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
406 FATAL("out of space for character class [%.10s...] 2", p);
413 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
414 FATAL("out of space for character class [%.10s...] 3", p);
419 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
421 return (char *) tostring((char *) buf);
424 void overflo(const char *s)
426 FATAL("regular expression too big: out of space in %.30s...", s);
429 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
437 f->re[info(v)].ltype = type(v);
438 f->re[info(v)].lval.np = right(v);
439 while (f->accept >= maxsetvec) { /* guessing here! */
440 resizesetvec(__func__);
442 for (i = 0; i <= f->accept; i++)
445 follow(v); /* computes setvec and setcnt */
446 p = intalloc(setcnt + 1, __func__);
447 f->re[info(v)].lfollow = p;
449 for (i = f->accept; i >= 0; i--)
463 default: /* can't happen */
464 FATAL("can't happen: unknown type %d in cfoll", type(v));
468 int first(Node *p) /* collects initially active leaves of p into setvec */
469 /* returns 0 if p matches empty string */
476 lp = info(p); /* look for high-water mark of subscripts */
477 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
478 resizesetvec(__func__);
480 if (type(p) == EMPTYRE) {
484 if (setvec[lp] != 1) {
488 if (type(p) == CCL && (*(char *) right(p)) == '\0')
489 return(0); /* empty CCL */
492 if (first(left(p)) == 0)
500 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
504 if (first(left(p)) == 0 || b == 0) return(0);
509 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
513 void follow(Node *v) /* collects leaves that can follow v into setvec */
517 if (type(v) == FINAL)
533 if (v == left(p)) { /* v is left child of p */
534 if (first(right(p)) == 0) {
538 } else /* v is right child */
544 int member(int c, const char *sarg) /* is c in s? */
546 const uschar *s = (const uschar *) sarg;
554 int match(fa *f, const char *p0) /* shortest match ? */
557 const uschar *p = (const uschar *) p0;
560 assert (s < f->state_count);
565 /* assert(*p < NCHARS); */
566 if ((ns = f->gototab[s][*p]) != 0)
576 int pmatch(fa *f, const char *p0) /* longest match, for sub */
579 const uschar *p = (const uschar *) p0;
583 assert(s < f->state_count);
585 patbeg = (const char *)p;
590 if (f->out[s]) /* final state */
592 /* assert(*q < NCHARS); */
593 if ((ns = f->gototab[s][*q]) != 0)
598 assert(s < f->state_count);
600 if (s == 1) { /* no transition */
602 patbeg = (const char *) p;
606 goto nextin; /* no match */
610 patlen = q-p-1; /* don't count $ */
612 patbeg = (const char *) p;
621 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
624 const uschar *p = (const uschar *) p0;
628 assert(s < f->state_count);
630 patbeg = (const char *)p;
635 if (f->out[s]) /* final state */
637 /* assert(*q < NCHARS); */
638 if ((ns = f->gototab[s][*q]) != 0)
642 if (s == 1) { /* no transition */
644 patbeg = (const char *) p;
647 goto nnextin; /* no nonempty match */
651 patlen = q-p-1; /* don't count $ */
653 patbeg = (const char *) p;
669 * A stream-fed version of nematch which transfers characters to a
670 * null-terminated buffer. All characters up to and including the last
671 * character of the matching text or EOF are placed in the buffer. If
672 * a match is found, patbeg and patlen are set appropriately.
675 * false No match found.
679 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
682 int bufsize = *pbufsize;
683 int c, i, j, k, ns, s;
689 * All indices relative to buf.
690 * i <= j <= k <= bufsize
692 * i: origin of active substring
693 * j: current character
694 * k: destination of next getc()
702 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
703 FATAL("stream '%.30s...' too long", buf);
704 buf[k++] = (c = getc(f)) != EOF ? c : 0;
707 /* assert(c < NCHARS); */
709 if ((ns = pfa->gototab[s][c]) != 0)
712 s = cgoto(pfa, s, c);
714 if (pfa->out[s]) { /* final state */
716 if (c == 0) /* don't count $ */
719 } while (buf[j] && s != 1);
721 } while (buf[i] && !patlen);
723 /* adjbuf() may have relocated a resized buffer. Inform the world. */
728 patbeg = (char *) buf + i;
730 * Under no circumstances is the last character fed to
731 * the automaton part of the match. It is EOF's nullbyte,
732 * or it sent the automaton into a state with no further
733 * transitions available (s==1), or both. Room for a
734 * terminating nullbyte is guaranteed.
736 * ungetc any chars after the end of matching text
737 * (except for EOF's nullbyte, if present) and null
738 * terminate the buffer.
741 if (buf[--k] && ungetc(buf[k], f) == EOF)
742 FATAL("unable to ungetc '%c'", buf[k]);
743 while (k > i + patlen);
751 Node *reparse(const char *p) /* parses regular expression pointed to by p */
752 { /* uses relex() to scan regular expression */
755 DPRINTF("reparse <%s>\n", p);
756 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
758 /* GNU compatibility: an empty regexp matches anything */
760 /* FATAL("empty regular expression"); previous */
761 return(op2(EMPTYRE, NIL, NIL));
765 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
769 Node *regexp(void) /* top-level parse of reg expr */
771 return (alt(concat(primary())));
782 np = op2(CHAR, NIL, itonp(rlxval));
787 return (unary(op2(ALL, NIL, NIL)));
790 return (unary(op2(EMPTYRE, NIL, NIL)));
794 return (unary(op2(DOT, NIL, NIL)));
796 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
801 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
807 return (unary(op2(CHAR, NIL, itonp(HAT))));
810 return (unary(op2(CHAR, NIL, NIL)));
813 savelastatom = starttok - basestr; /* Retain over recursion */
815 if (rtok == ')') { /* special pleading for () */
817 return unary(op2(CCL, NIL, (Node *) tostring("")));
821 lastatom = basestr + savelastatom; /* Restore */
826 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
828 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
830 return 0; /*NOTREACHED*/
833 Node *concat(Node *np)
836 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
837 return (concat(op2(CAT, np, primary())));
840 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
850 return (alt(op2(OR, np, concat(primary()))));
855 Node *unary(Node *np)
860 return (unary(op2(STAR, np, NIL)));
863 return (unary(op2(PLUS, np, NIL)));
866 return (unary(op2(QUEST, np, NIL)));
869 return (unary(op2(ZERO, np, NIL)));
876 * Character class definitions conformant to the POSIX locale as
877 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
878 * and operating character sets are both ASCII (ISO646) or supersets
881 * Note that to avoid overflowing the temporary buffer used in
882 * relex(), the expanded character class (prior to range expansion)
883 * must be less than twice the size of their full name.
886 /* Because isblank doesn't show up in any of the header files on any
887 * system i use, it's defined here. if some other locale has a richer
888 * definition of "blank", define HAS_ISBLANK and provide your own
890 * the parentheses here are an attempt to find a path through the maze
891 * of macro definition and/or function and/or version provided. thanks
892 * to nelson beebe for the suggestion; let's see if it works everywhere.
895 /* #define HAS_ISBLANK */
898 int (xisblank)(int c)
900 return c==' ' || c=='\t';
905 static const struct charclass {
910 { "alnum", 5, isalnum },
911 { "alpha", 5, isalpha },
913 { "blank", 5, xisblank },
915 { "blank", 5, isblank },
917 { "cntrl", 5, iscntrl },
918 { "digit", 5, isdigit },
919 { "graph", 5, isgraph },
920 { "lower", 5, islower },
921 { "print", 5, isprint },
922 { "punct", 5, ispunct },
923 { "space", 5, isspace },
924 { "upper", 5, isupper },
925 { "xdigit", 6, isxdigit },
929 #define REPEAT_SIMPLE 0
930 #define REPEAT_PLUS_APPENDED 1
931 #define REPEAT_WITH_Q 2
932 #define REPEAT_ZERO 3
935 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
936 int atomlen, int firstnum, int secondnum, int special_case)
941 int init_q = (firstnum == 0); /* first added char will be ? */
942 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
943 int prefix_length = reptok - basestr; /* prefix includes first rep */
944 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
945 int size = prefix_length + suffix_length;
947 if (firstnum > 1) { /* add room for reps 2 through firstnum */
948 size += atomlen*(firstnum-1);
951 /* Adjust size of buffer for special cases */
952 if (special_case == REPEAT_PLUS_APPENDED) {
953 size++; /* for the final + */
954 } else if (special_case == REPEAT_WITH_Q) {
955 size += init_q + (atomlen+1)* n_q_reps;
956 } else if (special_case == REPEAT_ZERO) {
957 size += 2; /* just a null ERE: () */
959 if ((buf = (uschar *) malloc(size + 1)) == NULL)
960 FATAL("out of space in reg expr %.10s..", lastre);
961 memcpy(buf, basestr, prefix_length); /* copy prefix */
963 if (special_case == REPEAT_ZERO) {
968 for (i = 1; i < firstnum; i++) { /* copy x reps */
969 memcpy(&buf[j], atom, atomlen);
972 if (special_case == REPEAT_PLUS_APPENDED) {
974 } else if (special_case == REPEAT_WITH_Q) {
977 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
978 memcpy(&buf[j], atom, atomlen);
983 memcpy(&buf[j], reptok+reptoklen, suffix_length);
984 if (special_case == REPEAT_ZERO) {
985 buf[j+suffix_length] = '\0';
989 /* free old basestr */
990 if (firstbasestr != basestr) {
995 prestr = buf + prefix_length;
996 if (special_case == REPEAT_ZERO) {
1003 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1004 int atomlen, int firstnum, int secondnum)
1007 In general, the repetition specifier or "bound" is replaced here
1008 by an equivalent ERE string, repeating the immediately previous atom
1009 and appending ? and + as needed. Note that the first copy of the
1010 atom is left in place, except in the special_case of a zero-repeat
1013 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1015 /* 0 or 1: should be handled before you get here */
1016 FATAL("internal error");
1018 return replace_repeat(reptok, reptoklen, atom, atomlen,
1019 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1021 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1022 if (firstnum == 0) { /* {0} or {0,0} */
1023 /* This case is unusual because the resulting
1024 replacement string might actually be SMALLER than
1026 return replace_repeat(reptok, reptoklen, atom, atomlen,
1027 firstnum, secondnum, REPEAT_ZERO);
1028 } else { /* (firstnum >= 1) */
1029 return replace_repeat(reptok, reptoklen, atom, atomlen,
1030 firstnum, secondnum, REPEAT_SIMPLE);
1032 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1033 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1034 return replace_repeat(reptok, reptoklen, atom, atomlen,
1035 firstnum, secondnum, REPEAT_WITH_Q);
1036 } else { /* Error - shouldn't be here (n>m) */
1037 FATAL("internal error");
1042 int relex(void) /* lexical analyzer for reparse */
1046 static uschar *buf = NULL;
1047 static int bufsz = 100;
1049 const struct charclass *cc;
1052 bool commafound, digitfound;
1053 const uschar *startreptok;
1054 static int parens = 0;
1059 switch (c = *prestr++) {
1060 case '|': return OR;
1061 case '*': return STAR;
1062 case '+': return PLUS;
1063 case '?': return QUEST;
1064 case '.': return DOT;
1065 case '\0': prestr--; return '\0';
1077 /* unmatched close parenthesis; per POSIX, treat as literal */
1081 rlxval = quoted(&prestr);
1087 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1088 FATAL("out of space in reg expr %.10s..", lastre);
1090 if (*prestr == '^') {
1096 n = 2 * strlen((const char *) prestr)+1;
1097 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1098 FATAL("out of space for reg expr %.10s...", lastre);
1100 if ((c = *prestr++) == '\\') {
1102 if ((c = *prestr++) == '\0')
1103 FATAL("nonterminated character class %.20s...", lastre);
1105 /* } else if (c == '\n') { */
1106 /* FATAL("newline in character class %.20s...", lastre); */
1107 } else if (c == '[' && *prestr == ':') {
1108 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1109 for (cc = charclasses; cc->cc_name; cc++)
1110 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1112 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1113 prestr[2 + cc->cc_namelen] == ']') {
1114 prestr += cc->cc_namelen + 3;
1116 * BUG: We begin at 1, instead of 0, since we
1117 * would otherwise prematurely terminate the
1118 * string for classes like [[:cntrl:]]. This
1119 * means that we can't match the NUL character,
1120 * not without first adapting the entire
1121 * program to track each string's length.
1123 for (i = 1; i <= UCHAR_MAX; i++) {
1124 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1125 FATAL("out of space for reg expr %.10s...", lastre);
1126 if (cc->cc_func(i)) {
1127 /* escape backslash */
1139 } else if (c == '[' && *prestr == '.') {
1142 collate_char = *prestr++;
1143 if (*prestr == '.' && prestr[1] == ']') {
1145 /* Found it: map via locale TBD: for
1146 now, simply return this char. This
1147 is sufficient to pass conformance
1150 if (*prestr == ']') {
1152 rlxval = collate_char;
1156 } else if (c == '[' && *prestr == '=') {
1159 equiv_char = *prestr++;
1160 if (*prestr == '=' && prestr[1] == ']') {
1162 /* Found it: map via locale TBD: for now
1163 simply return this char. This is
1164 sufficient to pass conformance test
1167 if (*prestr == ']') {
1169 rlxval = equiv_char;
1173 } else if (c == '\0') {
1174 FATAL("nonterminated character class %.20s", lastre);
1175 } else if (bp == buf) { /* 1st char is special */
1177 } else if (c == ']') {
1179 rlxstr = (uschar *) tostring((char *) buf);
1189 if (isdigit(*(prestr))) {
1190 num = 0; /* Process as a repetition */
1194 startreptok = prestr-1;
1195 /* Remember start of previous atom here ? */
1196 } else { /* just a { char, not a repetition */
1201 if ((c = *prestr++) == '}') {
1203 if (digitfound) { /* {n,m} */
1206 FATAL("illegal repetition expression: class %.20s",
1208 if (n == 0 && m == 1) {
1218 if (digitfound) { /* {n} same as {n,n} */
1222 FATAL("illegal repetition expression: class %.20s",
1226 if (repeat(starttok, prestr-starttok, lastatom,
1227 startreptok - lastatom, n, m) > 0) {
1228 if (n == 0 && m == 0) {
1231 /* must rescan input for next token */
1234 /* Failed to replace: eat up {...} characters
1235 and treat like just PLUS */
1237 } else if (c == '\0') {
1238 FATAL("nonterminated character class %.20s",
1240 } else if (isdigit(c)) {
1241 num = 10 * num + c - '0';
1243 } else if (c == ',') {
1245 FATAL("illegal repetition expression: class %.20s",
1247 /* looking for {n,} or {n,m} */
1250 digitfound = false; /* reset */
1253 FATAL("illegal repetition expression: class %.20s",
1261 int cgoto(fa *f, int s, int c)
1266 assert(c == HAT || c < NCHARS);
1267 while (f->accept >= maxsetvec) { /* guessing here! */
1268 resizesetvec(__func__);
1270 for (i = 0; i <= f->accept; i++)
1274 /* compute positions of gototab[s,c] into setvec */
1276 for (i = 1; i <= *p; i++) {
1277 if ((k = f->re[p[i]].ltype) != FINAL) {
1278 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1279 || (k == DOT && c != 0 && c != HAT)
1280 || (k == ALL && c != 0)
1281 || (k == EMPTYRE && c != 0)
1282 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1283 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1284 q = f->re[p[i]].lfollow;
1285 for (j = 1; j <= *q; j++) {
1286 if (q[j] >= maxsetvec) {
1287 resizesetvec(__func__);
1289 if (setvec[q[j]] == 0) {
1297 /* determine if setvec is a previous state */
1300 for (i = f->accept; i >= 0; i--)
1304 resize_state(f, f->curstat > s ? f->curstat : s);
1305 /* tmpset == previous state? */
1306 for (i = 1; i <= f->curstat; i++) {
1308 if ((k = tmpset[0]) != p[0])
1310 for (j = 1; j <= k; j++)
1311 if (tmpset[j] != p[j])
1313 /* setvec is state i */
1315 f->gototab[s][c] = i;
1320 /* add tmpset to current set of states */
1322 resize_state(f, f->curstat);
1323 for (i = 0; i < NCHARS; i++)
1324 f->gototab[f->curstat][i] = 0;
1325 xfree(f->posns[f->curstat]);
1326 p = intalloc(setcnt + 1, __func__);
1328 f->posns[f->curstat] = p;
1330 f->gototab[s][c] = f->curstat;
1331 for (i = 0; i <= setcnt; i++)
1333 if (setvec[f->accept])
1334 f->out[f->curstat] = 1;
1336 f->out[f->curstat] = 0;
1341 void freefa(fa *f) /* free a finite automaton */
1347 for (i = 0; i < f->state_count; i++)
1348 xfree(f->gototab[i])
1349 for (i = 0; i <= f->curstat; i++)
1351 for (i = 0; i <= f->accept; i++) {
1352 xfree(f->re[i].lfollow);
1353 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1354 xfree(f->re[i].lval.np);