1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
39 #define HAT (NCHARS+2) /* matches ^ in regular expr */
43 #define type(v) (v)->nobj /* badly overloaded here */
44 #define info(v) (v)->ntype /* badly overloaded here */
45 #define left(v) (v)->narg[0]
46 #define right(v) (v)->narg[1]
47 #define parent(v) (v)->nnext
49 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50 #define ELEAF case EMPTYRE: /* empty string in regexp */
51 #define UNARY case STAR: case PLUS: case QUEST:
53 /* encoding in tree Nodes:
54 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55 left is index, right contains value or pointer to value
56 unary (STAR, PLUS, QUEST): left is child, right is null
57 binary (CAT, OR): left and right are children
58 parent contains pointer to parent
66 int rtok; /* next token in current re */
68 static uschar *rlxstr;
69 static uschar *prestr; /* current position in current re */
70 static uschar *lastre; /* origin of last re */
78 #define NFA 20 /* cache this many dynamic fa's */
80 int nfatab = 0; /* entries in fatab */
82 fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
88 if (setvec == 0) { /* first time through any RE */
90 setvec = (int *) malloc(maxsetvec * sizeof(int));
91 tmpset = (int *) malloc(maxsetvec * sizeof(int));
92 if (setvec == 0 || tmpset == 0)
93 overflo("out of space initializing makedfa");
96 if (compile_time) /* a constant for sure */
97 return mkdfa(s, anchor);
98 for (i = 0; i < nfatab; i++) /* is it there already? */
99 if (fatab[i]->anchor == anchor
100 && strcmp((const char *) fatab[i]->restr, s) == 0) {
101 fatab[i]->use = now++;
104 pfa = mkdfa(s, anchor);
105 if (nfatab < NFA) { /* room for another */
107 fatab[nfatab]->use = now++;
111 use = fatab[0]->use; /* replace least-recently used */
113 for (i = 1; i < nfatab; i++)
114 if (fatab[i]->use < use) {
124 fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
125 /* anchor = 1 for anchored matches, else 0 */
131 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
132 /* put ALL STAR in front of reg. exp. */
133 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
134 /* put FINAL after reg. exp. */
137 penter(p1); /* enter parent pointers and leaf indices */
138 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
139 overflo("out of space for fa");
140 f->accept = poscnt-1; /* penter has computed number of positions in re */
141 cfoll(f, p1); /* set up follow sets */
143 if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
144 overflo("out of space in makedfa");
145 if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
146 overflo("out of space in makedfa");
148 f->initstat = makeinit(f, anchor);
150 f->restr = (uschar *) tostring(s);
154 int makeinit(fa *f, int anchor)
161 k = *(f->re[0].lfollow);
163 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
164 overflo("out of space in makeinit");
165 for (i=0; i <= k; i++) {
166 (f->posns[2])[i] = (f->re[0].lfollow)[i];
168 if ((f->posns[2])[1] == f->accept)
170 for (i=0; i < NCHARS; i++)
171 f->gototab[2][i] = 0;
172 f->curstat = cgoto(f, 2, HAT);
174 *f->posns[2] = k-1; /* leave out position 0 */
175 for (i=0; i < k; i++) {
176 (f->posns[0])[i] = (f->posns[2])[i];
179 f->out[0] = f->out[2];
181 --(*f->posns[f->curstat]);
186 void penter(Node *p) /* set up parent pointers and leaf indices */
203 parent(right(p)) = p;
205 default: /* can't happen */
206 FATAL("can't happen: unknown type %d in penter", type(p));
211 void freetr(Node *p) /* free parse tree */
228 default: /* can't happen */
229 FATAL("can't happen: unknown type %d in freetr", type(p));
234 /* in the parsing of regular expressions, metacharacters like . have */
235 /* to be seen literally; \056 is not a metacharacter. */
237 int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */
238 { /* only pick up one 8-bit byte (2 chars) */
243 for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
245 n = 16 * n + *p - '0';
246 else if (*p >= 'a' && *p <= 'f')
247 n = 16 * n + *p - 'a' + 10;
248 else if (*p >= 'A' && *p <= 'F')
249 n = 16 * n + *p - 'A' + 10;
255 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
257 int quoted(uschar **pp) /* pick up next thing after a \\ */
258 /* and increment *pp */
263 if ((c = *p++) == 't')
275 else if (c == 'x') { /* hexadecimal goo follows */
276 c = hexstr(&p); /* this adds a null if number is invalid */
277 } else if (isoctdigit(c)) { /* \d \dd \ddd */
279 if (isoctdigit(*p)) {
280 n = 8 * n + *p++ - '0';
282 n = 8 * n + *p++ - '0';
291 static int collate_range_cmp(int a, int b)
295 if ((uschar)a == (uschar)b)
299 return (strcoll(s[0], s[1]));
302 char *cclenter(const char *argp) /* add a character class */
306 uschar *p = (uschar *) argp;
308 static uschar *buf = 0;
309 static int bufsz = 100;
312 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
313 FATAL("out of space for character class [%.10s...] 1", p);
315 for (i = 0; (c = *p++) != 0; ) {
318 } else if (c == '-' && i > 0 && bp[-1] != 0) {
324 if (collate_range_cmp(c, c2) > 0) {
329 for (j = 0; j < NCHARS; j++) {
330 if ((collate_range_cmp(c, j) > 0) ||
331 collate_range_cmp(j, c2) > 0)
333 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
334 FATAL("out of space for character class [%.10s...] 2", p);
341 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
342 FATAL("out of space for character class [%.10s...] 3", p);
347 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
349 return (char *) tostring((char *) buf);
352 void overflo(const char *s)
354 FATAL("regular expression too big: %.30s...", s);
357 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
365 f->re[info(v)].ltype = type(v);
366 f->re[info(v)].lval.np = right(v);
367 while (f->accept >= maxsetvec) { /* guessing here! */
369 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
370 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
371 if (setvec == 0 || tmpset == 0)
372 overflo("out of space in cfoll()");
374 for (i = 0; i <= f->accept; i++)
377 follow(v); /* computes setvec and setcnt */
378 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
379 overflo("out of space building follow set");
380 f->re[info(v)].lfollow = p;
382 for (i = f->accept; i >= 0; i--)
394 default: /* can't happen */
395 FATAL("can't happen: unknown type %d in cfoll", type(v));
399 int first(Node *p) /* collects initially active leaves of p into setvec */
400 /* returns 0 if p matches empty string */
407 lp = info(p); /* look for high-water mark of subscripts */
408 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
410 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
411 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
412 if (setvec == 0 || tmpset == 0)
413 overflo("out of space in first()");
415 if (type(p) == EMPTYRE) {
419 if (setvec[lp] != 1) {
423 if (type(p) == CCL && (*(char *) right(p)) == '\0')
424 return(0); /* empty CCL */
427 if (first(left(p)) == 0) return(0);
434 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
438 if (first(left(p)) == 0 || b == 0) return(0);
441 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
445 void follow(Node *v) /* collects leaves that can follow v into setvec */
449 if (type(v) == FINAL)
465 if (v == left(p)) { /* v is left child of p */
466 if (first(right(p)) == 0) {
470 } else /* v is right child */
476 int member(int c, const char *sarg) /* is c in s? */
478 uschar *s = (uschar *) sarg;
486 int match(fa *f, const char *p0) /* shortest match ? */
489 uschar *p = (uschar *) p0;
491 s = f->reset ? makeinit(f,0) : f->initstat;
495 /* assert(*p < NCHARS); */
496 if ((ns = f->gototab[s][*p]) != 0)
506 int pmatch(fa *f, const char *p0) /* longest match, for sub */
509 uschar *p = (uschar *) p0;
513 /* s = f->reset ? makeinit(f,1) : f->initstat; */
515 f->initstat = s = makeinit(f,1);
524 if (f->out[s]) /* final state */
526 /* assert(*q < NCHARS); */
527 if ((ns = f->gototab[s][*q]) != 0)
531 if (s == 1) { /* no transition */
537 goto nextin; /* no match */
541 patlen = q-p-1; /* don't count $ */
549 for (i = 2; i <= f->curstat; i++)
552 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
553 overflo("out of space in pmatch");
554 for (i = 0; i <= k; i++)
555 (f->posns[2])[i] = (f->posns[0])[i];
556 f->initstat = f->curstat = 2;
557 f->out[2] = f->out[0];
558 for (i = 0; i < NCHARS; i++)
559 f->gototab[2][i] = 0;
565 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
568 uschar *p = (uschar *) p0;
572 /* s = f->reset ? makeinit(f,1) : f->initstat; */
574 f->initstat = s = makeinit(f,1);
582 if (f->out[s]) /* final state */
584 /* assert(*q < NCHARS); */
585 if ((ns = f->gototab[s][*q]) != 0)
589 if (s == 1) { /* no transition */
594 goto nnextin; /* no nonempty match */
598 patlen = q-p-1; /* don't count $ */
606 for (i = 2; i <= f->curstat; i++)
609 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
610 overflo("out of state space");
611 for (i = 0; i <= k; i++)
612 (f->posns[2])[i] = (f->posns[0])[i];
613 f->initstat = f->curstat = 2;
614 f->out[2] = f->out[0];
615 for (i = 0; i < NCHARS; i++)
616 f->gototab[2][i] = 0;
623 Node *reparse(const char *p) /* parses regular expression pointed to by p */
624 { /* uses relex() to scan regular expression */
627 dprintf( ("reparse <%s>\n", p) );
628 lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
630 /* GNU compatibility: an empty regexp matches anything */
632 /* FATAL("empty regular expression"); previous */
633 return(op2(EMPTYRE, NIL, NIL));
637 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
641 Node *regexp(void) /* top-level parse of reg expr */
643 return (alt(concat(primary())));
652 np = op2(CHAR, NIL, itonp(rlxval));
657 return (unary(op2(ALL, NIL, NIL)));
660 return (unary(op2(ALL, NIL, NIL)));
663 return (unary(op2(DOT, NIL, NIL)));
665 np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
669 np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
674 return (unary(op2(CHAR, NIL, itonp(HAT))));
677 return (unary(op2(CHAR, NIL, NIL)));
680 if (rtok == ')') { /* special pleading for () */
682 return unary(op2(CCL, NIL, (Node *) tostring("")));
690 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
692 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
694 return 0; /*NOTREACHED*/
697 Node *concat(Node *np)
700 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
701 return (concat(op2(CAT, np, primary())));
710 return (alt(op2(OR, np, concat(primary()))));
715 Node *unary(Node *np)
720 return (unary(op2(STAR, np, NIL)));
723 return (unary(op2(PLUS, np, NIL)));
726 return (unary(op2(QUEST, np, NIL)));
733 * Character class definitions conformant to the POSIX locale as
734 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
735 * and operating character sets are both ASCII (ISO646) or supersets
738 * Note that to avoid overflowing the temporary buffer used in
739 * relex(), the expanded character class (prior to range expansion)
740 * must be less than twice the size of their full name.
743 /* Because isblank doesn't show up in any of the header files on any
744 * system i use, it's defined here. if some other locale has a richer
745 * definition of "blank", define HAS_ISBLANK and provide your own
747 * the parentheses here are an attempt to find a path through the maze
748 * of macro definition and/or function and/or version provided. thanks
749 * to nelson beebe for the suggestion; let's see if it works everywhere.
752 /* #define HAS_ISBLANK */
755 int (xisblank)(int c)
757 return c==' ' || c=='\t';
767 { "alnum", 5, isalnum },
768 { "alpha", 5, isalpha },
770 { "blank", 5, isspace }, /* was isblank */
772 { "blank", 5, isblank },
774 { "cntrl", 5, iscntrl },
775 { "digit", 5, isdigit },
776 { "graph", 5, isgraph },
777 { "lower", 5, islower },
778 { "print", 5, isprint },
779 { "punct", 5, ispunct },
780 { "space", 5, isspace },
781 { "upper", 5, isupper },
782 { "xdigit", 6, isxdigit },
787 int relex(void) /* lexical analyzer for reparse */
791 static uschar *buf = 0;
792 static int bufsz = 100;
794 struct charclass *cc;
797 switch (c = *prestr++) {
799 case '*': return STAR;
800 case '+': return PLUS;
801 case '?': return QUEST;
802 case '.': return DOT;
803 case '\0': prestr--; return '\0';
810 rlxval = quoted(&prestr);
816 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
817 FATAL("out of space in reg expr %.10s..", lastre);
819 if (*prestr == '^') {
825 n = 2 * strlen((const char *) prestr)+1;
826 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
827 FATAL("out of space for reg expr %.10s...", lastre);
829 if ((c = *prestr++) == '\\') {
831 if ((c = *prestr++) == '\0')
832 FATAL("nonterminated character class %.20s...", lastre);
834 /* } else if (c == '\n') { */
835 /* FATAL("newline in character class %.20s...", lastre); */
836 } else if (c == '[' && *prestr == ':') {
837 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
838 for (cc = charclasses; cc->cc_name; cc++)
839 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
841 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
842 prestr[2 + cc->cc_namelen] == ']') {
843 prestr += cc->cc_namelen + 3;
844 for (i = 0; i < NCHARS; i++) {
845 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
846 FATAL("out of space for reg expr %.10s...", lastre);
847 if (cc->cc_func(i)) {
854 } else if (c == '\0') {
855 FATAL("nonterminated character class %.20s", lastre);
856 } else if (bp == buf) { /* 1st char is special */
858 } else if (c == ']') {
860 rlxstr = (uschar *) tostring((char *) buf);
871 int cgoto(fa *f, int s, int c)
876 assert(c == HAT || c < NCHARS);
877 while (f->accept >= maxsetvec) { /* guessing here! */
879 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
880 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
881 if (setvec == 0 || tmpset == 0)
882 overflo("out of space in cgoto()");
884 for (i = 0; i <= f->accept; i++)
887 /* compute positions of gototab[s,c] into setvec */
889 for (i = 1; i <= *p; i++) {
890 if ((k = f->re[p[i]].ltype) != FINAL) {
891 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
892 || (k == DOT && c != 0 && c != HAT)
893 || (k == ALL && c != 0)
894 || (k == EMPTYRE && c != 0)
895 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
896 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
897 q = f->re[p[i]].lfollow;
898 for (j = 1; j <= *q; j++) {
899 if (q[j] >= maxsetvec) {
901 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
902 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
903 if (setvec == 0 || tmpset == 0)
904 overflo("cgoto overflow");
906 if (setvec[q[j]] == 0) {
914 /* determine if setvec is a previous state */
917 for (i = f->accept; i >= 0; i--)
921 /* tmpset == previous state? */
922 for (i = 1; i <= f->curstat; i++) {
924 if ((k = tmpset[0]) != p[0])
926 for (j = 1; j <= k; j++)
927 if (tmpset[j] != p[j])
929 /* setvec is state i */
930 f->gototab[s][c] = i;
935 /* add tmpset to current set of states */
936 if (f->curstat >= NSTATES-1) {
939 for (i = 2; i < NSTATES; i++)
943 for (i = 0; i < NCHARS; i++)
944 f->gototab[f->curstat][i] = 0;
945 xfree(f->posns[f->curstat]);
946 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
947 overflo("out of space in cgoto");
949 f->posns[f->curstat] = p;
950 f->gototab[s][c] = f->curstat;
951 for (i = 0; i <= setcnt; i++)
953 if (setvec[f->accept])
954 f->out[f->curstat] = 1;
956 f->out[f->curstat] = 0;
961 void freefa(fa *f) /* free a finite automaton */
967 for (i = 0; i <= f->curstat; i++)
969 for (i = 0; i <= f->accept; i++) {
970 xfree(f->re[i].lfollow);
971 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
972 xfree((f->re[i].lval.np));