1 /* $NetBSD: engine.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer of the University of Toronto.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * @(#)engine.c 8.4 (Berkeley) 3/19/94
39 * The matching engine and friends. This file is #included by regexec.c
40 * after suitable #defines of a variety of macros used herein, so that
41 * different state representations can be used without duplicating masses
46 #define matcher smatcher
49 #define dissect sdissect
50 #define backref sbackref
57 #define matcher lmatcher
60 #define dissect ldissect
61 #define backref lbackref
68 /* another structure passed up and down to avoid zillions of parameters */
72 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
73 const RCHAR_T *offp; /* offsets work from here */
74 const RCHAR_T *beginp; /* start of string -- virtual NUL precedes */
75 const RCHAR_T *endp; /* end of string -- virtual NUL here */
76 const RCHAR_T *coldp; /* can be no match starting before here */
77 const RCHAR_T **lastpos; /* [nplus+1] */
79 states st; /* current states */
80 states fresh; /* states for a fresh start */
81 states tmp; /* temporary */
82 states empty; /* empty set of states */
85 /* ========= begin header generated by ./mkh ========= */
90 /* === engine.c === */
91 static int matcher(struct re_guts *g, const RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags);
92 static const RCHAR_T *dissect(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst);
93 static const RCHAR_T *backref(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst, sopno lev);
94 static const RCHAR_T *fast(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst);
95 static const RCHAR_T *slow(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst);
96 static states step(struct re_guts *g, sopno start, sopno stop, states bef, int flag, RCHAR_T ch, states aft);
99 #define BOLEOL (BOL+2)
100 #define NOTHING (BOL+3)
104 static void print(struct match *m, char *caption, states st, int ch, FILE *d);
107 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
110 static char *pchar(int ch);
116 /* ========= end header generated by ./mkh ========= */
119 #define SP(t, s, c) print(m, t, s, c, stdout)
120 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
121 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
123 #define SP(t, s, c) /* nothing */
124 #define AT(t, p1, p2, s1, s2) /* nothing */
125 #define NOTE(s) /* nothing */
129 - matcher - the actual matching engine
131 static int /* 0 success, REG_NOMATCH failure */
132 matcher(struct re_guts *g, const RCHAR_T *string, size_t nmatch,
133 regmatch_t pmatch[], int eflags)
138 struct match *m = &mv;
140 const sopno gf = g->firststate+1; /* +1 for OEND */
141 const sopno gl = g->laststate;
142 const RCHAR_T *start;
145 /* simplify the situation where possible */
146 if (g->cflags®_NOSUB)
148 if (eflags®_STARTEND) {
149 start = string + pmatch[0].rm_so;
150 stop = string + pmatch[0].rm_eo;
153 stop = start + STRLEN(start);
158 /* prescreening; this does wonders for this rather slow code */
159 if (g->must != NULL) {
160 for (dp = start; dp < stop; dp++)
161 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
162 MEMCMP(dp, g->must, g->mlen) == 0)
164 if (dp == stop) /* we didn't find g->must */
168 /* match struct setup */
183 /* this loop does only one repetition except for backrefs */
185 endp = fast(m, start, stop, gf, gl);
186 if (endp == NULL) { /* a miss */
190 if (nmatch == 0 && !g->backrefs)
191 break; /* no further info needed */
194 assert(m->coldp != NULL);
196 NOTE("finding start");
197 endp = slow(m, m->coldp, stop, gf, gl);
200 assert(m->coldp < m->endp);
203 if (nmatch == 1 && !g->backrefs)
204 break; /* no further info needed */
206 /* oh my, he wants the subexpressions... */
207 if (m->pmatch == NULL)
208 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
210 if (m->pmatch == NULL) {
214 for (i = 1; i <= m->g->nsub; i++)
215 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
216 if (!g->backrefs && !(m->eflags®_BACKR)) {
218 dp = dissect(m, m->coldp, endp, gf, gl);
220 if (g->nplus > 0 && m->lastpos == NULL)
221 m->lastpos = (const RCHAR_T **)malloc((g->nplus+1) *
222 sizeof(const RCHAR_T *));
223 if (g->nplus > 0 && m->lastpos == NULL) {
228 NOTE("backref dissect");
229 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
234 /* uh-oh... we couldn't find a subexpression-level match */
235 assert(g->backrefs); /* must be back references doing it */
236 assert(g->nplus == 0 || m->lastpos != NULL);
238 if (dp != NULL || endp <= m->coldp)
241 endp = slow(m, m->coldp, endp-1, gf, gl);
244 /* try it on a shorter possibility */
246 for (i = 1; i <= m->g->nsub; i++) {
247 assert(m->pmatch[i].rm_so == -1);
248 assert(m->pmatch[i].rm_eo == -1);
251 NOTE("backoff dissect");
252 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
254 assert(dp == NULL || dp == endp);
255 if (dp != NULL) /* found a shorter one */
258 /* despite initial appearances, there is no match here */
260 start = m->coldp + 1; /* recycle starting later */
261 assert(start <= stop);
264 /* fill in the details if requested */
266 pmatch[0].rm_so = m->coldp - m->offp;
267 pmatch[0].rm_eo = endp - m->offp;
270 assert(m->pmatch != NULL);
271 for (i = 1; i < nmatch; i++)
273 pmatch[i] = m->pmatch[i];
275 pmatch[i].rm_so = -1;
276 pmatch[i].rm_eo = -1;
280 if (m->pmatch != NULL)
281 free((char *)m->pmatch);
282 if (m->lastpos != NULL)
283 free((char *)m->lastpos);
289 - dissect - figure out what matched what, no back references
291 static const RCHAR_T * /* == stop (success) always */
292 dissect(struct match *m, const RCHAR_T *start, const RCHAR_T *stop,
293 sopno startst, sopno stopst)
296 sopno ss; /* start sop of current subRE */
297 sopno es; /* end sop of current subRE */
298 const RCHAR_T *sp; /* start of string matched by it */
299 const RCHAR_T *stp; /* string matched by it cannot pass here */
300 const RCHAR_T *rest; /* start of rest of string */
301 const RCHAR_T *tail; /* string unmatched by rest of RE */
302 sopno ssub; /* start sop of subsubRE */
303 sopno esub; /* end sop of subsubRE */
304 const RCHAR_T *ssp; /* start of string matched by subsubRE */
305 const RCHAR_T *sep; /* end of string matched by subsubRE */
306 const RCHAR_T *oldssp; /* previous ssp */
309 AT("diss", start, stop, startst, stopst);
311 for (ss = startst; ss < stopst; ss = es) {
312 /* identify end of subRE */
314 switch (m->g->strip[es]) {
317 es += m->g->stripdata[es];
320 while (m->g->strip[es] != O_CH)
321 es += m->g->stripdata[es];
326 /* figure out what it matched */
327 switch (m->g->strip[ss]) {
347 /* cases where length of match is hard to find */
351 /* how long could this one be? */
352 rest = slow(m, sp, stp, ss, es);
353 assert(rest != NULL); /* it did match */
354 /* could the rest match the rest? */
355 tail = slow(m, rest, stop, es, stopst);
358 /* no -- try a shorter match for this one */
360 assert(stp >= sp); /* it did work */
364 /* did innards match? */
365 if (slow(m, sp, rest, ssub, esub) != NULL) {
366 dp = dissect(m, sp, rest, ssub, esub);
375 /* how long could this one be? */
376 rest = slow(m, sp, stp, ss, es);
377 assert(rest != NULL); /* it did match */
378 /* could the rest match the rest? */
379 tail = slow(m, rest, stop, es, stopst);
382 /* no -- try a shorter match for this one */
384 assert(stp >= sp); /* it did work */
390 for (;;) { /* find last match of innards */
391 sep = slow(m, ssp, rest, ssub, esub);
392 if (sep == NULL || sep == ssp)
393 break; /* failed or matched null */
394 oldssp = ssp; /* on to next try */
398 /* last successful match */
402 assert(sep == rest); /* must exhaust substring */
403 assert(slow(m, ssp, sep, ssub, esub) == rest);
404 dp = dissect(m, ssp, sep, ssub, esub);
411 /* how long could this one be? */
412 rest = slow(m, sp, stp, ss, es);
413 assert(rest != NULL); /* it did match */
414 /* could the rest match the rest? */
415 tail = slow(m, rest, stop, es, stopst);
418 /* no -- try a shorter match for this one */
420 assert(stp >= sp); /* it did work */
423 esub = ss + m->g->stripdata[ss] - 1;
424 assert(m->g->strip[esub] == OOR1);
425 for (;;) { /* find first matching branch */
426 if (slow(m, sp, rest, ssub, esub) == rest)
427 break; /* it matched all of it */
428 /* that one missed, try next one */
429 assert(m->g->strip[esub] == OOR1);
431 assert(m->g->strip[esub] == OOR2);
433 esub += m->g->stripdata[esub];
434 if (m->g->strip[esub] == OOR2)
437 assert(m->g->strip[esub] == O_CH);
439 dp = dissect(m, sp, rest, ssub, esub);
451 i = m->g->stripdata[ss];
452 assert(0 < i && i <= m->g->nsub);
453 m->pmatch[i].rm_so = sp - m->offp;
456 i = m->g->stripdata[ss];
457 assert(0 < i && i <= m->g->nsub);
458 m->pmatch[i].rm_eo = sp - m->offp;
471 - backref - figure out what matched what, figuring in back references
473 static const RCHAR_T * /* == stop (success) or NULL (failure) */
474 backref(struct match *m, const RCHAR_T *start, const RCHAR_T *stop,
475 sopno startst, sopno stopst, sopno lev) /* PLUS nesting level */
478 sopno ss; /* start sop of current subRE */
479 const RCHAR_T *sp; /* start of string matched by it */
480 sopno ssub; /* start sop of subsubRE */
481 sopno esub; /* end sop of subsubRE */
482 const RCHAR_T *ssp; /* start of string matched by subsubRE */
491 AT("back", start, stop, startst, stopst);
494 /* get as far as we can with easy stuff */
496 for (ss = startst; !hard && ss < stopst; ss++) {
498 d = m->g->stripdata[ss];
501 if (sp == stop || *sp++ != d)
511 if (sp == stop || !CHIN(cs, *sp++))
515 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
516 (sp < m->endp && *(sp-1) == '\n' &&
517 (m->g->cflags®_NEWLINE)) )
523 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
524 (sp < m->endp && *sp == '\n' &&
525 (m->g->cflags®_NEWLINE)) )
531 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
532 (sp < m->endp && *(sp-1) == '\n' &&
533 (m->g->cflags®_NEWLINE)) ||
535 !ISWORD(*(sp-1))) ) &&
536 (sp < m->endp && ISWORD(*sp)) )
542 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
543 (sp < m->endp && *sp == '\n' &&
544 (m->g->cflags®_NEWLINE)) ||
545 (sp < m->endp && !ISWORD(*sp)) ) &&
546 (sp > m->beginp && ISWORD(*(sp-1))) )
553 case OOR1: /* matches null but needs to skip */
556 d = m->g->stripdata[ss];
561 d = m->g->stripdata[ss];
563 /* note that the ss++ gets us past the O_CH */
565 default: /* have to make a choice */
570 if (!hard) { /* that was it! */
575 ss--; /* adjust for the for's final increment */
578 AT("hard", sp, stop, ss, stopst);
580 d = m->g->stripdata[ss];
582 case OBACK_: /* the vilest depths */
584 assert(0 < i && i <= m->g->nsub);
585 if (m->pmatch[i].rm_eo == -1)
587 assert(m->pmatch[i].rm_so != -1);
588 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
589 assert(stop - m->beginp >= len);
591 return(NULL); /* not enough left to match */
592 ssp = m->offp + m->pmatch[i].rm_so;
593 if (memcmp(sp, ssp, len) != 0)
595 while (m->g->strip[ss] != O_BACK || m->g->stripdata[ss] != i)
597 return(backref(m, sp+len, stop, ss+1, stopst, lev));
599 case OQUEST_: /* to null or not */
600 dp = backref(m, sp, stop, ss+1, stopst, lev);
602 return(dp); /* not */
603 return(backref(m, sp, stop, ss+d+1, stopst, lev));
606 assert(m->lastpos != NULL);
607 assert(lev+1 <= m->g->nplus);
608 m->lastpos[lev+1] = sp;
609 return(backref(m, sp, stop, ss+1, stopst, lev+1));
612 if (sp == m->lastpos[lev]) /* last pass matched null */
613 return(backref(m, sp, stop, ss+1, stopst, lev-1));
614 /* try another pass */
615 m->lastpos[lev] = sp;
616 dp = backref(m, sp, stop, ss-d+1, stopst, lev);
618 return(backref(m, sp, stop, ss+1, stopst, lev-1));
622 case OCH_: /* find the right one, if any */
625 assert(m->g->strip[esub] == OOR1);
626 for (;;) { /* find first matching branch */
627 dp = backref(m, sp, stop, ssub, esub, lev);
630 /* that one missed, try next one */
631 if (m->g->strip[esub] == O_CH)
632 return(NULL); /* there is none */
634 assert(m->g->strip[esub] == OOR2);
636 esub += m->g->stripdata[esub];
637 if (m->g->strip[esub] == OOR2)
640 assert(m->g->strip[esub] == O_CH);
643 case OLPAREN: /* must undo assignment if rest fails */
645 assert(0 < i && i <= m->g->nsub);
646 offsave = m->pmatch[i].rm_so;
647 m->pmatch[i].rm_so = sp - m->offp;
648 dp = backref(m, sp, stop, ss+1, stopst, lev);
651 m->pmatch[i].rm_so = offsave;
654 case ORPAREN: /* must undo assignment if rest fails */
656 assert(0 < i && i <= m->g->nsub);
657 offsave = m->pmatch[i].rm_eo;
658 m->pmatch[i].rm_eo = sp - m->offp;
659 dp = backref(m, sp, stop, ss+1, stopst, lev);
662 m->pmatch[i].rm_eo = offsave;
677 - fast - step through the string at top speed
679 static const RCHAR_T * /* where tentative match ended, or NULL */
680 fast(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst,
684 states fresh = m->fresh;
686 const RCHAR_T *p = start;
687 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
688 RCHAR_T lastc; /* previous c */
691 const RCHAR_T *coldp; /* last p after which no match was underway */
695 st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
702 c = (p == m->endp) ? OUT : *p;
706 /* is there an EOL and/or BOL between lastc and c? */
709 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
710 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
714 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
715 (c == OUT && !(m->eflags®_NOTEOL)) ) {
716 flag = (flag == BOL) ? BOLEOL : EOL;
721 st = step(m->g, startst, stopst, st, flag, OUT, st);
725 /* how about a word boundary? */
726 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
727 (c != OUT && ISWORD(c)) ) {
730 if ( (lastc != OUT && ISWORD(lastc)) &&
731 (flag == EOL || (c != OUT && !ISWORD(c))) ) {
734 if (flag == BOW || flag == EOW) {
735 st = step(m->g, startst, stopst, st, flag, OUT, st);
740 if (ISSET(st, stopst) || p == stop)
741 break; /* NOTE BREAK OUT */
743 /* no, we must deal with this character */
747 st = step(m->g, startst, stopst, tmp, 0, c, st);
749 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
753 assert(coldp != NULL);
755 if (ISSET(st, stopst))
762 - slow - step through the string more deliberately
764 static const RCHAR_T * /* where it ended */
765 slow(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst,
769 states empty = m->empty;
771 const RCHAR_T *p = start;
772 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
773 RCHAR_T lastc; /* previous c */
776 const RCHAR_T *matchp; /* last p at which a match ended */
778 AT("slow", start, stop, startst, stopst);
781 SP("sstart", st, *p);
782 st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
787 c = (p == m->endp) ? OUT : *p;
789 /* is there an EOL and/or BOL between lastc and c? */
792 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
793 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
797 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
798 (c == OUT && !(m->eflags®_NOTEOL)) ) {
799 flag = (flag == BOL) ? BOLEOL : EOL;
804 st = step(m->g, startst, stopst, st, flag, OUT, st);
805 SP("sboleol", st, c);
808 /* how about a word boundary? */
809 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
810 (c != OUT && ISWORD(c)) ) {
813 if ( (lastc != OUT && ISWORD(lastc)) &&
814 (flag == EOL || (c != OUT && !ISWORD(c))) ) {
817 if (flag == BOW || flag == EOW) {
818 st = step(m->g, startst, stopst, st, flag, OUT, st);
819 SP("sboweow", st, c);
823 if (ISSET(st, stopst))
825 if (EQ(st, empty) || p == stop)
826 break; /* NOTE BREAK OUT */
828 /* no, we must deal with this character */
832 st = step(m->g, startst, stopst, tmp, 0, c, st);
834 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
843 - step - map set of states reachable before char to set reachable after
846 step(struct re_guts *g,
847 sopno start, /* start state within strip */
848 sopno stop, /* state after stop state within strip */
849 states bef, /* states reachable before */
850 int flag, /* NONCHAR flag */
851 RCHAR_T ch, /* character code */
852 states aft) /* states already known reachable after */
858 onestate here; /* note, macros know this name */
862 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
864 d = g->stripdata[pc];
867 assert(pc == stop-1);
870 /* only characters can match */
871 assert(!flag || ch != d);
876 if (flag == BOL || flag == BOLEOL)
880 if (flag == EOL || flag == BOLEOL)
897 if (!flag && CHIN(cs, ch))
900 case OBACK_: /* ignored here */
904 case OPLUS_: /* forward, this is just an empty */
907 case O_PLUS: /* both forward and back */
909 i = ISSETBACK(aft, d);
911 if (!i && ISSETBACK(aft, d)) {
912 /* oho, must reconsider loop body */
917 case OQUEST_: /* two branches, both forward */
921 case O_QUEST: /* just an empty */
924 case OLPAREN: /* not significant here */
928 case OCH_: /* mark the first two branches */
930 assert(OP(g->strip[pc+d]) == OOR2);
933 case OOR1: /* done a branch, find the O_CH */
934 if (ISSTATEIN(aft, here)) {
935 for (look = 1; /**/; look += d) {
936 s = g->strip[pc+look];
937 d = g->stripdata[pc+look];
945 case OOR2: /* propagate OCH_'s marking */
947 if (g->strip[pc+d] != O_CH) {
948 assert(g->strip[pc+d] == OOR2);
952 case O_CH: /* just empty */
955 default: /* ooooops... */
966 - print - print a set of states
969 print(struct match *m, char *caption, states st, int ch, FILE *d)
971 struct re_guts *g = m->g;
975 if (!(m->eflags®_TRACE))
978 fprintf(d, "%s", caption);
980 fprintf(d, " %s", pchar(ch));
981 for (i = 0; i < g->nstates; i++)
983 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
990 - at - print current situation
993 at(struct match *m, char *title, char *start, char *stop, sopno startst,
996 if (!(m->eflags®_TRACE))
999 printf("%s %s-", title, pchar(*start));
1000 printf("%s ", pchar(*stop));
1001 printf("%ld-%ld\n", (long)startst, (long)stopst);
1005 #define PCHARDONE /* never again */
1007 - pchar - make a character printable
1009 * Is this identical to regchar() over in debug.c? Well, yes. But a
1010 * duplicate here avoids having a debugging-capable regexec.o tied to
1011 * a matching debug.o, and this is convenient. It all disappears in
1012 * the non-debug compilation anyway, so it doesn't matter much.
1014 static char * /* -> representation */
1017 static char pbuf[10];
1019 if (isprint(ch) || ch == ' ')
1020 snprintf(pbuf, sizeof(pbuf), "%c", ch);
1022 snprintf(pbuf, sizeof(pbuf), "\\%o", ch);