2 * Copyright (c) 1991, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 static char copyright[] =
36 "@(#) Copyright (c) 1991, 1993\n\
37 The Regents of the University of California. All rights reserved.\n";
41 static char sccsid[] = "@(#)egrep.c 8.1 (Berkeley) 6/6/93";
45 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
46 table as in original paper (CACM, October, 1977). No delta1 or delta2.
47 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
48 minimal practical value. However, to improve for worst case input,
49 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
50 Comput., Feb. 1986) deserves consideration.
52 Method: extract longest metacharacter-free string from expression.
53 this is done using a side-effect from henry spencer's regcomp().
54 use boyer-moore to match such, then pass submatching lines
55 to either regexp() or standard 'egrep', depending on certain
56 criteria within execstrategy() below. [this tradeoff is due
57 to the general slowness of the regexp() nondeterministic
58 machine on complex expressions, as well as the startup time
59 of standard 'egrep' on short files.] alternatively, one may
60 change the vendor-supplied 'egrep' automaton to include
61 boyer-moore directly. see accompanying writeup for discussion
62 of kanji expression treatment.
64 late addition: apply trickbag for fast match of simple
65 alternations (sublinear, in common low-cardinality cases).
66 trap fgrep into this lair.
68 gnu additions: -f, newline as |, \< and \> [in regexec()], more
69 comments. inspire better dfa exec() strategy.
70 serious testing and help with special cases.
72 Algorithm amalgam summary:
74 dfa e?grep (aho/thompson)
75 ndfa regexp() (spencer/aho)
76 bmg (boyer/moore/gosper)
77 "superimposed" bmg (jaw)
80 sorry, but the knuth/morris/pratt machine, horspool's
81 "frequentist" code, and the rabin/karp matcher, however cute,
82 just don't cut it for this production.
84 James A. Woods Copyright (c) 1986
85 NASA Ames Research Center
87 #include <sys/types.h>
90 #include <regexp.h> /* must be henry spencer's version */
93 #include "pathnames.h"
95 #define MIN(A, B) ((A) > (B) ? (B) : (A))
101 #define BUFSIZE 8192 /* make higher for cray */
103 #define LARGE BUFSIZE + PATSIZE
105 #define NALT 7 /* tied to scanf() size in alternate() */
106 #define NMUSH 6 /* loosely relates to expected alt length */
108 #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
109 #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
110 * was processed by regexp(), exec std egrep. */
113 #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
114 #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
115 #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
116 #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
122 int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
123 int sflag, hflag; /* v7, v8, bsd */
125 int firstflag; /* Stop at first match */
126 int grepflag; /* Called as "grep" */
127 int fgrepflag; /* Called as "fgrep" */
128 int altflag; /* Simple alternation in pattern */
129 int boyonly; /* No regexp needed -- all simple */
131 int grepold, egrepold, fgrepold;
133 int nalt; /* Number of alternatives */
134 int nsuccess; /* 1 for match, 2 for error */
135 int altmin; /* Minimum length of all the alternate
137 int firstfile; /* argv index of first file argument */
138 int patind; /* argv index of pattern */
139 long nmatch; /* Number of matches in this file */
140 long incount, counted; /* Amount of input consumed */
141 long rxcount; /* Bytes of input processed by regexec() */
142 int boyfound; /* accumulated partial matches (tripped by
144 int prevmatch; /* next three lines aid fast -n */
145 long nline, prevnline;
150 char *patboy; /* Pattern for simple Boyer-Moore */
151 char *patfile; /* Filename containing pattern(s) */
153 int delta0[256]; /* Boyer-Moore algorithm core */
154 char cmap[256]; /* Usually 0-255, but if -i, maps upper to
156 char str[BUFSIZE + 2];
158 char linetemp[BUFSIZE];
159 char *altpat[NALT]; /* alternation component storage */
161 short altset[NMUSH + 1][256];
162 char preamble[200]; /* match prefix (filename, line no.) */
166 strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
168 grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
169 char *gotamatch(), *kanji(), *linesave(), *submatch();
181 if ((progname = strrchr(argv[0], '/')) != 0)
185 if (strcmp(progname, "grep") == 0)
187 else if (strcmp(progname, "fgrep") == 0)
191 while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
200 egrepold++; /* boyer-moore of little help here */
215 case '1': /* Stop at very first match */
216 firstflag++; /* spead freaks only */
236 case 'x': /* needs more work, like -b above */
245 if (errflag || ((argc <= optind) && !fflag && !eflag)) {
247 oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
249 oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
250 else /* encourage SVID options, though we provide
252 oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
255 pattern = pfile(patfile);
258 pattern = argv[optind++];
261 if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */
263 if (pattern[0] == EOS)
264 kernighan(argv); /* same as it ever was */
266 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
267 * exprs \( \), word matches \< and \>, -w and -y options, char
268 * classes with '-' at end (egrep bug?), and patterns beginning with
269 * an asterisk (don't ask why). otherwise, characters meaningful to
270 * 'egrep' but not to 'grep' are escaped; the entire expr is then
273 if (grepflag && !grepold) {
274 if (strindex(pattern, "\\(") >= 0 ||
275 strindex(pattern, "\\<") >= 0 ||
276 strindex(pattern, "\\>") >= 0 ||
277 strindex(pattern, "-]") >= 0 ||
278 pattern[0] == '*') /* grep bug */
281 pattern = grepxlat(pattern);
283 if (grepold || egrepold || fgrepold)
287 strcpy(pattern, fold(pattern));
289 * If the pattern is a plain string, just run boyer-moore. If it
290 * consists of meta-free alternatives, run "superimposed" bmg.
291 * Otherwise, find best string, and compile pattern for regexec().
293 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
297 if ((patboy = alternate(pattern)) != NULL)
300 if ((patboy = isolate(pattern)) == NULL)
301 kernighan(argv); /* expr too involved */
303 for (c = 0; pattern[c] != EOS; c++)
304 if (pattern[c] & NONASCII) /* kanji + meta */
307 if ((rspencer = regcomp(pattern)) == NULL)
308 oops("regcomp failure");
311 gosper(patboy); /* "pre-conditioning is wonderful"
314 if ((firstfile = optind) >= argc) {
315 /* Grep standard input */
316 if (lflag) /* We don't know its name! */
318 egsecute((char *) NULL);
320 while (optind < argc) {
321 egsecute(argv[optind]);
323 if (firstflag && (nsuccess == 1))
327 exit((nsuccess == 2) ? 2 : (nsuccess == 0));
331 pfile(pfname) /* absorb expression from file */
338 if ((fd = open(pfname, O_RDONLY, 0)) < 0)
339 oops("can't read pattern file");
340 if (fstat(fd, &patstat) != 0)
341 oops("can't stat pattern file");
342 if (patstat.st_size > PATSIZE) {
343 if (fgrepflag) { /* defer to unix version */
347 oops("pattern file too big");
349 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
350 oops("out of memory to read pattern file");
351 if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
352 oops("error reading pattern file");
355 pat[patstat.st_size] = EOS;
356 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
357 pat[patstat.st_size - 1] = EOS;
359 if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
361 fgrepold++; /* "what's it all about, alfie?" */
375 else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
377 "%s: %s: %s\n", progname, file, strerror(errno));
381 chimaera(file, patboy);
383 if (!boyonly && !flushflag && file != NULL)
389 chimaera(file, pat) /* "reach out and boyer-moore search someone" */
390 char *file, *pat; /* -- soon-to-be-popular bumper sticker */
392 register char *k, *strend, *s;
393 register int j, count;
394 register int *deltazero = delta0;
398 nleftover = boyfound = flushflag = 0;
401 nmatch = counted = rxcount = 0L;
403 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
406 strend = linesave(str, count);
408 for (k = str + patlen - 1; k < strend;) {
410 * for a large class of patterns, upwards of 80% of
411 * match time is spent on the next line. we beat
412 * existing microcode (vax 'matchc') this way.
414 while ((k += deltazero[*(unsigned char *) k]) < strend);
415 if (k < (str + LARGE))
421 * Parallel Boyer-Moore. Check whether each
422 * of the previous <altmin> chars COULD be
423 * from one of the alternative strings.
427 while (altset[--j][(unsigned char)
428 cmap[*(unsigned char *) s--]]);
430 * quick test fails. in this life, compare
431 * 'em all. but, a "reverse trie" would
432 * attenuate worst case (linear w/delta2?).
442 (cmap[*(unsigned char *) s--]
450 /* One string -- check it */
453 while (cmap[*(unsigned char *) s--] == pat[--j]);
456 * delta-less shortcut for literati. short shrift for
460 k++; /* no match; restart next char */
463 k = submatch(file, pat, str, strend, k, count);
469 nline = prevnline + nlcount(prevloc, k);
471 nline = nline + nlcount(str, k);
474 strncpy(str, linetemp, nleftover);
477 /* Bug from old grep: -c overrides -h. We fix the bug. */
480 printf("%ld\n", nmatch);
485 linesave(str, count) /* accumulate partial line at end of buffer */
492 if (count != BUFSIZE && fd != 0)
493 str[count++] = NL; /* insurance for broken last line */
495 for (j = count - 1; str[j] != NL && j >= 0;)
498 * break up these lines: long line (> BUFSIZE), last line of file, or
499 * short return from read(), as from tee(1) input
501 if (j < 0 && (count == (BUFSIZE - nleftover))) {
506 return (str + count);
508 nleftover = count - j - 1;
509 strncpy(linetemp, str + j + 1, nleftover);
515 * Process partial match. First check for mis-aligned Kanji, then match line
516 * against full compiled r.e. if statistics do not warrant handing off to
520 submatch(file, pat, str, strend, k, altindex)
521 char file[], pat[], str[];
522 register char *strend, *k;
529 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
531 c = ((altflag) ? altpat[altindex][0] : pat[0]);
533 if ((s = kanji(str, s, k)) == NULL)
534 return (++k); /* reject false kanji */
537 while (*s != NL && --s >= str);
538 k = s + 1; /* now at line start */
541 return (gotamatch(file, k));
543 incount = counted - (strend - k);
544 if (boyfound++ == FIRSTFEW)
553 * "quick henry -- the flit" (after theodor geisel)
555 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
557 if (gotamatch(file, k) == NULL)
566 * EUC code disambiguation -- scan backwards to first 7-bit code, while
567 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
568 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
572 register char *str, *s, *k;
576 for (s--; s >= str; s--) {
577 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
585 return ((j & 01) ? NULL : k);
590 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
593 char *pattern; /* ... HAKMEM lives ... */
598 /* Make one-string case look like simple alternatives case */
601 altmin = altlen[0] = strlen(pattern);
604 /* For chars that aren't in any string, skip by string length. */
605 for (j = 0; j < 256; j++) {
607 cmap[j] = j; /* Sneak in initialization of cmap */
610 /* For chars in a string, skip distance from char to end of string. */
611 /* (If char appears more than once, skip minimum distance.) */
612 for (i = 0; i < nalt; i++)
613 for (j = 0; j < altlen[i] - 1; j++) {
615 delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
616 if (iflag && islower((int) c))
617 delta0[toupper((int) c)] = delta0[c];
620 /* For last char of each string, fall out of search loop. */
621 for (i = 0; i < nalt; i++) {
622 c = altpat[i][altlen[i] - 1];
624 if (iflag && islower((int) c))
625 delta0[toupper((int) c)] = LARGE;
628 for (j = 'A'; j <= 'Z'; j++)
629 cmap[j] = tolower((int) j);
633 * Print, count, or stop on full match. Result is either the location for
634 * continued search, or NULL to stop.
638 register char *file, *s;
641 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
645 if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
649 return (NULL); /* -s usurps all flags (unlike some versions) */
650 if (cflag) { /* -c overrides -l, we guess */
661 (void)sprintf(preamble, "%s:", file);
664 prevnline = prevnline + nlcount(prevloc, s);
666 prevnline = nline + nlcount(str, s);
670 printf("%ld:", prevnline);
672 (void)sprintf(preamble + strlen(preamble),
685 return ((firstflag && !cflag) ? NULL : s);
692 static char fline[BUFSIZE];
693 register char *s, *t = fline;
695 for (s = line; *s != EOS; s++)
696 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
701 strindex(s, t) /* the easy way, as in K&P, p. 192 */
707 for (i = 0; s[i] != '\0'; i++)
708 if (strncmp(s + i, t, n) == 0)
714 grepxlat(pattern) /* grep pattern meta conversion */
717 register char *p, *s;
718 static char newpat[BUFSIZE];
720 for (s = newpat, p = pattern; *p != EOS;) {
721 if (*p == '\\') { /* skip escapes ... */
725 } else if (*p == '[') { /* ... and char classes */
726 while (*p != EOS && *p != ']')
728 } else if (strchr("+?|()", *p) != NULL) {
729 *s++ = '\\'; /* insert protection */
735 grepflag = ((patind) ? 0 : 1);
740 * Test for simple alternation. Result is NULL if it's not so simple, or is
741 * a pointer to the first string if it is. Warning: sscanf size is a
742 * fixpoint, beyond which the speedup linearity starts to break down. In the
743 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
744 * altpat[] to arbitrary size is not useful.
751 register char *start, *stop;
754 if (fgrepflag && strchr(regexpr, '|'))
758 * break pattern up into altpat array; delimit on newline, bar,
759 * or EOS. We know we won't overflow, we've already checked the
760 * number of patterns we're going to find against NALT.
761 * Also, set length of pattern and find minimum pattern length.
765 for (start = stop = regexpr;; ++stop)
766 if (!*stop || *stop == '|' || *stop == NL) {
767 altlen[nalt] = j = stop - start;
770 if (!(altpat[nalt] = malloc((u_int)(j + 1))))
771 oops("out of memory");
772 bcopy(start, altpat[nalt], j);
773 altpat[nalt][j] = EOS;
784 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
786 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
787 || strindex(regexpr, "||") >= 0)
791 if (nalt > 1) { /* build superimposed "pre-match" sets per
794 for (j = 0; j < nalt; j++)
795 for (i = 0; i < altmin; i++) {
796 c = altpat[j][altlen[j] - altmin + i];
797 altset[i + 1][c] = 1; /* offset for sentinel */
804 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
805 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
806 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
807 * of the input, the r.e. is likely to be underspecified, so do old *grep,
808 * which is faster on complex patterns than regexp(). At FIRSTFEW,
809 * dump the saved matches collected by savematch(). They are saved
810 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
811 * since it's hard to rewind.
819 pctmatch = (100 * rxcount) / incount;
820 if (pctmatch > PUNTPERCENT && file != NULL)
826 nlcount(bstart, bstop) /* flail interval to totalize newlines. */
827 char *bstart, *bstop;
829 register char *s = bstart;
830 register char *t = bstop;
831 register int count = 0;
833 do { /* loop unroll for older architectures */
834 if (*t == NL) /* ... ask ames!jaw for sample code */
842 isolate(regexpr) /* isolate longest metacharacter-free string */
848 * We add (.)* because Henry's regcomp only figures regmust if it
849 * sees a leading * pattern. Foo!
851 dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
852 (void)sprintf(dummyexpr, "(.)*%s", regexpr);
853 if ((rspencer = regcomp(dummyexpr)) == NULL)
855 return (rspencer->regmust);
858 char *matches[FIRSTFEW];
859 static int mcount = 0;
862 savematch(s) /* horde matches during statistics gathering */
868 int psize = strlen(preamble);
874 p = malloc((unsigned) msize + 1 + psize);
876 strcpy(p + psize, start);
877 matches[mcount++] = p;
889 for (n = 0; n < mcount; n++)
890 printf("%s\n", matches[n]);
897 fprintf(stderr, "%s: %s\n", progname, message);
901 kernighan(args) /* "let others do the hard part ..." */
905 * We may have already run grep on some of the files; remove them
906 * from the arg list we pass on. Note that we can't delete them
907 * totally because the number of file names affects the output
910 /* better would be fork/exec per punted file -- jaw */
912 while (firstfile && optind > firstfile)
913 args[firstfile++] = _PATH_DEVNULL;
915 args[patind] = pattern;
916 (void) fflush(stdout);
919 execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
921 execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
923 execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");