]> CyberLeo.Net >> Repos - FreeBSD/releng/9.2.git/blob - usr.bin/grep/regex/tre-fastmatch.c
- Copy stable/9 to releng/9.2 as part of the 9.2-RELEASE cycle.
[FreeBSD/releng/9.2.git] / usr.bin / grep / regex / tre-fastmatch.c
1 /* $FreeBSD$ */
2
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5  * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 #include "glue.h"
31
32 #include <ctype.h>
33 #include <limits.h>
34 #include <regex.h>
35 #include <stdbool.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef TRE_WCHAR
39 #include <wchar.h>
40 #include <wctype.h>
41 #endif
42
43 #include "hashtable.h"
44 #include "tre-fastmatch.h"
45 #include "xmalloc.h"
46
47 static int      fastcmp(const fastmatch_t *fg, const void *data,
48                         tre_str_type_t type);
49
50 /*
51  * Clean up if pattern compilation fails.
52  */
53 #define FAIL_COMP(errcode)                                              \
54   {                                                                     \
55     if (fg->pattern)                                                    \
56       xfree(fg->pattern);                                               \
57     if (fg->wpattern)                                                   \
58       xfree(fg->wpattern);                                              \
59     if (fg->qsBc_table)                                                 \
60       hashtable_free(fg->qsBc_table);                                   \
61     fg = NULL;                                                          \
62     return errcode;                                                     \
63   }
64
65 /*
66  * Skips n characters in the input string and assigns the start
67  * address to startptr. Note: as per IEEE Std 1003.1-2008
68  * matching is based on bit pattern not character representations
69  * so we can handle MB strings as byte sequences just like
70  * SB strings.
71  */
72 #define SKIP_CHARS(n)                                                   \
73   switch (type)                                                         \
74     {                                                                   \
75       case STR_WIDE:                                                    \
76         startptr = str_wide + n;                                        \
77         break;                                                          \
78       default:                                                          \
79         startptr = str_byte + n;                                        \
80     }
81
82 /*
83  * Converts the wide string pattern to SB/MB string and stores
84  * it in fg->pattern. Sets fg->len to the byte length of the
85  * converted string.
86  */
87 #define STORE_MBS_PAT                                                   \
88   {                                                                     \
89     size_t siz;                                                         \
90                                                                         \
91     siz = wcstombs(NULL, fg->wpattern, 0);                              \
92     if (siz == (size_t)-1)                                              \
93       return REG_BADPAT;                                                \
94     fg->len = siz;                                                      \
95     fg->pattern = xmalloc(siz + 1);                                     \
96     if (fg->pattern == NULL)                                            \
97       return REG_ESPACE;                                                \
98     wcstombs(fg->pattern, fg->wpattern, siz);                           \
99     fg->pattern[siz] = '\0';                                            \
100   }                                                                     \
101
102 #define IS_OUT_OF_BOUNDS                                                \
103   ((!fg->reversed                                                       \
104     ? ((type == STR_WIDE) ? ((j + fg->wlen) > len)                      \
105                           : ((j + fg->len) > len))                      \
106     : (j < 0)))
107
108 /*
109  * Checks whether the new position after shifting in the input string
110  * is out of the bounds and break out from the loop if so.
111  */
112 #define CHECKBOUNDS                                                     \
113   if (IS_OUT_OF_BOUNDS)                                                 \
114     break;                                                              \
115
116 /*
117  * Shifts in the input string after a mismatch. The position of the
118  * mismatch is stored in the mismatch variable.
119  */
120 #define SHIFT                                                           \
121   CHECKBOUNDS;                                                          \
122                                                                         \
123   {                                                                     \
124     int r = -1;                                                         \
125     unsigned int bc = 0, gs = 0, ts;                                    \
126                                                                         \
127     switch (type)                                                       \
128       {                                                                 \
129         case STR_WIDE:                                                  \
130           if (!fg->hasdot)                                              \
131             {                                                           \
132               if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
133                 mismatch -= u;                                          \
134               v = fg->wlen - 1 - mismatch;                              \
135               r = hashtable_get(fg->qsBc_table,                         \
136                 &str_wide[!fg->reversed ? (size_t)j + fg->wlen          \
137                           : (size_t)j - 1], &bc);                       \
138               gs = fg->bmGs[mismatch];                                  \
139             }                                                           \
140             bc = (r == HASH_OK) ? bc : fg->defBc;                       \
141             DPRINT(("tre_fast_match: mismatch on character" CHF ", "    \
142                     "BC %d, GS %d\n",                                   \
143                     ((const tre_char_t *)startptr)[mismatch + 1],       \
144                     bc, gs));                                           \
145             break;                                                      \
146         default:                                                        \
147           if (!fg->hasdot)                                              \
148             {                                                           \
149               if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift)  \
150                 mismatch -= u;                                          \
151               v = fg->len - 1 - mismatch;                               \
152               gs = fg->sbmGs[mismatch];                                 \
153             }                                                           \
154           bc = fg->qsBc[((const unsigned char *)str_byte)               \
155                         [!fg->reversed ? (size_t)j + fg->len            \
156                          : (size_t)j - 1]];                             \
157           DPRINT(("tre_fast_match: mismatch on character %c, "          \
158                  "BC %d, GS %d\n",                                      \
159                  ((const unsigned char *)startptr)[mismatch + 1],       \
160                  bc, gs));                                              \
161       }                                                                 \
162     if (fg->hasdot)                                                     \
163       shift = bc;                                                       \
164     else                                                                \
165       {                                                                 \
166         ts = (u >= v) ? (u - v) : 0;                                    \
167         shift = MAX(ts, bc);                                            \
168         shift = MAX(shift, gs);                                         \
169         if (shift == gs)                                                \
170           u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v);  \
171         else                                                            \
172           {                                                             \
173             if (ts < bc)                                                \
174               shift = MAX(shift, u + 1);                                \
175             u = 0;                                                      \
176           }                                                             \
177       }                                                                 \
178       DPRINT(("tre_fast_match: shifting %u characters\n", shift));      \
179       j = !fg->reversed ? j + shift : j - shift;                        \
180   }
181
182 /*
183  * Normal Quick Search would require a shift based on the position the
184  * next character after the comparison is within the pattern.  With
185  * wildcards, the position of the last dot effects the maximum shift
186  * distance.
187  * The closer to the end the wild card is the slower the search.
188  *
189  * Examples:
190  * Pattern    Max shift
191  * -------    ---------
192  * this               5
193  * .his               4
194  * t.is               3
195  * th.s               2
196  * thi.               1
197  */
198
199 /*
200  * Fills in the bad character shift array for SB/MB strings.
201  */
202 #define FILL_QSBC                                                       \
203   if (fg->reversed)                                                     \
204     {                                                                   \
205       _FILL_QSBC_REVERSED                                               \
206     }                                                                   \
207   else                                                                  \
208     {                                                                   \
209       _FILL_QSBC                                                        \
210     }
211
212 #define _FILL_QSBC                                                      \
213   for (unsigned int i = 0; i <= UCHAR_MAX; i++)                         \
214     fg->qsBc[i] = fg->len - hasdot;                                     \
215   for (unsigned int i = hasdot + 1; i < fg->len; i++)                   \
216     {                                                                   \
217       fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;            \
218       DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],          \
219              fg->len - i));                                             \
220       if (fg->icase)                                                    \
221         {                                                               \
222           char c = islower((unsigned char)fg->pattern[i]) ?             \
223                    toupper((unsigned char)fg->pattern[i]) :             \
224                    tolower((unsigned char)fg->pattern[i]);              \
225           fg->qsBc[(unsigned char)c] = fg->len - i;                     \
226           DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i));    \
227         }                                                               \
228     }
229
230 #define _FILL_QSBC_REVERSED                                             \
231   for (unsigned int i = 0; i <= UCHAR_MAX; i++)                         \
232     fg->qsBc[i] = firstdot + 1;                                         \
233   for (int i = firstdot - 1; i >= 0; i--)                               \
234     {                                                                   \
235       fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1;                  \
236       DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i],   \
237              i + 1));                                                   \
238       if (fg->icase)                                                    \
239         {                                                               \
240           char c = islower((unsigned char)fg->pattern[i]) ?             \
241                    toupper((unsigned char)fg->pattern[i]) :             \
242                    tolower((unsigned char)fg->pattern[i]);              \
243           fg->qsBc[(unsigned char)c] = i + 1;                           \
244           DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1));   \
245         }                                                               \
246     }
247
248 /*
249  * Fills in the bad character shifts into a hastable for wide strings.
250  * With wide characters it is not possible any more to use a normal
251  * array because there are too many characters and we could not
252  * provide enough memory. Fortunately, we only have to store distinct
253  * values for so many characters as the number of distinct characters
254  * in the pattern, so we can store them in a hashtable and store a
255  * default shift value for the rest.
256  */
257 #define FILL_QSBC_WIDE                                                  \
258   if (fg->reversed)                                                     \
259     {                                                                   \
260       _FILL_QSBC_WIDE_REVERSED                                          \
261     }                                                                   \
262   else                                                                  \
263     {                                                                   \
264       _FILL_QSBC_WIDE                                                   \
265     }
266
267 #define _FILL_QSBC_WIDE                                                 \
268   /* Adjust the shift based on location of the last dot ('.'). */       \
269   fg->defBc = fg->wlen - whasdot;                                       \
270                                                                         \
271   /* Preprocess pattern. */                                             \
272   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),       \
273                                   sizeof(tre_char_t), sizeof(int));     \
274   if (!fg->qsBc_table)                                                  \
275     FAIL_COMP(REG_ESPACE);                                              \
276   for (unsigned int i = whasdot + 1; i < fg->wlen; i++)                 \
277     {                                                                   \
278       int k = fg->wlen - i;                                             \
279       int r;                                                            \
280                                                                         \
281       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);          \
282       if ((r == HASH_FAIL) || (r == HASH_FULL))                         \
283         FAIL_COMP(REG_ESPACE);                                          \
284       DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
285              k));                                                       \
286       if (fg->icase)                                                    \
287         {                                                               \
288           tre_char_t wc = iswlower(fg->wpattern[i]) ?                   \
289             towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);      \
290           r = hashtable_put(fg->qsBc_table, &wc, &k);                   \
291           if ((r == HASH_FAIL) || (r == HASH_FULL))                     \
292             FAIL_COMP(REG_ESPACE);                                      \
293           DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k));    \
294         }                                                               \
295     }
296
297 #define _FILL_QSBC_WIDE_REVERSED                                        \
298   /* Adjust the shift based on location of the last dot ('.'). */       \
299   fg->defBc = (size_t)wfirstdot;                                        \
300                                                                         \
301   /* Preprocess pattern. */                                             \
302   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),       \
303                                   sizeof(tre_char_t), sizeof(int));     \
304   if (!fg->qsBc_table)                                                  \
305     FAIL_COMP(REG_ESPACE);                                              \
306   for (int i = wfirstdot - 1; i >= 0; i--)                              \
307     {                                                                   \
308       int k = i + 1;                                                    \
309       int r;                                                            \
310                                                                         \
311       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);          \
312       if ((r == HASH_FAIL) || (r == HASH_FULL))                         \
313         FAIL_COMP(REG_ESPACE);                                          \
314       DPRINT(("Reverse BC shift for wide char " CHF " is %d\n",         \
315              fg->wpattern[i], k));                                      \
316       if (fg->icase)                                                    \
317         {                                                               \
318           tre_char_t wc = iswlower(fg->wpattern[i]) ?                   \
319             towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);      \
320           r = hashtable_put(fg->qsBc_table, &wc, &k);                   \
321           if ((r == HASH_FAIL) || (r == HASH_FULL))                     \
322             FAIL_COMP(REG_ESPACE);                                      \
323           DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
324                  k));                                                   \
325         }                                                               \
326     }
327
328 #ifdef _GREP_DEBUG
329 #define DPRINT_BMGS(len, fmt_str, sh)                                   \
330   for (unsigned int i = 0; i < len; i++)                                \
331     DPRINT((fmt_str, i, sh[i]));
332 #else
333 #define DPRINT_BMGS(len, fmt_str, sh)                                   \
334   do { } while(/*CONSTCOND*/0)
335 #endif
336
337 /*
338  * Fills in the good suffix table for SB/MB strings.
339  */
340 #define FILL_BMGS                                                       \
341   if (!fg->hasdot)                                                      \
342     {                                                                   \
343       fg->sbmGs = xmalloc(fg->len * sizeof(int));                       \
344       if (!fg->sbmGs)                                                   \
345         return REG_ESPACE;                                              \
346       if (fg->len == 1)                                                 \
347         fg->sbmGs[0] = 1;                                               \
348       else                                                              \
349         _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);             \
350       DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);   \
351     }
352
353 /*
354  * Fills in the good suffix table for wide strings.
355  */
356 #define FILL_BMGS_WIDE                                                  \
357   if (!fg->hasdot)                                                      \
358     {                                                                   \
359       fg->bmGs = xmalloc(fg->wlen * sizeof(int));                       \
360       if (!fg->bmGs)                                                    \
361         return REG_ESPACE;                                              \
362       if (fg->wlen == 1)                                                \
363         fg->bmGs[0] = 1;                                                \
364       else                                                              \
365         _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);             \
366       DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",       \
367                   fg->bmGs);                                            \
368     }
369
370 #define _FILL_BMGS(arr, pat, plen, wide)                                \
371   {                                                                     \
372     char *p;                                                            \
373     tre_char_t *wp;                                                     \
374                                                                         \
375     if (wide)                                                           \
376       {                                                                 \
377         if (fg->icase)                                                  \
378           {                                                             \
379             wp = xmalloc(plen * sizeof(tre_char_t));                    \
380             if (wp == NULL)                                             \
381               return REG_ESPACE;                                        \
382             for (unsigned int i = 0; i < plen; i++)                     \
383               wp[i] = towlower(pat[i]);                                 \
384             _CALC_BMGS(arr, wp, plen);                                  \
385             xfree(wp);                                                  \
386           }                                                             \
387         else                                                            \
388           _CALC_BMGS(arr, pat, plen);                                   \
389       }                                                                 \
390     else                                                                \
391       {                                                                 \
392         if (fg->icase)                                                  \
393           {                                                             \
394             p = xmalloc(plen);                                          \
395             if (p == NULL)                                              \
396               return REG_ESPACE;                                        \
397             for (unsigned int i = 0; i < plen; i++)                     \
398               p[i] = tolower((unsigned char)pat[i]);                    \
399             _CALC_BMGS(arr, p, plen);                                   \
400             xfree(p);                                                   \
401           }                                                             \
402         else                                                            \
403           _CALC_BMGS(arr, pat, plen);                                   \
404       }                                                                 \
405   }
406
407 #define _CALC_BMGS(arr, pat, plen)                                      \
408   {                                                                     \
409     int f = 0, g;                                                       \
410                                                                         \
411     int *suff = xmalloc(plen * sizeof(int));                            \
412     if (suff == NULL)                                                   \
413       return REG_ESPACE;                                                \
414                                                                         \
415     suff[plen - 1] = plen;                                              \
416     g = plen - 1;                                                       \
417     for (int i = plen - 2; i >= 0; i--)                                 \
418       {                                                                 \
419         if (i > g && suff[i + plen - 1 - f] < i - g)                    \
420           suff[i] = suff[i + plen - 1 - f];                             \
421         else                                                            \
422           {                                                             \
423             if (i < g)                                                  \
424               g = i;                                                    \
425             f = i;                                                      \
426             while (g >= 0 && pat[g] == pat[g + plen - 1 - f])           \
427               g--;                                                      \
428             suff[i] = f - g;                                            \
429           }                                                             \
430       }                                                                 \
431                                                                         \
432     for (unsigned int i = 0; i < plen; i++)                             \
433       arr[i] = plen;                                                    \
434     g = 0;                                                              \
435     for (int i = plen - 1; i >= 0; i--)                                 \
436       if (suff[i] == i + 1)                                             \
437         for(; (unsigned long)g < plen - 1 - i; g++)                     \
438           if (arr[g] == plen)                                           \
439             arr[g] = plen - 1 - i;                                      \
440     for (unsigned int i = 0; i <= plen - 2; i++)                        \
441       arr[plen - 1 - suff[i]] = plen - 1 - i;                           \
442                                                                         \
443     xfree(suff);                                                        \
444   }
445
446 /*
447  * Copies the pattern pat having lenght n to p and stores
448  * the size in l.
449  */
450 #define SAVE_PATTERN(src, srclen, dst, dstlen)                          \
451   dstlen = srclen;                                                      \
452   dst = xmalloc((dstlen + 1) * sizeof(tre_char_t));                     \
453   if (dst == NULL)                                                      \
454     return REG_ESPACE;                                                  \
455   if (dstlen > 0)                                                       \
456     memcpy(dst, src, dstlen * sizeof(tre_char_t));                      \
457   dst[dstlen] = TRE_CHAR('\0');
458
459 /*
460  * Initializes pattern compiling.
461  */
462 #define INIT_COMP                                                       \
463   /* Initialize. */                                                     \
464   memset(fg, 0, sizeof(*fg));                                           \
465   fg->icase = (cflags & REG_ICASE);                                     \
466   fg->word = (cflags & REG_WORD);                                       \
467   fg->newline = (cflags & REG_NEWLINE);                                 \
468   fg->nosub = (cflags & REG_NOSUB);                                     \
469                                                                         \
470   /* Cannot handle REG_ICASE with MB string */                          \
471   if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0)                       \
472     {                                                                   \
473       DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n"));     \
474       return REG_BADPAT;                                                \
475     }
476
477 /*
478  * Checks whether we have a 0-length pattern that will match
479  * anything. If literal is set to false, the EOL anchor is also
480  * taken into account.
481  */
482 #define CHECK_MATCHALL(literal)                                         \
483   if (!literal && n == 1 && pat[0] == TRE_CHAR('$'))                    \
484     {                                                                   \
485       n--;                                                              \
486       fg->eol = true;                                                   \
487     }                                                                   \
488                                                                         \
489   if (n == 0)                                                           \
490     {                                                                   \
491       fg->matchall = true;                                              \
492       fg->pattern = xmalloc(sizeof(char));                              \
493       if (!fg->pattern)                                                 \
494         FAIL_COMP(REG_ESPACE);                                          \
495       fg->pattern[0] = '\0';                                            \
496       fg->wpattern = xmalloc(sizeof(tre_char_t));                       \
497       if (!fg->wpattern)                                                \
498         FAIL_COMP(REG_ESPACE);                                          \
499       fg->wpattern[0] = TRE_CHAR('\0');                                 \
500       DPRINT(("Matching every input\n"));                               \
501       return REG_OK;                                                    \
502     }
503
504 /*
505  * Returns: REG_OK on success, error code otherwise
506  */
507 int
508 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
509                     int cflags)
510 {
511   size_t hasdot = 0, whasdot = 0;
512   ssize_t firstdot = -1, wfirstdot = -1;
513
514   INIT_COMP;
515
516   CHECK_MATCHALL(true);
517
518   /* Cannot handle word boundaries with MB string */
519   if (fg->word && (TRE_MB_CUR_MAX > 1))
520     return REG_BADPAT;
521
522 #ifdef TRE_WCHAR
523   SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
524   STORE_MBS_PAT;
525 #else
526   SAVE_PATTERN(pat, n, fg->pattern, fg->len);
527 #endif
528
529   DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
530          "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
531          fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
532
533   FILL_QSBC;
534   FILL_BMGS;
535 #ifdef TRE_WCHAR
536   FILL_QSBC_WIDE;
537   FILL_BMGS_WIDE;
538 #endif
539
540   return REG_OK;
541 }
542
543 /*
544  * Returns: REG_OK on success, error code otherwise
545  */
546 int
547 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
548                  int cflags)
549 {
550   tre_char_t *tmp;
551   size_t pos = 0, hasdot = 0, whasdot = 0;
552   ssize_t firstdot = -1, wfirstdot = -1;
553   bool escaped = false;
554   bool *_escmap = NULL;
555
556   INIT_COMP;
557
558   /* Remove beginning-of-line character ('^'). */
559   if (pat[0] == TRE_CHAR('^'))
560     {
561       fg->bol = true;
562       n--;
563       pat++;
564     }
565
566   CHECK_MATCHALL(false);
567
568   /* Handle word-boundary matching when GNU extensions are enabled */
569   if ((cflags & REG_GNU) && (n >= 14) &&
570       (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
571       (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
572               7 * sizeof(tre_char_t)) == 0))
573     {
574       n -= 14;
575       pat += 7;
576       fg->word = true;
577     }
578
579   /* Cannot handle word boundaries with MB string */
580   if (fg->word && (TRE_MB_CUR_MAX > 1))
581     return REG_BADPAT;
582
583   tmp = xmalloc((n + 1) * sizeof(tre_char_t));
584   if (tmp == NULL)
585     return REG_ESPACE;
586
587 /* Copies the char into the stored pattern and skips to the next char. */
588 #define STORE_CHAR                                                      \
589   do                                                                    \
590     {                                                                   \
591       tmp[pos++] = pat[i];                                              \
592       escaped = false;                                                  \
593       continue;                                                         \
594     } while (0)
595
596   /* Traverse the input pattern for processing */
597   for (unsigned int i = 0; i < n; i++)
598     {
599       switch (pat[i])
600         {
601           case TRE_CHAR('\\'):
602             if (escaped)
603               STORE_CHAR;
604             else if (i == n - 1)
605               goto badpat;
606             else
607               escaped = true;
608             continue;
609           case TRE_CHAR('['):
610             if (escaped)
611               STORE_CHAR;
612             else
613               goto badpat;
614             continue;
615           case TRE_CHAR('*'):
616             if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
617               STORE_CHAR;
618             else
619               goto badpat;
620             continue;
621           case TRE_CHAR('+'):
622           case TRE_CHAR('?'):
623             if ((cflags & REG_EXTENDED) && (i == 0))
624               continue;
625             else if ((cflags & REG_EXTENDED) ^ !escaped)
626               STORE_CHAR;
627             else
628               goto badpat;
629             continue;
630           case TRE_CHAR('.'):
631             if (escaped)
632               {
633                 if (!_escmap)
634                   _escmap = xmalloc(n * sizeof(bool));
635                 if (!_escmap)
636                   {
637                     xfree(tmp);
638                     return REG_ESPACE;
639                   }
640                 _escmap[i] = true;
641                 STORE_CHAR;
642               }
643             else
644               {
645                 whasdot = i;
646                 if (wfirstdot == -1)
647                         wfirstdot = i;
648                 STORE_CHAR;
649               }
650             continue;
651           case TRE_CHAR('^'):
652             STORE_CHAR;
653             continue;
654           case TRE_CHAR('$'):
655             if (!escaped && (i == n - 1))
656               fg->eol = true;
657             else
658               STORE_CHAR;
659             continue;
660           case TRE_CHAR('('):
661             if ((cflags & REG_EXTENDED) ^ escaped)
662               goto badpat;
663             else
664               STORE_CHAR;
665             continue;
666           case TRE_CHAR('{'):
667             if (!(cflags & REG_EXTENDED) ^ escaped)
668               STORE_CHAR;
669             else if (!(cflags & REG_EXTENDED) && (i == 0))
670               STORE_CHAR;
671             else if ((cflags & REG_EXTENDED) && (i == 0))
672               continue;
673             else
674               goto badpat;
675             continue;
676           case TRE_CHAR('|'):
677             if ((cflags & REG_EXTENDED) ^ escaped)
678               goto badpat;
679             else
680               STORE_CHAR;
681             continue;
682           default:
683             if (escaped)
684               goto badpat;
685             else
686               STORE_CHAR;
687             continue;
688         }
689       continue;
690 badpat:
691       xfree(tmp);
692       DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
693               "back to NFA\n"));
694       return REG_BADPAT;
695     }
696
697   fg->hasdot = wfirstdot > -1;
698
699   /*
700    * The pattern has been processed and copied to tmp as a literal string
701    * with escapes, anchors (^$) and the word boundary match character
702    * classes stripped out.
703    */
704 #ifdef TRE_WCHAR
705   SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
706   fg->wescmap = _escmap;
707   STORE_MBS_PAT;
708
709   /*
710    * The position of dots and escaped dots is different in the MB string
711    * than in to the wide string so traverse the converted string, as well,
712    * to store these positions.
713    */
714   if (fg->hasdot || (fg->wescmap != NULL))
715     {
716       if (fg->wescmap != NULL)
717         {
718           fg->escmap = xmalloc(fg->len * sizeof(bool));
719           if (!fg->escmap)
720             {
721               tre_free_fast(fg);
722               return REG_ESPACE;
723             }
724         }
725
726       escaped = false;
727       for (unsigned int i = 0; i < fg->len; i++)
728         if (fg->pattern[i] == '\\')
729           escaped = !escaped;
730         else if (fg->pattern[i] == '.' && escaped)
731           {
732             fg->escmap[i] = true;
733             escaped = false;
734           }
735         else if (fg->pattern[i] == '.' && !escaped)
736           {
737             hasdot = i;
738             if (firstdot == -1)
739               firstdot = i;
740           }
741         else
742           escaped = false;
743     }
744 #else
745   SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
746   fg->escmap = _escmap;
747 #endif
748
749   xfree(tmp);
750
751   DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
752          "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
753          fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
754          fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
755          fg->newline ? 'y' : 'n'));
756
757   /* Check whether reverse QS algorithm is more efficient */
758   if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
759       fg->nosub)
760     {
761       fg->reversed = true;
762       DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
763     }
764
765   FILL_QSBC;
766   FILL_BMGS;
767 #ifdef TRE_WCHAR
768   FILL_QSBC_WIDE;
769   FILL_BMGS_WIDE;
770 #endif
771
772   return REG_OK;
773 }
774
775 #define _SHIFT_ONE                                                      \
776   {                                                                     \
777     shift = 1;                                                          \
778     j = !fg->reversed ? j + shift : j - shift;                          \
779     continue;                                                           \
780   }
781
782 #define _BBOUND_COND                                                    \
783   ((type == STR_WIDE) ?                                                 \
784     ((j == 0) || !(tre_isalnum(str_wide[j - 1]) ||                      \
785       (str_wide[j - 1] == TRE_CHAR('_')))) :                            \
786     ((j == 0) || !(tre_isalnum(str_byte[j - 1]) ||                      \
787       (str_byte[j - 1] == '_'))))
788
789 #define _EBOUND_COND                                                    \
790   ((type == STR_WIDE) ?                                                 \
791     ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) ||  \
792       (str_wide[j + fg->wlen] == TRE_CHAR('_')))) :                     \
793     ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) ||    \
794       (str_byte[j + fg->len] == '_'))))
795
796 /*
797  * Condition to check whether the match on position j is on a
798  * word boundary.
799  */
800 #define IS_ON_WORD_BOUNDARY                                             \
801   (_BBOUND_COND && _EBOUND_COND)
802
803 /*
804  * Checks word boundary and shifts one if match is not on a
805  * boundary.
806  */
807 #define CHECK_WORD_BOUNDARY                                             \
808     if (!IS_ON_WORD_BOUNDARY)                                           \
809       _SHIFT_ONE;
810
811 #define _BOL_COND                                                       \
812   ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
813                                    : (str_byte[j - 1] == '\n')))
814
815 /*
816  * Checks BOL anchor and shifts one if match is not on a
817  * boundary.
818  */
819 #define CHECK_BOL_ANCHOR                                                \
820     if (!_BOL_COND)                                                     \
821       _SHIFT_ONE;
822
823 #define _EOL_COND                                                       \
824   ((type == STR_WIDE)                                                   \
825     ? ((j + fg->wlen == len) ||                                         \
826                 (str_wide[j + fg->wlen] == TRE_CHAR('\n')))             \
827     : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
828
829 /*
830  * Checks EOL anchor and shifts one if match is not on a
831  * boundary.
832  */
833 #define CHECK_EOL_ANCHOR                                                \
834     if (!_EOL_COND)                                                     \
835       _SHIFT_ONE;
836
837 /*
838  * Executes matching of the precompiled pattern on the input string.
839  * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
840  */
841 int
842 tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
843     tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
844 {
845   unsigned int shift, u = 0, v = 0;
846   ssize_t j = 0;
847   int ret = REG_NOMATCH;
848   int mismatch;
849   const char *str_byte = data;
850   const void *startptr = NULL;
851   const tre_char_t *str_wide = data;
852
853   /* Calculate length if unspecified. */
854   if (len == (size_t)-1)
855     switch (type)
856       {
857         case STR_WIDE:
858           len = tre_strlen(str_wide);
859           break;
860         default:
861           len = strlen(str_byte);
862           break;
863       }
864
865   /* Shortcut for empty pattern */
866   if (fg->matchall)
867     {
868       if (!fg->nosub && nmatch >= 1)
869         {
870           pmatch[0].rm_so = 0;
871           pmatch[0].rm_eo = len;
872         }
873       if (fg->bol && fg->eol)
874         return (len == 0) ? REG_OK : REG_NOMATCH;
875       else
876         return REG_OK;
877     }
878
879   /* No point in going farther if we do not have enough data. */
880   switch (type)
881     {
882       case STR_WIDE:
883         if (len < fg->wlen)
884           return ret;
885         shift = fg->wlen;
886         break;
887       default:
888         if (len < fg->len)
889           return ret;
890         shift = fg->len;
891     }
892
893   /*
894    * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
895    * can shift one because there can't be a match at the beginning.
896    */
897   if (fg->bol && (eflags & REG_NOTBOL))
898     j = 1;
899
900   /*
901    * Like above, we cannot have a match at the very end when anchoring to
902    * the end and REG_NOTEOL is specified.
903    */
904   if (fg->eol && (eflags & REG_NOTEOL))
905     len--;
906
907   if (fg->reversed)
908     j = len - (type == STR_WIDE ? fg->wlen : fg->len);
909
910
911   /* Only try once at the beginning or ending of the line. */
912   if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
913       !(eflags & REG_NOTEOL))
914     {
915       /* Simple text comparison. */
916       if (!((fg->bol && fg->eol) &&
917           (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
918         {
919           /* Determine where in data to start search at. */
920           j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
921           SKIP_CHARS(j);
922           mismatch = fastcmp(fg, startptr, type);
923           if (mismatch == REG_OK)
924             {
925               if (fg->word && !IS_ON_WORD_BOUNDARY)
926                 return ret;
927               if (!fg->nosub && nmatch >= 1)
928                 {
929                   pmatch[0].rm_so = j;
930                   pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
931                 }
932               return REG_OK;
933             }
934         }
935     }
936   else
937     {
938       /* Quick Search / Turbo Boyer-Moore algorithm. */
939       do
940         {
941           SKIP_CHARS(j);
942           mismatch = fastcmp(fg, startptr, type);
943           if (mismatch == REG_OK)
944             {
945               if (fg->word)
946                 CHECK_WORD_BOUNDARY;
947               if (fg->bol)
948                 CHECK_BOL_ANCHOR;
949               if (fg->eol)
950                 CHECK_EOL_ANCHOR;
951               if (!fg->nosub && nmatch >= 1)
952                 {
953                   pmatch[0].rm_so = j;
954                   pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
955                 }
956               return REG_OK;
957             }
958           else if (mismatch > 0)
959             return mismatch;
960           mismatch = -mismatch - 1;
961           SHIFT;
962         } while (!IS_OUT_OF_BOUNDS);
963     }
964     return ret;
965 }
966
967 /*
968  * Frees the resources that were allocated when the pattern was compiled.
969  */
970 void
971 tre_free_fast(fastmatch_t *fg)
972 {
973
974   DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
975          fg->pattern));
976
977 #ifdef TRE_WCHAR
978   hashtable_free(fg->qsBc_table);
979   if (!fg->hasdot)
980     xfree(fg->bmGs);
981   if (fg->wescmap)
982     xfree(fg->wescmap);
983   xfree(fg->wpattern);
984 #endif
985   if (!fg->hasdot)
986     xfree(fg->sbmGs);
987   if (fg->escmap)
988     xfree(fg->escmap);
989   xfree(fg->pattern);
990 }
991
992 /*
993  * Returns:     -(i + 1) on failure (position that it failed with minus sign)
994  *              error code on error
995  *              REG_OK on success
996  */
997 static inline int
998 fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
999 {
1000   const char *str_byte = data;
1001   const char *pat_byte = fg->pattern;
1002   const tre_char_t *str_wide = data;
1003   const tre_char_t *pat_wide = fg->wpattern;
1004   const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1005   size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1006   int ret = REG_OK;
1007
1008   /* Compare the pattern and the input char-by-char from the last position. */
1009   for (int i = len - 1; i >= 0; i--) {
1010     switch (type)
1011       {
1012         case STR_WIDE:
1013
1014           /* Check dot */
1015           if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1016               (!escmap || !escmap[i]) &&
1017               (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1018             continue;
1019
1020           /* Compare */
1021           if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1022                     : (pat_wide[i] == str_wide[i]))
1023             continue;
1024           break;
1025         default:
1026           /* Check dot */
1027           if (fg->hasdot && pat_byte[i] == '.' &&
1028               (!escmap || !escmap[i]) &&
1029               (!fg->newline || (str_byte[i] != '\n')))
1030             continue;
1031
1032           /* Compare */
1033           if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1034                     : (pat_byte[i] == str_byte[i]))
1035           continue;
1036       }
1037     DPRINT(("fastcmp: mismatch at position %d\n", i));
1038     ret = -(i + 1);
1039     break;
1040   }
1041   return ret;
1042 }