]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.bin/grep/regex/tre-fastmatch.c
Fix IPv6 remote denial of service.
[FreeBSD/FreeBSD.git] / usr.bin / grep / regex / tre-fastmatch.c
1 /* $FreeBSD$ */
2
3 /*-
4  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
5  *
6  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
7  * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 #include "glue.h"
33
34 #include <ctype.h>
35 #include <limits.h>
36 #include <regex.h>
37 #include <stdbool.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #ifdef TRE_WCHAR
41 #include <wchar.h>
42 #include <wctype.h>
43 #endif
44
45 #include "hashtable.h"
46 #include "tre-fastmatch.h"
47
48 static int      fastcmp(const fastmatch_t *fg, const void *data,
49                         tre_str_type_t type);
50
51 /*
52  * Clean up if pattern compilation fails.
53  */
54 #define FAIL_COMP(errcode)                                              \
55   {                                                                     \
56     if (fg->pattern)                                                    \
57       free(fg->pattern);                                                \
58     if (fg->wpattern)                                                   \
59       free(fg->wpattern);                                               \
60     if (fg->qsBc_table)                                                 \
61       hashtable_free(fg->qsBc_table);                                   \
62     fg = NULL;                                                          \
63     return errcode;                                                     \
64   }
65
66 /*
67  * Skips n characters in the input string and assigns the start
68  * address to startptr. Note: as per IEEE Std 1003.1-2008
69  * matching is based on bit pattern not character representations
70  * so we can handle MB strings as byte sequences just like
71  * SB strings.
72  */
73 #define SKIP_CHARS(n)                                                   \
74   switch (type)                                                         \
75     {                                                                   \
76       case STR_WIDE:                                                    \
77         startptr = str_wide + n;                                        \
78         break;                                                          \
79       default:                                                          \
80         startptr = str_byte + n;                                        \
81     }
82
83 /*
84  * Converts the wide string pattern to SB/MB string and stores
85  * it in fg->pattern. Sets fg->len to the byte length of the
86  * converted string.
87  */
88 #define STORE_MBS_PAT                                                   \
89   {                                                                     \
90     size_t siz;                                                         \
91                                                                         \
92     siz = wcstombs(NULL, fg->wpattern, 0);                              \
93     if (siz == (size_t)-1)                                              \
94       return REG_BADPAT;                                                \
95     fg->len = siz;                                                      \
96     fg->pattern = malloc(siz + 1);                                      \
97     if (fg->pattern == NULL)                                            \
98       return REG_ESPACE;                                                \
99     wcstombs(fg->pattern, fg->wpattern, siz);                           \
100     fg->pattern[siz] = '\0';                                            \
101   }                                                                     \
102
103 #define CONV_MBS_PAT(src, dest, destsz)                                 \
104   {                                                                     \
105     destsz = wcstombs(NULL, src, 0);                                    \
106     if (destsz == (size_t)-1)                                           \
107       return REG_BADPAT;                                                \
108     dest = malloc(destsz + 1);                                          \
109     if (dest == NULL)                                                   \
110       return REG_ESPACE;                                                \
111     wcstombs(dest, src, destsz);                                        \
112     dest[destsz] = '\0';                                                \
113   }                                                                     \
114
115 #define IS_OUT_OF_BOUNDS                                                \
116   ((!fg->reversed                                                       \
117     ? ((type == STR_WIDE) ? ((j + fg->wlen) > len)                      \
118                           : ((j + fg->len) > len))                      \
119     : (j < 0)))
120
121 /*
122  * Checks whether the new position after shifting in the input string
123  * is out of the bounds and break out from the loop if so.
124  */
125 #define CHECKBOUNDS                                                     \
126   if (IS_OUT_OF_BOUNDS)                                                 \
127     break;                                                              \
128
129 /*
130  * Shifts in the input string after a mismatch. The position of the
131  * mismatch is stored in the mismatch variable.
132  */
133 #define SHIFT                                                           \
134   CHECKBOUNDS;                                                          \
135                                                                         \
136   {                                                                     \
137     int r = -1;                                                         \
138     unsigned int bc = 0, gs = 0, ts;                                    \
139                                                                         \
140     switch (type)                                                       \
141       {                                                                 \
142         case STR_WIDE:                                                  \
143           if (!fg->hasdot)                                              \
144             {                                                           \
145               if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
146                 mismatch -= u;                                          \
147               v = fg->wlen - 1 - mismatch;                              \
148               r = hashtable_get(fg->qsBc_table,                         \
149                 &str_wide[!fg->reversed ? (size_t)j + fg->wlen          \
150                           : (size_t)j - 1], &bc);                       \
151               gs = fg->bmGs[mismatch];                                  \
152             }                                                           \
153             bc = (r == HASH_OK) ? bc : fg->defBc;                       \
154             DPRINT(("tre_fast_match: mismatch on character" CHF ", "    \
155                     "BC %d, GS %d\n",                                   \
156                     ((const tre_char_t *)startptr)[mismatch + 1],       \
157                     bc, gs));                                           \
158             break;                                                      \
159         default:                                                        \
160           if (!fg->hasdot)                                              \
161             {                                                           \
162               if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift)  \
163                 mismatch -= u;                                          \
164               v = fg->len - 1 - mismatch;                               \
165               gs = fg->sbmGs[mismatch];                                 \
166             }                                                           \
167           bc = fg->qsBc[((const unsigned char *)str_byte)               \
168                         [!fg->reversed ? (size_t)j + fg->len            \
169                          : (size_t)j - 1]];                             \
170           DPRINT(("tre_fast_match: mismatch on character %c, "          \
171                  "BC %d, GS %d\n",                                      \
172                  ((const unsigned char *)startptr)[mismatch + 1],       \
173                  bc, gs));                                              \
174       }                                                                 \
175     if (fg->hasdot)                                                     \
176       shift = bc;                                                       \
177     else                                                                \
178       {                                                                 \
179         ts = (u >= v) ? (u - v) : 0;                                    \
180         shift = MAX(ts, bc);                                            \
181         shift = MAX(shift, gs);                                         \
182         if (shift == gs)                                                \
183           u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v);  \
184         else                                                            \
185           {                                                             \
186             if (ts < bc)                                                \
187               shift = MAX(shift, u + 1);                                \
188             u = 0;                                                      \
189           }                                                             \
190       }                                                                 \
191       DPRINT(("tre_fast_match: shifting %u characters\n", shift));      \
192       j = !fg->reversed ? j + shift : j - shift;                        \
193   }
194
195 /*
196  * Normal Quick Search would require a shift based on the position the
197  * next character after the comparison is within the pattern.  With
198  * wildcards, the position of the last dot effects the maximum shift
199  * distance.
200  * The closer to the end the wild card is the slower the search.
201  *
202  * Examples:
203  * Pattern    Max shift
204  * -------    ---------
205  * this               5
206  * .his               4
207  * t.is               3
208  * th.s               2
209  * thi.               1
210  */
211
212 /*
213  * Fills in the bad character shift array for SB/MB strings.
214  */
215 #define FILL_QSBC                                                       \
216   if (fg->reversed)                                                     \
217     {                                                                   \
218       _FILL_QSBC_REVERSED                                               \
219     }                                                                   \
220   else                                                                  \
221     {                                                                   \
222       _FILL_QSBC                                                        \
223     }
224
225 #define _FILL_QSBC                                                      \
226   for (unsigned int i = 0; i <= UCHAR_MAX; i++)                         \
227     fg->qsBc[i] = fg->len - hasdot;                                     \
228   for (unsigned int i = hasdot + 1; i < fg->len; i++)                   \
229     {                                                                   \
230       fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;            \
231       DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],          \
232              fg->len - i));                                             \
233       if (fg->icase)                                                    \
234         {                                                               \
235           char c = islower((unsigned char)fg->pattern[i]) ?             \
236                    toupper((unsigned char)fg->pattern[i]) :             \
237                    tolower((unsigned char)fg->pattern[i]);              \
238           fg->qsBc[(unsigned char)c] = fg->len - i;                     \
239           DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i));    \
240         }                                                               \
241     }
242
243 #define _FILL_QSBC_REVERSED                                             \
244   for (unsigned int i = 0; i <= UCHAR_MAX; i++)                         \
245     fg->qsBc[i] = firstdot + 1;                                         \
246   for (int i = firstdot - 1; i >= 0; i--)                               \
247     {                                                                   \
248       fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1;                  \
249       DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i],   \
250              i + 1));                                                   \
251       if (fg->icase)                                                    \
252         {                                                               \
253           char c = islower((unsigned char)fg->pattern[i]) ?             \
254                    toupper((unsigned char)fg->pattern[i]) :             \
255                    tolower((unsigned char)fg->pattern[i]);              \
256           fg->qsBc[(unsigned char)c] = i + 1;                           \
257           DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1));   \
258         }                                                               \
259     }
260
261 /*
262  * Fills in the bad character shifts into a hastable for wide strings.
263  * With wide characters it is not possible any more to use a normal
264  * array because there are too many characters and we could not
265  * provide enough memory. Fortunately, we only have to store distinct
266  * values for so many characters as the number of distinct characters
267  * in the pattern, so we can store them in a hashtable and store a
268  * default shift value for the rest.
269  */
270 #define FILL_QSBC_WIDE                                                  \
271   if (fg->reversed)                                                     \
272     {                                                                   \
273       _FILL_QSBC_WIDE_REVERSED                                          \
274     }                                                                   \
275   else                                                                  \
276     {                                                                   \
277       _FILL_QSBC_WIDE                                                   \
278     }
279
280 #define _FILL_QSBC_WIDE                                                 \
281   /* Adjust the shift based on location of the last dot ('.'). */       \
282   fg->defBc = fg->wlen - whasdot;                                       \
283                                                                         \
284   /* Preprocess pattern. */                                             \
285   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),       \
286                                   sizeof(tre_char_t), sizeof(int));     \
287   if (!fg->qsBc_table)                                                  \
288     FAIL_COMP(REG_ESPACE);                                              \
289   for (unsigned int i = whasdot + 1; i < fg->wlen; i++)                 \
290     {                                                                   \
291       int k = fg->wlen - i;                                             \
292       int r;                                                            \
293                                                                         \
294       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);          \
295       if ((r == HASH_FAIL) || (r == HASH_FULL))                         \
296         FAIL_COMP(REG_ESPACE);                                          \
297       DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
298              k));                                                       \
299       if (fg->icase)                                                    \
300         {                                                               \
301           tre_char_t wc = iswlower(fg->wpattern[i]) ?                   \
302             towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);      \
303           r = hashtable_put(fg->qsBc_table, &wc, &k);                   \
304           if ((r == HASH_FAIL) || (r == HASH_FULL))                     \
305             FAIL_COMP(REG_ESPACE);                                      \
306           DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k));    \
307         }                                                               \
308     }
309
310 #define _FILL_QSBC_WIDE_REVERSED                                        \
311   /* Adjust the shift based on location of the last dot ('.'). */       \
312   fg->defBc = (size_t)wfirstdot;                                        \
313                                                                         \
314   /* Preprocess pattern. */                                             \
315   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),       \
316                                   sizeof(tre_char_t), sizeof(int));     \
317   if (!fg->qsBc_table)                                                  \
318     FAIL_COMP(REG_ESPACE);                                              \
319   for (int i = wfirstdot - 1; i >= 0; i--)                              \
320     {                                                                   \
321       int k = i + 1;                                                    \
322       int r;                                                            \
323                                                                         \
324       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);          \
325       if ((r == HASH_FAIL) || (r == HASH_FULL))                         \
326         FAIL_COMP(REG_ESPACE);                                          \
327       DPRINT(("Reverse BC shift for wide char " CHF " is %d\n",         \
328              fg->wpattern[i], k));                                      \
329       if (fg->icase)                                                    \
330         {                                                               \
331           tre_char_t wc = iswlower(fg->wpattern[i]) ?                   \
332             towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);      \
333           r = hashtable_put(fg->qsBc_table, &wc, &k);                   \
334           if ((r == HASH_FAIL) || (r == HASH_FULL))                     \
335             FAIL_COMP(REG_ESPACE);                                      \
336           DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
337                  k));                                                   \
338         }                                                               \
339     }
340
341 #ifdef _GREP_DEBUG
342 #define DPRINT_BMGS(len, fmt_str, sh)                                   \
343   for (unsigned int i = 0; i < len; i++)                                \
344     DPRINT((fmt_str, i, sh[i]));
345 #else
346 #define DPRINT_BMGS(len, fmt_str, sh)                                   \
347   do { } while(/*CONSTCOND*/0)
348 #endif
349
350 /*
351  * Fills in the good suffix table for SB/MB strings.
352  */
353 #define FILL_BMGS                                                       \
354   if (fg->len > 0 && !fg->hasdot)                                       \
355     {                                                                   \
356       fg->sbmGs = malloc(fg->len * sizeof(*fg->sbmGs));                 \
357       if (!fg->sbmGs)                                                   \
358         return REG_ESPACE;                                              \
359       if (fg->len == 1)                                                 \
360         fg->sbmGs[0] = 1;                                               \
361       else                                                              \
362         _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);             \
363       DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);   \
364     }
365
366 /*
367  * Fills in the good suffix table for wide strings.
368  */
369 #define FILL_BMGS_WIDE                                                  \
370   if (fg->wlen > 0 && !fg->hasdot)                                      \
371     {                                                                   \
372       fg->bmGs = malloc(fg->wlen * sizeof(*fg->bmGs));                  \
373       if (!fg->bmGs)                                                    \
374         return REG_ESPACE;                                              \
375       if (fg->wlen == 1)                                                \
376         fg->bmGs[0] = 1;                                                \
377       else                                                              \
378         _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);             \
379       DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",       \
380                   fg->bmGs);                                            \
381     }
382
383 #define _FILL_BMGS(arr, pat, plen, wide)                                \
384   {                                                                     \
385     char *p;                                                            \
386     tre_char_t *wp;                                                     \
387                                                                         \
388     if (wide)                                                           \
389       {                                                                 \
390         if (fg->icase)                                                  \
391           {                                                             \
392             wp = malloc(plen * sizeof(tre_char_t));                     \
393             if (wp == NULL)                                             \
394               return REG_ESPACE;                                        \
395             for (unsigned int i = 0; i < plen; i++)                     \
396               wp[i] = towlower(pat[i]);                                 \
397             _CALC_BMGS(arr, wp, plen);                                  \
398             free(wp);                                                   \
399           }                                                             \
400         else                                                            \
401           _CALC_BMGS(arr, pat, plen);                                   \
402       }                                                                 \
403     else                                                                \
404       {                                                                 \
405         if (fg->icase)                                                  \
406           {                                                             \
407             p = malloc(plen);                                           \
408             if (p == NULL)                                              \
409               return REG_ESPACE;                                        \
410             for (unsigned int i = 0; i < plen; i++)                     \
411               p[i] = tolower((unsigned char)pat[i]);                    \
412             _CALC_BMGS(arr, p, plen);                                   \
413             free(p);                                                    \
414           }                                                             \
415         else                                                            \
416           _CALC_BMGS(arr, pat, plen);                                   \
417       }                                                                 \
418   }
419
420 #define _CALC_BMGS(arr, pat, plen)                                      \
421   {                                                                     \
422     int f = 0, g;                                                       \
423                                                                         \
424     int *suff = malloc(plen * sizeof(int));                             \
425     if (suff == NULL)                                                   \
426       return REG_ESPACE;                                                \
427                                                                         \
428     suff[plen - 1] = plen;                                              \
429     g = plen - 1;                                                       \
430     for (int i = plen - 2; i >= 0; i--)                                 \
431       {                                                                 \
432         if (i > g && suff[i + plen - 1 - f] < i - g)                    \
433           suff[i] = suff[i + plen - 1 - f];                             \
434         else                                                            \
435           {                                                             \
436             if (i < g)                                                  \
437               g = i;                                                    \
438             f = i;                                                      \
439             while (g >= 0 && pat[g] == pat[g + plen - 1 - f])           \
440               g--;                                                      \
441             suff[i] = f - g;                                            \
442           }                                                             \
443       }                                                                 \
444                                                                         \
445     for (unsigned int i = 0; i < plen; i++)                             \
446       arr[i] = plen;                                                    \
447     g = 0;                                                              \
448     for (int i = plen - 1; i >= 0; i--)                                 \
449       if (suff[i] == i + 1)                                             \
450         for(; (unsigned long)g < plen - 1 - i; g++)                     \
451           if (arr[g] == plen)                                           \
452             arr[g] = plen - 1 - i;                                      \
453     for (unsigned int i = 0; i <= plen - 2; i++)                        \
454       arr[plen - 1 - suff[i]] = plen - 1 - i;                           \
455                                                                         \
456     free(suff);                                                 \
457   }
458
459 /*
460  * Copies the pattern pat having length n to p and stores
461  * the size in l.
462  */
463 #define SAVE_PATTERN(src, srclen, dst, dstlen)                          \
464   dstlen = srclen;                                                      \
465   dst = malloc((dstlen + 1) * sizeof(tre_char_t));                      \
466   if (dst == NULL)                                                      \
467     return REG_ESPACE;                                                  \
468   if (dstlen > 0)                                                       \
469     memcpy(dst, src, dstlen * sizeof(tre_char_t));                      \
470   dst[dstlen] = TRE_CHAR('\0');
471
472 /*
473  * Initializes pattern compiling.
474  */
475 #define INIT_COMP                                                       \
476   /* Initialize. */                                                     \
477   memset(fg, 0, sizeof(*fg));                                           \
478   fg->icase = (cflags & REG_ICASE);                                     \
479   fg->word = (cflags & REG_WORD);                                       \
480   fg->newline = (cflags & REG_NEWLINE);                                 \
481   fg->nosub = (cflags & REG_NOSUB);                                     \
482                                                                         \
483   /* Cannot handle REG_ICASE with MB string */                          \
484   if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0)                       \
485     {                                                                   \
486       DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n"));     \
487       return REG_BADPAT;                                                \
488     }
489
490 /*
491  * Checks whether we have a 0-length pattern that will match
492  * anything. If literal is set to false, the EOL anchor is also
493  * taken into account.
494  */
495 #define CHECK_MATCHALL(literal)                                         \
496   if (!literal && n == 1 && pat[0] == TRE_CHAR('$'))                    \
497     {                                                                   \
498       n--;                                                              \
499       fg->eol = true;                                                   \
500     }                                                                   \
501                                                                         \
502   if (n == 0)                                                           \
503     {                                                                   \
504       fg->matchall = true;                                              \
505       fg->pattern = malloc(sizeof(char));                               \
506       if (!fg->pattern)                                                 \
507         FAIL_COMP(REG_ESPACE);                                          \
508       fg->pattern[0] = '\0';                                            \
509       fg->wpattern = malloc(sizeof(tre_char_t));                        \
510       if (!fg->wpattern)                                                \
511         FAIL_COMP(REG_ESPACE);                                          \
512       fg->wpattern[0] = TRE_CHAR('\0');                                 \
513       DPRINT(("Matching every input\n"));                               \
514       return REG_OK;                                                    \
515     }
516
517 /*
518  * Returns: REG_OK on success, error code otherwise
519  */
520 int
521 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
522                     int cflags)
523 {
524   size_t hasdot = 0, whasdot = 0;
525   ssize_t firstdot = -1, wfirstdot = -1;
526
527   INIT_COMP;
528
529   CHECK_MATCHALL(true);
530
531   /* Cannot handle word boundaries with MB string */
532   if (fg->word && (TRE_MB_CUR_MAX > 1))
533     return REG_BADPAT;
534
535 #ifdef TRE_WCHAR
536   SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
537   STORE_MBS_PAT;
538 #else
539   SAVE_PATTERN(pat, n, fg->pattern, fg->len);
540 #endif
541
542   DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
543          "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
544          fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
545
546   FILL_QSBC;
547   FILL_BMGS;
548 #ifdef TRE_WCHAR
549   FILL_QSBC_WIDE;
550   FILL_BMGS_WIDE;
551 #endif
552
553   return REG_OK;
554 }
555
556 /*
557  * Returns: REG_OK on success, error code otherwise
558  */
559 int
560 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
561                  int cflags)
562 {
563   tre_char_t *tmp;
564   size_t pos = 0, hasdot = 0, whasdot = 0;
565   ssize_t firstdot = -1, wfirstdot = -1;
566   bool escaped = false;
567   bool *_escmap = NULL;
568
569   INIT_COMP;
570
571   /* Remove beginning-of-line character ('^'). */
572   if (pat[0] == TRE_CHAR('^'))
573     {
574       fg->bol = true;
575       n--;
576       pat++;
577     }
578
579   CHECK_MATCHALL(false);
580
581   /* Handle word-boundary matching when GNU extensions are enabled */
582   if ((cflags & REG_GNU) && (n >= 14) &&
583       (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
584       (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
585               7 * sizeof(tre_char_t)) == 0))
586     {
587       n -= 14;
588       pat += 7;
589       fg->word = true;
590     }
591
592   /* Cannot handle word boundaries with MB string */
593   if (fg->word && (TRE_MB_CUR_MAX > 1))
594     return REG_BADPAT;
595
596   tmp = malloc((n + 1) * sizeof(tre_char_t));
597   if (tmp == NULL)
598     return REG_ESPACE;
599
600 /* Copies the char into the stored pattern and skips to the next char. */
601 #define STORE_CHAR                                                      \
602   do                                                                    \
603     {                                                                   \
604       tmp[pos++] = pat[i];                                              \
605       escaped = false;                                                  \
606       continue;                                                         \
607     } while (0)
608
609   /* Traverse the input pattern for processing */
610   for (unsigned int i = 0; i < n; i++)
611     {
612       switch (pat[i])
613         {
614           case TRE_CHAR('\\'):
615             if (escaped)
616               STORE_CHAR;
617             else if (i == n - 1)
618               goto badpat;
619             else
620               escaped = true;
621             continue;
622           case TRE_CHAR('['):
623             if (escaped)
624               STORE_CHAR;
625             else
626               goto badpat;
627             continue;
628           case TRE_CHAR('*'):
629             if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
630               STORE_CHAR;
631             else
632               goto badpat;
633             continue;
634           case TRE_CHAR('+'):
635           case TRE_CHAR('?'):
636             if ((cflags & REG_EXTENDED) && (i == 0))
637               goto badpat;
638             else if ((cflags & REG_EXTENDED) ^ !escaped)
639               STORE_CHAR;
640             else
641               goto badpat;
642             continue;
643           case TRE_CHAR('.'):
644             if (escaped)
645               {
646                 if (!_escmap)
647                   _escmap = calloc(n, sizeof(bool));
648                 if (!_escmap)
649                   {
650                     free(tmp);
651                     return REG_ESPACE;
652                   }
653                 _escmap[i] = true;
654                 STORE_CHAR;
655               }
656             else
657               {
658                 whasdot = i;
659                 if (wfirstdot == -1)
660                         wfirstdot = i;
661                 STORE_CHAR;
662               }
663             continue;
664           case TRE_CHAR('^'):
665             STORE_CHAR;
666             continue;
667           case TRE_CHAR('$'):
668             if (!escaped && (i == n - 1))
669               fg->eol = true;
670             else
671               STORE_CHAR;
672             continue;
673           case TRE_CHAR('('):
674             if ((cflags & REG_EXTENDED) ^ escaped)
675               goto badpat;
676             else
677               STORE_CHAR;
678             continue;
679           case TRE_CHAR('{'):
680             if (!(cflags & REG_EXTENDED) ^ escaped)
681               STORE_CHAR;
682             else if (!(cflags & REG_EXTENDED) && (i == 0))
683               STORE_CHAR;
684             else if ((cflags & REG_EXTENDED) && (i == 0))
685               continue;
686             else
687               goto badpat;
688             continue;
689           case TRE_CHAR('|'):
690             if ((cflags & REG_EXTENDED) ^ escaped)
691               goto badpat;
692             else
693               STORE_CHAR;
694             continue;
695           default:
696             if (escaped)
697               goto badpat;
698             else
699               STORE_CHAR;
700             continue;
701         }
702       continue;
703 badpat:
704       free(tmp);
705       DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
706               "back to NFA\n"));
707       return REG_BADPAT;
708     }
709
710   fg->hasdot = wfirstdot > -1;
711
712   /*
713    * The pattern has been processed and copied to tmp as a literal string
714    * with escapes, anchors (^$) and the word boundary match character
715    * classes stripped out.
716    */
717 #ifdef TRE_WCHAR
718   SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
719   fg->wescmap = _escmap;
720   STORE_MBS_PAT;
721
722   /*
723    * The position of dots and escaped dots is different in the MB string
724    * than in to the wide string so traverse the converted string, as well,
725    * to store these positions.
726    */
727   if (fg->hasdot || (fg->wescmap != NULL))
728     {
729       if (fg->wescmap != NULL)
730         {
731           fg->escmap = calloc(fg->len, sizeof(bool));
732           if (fg->escmap == NULL)
733             {
734               tre_free_fast(fg);
735               return REG_ESPACE;
736             }
737         }
738
739       escaped = false;
740       char *_checkpat = NULL;
741       size_t _checklen = 0;
742       unsigned int escofs = 0;
743       /*
744        * Make a copy here of the original pattern, because fg->pattern has
745        * already been stripped of all escape sequences in the above processing.
746        * This is necessary if we wish to later treat fg->escmap as an actual,
747        * functional replacement of fg->wescmap.
748        */
749       CONV_MBS_PAT(pat, _checkpat, _checklen);
750       for (unsigned int i = 0; i < n; i++)
751         if (_checkpat[i] == '\\')
752           {
753             escaped = !escaped;
754             if (escaped)
755               ++escofs;
756           }
757         else if (_checkpat[i] == '.' && fg->escmap != NULL && escaped)
758           {
759             fg->escmap[i - escofs] = true;
760             escaped = false;
761           }
762         else if (_checkpat[i] == '.' && !escaped)
763           {
764             hasdot = i;
765             if (firstdot == -1)
766               firstdot = i;
767           }
768         else
769           escaped = false;
770       free(_checkpat);
771     }
772 #else
773   SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
774   fg->escmap = _escmap;
775 #endif
776
777   free(tmp);
778
779   DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
780          "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
781          fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
782          fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
783          fg->newline ? 'y' : 'n'));
784
785   /* Check whether reverse QS algorithm is more efficient */
786   if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
787       fg->nosub)
788     {
789       fg->reversed = true;
790       DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
791     }
792
793   FILL_QSBC;
794   FILL_BMGS;
795 #ifdef TRE_WCHAR
796   FILL_QSBC_WIDE;
797   FILL_BMGS_WIDE;
798 #endif
799
800   return REG_OK;
801 }
802
803 #define _SHIFT_ONE                                                      \
804   {                                                                     \
805     shift = 1;                                                          \
806     j = !fg->reversed ? j + shift : j - shift;                          \
807     continue;                                                           \
808   }
809
810 #define _BBOUND_COND                                                    \
811   ((type == STR_WIDE) ?                                                 \
812     ((j == 0) || !(tre_isalnum(str_wide[j - 1]) ||                      \
813       (str_wide[j - 1] == TRE_CHAR('_')))) :                            \
814     ((j == 0) || !(tre_isalnum(str_byte[j - 1]) ||                      \
815       (str_byte[j - 1] == '_'))))
816
817 #define _EBOUND_COND                                                    \
818   ((type == STR_WIDE) ?                                                 \
819     ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) ||  \
820       (str_wide[j + fg->wlen] == TRE_CHAR('_')))) :                     \
821     ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) ||    \
822       (str_byte[j + fg->len] == '_'))))
823
824 /*
825  * Condition to check whether the match on position j is on a
826  * word boundary.
827  */
828 #define IS_ON_WORD_BOUNDARY                                             \
829   (_BBOUND_COND && _EBOUND_COND)
830
831 /*
832  * Checks word boundary and shifts one if match is not on a
833  * boundary.
834  */
835 #define CHECK_WORD_BOUNDARY                                             \
836     if (!IS_ON_WORD_BOUNDARY)                                           \
837       _SHIFT_ONE;
838
839 #define _BOL_COND                                                       \
840   ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
841                                    : (str_byte[j - 1] == '\n')))
842
843 /*
844  * Checks BOL anchor and shifts one if match is not on a
845  * boundary.
846  */
847 #define CHECK_BOL_ANCHOR                                                \
848     if (!_BOL_COND)                                                     \
849       _SHIFT_ONE;
850
851 #define _EOL_COND                                                       \
852   ((type == STR_WIDE)                                                   \
853     ? ((j + fg->wlen == len) ||                                         \
854                 (str_wide[j + fg->wlen] == TRE_CHAR('\n')))             \
855     : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
856
857 /*
858  * Checks EOL anchor and shifts one if match is not on a
859  * boundary.
860  */
861 #define CHECK_EOL_ANCHOR                                                \
862     if (!_EOL_COND)                                                     \
863       _SHIFT_ONE;
864
865 /*
866  * Executes matching of the precompiled pattern on the input string.
867  * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
868  */
869 int
870 tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
871     tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
872 {
873   unsigned int shift, u = 0, v = 0;
874   ssize_t j = 0;
875   int ret = REG_NOMATCH;
876   int mismatch;
877   const char *str_byte = data;
878   const void *startptr = NULL;
879   const tre_char_t *str_wide = data;
880
881   /* Calculate length if unspecified. */
882   if (len == (size_t)-1)
883     switch (type)
884       {
885         case STR_WIDE:
886           len = tre_strlen(str_wide);
887           break;
888         default:
889           len = strlen(str_byte);
890           break;
891       }
892
893   /* Shortcut for empty pattern */
894   if (fg->matchall)
895     {
896       if (!fg->nosub && nmatch >= 1)
897         {
898           pmatch[0].rm_so = 0;
899           pmatch[0].rm_eo = len;
900         }
901       if (fg->bol && fg->eol)
902         return (len == 0) ? REG_OK : REG_NOMATCH;
903       else
904         return REG_OK;
905     }
906
907   /* No point in going farther if we do not have enough data. */
908   switch (type)
909     {
910       case STR_WIDE:
911         if (len < fg->wlen)
912           return ret;
913         shift = fg->wlen;
914         break;
915       default:
916         if (len < fg->len)
917           return ret;
918         shift = fg->len;
919     }
920
921   /*
922    * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
923    * can shift one because there can't be a match at the beginning.
924    */
925   if (fg->bol && (eflags & REG_NOTBOL))
926     j = 1;
927
928   /*
929    * Like above, we cannot have a match at the very end when anchoring to
930    * the end and REG_NOTEOL is specified.
931    */
932   if (fg->eol && (eflags & REG_NOTEOL))
933     len--;
934
935   if (fg->reversed)
936     j = len - (type == STR_WIDE ? fg->wlen : fg->len);
937
938
939   /* Only try once at the beginning or ending of the line. */
940   if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
941       !(eflags & REG_NOTEOL))
942     {
943       /* Simple text comparison. */
944       if (!((fg->bol && fg->eol) &&
945           (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
946         {
947           /* Determine where in data to start search at. */
948           j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
949           SKIP_CHARS(j);
950           mismatch = fastcmp(fg, startptr, type);
951           if (mismatch == REG_OK)
952             {
953               if (fg->word && !IS_ON_WORD_BOUNDARY)
954                 return ret;
955               if (!fg->nosub && nmatch >= 1)
956                 {
957                   pmatch[0].rm_so = j;
958                   pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
959                 }
960               return REG_OK;
961             }
962         }
963     }
964   else
965     {
966       /* Quick Search / Turbo Boyer-Moore algorithm. */
967       do
968         {
969           SKIP_CHARS(j);
970           mismatch = fastcmp(fg, startptr, type);
971           if (mismatch == REG_OK)
972             {
973               if (fg->word)
974                 CHECK_WORD_BOUNDARY;
975               if (fg->bol)
976                 CHECK_BOL_ANCHOR;
977               if (fg->eol)
978                 CHECK_EOL_ANCHOR;
979               if (!fg->nosub && nmatch >= 1)
980                 {
981                   pmatch[0].rm_so = j;
982                   pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
983                 }
984               return REG_OK;
985             }
986           else if (mismatch > 0)
987             return mismatch;
988           mismatch = -mismatch - 1;
989           SHIFT;
990         } while (!IS_OUT_OF_BOUNDS);
991     }
992     return ret;
993 }
994
995 /*
996  * Frees the resources that were allocated when the pattern was compiled.
997  */
998 void
999 tre_free_fast(fastmatch_t *fg)
1000 {
1001
1002   DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
1003          fg->pattern));
1004
1005 #ifdef TRE_WCHAR
1006   hashtable_free(fg->qsBc_table);
1007   if (!fg->hasdot)
1008     free(fg->bmGs);
1009   if (fg->wescmap)
1010     free(fg->wescmap);
1011   free(fg->wpattern);
1012 #endif
1013   if (!fg->hasdot)
1014     free(fg->sbmGs);
1015   if (fg->escmap)
1016     free(fg->escmap);
1017   free(fg->pattern);
1018 }
1019
1020 /*
1021  * Returns:     -(i + 1) on failure (position that it failed with minus sign)
1022  *              error code on error
1023  *              REG_OK on success
1024  */
1025 static inline int
1026 fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
1027 {
1028   const char *str_byte = data;
1029   const char *pat_byte = fg->pattern;
1030   const tre_char_t *str_wide = data;
1031   const tre_char_t *pat_wide = fg->wpattern;
1032   const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1033   size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1034   int ret = REG_OK;
1035
1036   /* Compare the pattern and the input char-by-char from the last position. */
1037   for (int i = len - 1; i >= 0; i--) {
1038     switch (type)
1039       {
1040         case STR_WIDE:
1041
1042           /* Check dot */
1043           if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1044               (!escmap || !escmap[i]) &&
1045               (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1046             continue;
1047
1048           /* Compare */
1049           if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1050                     : (pat_wide[i] == str_wide[i]))
1051             continue;
1052           break;
1053         default:
1054           /* Check dot */
1055           if (fg->hasdot && pat_byte[i] == '.' &&
1056               (!escmap || !escmap[i]) &&
1057               (!fg->newline || (str_byte[i] != '\n')))
1058             continue;
1059
1060           /* Compare */
1061           if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1062                     : (pat_byte[i] == str_byte[i]))
1063           continue;
1064       }
1065     DPRINT(("fastcmp: mismatch at position %d\n", i));
1066     ret = -(i + 1);
1067     break;
1068   }
1069   return ret;
1070 }