gnu/usr.bin/grep/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
  17
  18 /* Written June, 1988 by Mike Haertel
  19    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  20
  21 /* $FreeBSD$ */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <assert.h>
  28 #include <ctype.h>
  29 #include <stdio.h>
  30
  31 #include <sys/types.h>
  32 #ifdef STDC_HEADERS
  33 #include <stdlib.h>
  34 #else
  35 extern char *calloc(), *malloc(), *realloc();
  36 extern void free();
  37 #endif
  38
  39 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
  40 #include <string.h>
  41 #else
  42 #include <strings.h>
  43 #endif
  44
  45 #if HAVE_SETLOCALE
  46 # include <locale.h>
  47 #endif
  48
  49 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
  50 /* We can handle multibyte string.  */
  51 # define MBS_SUPPORT
  52 #endif
  53
  54 #ifdef MBS_SUPPORT
  55 # include <wchar.h>
  56 # include <wctype.h>
  57 #endif
  58
  59 #ifndef DEBUG   /* use the same approach as regex.c */
  60 #undef assert
  61 #define assert(e)
  62 #endif /* DEBUG */
  63
  64 #ifndef isgraph
  65 #define isgraph(C) (isprint(C) && !isspace(C))
  66 #endif
  67
  68 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
  69 #define ISALPHA(C) isalpha(C)
  70 #define ISUPPER(C) isupper(C)
  71 #define ISLOWER(C) islower(C)
  72 #define ISDIGIT(C) isdigit(C)
  73 #define ISXDIGIT(C) isxdigit(C)
  74 #define ISSPACE(C) isspace(C)
  75 #define ISPUNCT(C) ispunct(C)
  76 #define ISALNUM(C) isalnum(C)
  77 #define ISPRINT(C) isprint(C)
  78 #define ISGRAPH(C) isgraph(C)
  79 #define ISCNTRL(C) iscntrl(C)
  80 #else
  81 #define ISALPHA(C) (isascii(C) && isalpha(C))
  82 #define ISUPPER(C) (isascii(C) && isupper(C))
  83 #define ISLOWER(C) (isascii(C) && islower(C))
  84 #define ISDIGIT(C) (isascii(C) && isdigit(C))
  85 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
  86 #define ISSPACE(C) (isascii(C) && isspace(C))
  87 #define ISPUNCT(C) (isascii(C) && ispunct(C))
  88 #define ISALNUM(C) (isascii(C) && isalnum(C))
  89 #define ISPRINT(C) (isascii(C) && isprint(C))
  90 #define ISGRAPH(C) (isascii(C) && isgraph(C))
  91 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
  92 #endif
  93
  94 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
  95    - Its arg may be any int or unsigned int; it need not be an unsigned char.
  96    - It's guaranteed to evaluate its argument exactly once.
  97    - It's typically faster.
  98    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
  99    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
 100    it's important to use the locale's definition of `digit' even when the
 101    host does not conform to Posix.  */
 102 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
 103
 104 /* If we (don't) have I18N.  */
 105 /* glibc defines _ */
 106 #ifndef _
 107 # ifdef HAVE_LIBINTL_H
 108 #  include <libintl.h>
 109 #  ifndef _
 110 #   define _(Str) gettext (Str)
 111 #  endif
 112 # else
 113 #  define _(Str) (Str)
 114 # endif
 115 #endif
 116
 117 #include "regex.h"
 118 #include "dfa.h"
 119 #include "hard-locale.h"
 120
 121 /* HPUX, define those as macros in sys/param.h */
 122 #ifdef setbit
 123 # undef setbit
 124 #endif
 125 #ifdef clrbit
 126 # undef clrbit
 127 #endif
 128
 129 static void dfamust PARAMS ((struct dfa *dfa));
 130 static void regexp PARAMS ((int toplevel));
 131
 132 static ptr_t
 133 xcalloc (size_t n, size_t s)
 134 {
 135   ptr_t r = calloc(n, s);
 136
 137   if (!r)
 138     dfaerror(_("Memory exhausted"));
 139   return r;
 140 }
 141
 142 static ptr_t
 143 xmalloc (size_t n)
 144 {
 145   ptr_t r = malloc(n);
 146
 147   assert(n != 0);
 148   if (!r)
 149     dfaerror(_("Memory exhausted"));
 150   return r;
 151 }
 152
 153 static ptr_t
 154 xrealloc (ptr_t p, size_t n)
 155 {
 156   ptr_t r = realloc(p, n);
 157
 158   assert(n != 0);
 159   if (!r)
 160     dfaerror(_("Memory exhausted"));
 161   return r;
 162 }
 163
 164 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
 165 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
 166 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
 167
 168 /* Reallocate an array of type t if nalloc is too small for index. */
 169 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
 170   if ((index) >= (nalloc))                        \
 171     {                                             \
 172       do                                          \
 173         (nalloc) *= 2;                            \
 174       while ((index) >= (nalloc));                \
 175       REALLOC(p, t, nalloc);                      \
 176     }
 177
 178 #ifdef DEBUG
 179
 180 static void
 181 prtok (token t)
 182 {
 183   char const *s;
 184
 185   if (t < 0)
 186     fprintf(stderr, "END");
 187   else if (t < NOTCHAR)
 188     fprintf(stderr, "%c", t);
 189   else
 190     {
 191       switch (t)
 192         {
 193         case EMPTY: s = "EMPTY"; break;
 194         case BACKREF: s = "BACKREF"; break;
 195         case BEGLINE: s = "BEGLINE"; break;
 196         case ENDLINE: s = "ENDLINE"; break;
 197         case BEGWORD: s = "BEGWORD"; break;
 198         case ENDWORD: s = "ENDWORD"; break;
 199         case LIMWORD: s = "LIMWORD"; break;
 200         case NOTLIMWORD: s = "NOTLIMWORD"; break;
 201         case QMARK: s = "QMARK"; break;
 202         case STAR: s = "STAR"; break;
 203         case PLUS: s = "PLUS"; break;
 204         case CAT: s = "CAT"; break;
 205         case OR: s = "OR"; break;
 206         case ORTOP: s = "ORTOP"; break;
 207         case LPAREN: s = "LPAREN"; break;
 208         case RPAREN: s = "RPAREN"; break;
 209         case CRANGE: s = "CRANGE"; break;
 210 #ifdef MBS_SUPPORT
 211         case ANYCHAR: s = "ANYCHAR"; break;
 212         case MBCSET: s = "MBCSET"; break;
 213 #endif /* MBS_SUPPORT */
 214         default: s = "CSET"; break;
 215         }
 216       fprintf(stderr, "%s", s);
 217     }
 218 }
 219 #endif /* DEBUG */
 220
 221 /* Stuff pertaining to charclasses. */
 222
 223 static int
 224 tstbit (unsigned b, charclass c)
 225 {
 226   return c[b / INTBITS] & 1 << b % INTBITS;
 227 }
 228
 229 static void
 230 setbit (unsigned b, charclass c)
 231 {
 232   c[b / INTBITS] |= 1 << b % INTBITS;
 233 }
 234
 235 static void
 236 clrbit (unsigned b, charclass c)
 237 {
 238   c[b / INTBITS] &= ~(1 << b % INTBITS);
 239 }
 240
 241 static void
 242 copyset (charclass src, charclass dst)
 243 {
 244   memcpy (dst, src, sizeof (charclass));
 245 }
 246
 247 static void
 248 zeroset (charclass s)
 249 {
 250   memset (s, 0, sizeof (charclass));
 251 }
 252
 253 static void
 254 notset (charclass s)
 255 {
 256   int i;
 257
 258   for (i = 0; i < CHARCLASS_INTS; ++i)
 259     s[i] = ~s[i];
 260 }
 261
 262 static int
 263 equal (charclass s1, charclass s2)
 264 {
 265   return memcmp (s1, s2, sizeof (charclass)) == 0;
 266 }
 267
 268 /* A pointer to the current dfa is kept here during parsing. */
 269 static struct dfa *dfa;
 270
 271 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
 272 static int
 273 charclass_index (charclass s)
 274 {
 275   int i;
 276
 277   for (i = 0; i < dfa->cindex; ++i)
 278     if (equal(s, dfa->charclasses[i]))
 279       return i;
 280   REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
 281   ++dfa->cindex;
 282   copyset(s, dfa->charclasses[i]);
 283   return i;
 284 }
 285
 286 /* Syntax bits controlling the behavior of the lexical analyzer. */
 287 static reg_syntax_t syntax_bits, syntax_bits_set;
 288
 289 /* Flag for case-folding letters into sets. */
 290 static int case_fold;
 291
 292 /* End-of-line byte in data.  */
 293 static unsigned char eolbyte;
 294
 295 /* Entry point to set syntax options. */
 296 void
 297 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 298 {
 299   syntax_bits_set = 1;
 300   syntax_bits = bits;
 301   case_fold = fold;
 302   eolbyte = eol;
 303 }
 304
 305 /* Like setbit, but if case is folded, set both cases of a letter.  */
 306 static void
 307 setbit_case_fold (unsigned b, charclass c)
 308 {
 309   setbit (b, c);
 310   if (case_fold)
 311     {
 312       if (ISUPPER (b))
 313         setbit (tolower (b), c);
 314       else if (ISLOWER (b))
 315         setbit (toupper (b), c);
 316     }
 317 }
 318
 319 /* Lexical analyzer.  All the dross that deals with the obnoxious
 320    GNU Regex syntax bits is located here.  The poor, suffering
 321    reader is referred to the GNU Regex documentation for the
 322    meaning of the @#%!@#%^!@ syntax bits. */
 323
 324 static char const *lexstart;    /* Pointer to beginning of input string. */
 325 static char const *lexptr;      /* Pointer to next input character. */
 326 static int lexleft;             /* Number of characters remaining. */
 327 static token lasttok;           /* Previous token returned; initially END. */
 328 static int laststart;           /* True if we're separated from beginning or (, |
 329                                    only by zero-width characters. */
 330 static int parens;              /* Count of outstanding left parens. */
 331 static int minrep, maxrep;      /* Repeat counts for {m,n}. */
 332 static int hard_LC_COLLATE;     /* Nonzero if LC_COLLATE is hard.  */
 333
 334 #ifdef MBS_SUPPORT
 335 /* These variables are used only if (MB_CUR_MAX > 1).  */
 336 static mbstate_t mbs;           /* Mbstate for mbrlen().  */
 337 static ssize_t cur_mb_len;      /* Byte length of the current scanning
 338                                    multibyte character.  Must also handle
 339                                    negative result from mbrlen().  */
 340 static ssize_t cur_mb_index;    /* Byte index of the current scanning multibyte
 341                                    character.
 342
 343                                    singlebyte character : cur_mb_index = 0
 344                                    multibyte character
 345                                        1st byte : cur_mb_index = 1
 346                                        2nd byte : cur_mb_index = 2
 347                                          ...
 348                                        nth byte : cur_mb_index = n  */
 349 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
 350                                   Each element store the amount of remain
 351                                   byte of corresponding multibyte character
 352                                   in the input string.  A element's value
 353                                   is 0 if corresponding character is a
 354                                   singlebyte chracter.
 355                                   e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
 356                                    mblen_buf :   0,       3,       2,       1
 357                                */
 358 static wchar_t *inputwcs;       /* Wide character representation of input
 359                                    string in dfaexec().
 360                                    The length of this array is same as
 361                                    the length of input string(char array).
 362                                    inputstring[i] is a single-byte char,
 363                                    or 1st byte of a multibyte char.
 364                                    And inputwcs[i] is the codepoint.  */
 365 static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
 366 static unsigned char const *buf_end;    /* refference to end in dfaexec().  */
 367 #endif /* MBS_SUPPORT  */
 368
 369 #ifdef MBS_SUPPORT
 370 /* This function update cur_mb_len, and cur_mb_index.
 371    p points current lexptr, len is the remaining buffer length.  */
 372 static void
 373 update_mb_len_index (unsigned char const *p, size_t len)
 374 {
 375   /* If last character is a part of a multibyte character,
 376      we update cur_mb_index.  */
 377   if (cur_mb_index)
 378     cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
 379                         : cur_mb_index + 1;
 380
 381   /* If last character is a single byte character, or the
 382      last portion of a multibyte character, we check whether
 383      next character is a multibyte character or not.  */
 384   if (! cur_mb_index)
 385     {
 386       cur_mb_len = mbrlen(p, len, &mbs);
 387       if (cur_mb_len > 1)
 388         /* It is a multibyte character.
 389            cur_mb_len was already set by mbrlen().  */
 390         cur_mb_index = 1;
 391       else if (cur_mb_len < 1)
 392         /* Invalid sequence.  We treat it as a singlebyte character.
 393            cur_mb_index is aleady 0.  */
 394         cur_mb_len = 1;
 395       /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
 396          cur_mb_index is aleady 0.  */
 397     }
 398 }
 399 #endif /* MBS_SUPPORT */
 400
 401 #ifdef MBS_SUPPORT
 402 /* Note that characters become unsigned here. */
 403 # define FETCH(c, eoferr)                       \
 404   {                                             \
 405     if (! lexleft)                              \
 406      {                                          \
 407         if (eoferr != 0)                        \
 408           dfaerror (eoferr);                    \
 409         else                                    \
 410           return lasttok = END;                 \
 411       }                                         \
 412     if (MB_CUR_MAX > 1)                         \
 413       update_mb_len_index(lexptr, lexleft);     \
 414     (c) = (unsigned char) *lexptr++;            \
 415     --lexleft;                                  \
 416   }
 417
 418 /* This function fetch a wide character, and update cur_mb_len,
 419    used only if the current locale is a multibyte environment.  */
 420 static wint_t
 421 fetch_wc (char const *eoferr)
 422 {
 423   wchar_t wc;
 424   if (! lexleft)
 425     {
 426       if (eoferr != 0)
 427         dfaerror (eoferr);
 428       else
 429         return WEOF;
 430     }
 431
 432   cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
 433   if (cur_mb_len <= 0)
 434    {
 435       cur_mb_len = 1;
 436       wc = *lexptr;
 437     }
 438   lexptr += cur_mb_len;
 439   lexleft -= cur_mb_len;
 440   return wc;
 441 }
 442 #else
 443 /* Note that characters become unsigned here. */
 444 # define FETCH(c, eoferr)             \
 445   {                                   \
 446     if (! lexleft)                    \
 447       {                               \
 448         if (eoferr != 0)              \
 449           dfaerror (eoferr);          \
 450         else                          \
 451           return lasttok = END;       \
 452       }                               \
 453     (c) = (unsigned char) *lexptr++;  \
 454     --lexleft;                        \
 455   }
 456 #endif /* MBS_SUPPORT */
 457
 458 #ifdef MBS_SUPPORT
 459 /* Multibyte character handling sub-routin for lex.
 460    This function  parse a bracket expression and build a struct
 461    mb_char_classes.  */
 462 static void
 463 parse_bracket_exp_mb ()
 464 {
 465   wint_t wc, wc1, wc2;
 466
 467   /* Work area to build a mb_char_classes.  */
 468   struct mb_char_classes *work_mbc;
 469   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
 470     equivs_al, coll_elems_al;
 471
 472   REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
 473                        dfa->mbcsets_alloc, dfa->nmbcsets + 1);
 474   /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
 475      We will update dfa->multibyte_prop in addtok(), because we can't
 476      decide the index in dfa->tokens[].  */
 477
 478   /* Initialize work are */
 479   work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
 480
 481   chars_al = 1;
 482   range_sts_al = range_ends_al = 0;
 483   ch_classes_al = equivs_al = coll_elems_al = 0;
 484   MALLOC(work_mbc->chars, wchar_t, chars_al);
 485
 486   work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
 487   work_mbc->nequivs = work_mbc->ncoll_elems = 0;
 488   work_mbc->chars = work_mbc->ch_classes = NULL;
 489   work_mbc->range_sts = work_mbc->range_ends = NULL;
 490   work_mbc->equivs = work_mbc->coll_elems = NULL;
 491
 492   wc = fetch_wc(_("Unbalanced ["));
 493   if (wc == L'^')
 494     {
 495       wc = fetch_wc(_("Unbalanced ["));
 496       work_mbc->invert = 1;
 497     }
 498   else
 499     work_mbc->invert = 0;
 500   do
 501     {
 502       wc1 = WEOF; /* mark wc1 is not initialized".  */
 503
 504       /* Note that if we're looking at some other [:...:] construct,
 505          we just treat it as a bunch of ordinary characters.  We can do
 506          this because we assume regex has checked for syntax errors before
 507          dfa is ever called. */
 508       if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
 509         {
 510 #define BRACKET_BUFFER_SIZE 128
 511           char str[BRACKET_BUFFER_SIZE];
 512           wc1 = wc;
 513           wc = fetch_wc(_("Unbalanced ["));
 514
 515           /* If pattern contains `[[:', `[[.', or `[[='.  */
 516           if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
 517             {
 518               unsigned char c;
 519               unsigned char delim = (unsigned char)wc;
 520               int len = 0;
 521               for (;;)
 522                 {
 523                   if (! lexleft)
 524                     dfaerror (_("Unbalanced ["));
 525                   c = (unsigned char) *lexptr++;
 526                   --lexleft;
 527
 528                   if ((c == delim && *lexptr == ']') || lexleft == 0)
 529                     break;
 530                   if (len < BRACKET_BUFFER_SIZE)
 531                     str[len++] = c;
 532                   else
 533                     /* This is in any case an invalid class name.  */
 534                     str[0] = '\0';
 535                 }
 536               str[len] = '\0';
 537
 538               if (lexleft == 0)
 539                 {
 540                   REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 541                                        work_mbc->nchars + 2);
 542                   work_mbc->chars[work_mbc->nchars++] = L'[';
 543                   work_mbc->chars[work_mbc->nchars++] = delim;
 544                   break;
 545                 }
 546
 547               if (--lexleft, *lexptr++ != ']')
 548                 dfaerror (_("Unbalanced ["));
 549               if (delim == ':')
 550                 /* build character class.  */
 551                 {
 552                   wctype_t wt;
 553                   /* Query the character class as wctype_t.  */
 554                   wt = wctype (str);
 555
 556                   if (ch_classes_al == 0)
 557                     MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
 558                   REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
 559                                        ch_classes_al,
 560                                        work_mbc->nch_classes + 1);
 561                   work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
 562
 563                 }
 564               else if (delim == '=' || delim == '.')
 565                 {
 566                   char *elem;
 567                   MALLOC(elem, char, len + 1);
 568                   strncpy(elem, str, len + 1);
 569
 570                   if (delim == '=')
 571                     /* build equivalent class.  */
 572                     {
 573                       if (equivs_al == 0)
 574                         MALLOC(work_mbc->equivs, char*, ++equivs_al);
 575                       REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
 576                                            equivs_al,
 577                                            work_mbc->nequivs + 1);
 578                       work_mbc->equivs[work_mbc->nequivs++] = elem;
 579                     }
 580
 581                   if (delim == '.')
 582                     /* build collating element.  */
 583                     {
 584                       if (coll_elems_al == 0)
 585                         MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
 586                       REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
 587                                            coll_elems_al,
 588                                            work_mbc->ncoll_elems + 1);
 589                       work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
 590                     }
 591                 }
 592               wc1 = wc = WEOF;
 593             }
 594           else
 595             /* We treat '[' as a normal character here.  */
 596             {
 597               wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
 598             }
 599         }
 600       else
 601         {
 602           if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 603             wc = fetch_wc(("Unbalanced ["));
 604         }
 605
 606       if (wc1 == WEOF)
 607         wc1 = fetch_wc(_("Unbalanced ["));
 608
 609       if (wc1 == L'-')
 610         /* build range characters.  */
 611         {
 612           wc2 = fetch_wc(_("Unbalanced ["));
 613           if (wc2 == L']')
 614             {
 615               /* In the case [x-], the - is an ordinary hyphen,
 616                  which is left in c1, the lookahead character. */
 617               lexptr -= cur_mb_len;
 618               lexleft += cur_mb_len;
 619               wc2 = wc;
 620             }
 621           else
 622             {
 623               if (wc2 == L'\\'
 624                   && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
 625                 wc2 = fetch_wc(_("Unbalanced ["));
 626               wc1 = fetch_wc(_("Unbalanced ["));
 627             }
 628
 629           if (range_sts_al == 0)
 630             {
 631               MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
 632               MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
 633             }
 634           REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
 635                                range_sts_al, work_mbc->nranges + 1);
 636           work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
 637           REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
 638                                range_ends_al, work_mbc->nranges + 1);
 639           work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
 640         }
 641       else if (wc != WEOF)
 642         /* build normal characters.  */
 643         {
 644           REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
 645                                work_mbc->nchars + 1);
 646           work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
 647         }
 648     }
 649   while ((wc = wc1) != L']');
 650 }
 651 #endif /* MBS_SUPPORT */
 652
 653 #ifdef __STDC__
 654 #define FUNC(F, P) static int F(int c) { return P(c); }
 655 #else
 656 #define FUNC(F, P) static int F(c) int c; { return P(c); }
 657 #endif
 658
 659 FUNC(is_alpha, ISALPHA)
 660 FUNC(is_upper, ISUPPER)
 661 FUNC(is_lower, ISLOWER)
 662 FUNC(is_digit, ISDIGIT)
 663 FUNC(is_xdigit, ISXDIGIT)
 664 FUNC(is_space, ISSPACE)
 665 FUNC(is_punct, ISPUNCT)
 666 FUNC(is_alnum, ISALNUM)
 667 FUNC(is_print, ISPRINT)
 668 FUNC(is_graph, ISGRAPH)
 669 FUNC(is_cntrl, ISCNTRL)
 670
 671 static int
 672 is_blank (int c)
 673 {
 674    return (c == ' ' || c == '\t');
 675 }
 676
 677 /* The following list maps the names of the Posix named character classes
 678    to predicate functions that determine whether a given character is in
 679    the class.  The leading [ has already been eaten by the lexical analyzer. */
 680 static struct {
 681   const char *name;
 682   int (*pred) PARAMS ((int));
 683 } const prednames[] = {
 684   { ":alpha:]", is_alpha },
 685   { ":upper:]", is_upper },
 686   { ":lower:]", is_lower },
 687   { ":digit:]", is_digit },
 688   { ":xdigit:]", is_xdigit },
 689   { ":space:]", is_space },
 690   { ":punct:]", is_punct },
 691   { ":alnum:]", is_alnum },
 692   { ":print:]", is_print },
 693   { ":graph:]", is_graph },
 694   { ":cntrl:]", is_cntrl },
 695   { ":blank:]", is_blank },
 696   { 0 }
 697 };
 698
 699 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
 700 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
 701
 702 static int
 703 looking_at (char const *s)
 704 {
 705   size_t len;
 706
 707   len = strlen(s);
 708   if (lexleft < len)
 709     return 0;
 710   return strncmp(s, lexptr, len) == 0;
 711 }
 712
 713 static token
 714 lex (void)
 715 {
 716   unsigned c, c1, c2;
 717   int backslash = 0, invert;
 718   charclass ccl;
 719   int i;
 720
 721   /* Basic plan: We fetch a character.  If it's a backslash,
 722      we set the backslash flag and go through the loop again.
 723      On the plus side, this avoids having a duplicate of the
 724      main switch inside the backslash case.  On the minus side,
 725      it means that just about every case begins with
 726      "if (backslash) ...".  */
 727   for (i = 0; i < 2; ++i)
 728     {
 729       FETCH(c, 0);
 730 #ifdef MBS_SUPPORT
 731       if (MB_CUR_MAX > 1 && cur_mb_index)
 732         /* If this is a part of a multi-byte character, we must treat
 733            this byte data as a normal character.
 734            e.g. In case of SJIS encoding, some character contains '\',
 735                 but they must not be backslash.  */
 736         goto normal_char;
 737 #endif /* MBS_SUPPORT  */
 738       switch (c)
 739         {
 740         case '\\':
 741           if (backslash)
 742             goto normal_char;
 743           if (lexleft == 0)
 744             dfaerror(_("Unfinished \\ escape"));
 745           backslash = 1;
 746           break;
 747
 748         case '^':
 749           if (backslash)
 750             goto normal_char;
 751           if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 752               || lasttok == END
 753               || lasttok == LPAREN
 754               || lasttok == OR)
 755             return lasttok = BEGLINE;
 756           goto normal_char;
 757
 758         case '$':
 759           if (backslash)
 760             goto normal_char;
 761           if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
 762               || lexleft == 0
 763               || (syntax_bits & RE_NO_BK_PARENS
 764                   ? lexleft > 0 && *lexptr == ')'
 765                   : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
 766               || (syntax_bits & RE_NO_BK_VBAR
 767                   ? lexleft > 0 && *lexptr == '|'
 768                   : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
 769               || ((syntax_bits & RE_NEWLINE_ALT)
 770                   && lexleft > 0 && *lexptr == '\n'))
 771             return lasttok = ENDLINE;
 772           goto normal_char;
 773
 774         case '1':
 775         case '2':
 776         case '3':
 777         case '4':
 778         case '5':
 779         case '6':
 780         case '7':
 781         case '8':
 782         case '9':
 783           if (backslash && !(syntax_bits & RE_NO_BK_REFS))
 784             {
 785               laststart = 0;
 786               return lasttok = BACKREF;
 787             }
 788           goto normal_char;
 789
 790         case '`':
 791           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 792             return lasttok = BEGLINE;   /* FIXME: should be beginning of string */
 793           goto normal_char;
 794
 795         case '\'':
 796           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 797             return lasttok = ENDLINE;   /* FIXME: should be end of string */
 798           goto normal_char;
 799
 800         case '<':
 801           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 802             return lasttok = BEGWORD;
 803           goto normal_char;
 804
 805         case '>':
 806           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 807             return lasttok = ENDWORD;
 808           goto normal_char;
 809
 810         case 'b':
 811           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 812             return lasttok = LIMWORD;
 813           goto normal_char;
 814
 815         case 'B':
 816           if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
 817             return lasttok = NOTLIMWORD;
 818           goto normal_char;
 819
 820         case '?':
 821           if (syntax_bits & RE_LIMITED_OPS)
 822             goto normal_char;
 823           if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
 824             goto normal_char;
 825           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 826             goto normal_char;
 827           return lasttok = QMARK;
 828
 829         case '*':
 830           if (backslash)
 831             goto normal_char;
 832           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 833             goto normal_char;
 834           return lasttok = STAR;
 835
 836         case '+':
 837           if (syntax_bits & RE_LIMITED_OPS)
 838             goto normal_char;
 839           if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
 840             goto normal_char;
 841           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 842             goto normal_char;
 843           return lasttok = PLUS;
 844
 845         case '{':
 846           if (!(syntax_bits & RE_INTERVALS))
 847             goto normal_char;
 848           if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
 849             goto normal_char;
 850           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
 851             goto normal_char;
 852
 853           if (syntax_bits & RE_NO_BK_BRACES)
 854             {
 855               /* Scan ahead for a valid interval; if it's not valid,
 856                  treat it as a literal '{'.  */
 857               int lo = -1, hi = -1;
 858               char const *p = lexptr;
 859               char const *lim = p + lexleft;
 860               for (;  p != lim && ISASCIIDIGIT (*p);  p++)
 861                 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
 862               if (p != lim && *p == ',')
 863                 while (++p != lim && ISASCIIDIGIT (*p))
 864                   hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
 865               else
 866                 hi = lo;
 867               if (p == lim || *p != '}'
 868                   || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
 869                 goto normal_char;
 870             }
 871
 872           minrep = 0;
 873           /* Cases:
 874              {M} - exact count
 875              {M,} - minimum count, maximum is infinity
 876              {M,N} - M through N */
 877           FETCH(c, _("unfinished repeat count"));
 878           if (ISASCIIDIGIT (c))
 879             {
 880               minrep = c - '0';
 881               for (;;)
 882                 {
 883                   FETCH(c, _("unfinished repeat count"));
 884                   if (! ISASCIIDIGIT (c))
 885                     break;
 886                   minrep = 10 * minrep + c - '0';
 887                 }
 888             }
 889           else
 890             dfaerror(_("malformed repeat count"));
 891           if (c == ',')
 892             {
 893               FETCH (c, _("unfinished repeat count"));
 894               if (! ISASCIIDIGIT (c))
 895                 maxrep = -1;
 896               else
 897                 {
 898                   maxrep = c - '0';
 899                   for (;;)
 900                     {
 901                       FETCH (c, _("unfinished repeat count"));
 902                       if (! ISASCIIDIGIT (c))
 903                         break;
 904                       maxrep = 10 * maxrep + c - '0';
 905                     }
 906                   if (0 <= maxrep && maxrep < minrep)
 907                     dfaerror (_("malformed repeat count"));
 908                 }
 909             }
 910           else
 911             maxrep = minrep;
 912           if (!(syntax_bits & RE_NO_BK_BRACES))
 913             {
 914               if (c != '\\')
 915                 dfaerror(_("malformed repeat count"));
 916               FETCH(c, _("unfinished repeat count"));
 917             }
 918           if (c != '}')
 919             dfaerror(_("malformed repeat count"));
 920           laststart = 0;
 921           return lasttok = REPMN;
 922
 923         case '|':
 924           if (syntax_bits & RE_LIMITED_OPS)
 925             goto normal_char;
 926           if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
 927             goto normal_char;
 928           laststart = 1;
 929           return lasttok = OR;
 930
 931         case '\n':
 932           if (syntax_bits & RE_LIMITED_OPS
 933               || backslash
 934               || !(syntax_bits & RE_NEWLINE_ALT))
 935             goto normal_char;
 936           laststart = 1;
 937           return lasttok = OR;
 938
 939         case '(':
 940           if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
 941             goto normal_char;
 942           ++parens;
 943           laststart = 1;
 944           return lasttok = LPAREN;
 945
 946         case ')':
 947           if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
 948             goto normal_char;
 949           if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
 950             goto normal_char;
 951           --parens;
 952           laststart = 0;
 953           return lasttok = RPAREN;
 954
 955         case '.':
 956           if (backslash)
 957             goto normal_char;
 958 #ifdef MBS_SUPPORT
 959           if (MB_CUR_MAX > 1)
 960             {
 961               /* In multibyte environment period must match with a single
 962                  character not a byte.  So we use ANYCHAR.  */
 963               laststart = 0;
 964               return lasttok = ANYCHAR;
 965             }
 966 #endif /* MBS_SUPPORT */
 967           zeroset(ccl);
 968           notset(ccl);
 969           if (!(syntax_bits & RE_DOT_NEWLINE))
 970             clrbit(eolbyte, ccl);
 971           if (syntax_bits & RE_DOT_NOT_NULL)
 972             clrbit('\0', ccl);
 973           laststart = 0;
 974           return lasttok = CSET + charclass_index(ccl);
 975
 976         case 'w':
 977         case 'W':
 978           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
 979             goto normal_char;
 980           zeroset(ccl);
 981           for (c2 = 0; c2 < NOTCHAR; ++c2)
 982             if (IS_WORD_CONSTITUENT(c2))
 983               setbit(c2, ccl);
 984           if (c == 'W')
 985             notset(ccl);
 986           laststart = 0;
 987           return lasttok = CSET + charclass_index(ccl);
 988
 989         case '[':
 990           if (backslash)
 991             goto normal_char;
 992           laststart = 0;
 993 #ifdef MBS_SUPPORT
 994           if (MB_CUR_MAX > 1)
 995             {
 996               /* In multibyte environment a bracket expression may contain
 997                  multibyte characters, which must be treated as characters
 998                  (not bytes).  So we parse it by parse_bracket_exp_mb().  */
 999               parse_bracket_exp_mb();
1000               return lasttok = MBCSET;
1001             }
1002 #endif
1003           zeroset(ccl);
1004           FETCH(c, _("Unbalanced ["));
1005           if (c == '^')
1006             {
1007               FETCH(c, _("Unbalanced ["));
1008               invert = 1;
1009             }
1010           else
1011             invert = 0;
1012           do
1013             {
1014               /* Nobody ever said this had to be fast. :-)
1015                  Note that if we're looking at some other [:...:]
1016                  construct, we just treat it as a bunch of ordinary
1017                  characters.  We can do this because we assume
1018                  regex has checked for syntax errors before
1019                  dfa is ever called. */
1020               if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1021                 for (c1 = 0; prednames[c1].name; ++c1)
1022                   if (looking_at(prednames[c1].name))
1023                     {
1024                       int (*pred) PARAMS ((int)) = prednames[c1].pred;
1025
1026                       for (c2 = 0; c2 < NOTCHAR; ++c2)
1027                         if ((*pred)(c2))
1028                           setbit_case_fold (c2, ccl);
1029                       lexptr += strlen(prednames[c1].name);
1030                       lexleft -= strlen(prednames[c1].name);
1031                       FETCH(c1, _("Unbalanced ["));
1032                       goto skip;
1033                     }
1034               if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1035                 FETCH(c, _("Unbalanced ["));
1036               FETCH(c1, _("Unbalanced ["));
1037               if (c1 == '-')
1038                 {
1039                   FETCH(c2, _("Unbalanced ["));
1040                   if (c2 == ']')
1041                     {
1042                       /* In the case [x-], the - is an ordinary hyphen,
1043                          which is left in c1, the lookahead character. */
1044                       --lexptr;
1045                       ++lexleft;
1046                     }
1047                   else
1048                     {
1049                       if (c2 == '\\'
1050                           && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1051                         FETCH(c2, _("Unbalanced ["));
1052                       FETCH(c1, _("Unbalanced ["));
1053                       if (!hard_LC_COLLATE) {
1054                         for (; c <= c2; c++)
1055                           setbit_case_fold (c, ccl);
1056                       } else {
1057                         /* POSIX locales are painful - leave the decision to libc */
1058                         char expr[6] = { '[', c, '-', c2, ']', '\0' };
1059                         regex_t re;
1060                         if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1061                           for (c = 0; c < NOTCHAR; ++c) {
1062                             char buf[2] = { c, '\0' };
1063                             regmatch_t mat;
1064                             if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1065                                && mat.rm_so == 0 && mat.rm_eo == 1)
1066                               setbit_case_fold (c, ccl);
1067                           }
1068                           regfree (&re);
1069                         }
1070                       }
1071                       continue;
1072                     }
1073                 }
1074
1075               setbit_case_fold (c, ccl);
1076
1077             skip:
1078               ;
1079             }
1080           while ((c = c1) != ']');
1081           if (invert)
1082             {
1083               notset(ccl);
1084               if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1085                 clrbit(eolbyte, ccl);
1086             }
1087           return lasttok = CSET + charclass_index(ccl);
1088
1089         default:
1090         normal_char:
1091           laststart = 0;
1092           if (case_fold && ISALPHA(c))
1093             {
1094               zeroset(ccl);
1095               setbit_case_fold (c, ccl);
1096               return lasttok = CSET + charclass_index(ccl);
1097             }
1098           return c;
1099         }
1100     }
1101
1102   /* The above loop should consume at most a backslash
1103      and some other character. */
1104   abort();
1105   return END;   /* keeps pedantic compilers happy. */
1106 }
1107
1108 /* Recursive descent parser for regular expressions. */
1109
1110 static token tok;               /* Lookahead token. */
1111 static int depth;               /* Current depth of a hypothetical stack
1112                                    holding deferred productions.  This is
1113                                    used to determine the depth that will be
1114                                    required of the real stack later on in
1115                                    dfaanalyze(). */
1116
1117 /* Add the given token to the parse tree, maintaining the depth count and
1118    updating the maximum depth if necessary. */
1119 static void
1120 addtok (token t)
1121 {
1122 #ifdef MBS_SUPPORT
1123   if (MB_CUR_MAX > 1)
1124     {
1125       REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1126                            dfa->tindex);
1127       /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1128       if (t == MBCSET)
1129         dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1130       else if (t < NOTCHAR)
1131         dfa->multibyte_prop[dfa->tindex]
1132           = (cur_mb_len == 1)? 3 /* single-byte char */
1133           : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1134              + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1135       else
1136         /* It may be unnecesssary, but it is safer to treat other
1137            symbols as singlebyte characters.  */
1138         dfa->multibyte_prop[dfa->tindex] = 3;
1139     }
1140 #endif
1141
1142   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1143   dfa->tokens[dfa->tindex++] = t;
1144
1145   switch (t)
1146     {
1147     case QMARK:
1148     case STAR:
1149     case PLUS:
1150       break;
1151
1152     case CAT:
1153     case OR:
1154     case ORTOP:
1155       --depth;
1156       break;
1157
1158     default:
1159       ++dfa->nleaves;
1160     case EMPTY:
1161       ++depth;
1162       break;
1163     }
1164   if (depth > dfa->depth)
1165     dfa->depth = depth;
1166 }
1167
1168 /* The grammar understood by the parser is as follows.
1169
1170    regexp:
1171      regexp OR branch
1172      branch
1173
1174    branch:
1175      branch closure
1176      closure
1177
1178    closure:
1179      closure QMARK
1180      closure STAR
1181      closure PLUS
1182      closure REPMN
1183      atom
1184
1185    atom:
1186      <normal character>
1187      <multibyte character>
1188      ANYCHAR
1189      MBCSET
1190      CSET
1191      BACKREF
1192      BEGLINE
1193      ENDLINE
1194      BEGWORD
1195      ENDWORD
1196      LIMWORD
1197      NOTLIMWORD
1198      CRANGE
1199      LPAREN regexp RPAREN
1200      <empty>
1201
1202    The parser builds a parse tree in postfix form in an array of tokens. */
1203
1204 static void
1205 atom (void)
1206 {
1207   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1208       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1209 #ifdef MBS_SUPPORT
1210       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1211 #endif /* MBS_SUPPORT */
1212       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1213     {
1214       addtok(tok);
1215       tok = lex();
1216 #ifdef MBS_SUPPORT
1217       /* We treat a multibyte character as a single atom, so that DFA
1218          can treat a multibyte character as a single expression.
1219
1220          e.g. We construct following tree from "<mb1><mb2>".
1221               <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1222               <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1223       */
1224       if (MB_CUR_MAX > 1)
1225         {
1226           while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1227             {
1228               addtok(tok);
1229               addtok(CAT);
1230               tok = lex();
1231             }
1232         }
1233 #endif /* MBS_SUPPORT  */
1234     }
1235   else if (tok == CRANGE)
1236     {
1237       /* A character range like "[a-z]" in a locale other than "C" or
1238          "POSIX".  This range might any sequence of one or more
1239          characters.  Unfortunately the POSIX locale primitives give
1240          us no practical way to find what character sequences might be
1241          matched.  Treat this approximately like "(.\1)" -- i.e. match
1242          one character, and then punt to the full matcher.  */
1243       charclass ccl;
1244       zeroset (ccl);
1245       notset (ccl);
1246       addtok (CSET + charclass_index (ccl));
1247       addtok (BACKREF);
1248       addtok (CAT);
1249       tok = lex ();
1250     }
1251   else if (tok == LPAREN)
1252     {
1253       tok = lex();
1254       regexp(0);
1255       if (tok != RPAREN)
1256         dfaerror(_("Unbalanced ("));
1257       tok = lex();
1258     }
1259   else
1260     addtok(EMPTY);
1261 }
1262
1263 /* Return the number of tokens in the given subexpression. */
1264 static int
1265 nsubtoks (int tindex)
1266 {
1267   int ntoks1;
1268
1269   switch (dfa->tokens[tindex - 1])
1270     {
1271     default:
1272       return 1;
1273     case QMARK:
1274     case STAR:
1275     case PLUS:
1276       return 1 + nsubtoks(tindex - 1);
1277     case CAT:
1278     case OR:
1279     case ORTOP:
1280       ntoks1 = nsubtoks(tindex - 1);
1281       return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1282     }
1283 }
1284
1285 /* Copy the given subexpression to the top of the tree. */
1286 static void
1287 copytoks (int tindex, int ntokens)
1288 {
1289   int i;
1290
1291   for (i = 0; i < ntokens; ++i)
1292     addtok(dfa->tokens[tindex + i]);
1293 }
1294
1295 static void
1296 closure (void)
1297 {
1298   int tindex, ntokens, i;
1299
1300   atom();
1301   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1302     if (tok == REPMN)
1303       {
1304         ntokens = nsubtoks(dfa->tindex);
1305         tindex = dfa->tindex - ntokens;
1306         if (maxrep < 0)
1307           addtok(PLUS);
1308         if (minrep == 0)
1309           addtok(QMARK);
1310         for (i = 1; i < minrep; ++i)
1311           {
1312             copytoks(tindex, ntokens);
1313             addtok(CAT);
1314           }
1315         for (; i < maxrep; ++i)
1316           {
1317             copytoks(tindex, ntokens);
1318             addtok(QMARK);
1319             addtok(CAT);
1320           }
1321         tok = lex();
1322       }
1323     else
1324       {
1325         addtok(tok);
1326         tok = lex();
1327       }
1328 }
1329
1330 static void
1331 branch (void)
1332 {
1333   closure();
1334   while (tok != RPAREN && tok != OR && tok >= 0)
1335     {
1336       closure();
1337       addtok(CAT);
1338     }
1339 }
1340
1341 static void
1342 regexp (int toplevel)
1343 {
1344   branch();
1345   while (tok == OR)
1346     {
1347       tok = lex();
1348       branch();
1349       if (toplevel)
1350         addtok(ORTOP);
1351       else
1352         addtok(OR);
1353     }
1354 }
1355
1356 /* Main entry point for the parser.  S is a string to be parsed, len is the
1357    length of the string, so s can include NUL characters.  D is a pointer to
1358    the struct dfa to parse into. */
1359 void
1360 dfaparse (char const *s, size_t len, struct dfa *d)
1361 {
1362   dfa = d;
1363   lexstart = lexptr = s;
1364   lexleft = len;
1365   lasttok = END;
1366   laststart = 1;
1367   parens = 0;
1368   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1369 #ifdef MBS_SUPPORT
1370   if (MB_CUR_MAX > 1)
1371     {
1372       cur_mb_index = 0;
1373       cur_mb_len = 0;
1374       memset(&mbs, 0, sizeof(mbstate_t));
1375     }
1376 #endif /* MBS_SUPPORT  */
1377
1378   if (! syntax_bits_set)
1379     dfaerror(_("No syntax specified"));
1380
1381   tok = lex();
1382   depth = d->depth;
1383
1384   regexp(1);
1385
1386   if (tok != END)
1387     dfaerror(_("Unbalanced )"));
1388
1389   addtok(END - d->nregexps);
1390   addtok(CAT);
1391
1392   if (d->nregexps)
1393     addtok(ORTOP);
1394
1395   ++d->nregexps;
1396 }
1397
1398 /* Some primitives for operating on sets of positions. */
1399
1400 /* Copy one set to another; the destination must be large enough. */
1401 static void
1402 copy (position_set const *src, position_set *dst)
1403 {
1404   int i;
1405
1406   for (i = 0; i < src->nelem; ++i)
1407     dst->elems[i] = src->elems[i];
1408   dst->nelem = src->nelem;
1409 }
1410
1411 /* Insert a position in a set.  Position sets are maintained in sorted
1412    order according to index.  If position already exists in the set with
1413    the same index then their constraints are logically or'd together.
1414    S->elems must point to an array large enough to hold the resulting set. */
1415 static void
1416 insert (position p, position_set *s)
1417 {
1418   int i;
1419   position t1, t2;
1420
1421   for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1422     continue;
1423   if (i < s->nelem && p.index == s->elems[i].index)
1424     s->elems[i].constraint |= p.constraint;
1425   else
1426     {
1427       t1 = p;
1428       ++s->nelem;
1429       while (i < s->nelem)
1430         {
1431           t2 = s->elems[i];
1432           s->elems[i++] = t1;
1433           t1 = t2;
1434         }
1435     }
1436 }
1437
1438 /* Merge two sets of positions into a third.  The result is exactly as if
1439    the positions of both sets were inserted into an initially empty set. */
1440 static void
1441 merge (position_set const *s1, position_set const *s2, position_set *m)
1442 {
1443   int i = 0, j = 0;
1444
1445   m->nelem = 0;
1446   while (i < s1->nelem && j < s2->nelem)
1447     if (s1->elems[i].index > s2->elems[j].index)
1448       m->elems[m->nelem++] = s1->elems[i++];
1449     else if (s1->elems[i].index < s2->elems[j].index)
1450       m->elems[m->nelem++] = s2->elems[j++];
1451     else
1452       {
1453         m->elems[m->nelem] = s1->elems[i++];
1454         m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1455       }
1456   while (i < s1->nelem)
1457     m->elems[m->nelem++] = s1->elems[i++];
1458   while (j < s2->nelem)
1459     m->elems[m->nelem++] = s2->elems[j++];
1460 }
1461
1462 /* Delete a position from a set. */
1463 static void
1464 delete (position p, position_set *s)
1465 {
1466   int i;
1467
1468   for (i = 0; i < s->nelem; ++i)
1469     if (p.index == s->elems[i].index)
1470       break;
1471   if (i < s->nelem)
1472     for (--s->nelem; i < s->nelem; ++i)
1473       s->elems[i] = s->elems[i + 1];
1474 }
1475
1476 /* Find the index of the state corresponding to the given position set with
1477    the given preceding context, or create a new state if there is no such
1478    state.  Newline and letter tell whether we got here on a newline or
1479    letter, respectively. */
1480 static int
1481 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1482 {
1483   int hash = 0;
1484   int constraint;
1485   int i, j;
1486
1487   newline = newline ? 1 : 0;
1488   letter = letter ? 1 : 0;
1489
1490   for (i = 0; i < s->nelem; ++i)
1491     hash ^= s->elems[i].index + s->elems[i].constraint;
1492
1493   /* Try to find a state that exactly matches the proposed one. */
1494   for (i = 0; i < d->sindex; ++i)
1495     {
1496       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1497           || newline != d->states[i].newline || letter != d->states[i].letter)
1498         continue;
1499       for (j = 0; j < s->nelem; ++j)
1500         if (s->elems[j].constraint
1501             != d->states[i].elems.elems[j].constraint
1502             || s->elems[j].index != d->states[i].elems.elems[j].index)
1503           break;
1504       if (j == s->nelem)
1505         return i;
1506     }
1507
1508   /* We'll have to create a new state. */
1509   REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1510   d->states[i].hash = hash;
1511   MALLOC(d->states[i].elems.elems, position, s->nelem);
1512   copy(s, &d->states[i].elems);
1513   d->states[i].newline = newline;
1514   d->states[i].letter = letter;
1515   d->states[i].backref = 0;
1516   d->states[i].constraint = 0;
1517   d->states[i].first_end = 0;
1518 #ifdef MBS_SUPPORT
1519   if (MB_CUR_MAX > 1)
1520     d->states[i].mbps.nelem = 0;
1521 #endif
1522   for (j = 0; j < s->nelem; ++j)
1523     if (d->tokens[s->elems[j].index] < 0)
1524       {
1525         constraint = s->elems[j].constraint;
1526         if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1527             || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1528             || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1529             || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1530           d->states[i].constraint |= constraint;
1531         if (! d->states[i].first_end)
1532           d->states[i].first_end = d->tokens[s->elems[j].index];
1533       }
1534     else if (d->tokens[s->elems[j].index] == BACKREF)
1535       {
1536         d->states[i].constraint = NO_CONSTRAINT;
1537         d->states[i].backref = 1;
1538       }
1539
1540   ++d->sindex;
1541
1542   return i;
1543 }
1544
1545 /* Find the epsilon closure of a set of positions.  If any position of the set
1546    contains a symbol that matches the empty string in some context, replace
1547    that position with the elements of its follow labeled with an appropriate
1548    constraint.  Repeat exhaustively until no funny positions are left.
1549    S->elems must be large enough to hold the result. */
1550 static void
1551 epsclosure (position_set *s, struct dfa const *d)
1552 {
1553   int i, j;
1554   int *visited;
1555   position p, old;
1556
1557   MALLOC(visited, int, d->tindex);
1558   for (i = 0; i < d->tindex; ++i)
1559     visited[i] = 0;
1560
1561   for (i = 0; i < s->nelem; ++i)
1562     if (d->tokens[s->elems[i].index] >= NOTCHAR
1563         && d->tokens[s->elems[i].index] != BACKREF
1564 #ifdef MBS_SUPPORT
1565         && d->tokens[s->elems[i].index] != ANYCHAR
1566         && d->tokens[s->elems[i].index] != MBCSET
1567 #endif
1568         && d->tokens[s->elems[i].index] < CSET)
1569       {
1570         old = s->elems[i];
1571         p.constraint = old.constraint;
1572         delete(s->elems[i], s);
1573         if (visited[old.index])
1574           {
1575             --i;
1576             continue;
1577           }
1578         visited[old.index] = 1;
1579         switch (d->tokens[old.index])
1580           {
1581           case BEGLINE:
1582             p.constraint &= BEGLINE_CONSTRAINT;
1583             break;
1584           case ENDLINE:
1585             p.constraint &= ENDLINE_CONSTRAINT;
1586             break;
1587           case BEGWORD:
1588             p.constraint &= BEGWORD_CONSTRAINT;
1589             break;
1590           case ENDWORD:
1591             p.constraint &= ENDWORD_CONSTRAINT;
1592             break;
1593           case LIMWORD:
1594             p.constraint &= LIMWORD_CONSTRAINT;
1595             break;
1596           case NOTLIMWORD:
1597             p.constraint &= NOTLIMWORD_CONSTRAINT;
1598             break;
1599           default:
1600             break;
1601           }
1602         for (j = 0; j < d->follows[old.index].nelem; ++j)
1603           {
1604             p.index = d->follows[old.index].elems[j].index;
1605             insert(p, s);
1606           }
1607         /* Force rescan to start at the beginning. */
1608         i = -1;
1609       }
1610
1611   free(visited);
1612 }
1613
1614 /* Perform bottom-up analysis on the parse tree, computing various functions.
1615    Note that at this point, we're pretending constructs like \< are real
1616    characters rather than constraints on what can follow them.
1617
1618    Nullable:  A node is nullable if it is at the root of a regexp that can
1619    match the empty string.
1620    *  EMPTY leaves are nullable.
1621    * No other leaf is nullable.
1622    * A QMARK or STAR node is nullable.
1623    * A PLUS node is nullable if its argument is nullable.
1624    * A CAT node is nullable if both its arguments are nullable.
1625    * An OR node is nullable if either argument is nullable.
1626
1627    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1628    that could correspond to the first character of a string matching the
1629    regexp rooted at the given node.
1630    * EMPTY leaves have empty firstpos.
1631    * The firstpos of a nonempty leaf is that leaf itself.
1632    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1633      argument.
1634    * The firstpos of a CAT node is the firstpos of the left argument, union
1635      the firstpos of the right if the left argument is nullable.
1636    * The firstpos of an OR node is the union of firstpos of each argument.
1637
1638    Lastpos:  The lastpos of a node is the set of positions that could
1639    correspond to the last character of a string matching the regexp at
1640    the given node.
1641    * EMPTY leaves have empty lastpos.
1642    * The lastpos of a nonempty leaf is that leaf itself.
1643    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1644      argument.
1645    * The lastpos of a CAT node is the lastpos of its right argument, union
1646      the lastpos of the left if the right argument is nullable.
1647    * The lastpos of an OR node is the union of the lastpos of each argument.
1648
1649    Follow:  The follow of a position is the set of positions that could
1650    correspond to the character following a character matching the node in
1651    a string matching the regexp.  At this point we consider special symbols
1652    that match the empty string in some context to be just normal characters.
1653    Later, if we find that a special symbol is in a follow set, we will
1654    replace it with the elements of its follow, labeled with an appropriate
1655    constraint.
1656    * Every node in the firstpos of the argument of a STAR or PLUS node is in
1657      the follow of every node in the lastpos.
1658    * Every node in the firstpos of the second argument of a CAT node is in
1659      the follow of every node in the lastpos of the first argument.
1660
1661    Because of the postfix representation of the parse tree, the depth-first
1662    analysis is conveniently done by a linear scan with the aid of a stack.
1663    Sets are stored as arrays of the elements, obeying a stack-like allocation
1664    scheme; the number of elements in each set deeper in the stack can be
1665    used to determine the address of a particular set's array. */
1666 void
1667 dfaanalyze (struct dfa *d, int searchflag)
1668 {
1669   int *nullable;                /* Nullable stack. */
1670   int *nfirstpos;               /* Element count stack for firstpos sets. */
1671   position *firstpos;           /* Array where firstpos elements are stored. */
1672   int *nlastpos;                /* Element count stack for lastpos sets. */
1673   position *lastpos;            /* Array where lastpos elements are stored. */
1674   int *nalloc;                  /* Sizes of arrays allocated to follow sets. */
1675   position_set tmp;             /* Temporary set for merging sets. */
1676   position_set merged;          /* Result of merging sets. */
1677   int wants_newline;            /* True if some position wants newline info. */
1678   int *o_nullable;
1679   int *o_nfirst, *o_nlast;
1680   position *o_firstpos, *o_lastpos;
1681   int i, j;
1682   position *pos;
1683
1684 #ifdef DEBUG
1685   fprintf(stderr, "dfaanalyze:\n");
1686   for (i = 0; i < d->tindex; ++i)
1687     {
1688       fprintf(stderr, " %d:", i);
1689       prtok(d->tokens[i]);
1690     }
1691   putc('\n', stderr);
1692 #endif
1693
1694   d->searchflag = searchflag;
1695
1696   MALLOC(nullable, int, d->depth);
1697   o_nullable = nullable;
1698   MALLOC(nfirstpos, int, d->depth);
1699   o_nfirst = nfirstpos;
1700   MALLOC(firstpos, position, d->nleaves);
1701   o_firstpos = firstpos, firstpos += d->nleaves;
1702   MALLOC(nlastpos, int, d->depth);
1703   o_nlast = nlastpos;
1704   MALLOC(lastpos, position, d->nleaves);
1705   o_lastpos = lastpos, lastpos += d->nleaves;
1706   MALLOC(nalloc, int, d->tindex);
1707   for (i = 0; i < d->tindex; ++i)
1708     nalloc[i] = 0;
1709   MALLOC(merged.elems, position, d->nleaves);
1710
1711   CALLOC(d->follows, position_set, d->tindex);
1712
1713   for (i = 0; i < d->tindex; ++i)
1714 #ifdef DEBUG
1715     {                           /* Nonsyntactic #ifdef goo... */
1716 #endif
1717     switch (d->tokens[i])
1718       {
1719       case EMPTY:
1720         /* The empty set is nullable. */
1721         *nullable++ = 1;
1722
1723         /* The firstpos and lastpos of the empty leaf are both empty. */
1724         *nfirstpos++ = *nlastpos++ = 0;
1725         break;
1726
1727       case STAR:
1728       case PLUS:
1729         /* Every element in the firstpos of the argument is in the follow
1730            of every element in the lastpos. */
1731         tmp.nelem = nfirstpos[-1];
1732         tmp.elems = firstpos;
1733         pos = lastpos;
1734         for (j = 0; j < nlastpos[-1]; ++j)
1735           {
1736             merge(&tmp, &d->follows[pos[j].index], &merged);
1737             REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1738                                  nalloc[pos[j].index], merged.nelem - 1);
1739             copy(&merged, &d->follows[pos[j].index]);
1740           }
1741
1742       case QMARK:
1743         /* A QMARK or STAR node is automatically nullable. */
1744         if (d->tokens[i] != PLUS)
1745           nullable[-1] = 1;
1746         break;
1747
1748       case CAT:
1749         /* Every element in the firstpos of the second argument is in the
1750            follow of every element in the lastpos of the first argument. */
1751         tmp.nelem = nfirstpos[-1];
1752         tmp.elems = firstpos;
1753         pos = lastpos + nlastpos[-1];
1754         for (j = 0; j < nlastpos[-2]; ++j)
1755           {
1756             merge(&tmp, &d->follows[pos[j].index], &merged);
1757             REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1758                                  nalloc[pos[j].index], merged.nelem - 1);
1759             copy(&merged, &d->follows[pos[j].index]);
1760           }
1761
1762         /* The firstpos of a CAT node is the firstpos of the first argument,
1763            union that of the second argument if the first is nullable. */
1764         if (nullable[-2])
1765           nfirstpos[-2] += nfirstpos[-1];
1766         else
1767           firstpos += nfirstpos[-1];
1768         --nfirstpos;
1769
1770         /* The lastpos of a CAT node is the lastpos of the second argument,
1771            union that of the first argument if the second is nullable. */
1772         if (nullable[-1])
1773           nlastpos[-2] += nlastpos[-1];
1774         else
1775           {
1776             pos = lastpos + nlastpos[-2];
1777             for (j = nlastpos[-1] - 1; j >= 0; --j)
1778               pos[j] = lastpos[j];
1779             lastpos += nlastpos[-2];
1780             nlastpos[-2] = nlastpos[-1];
1781           }
1782         --nlastpos;
1783
1784         /* A CAT node is nullable if both arguments are nullable. */
1785         nullable[-2] = nullable[-1] && nullable[-2];
1786         --nullable;
1787         break;
1788
1789       case OR:
1790       case ORTOP:
1791         /* The firstpos is the union of the firstpos of each argument. */
1792         nfirstpos[-2] += nfirstpos[-1];
1793         --nfirstpos;
1794
1795         /* The lastpos is the union of the lastpos of each argument. */
1796         nlastpos[-2] += nlastpos[-1];
1797         --nlastpos;
1798
1799         /* An OR node is nullable if either argument is nullable. */
1800         nullable[-2] = nullable[-1] || nullable[-2];
1801         --nullable;
1802         break;
1803
1804       default:
1805         /* Anything else is a nonempty position.  (Note that special
1806            constructs like \< are treated as nonempty strings here;
1807            an "epsilon closure" effectively makes them nullable later.
1808            Backreferences have to get a real position so we can detect
1809            transitions on them later.  But they are nullable. */
1810         *nullable++ = d->tokens[i] == BACKREF;
1811
1812         /* This position is in its own firstpos and lastpos. */
1813         *nfirstpos++ = *nlastpos++ = 1;
1814         --firstpos, --lastpos;
1815         firstpos->index = lastpos->index = i;
1816         firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1817
1818         /* Allocate the follow set for this position. */
1819         nalloc[i] = 1;
1820         MALLOC(d->follows[i].elems, position, nalloc[i]);
1821         break;
1822       }
1823 #ifdef DEBUG
1824     /* ... balance the above nonsyntactic #ifdef goo... */
1825       fprintf(stderr, "node %d:", i);
1826       prtok(d->tokens[i]);
1827       putc('\n', stderr);
1828       fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1829       fprintf(stderr, " firstpos:");
1830       for (j = nfirstpos[-1] - 1; j >= 0; --j)
1831         {
1832           fprintf(stderr, " %d:", firstpos[j].index);
1833           prtok(d->tokens[firstpos[j].index]);
1834         }
1835       fprintf(stderr, "\n lastpos:");
1836       for (j = nlastpos[-1] - 1; j >= 0; --j)
1837         {
1838           fprintf(stderr, " %d:", lastpos[j].index);
1839           prtok(d->tokens[lastpos[j].index]);
1840         }
1841       putc('\n', stderr);
1842     }
1843 #endif
1844
1845   /* For each follow set that is the follow set of a real position, replace
1846      it with its epsilon closure. */
1847   for (i = 0; i < d->tindex; ++i)
1848     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1849 #ifdef MBS_SUPPORT
1850         || d->tokens[i] == ANYCHAR
1851         || d->tokens[i] == MBCSET
1852 #endif
1853         || d->tokens[i] >= CSET)
1854       {
1855 #ifdef DEBUG
1856         fprintf(stderr, "follows(%d:", i);
1857         prtok(d->tokens[i]);
1858         fprintf(stderr, "):");
1859         for (j = d->follows[i].nelem - 1; j >= 0; --j)
1860           {
1861             fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1862             prtok(d->tokens[d->follows[i].elems[j].index]);
1863           }
1864         putc('\n', stderr);
1865 #endif
1866         copy(&d->follows[i], &merged);
1867         epsclosure(&merged, d);
1868         if (d->follows[i].nelem < merged.nelem)
1869           REALLOC(d->follows[i].elems, position, merged.nelem);
1870         copy(&merged, &d->follows[i]);
1871       }
1872
1873   /* Get the epsilon closure of the firstpos of the regexp.  The result will
1874      be the set of positions of state 0. */
1875   merged.nelem = 0;
1876   for (i = 0; i < nfirstpos[-1]; ++i)
1877     insert(firstpos[i], &merged);
1878   epsclosure(&merged, d);
1879
1880   /* Check if any of the positions of state 0 will want newline context. */
1881   wants_newline = 0;
1882   for (i = 0; i < merged.nelem; ++i)
1883     if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1884       wants_newline = 1;
1885
1886   /* Build the initial state. */
1887   d->salloc = 1;
1888   d->sindex = 0;
1889   MALLOC(d->states, dfa_state, d->salloc);
1890   state_index(d, &merged, wants_newline, 0);
1891
1892   free(o_nullable);
1893   free(o_nfirst);
1894   free(o_firstpos);
1895   free(o_nlast);
1896   free(o_lastpos);
1897   free(nalloc);
1898   free(merged.elems);
1899 }
1900
1901 /* Find, for each character, the transition out of state s of d, and store
1902    it in the appropriate slot of trans.
1903
1904    We divide the positions of s into groups (positions can appear in more
1905    than one group).  Each group is labeled with a set of characters that
1906    every position in the group matches (taking into account, if necessary,
1907    preceding context information of s).  For each group, find the union
1908    of the its elements' follows.  This set is the set of positions of the
1909    new state.  For each character in the group's label, set the transition
1910    on this character to be to a state corresponding to the set's positions,
1911    and its associated backward context information, if necessary.
1912
1913    If we are building a searching matcher, we include the positions of state
1914    0 in every state.
1915
1916    The collection of groups is constructed by building an equivalence-class
1917    partition of the positions of s.
1918
1919    For each position, find the set of characters C that it matches.  Eliminate
1920    any characters from C that fail on grounds of backward context.
1921
1922    Search through the groups, looking for a group whose label L has nonempty
1923    intersection with C.  If L - C is nonempty, create a new group labeled
1924    L - C and having the same positions as the current group, and set L to
1925    the intersection of L and C.  Insert the position in this group, set
1926    C = C - L, and resume scanning.
1927
1928    If after comparing with every group there are characters remaining in C,
1929    create a new group labeled with the characters of C and insert this
1930    position in that group. */
1931 void
1932 dfastate (int s, struct dfa *d, int trans[])
1933 {
1934   position_set grps[NOTCHAR];   /* As many as will ever be needed. */
1935   charclass labels[NOTCHAR];    /* Labels corresponding to the groups. */
1936   int ngrps = 0;                /* Number of groups actually used. */
1937   position pos;                 /* Current position being considered. */
1938   charclass matches;            /* Set of matching characters. */
1939   int matchesf;                 /* True if matches is nonempty. */
1940   charclass intersect;          /* Intersection with some label set. */
1941   int intersectf;               /* True if intersect is nonempty. */
1942   charclass leftovers;          /* Stuff in the label that didn't match. */
1943   int leftoversf;               /* True if leftovers is nonempty. */
1944   static charclass letters;     /* Set of characters considered letters. */
1945   static charclass newline;     /* Set of characters that aren't newline. */
1946   position_set follows;         /* Union of the follows of some group. */
1947   position_set tmp;             /* Temporary space for merging sets. */
1948   int state;                    /* New state. */
1949   int wants_newline;            /* New state wants to know newline context. */
1950   int state_newline;            /* New state on a newline transition. */
1951   int wants_letter;             /* New state wants to know letter context. */
1952   int state_letter;             /* New state on a letter transition. */
1953   static int initialized;       /* Flag for static initialization. */
1954 #ifdef MBS_SUPPORT
1955   int next_isnt_1st_byte = 0;   /* Flag If we can't add state0.  */
1956 #endif
1957   int i, j, k;
1958
1959   /* Initialize the set of letters, if necessary. */
1960   if (! initialized)
1961     {
1962       initialized = 1;
1963       for (i = 0; i < NOTCHAR; ++i)
1964         if (IS_WORD_CONSTITUENT(i))
1965           setbit(i, letters);
1966       setbit(eolbyte, newline);
1967     }
1968
1969   zeroset(matches);
1970
1971   for (i = 0; i < d->states[s].elems.nelem; ++i)
1972     {
1973       pos = d->states[s].elems.elems[i];
1974       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1975         setbit(d->tokens[pos.index], matches);
1976       else if (d->tokens[pos.index] >= CSET)
1977         copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1978 #ifdef MBS_SUPPORT
1979       else if (d->tokens[pos.index] == ANYCHAR
1980                || d->tokens[pos.index] == MBCSET)
1981       /* MB_CUR_MAX > 1  */
1982         {
1983           /* ANYCHAR and MBCSET must match with a single character, so we
1984              must put it to d->states[s].mbps, which contains the positions
1985              which can match with a single character not a byte.  */
1986           if (d->states[s].mbps.nelem == 0)
1987             {
1988               MALLOC(d->states[s].mbps.elems, position,
1989                      d->states[s].elems.nelem);
1990             }
1991           insert(pos, &(d->states[s].mbps));
1992           continue;
1993         }
1994 #endif /* MBS_SUPPORT */
1995       else
1996         continue;
1997
1998       /* Some characters may need to be eliminated from matches because
1999          they fail in the current context. */
2000       if (pos.constraint != 0xFF)
2001         {
2002           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2003                                          d->states[s].newline, 1))
2004             clrbit(eolbyte, matches);
2005           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2006                                          d->states[s].newline, 0))
2007             for (j = 0; j < CHARCLASS_INTS; ++j)
2008               matches[j] &= newline[j];
2009           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2010                                         d->states[s].letter, 1))
2011             for (j = 0; j < CHARCLASS_INTS; ++j)
2012               matches[j] &= ~letters[j];
2013           if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2014                                         d->states[s].letter, 0))
2015             for (j = 0; j < CHARCLASS_INTS; ++j)
2016               matches[j] &= letters[j];
2017
2018           /* If there are no characters left, there's no point in going on. */
2019           for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2020             continue;
2021           if (j == CHARCLASS_INTS)
2022             continue;
2023         }
2024
2025       for (j = 0; j < ngrps; ++j)
2026         {
2027           /* If matches contains a single character only, and the current
2028              group's label doesn't contain that character, go on to the
2029              next group. */
2030           if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2031               && !tstbit(d->tokens[pos.index], labels[j]))
2032             continue;
2033
2034           /* Check if this group's label has a nonempty intersection with
2035              matches. */
2036           intersectf = 0;
2037           for (k = 0; k < CHARCLASS_INTS; ++k)
2038             (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2039           if (! intersectf)
2040             continue;
2041
2042           /* It does; now find the set differences both ways. */
2043           leftoversf = matchesf = 0;
2044           for (k = 0; k < CHARCLASS_INTS; ++k)
2045             {
2046               /* Even an optimizing compiler can't know this for sure. */
2047               int match = matches[k], label = labels[j][k];
2048
2049               (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2050               (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2051             }
2052
2053           /* If there were leftovers, create a new group labeled with them. */
2054           if (leftoversf)
2055             {
2056               copyset(leftovers, labels[ngrps]);
2057               copyset(intersect, labels[j]);
2058               MALLOC(grps[ngrps].elems, position, d->nleaves);
2059               copy(&grps[j], &grps[ngrps]);
2060               ++ngrps;
2061             }
2062
2063           /* Put the position in the current group.  Note that there is no
2064              reason to call insert() here. */
2065           grps[j].elems[grps[j].nelem++] = pos;
2066
2067           /* If every character matching the current position has been
2068              accounted for, we're done. */
2069           if (! matchesf)
2070             break;
2071         }
2072
2073       /* If we've passed the last group, and there are still characters
2074          unaccounted for, then we'll have to create a new group. */
2075       if (j == ngrps)
2076         {
2077           copyset(matches, labels[ngrps]);
2078           zeroset(matches);
2079           MALLOC(grps[ngrps].elems, position, d->nleaves);
2080           grps[ngrps].nelem = 1;
2081           grps[ngrps].elems[0] = pos;
2082           ++ngrps;
2083         }
2084     }
2085
2086   MALLOC(follows.elems, position, d->nleaves);
2087   MALLOC(tmp.elems, position, d->nleaves);
2088
2089   /* If we are a searching matcher, the default transition is to a state
2090      containing the positions of state 0, otherwise the default transition
2091      is to fail miserably. */
2092   if (d->searchflag)
2093     {
2094       wants_newline = 0;
2095       wants_letter = 0;
2096       for (i = 0; i < d->states[0].elems.nelem; ++i)
2097         {
2098           if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2099             wants_newline = 1;
2100           if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2101             wants_letter = 1;
2102         }
2103       copy(&d->states[0].elems, &follows);
2104       state = state_index(d, &follows, 0, 0);
2105       if (wants_newline)
2106         state_newline = state_index(d, &follows, 1, 0);
2107       else
2108         state_newline = state;
2109       if (wants_letter)
2110         state_letter = state_index(d, &follows, 0, 1);
2111       else
2112         state_letter = state;
2113       for (i = 0; i < NOTCHAR; ++i)
2114         trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2115       trans[eolbyte] = state_newline;
2116     }
2117   else
2118     for (i = 0; i < NOTCHAR; ++i)
2119       trans[i] = -1;
2120
2121   for (i = 0; i < ngrps; ++i)
2122     {
2123       follows.nelem = 0;
2124
2125       /* Find the union of the follows of the positions of the group.
2126          This is a hideously inefficient loop.  Fix it someday. */
2127       for (j = 0; j < grps[i].nelem; ++j)
2128         for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2129           insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2130
2131 #ifdef MBS_SUPPORT
2132       if (MB_CUR_MAX > 1)
2133         {
2134           /* If a token in follows.elems is not 1st byte of a multibyte
2135              character, or the states of follows must accept the bytes
2136              which are not 1st byte of the multibyte character.
2137              Then, if a state of follows encounter a byte, it must not be
2138              a 1st byte of a multibyte character nor singlebyte character.
2139              We cansel to add state[0].follows to next state, because
2140              state[0] must accept 1st-byte
2141
2142              For example, we assume <sb a> is a certain singlebyte
2143              character, <mb A> is a certain multibyte character, and the
2144              codepoint of <sb a> equals the 2nd byte of the codepoint of
2145              <mb A>.
2146              When state[0] accepts <sb a>, state[i] transit to state[i+1]
2147              by accepting accepts 1st byte of <mb A>, and state[i+1]
2148              accepts 2nd byte of <mb A>, if state[i+1] encounter the
2149              codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2150              <mb A>, so we can not add state[0].  */
2151
2152           next_isnt_1st_byte = 0;
2153           for (j = 0; j < follows.nelem; ++j)
2154             {
2155               if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2156                 {
2157                   next_isnt_1st_byte = 1;
2158                   break;
2159                 }
2160             }
2161         }
2162 #endif
2163
2164       /* If we are building a searching matcher, throw in the positions
2165          of state 0 as well. */
2166 #ifdef MBS_SUPPORT
2167       if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2168 #else
2169       if (d->searchflag)
2170 #endif
2171         for (j = 0; j < d->states[0].elems.nelem; ++j)
2172           insert(d->states[0].elems.elems[j], &follows);
2173
2174       /* Find out if the new state will want any context information. */
2175       wants_newline = 0;
2176       if (tstbit(eolbyte, labels[i]))
2177         for (j = 0; j < follows.nelem; ++j)
2178           if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2179             wants_newline = 1;
2180
2181       wants_letter = 0;
2182       for (j = 0; j < CHARCLASS_INTS; ++j)
2183         if (labels[i][j] & letters[j])
2184           break;
2185       if (j < CHARCLASS_INTS)
2186         for (j = 0; j < follows.nelem; ++j)
2187           if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2188             wants_letter = 1;
2189
2190       /* Find the state(s) corresponding to the union of the follows. */
2191       state = state_index(d, &follows, 0, 0);
2192       if (wants_newline)
2193         state_newline = state_index(d, &follows, 1, 0);
2194       else
2195         state_newline = state;
2196       if (wants_letter)
2197         state_letter = state_index(d, &follows, 0, 1);
2198       else
2199         state_letter = state;
2200
2201       /* Set the transitions for each character in the current label. */
2202       for (j = 0; j < CHARCLASS_INTS; ++j)
2203         for (k = 0; k < INTBITS; ++k)
2204           if (labels[i][j] & 1 << k)
2205             {
2206               int c = j * INTBITS + k;
2207
2208               if (c == eolbyte)
2209                 trans[c] = state_newline;
2210               else if (IS_WORD_CONSTITUENT(c))
2211                 trans[c] = state_letter;
2212               else if (c < NOTCHAR)
2213                 trans[c] = state;
2214             }
2215     }
2216
2217   for (i = 0; i < ngrps; ++i)
2218     free(grps[i].elems);
2219   free(follows.elems);
2220   free(tmp.elems);
2221 }
2222
2223 /* Some routines for manipulating a compiled dfa's transition tables.
2224    Each state may or may not have a transition table; if it does, and it
2225    is a non-accepting state, then d->trans[state] points to its table.
2226    If it is an accepting state then d->fails[state] points to its table.
2227    If it has no table at all, then d->trans[state] is NULL.
2228    TODO: Improve this comment, get rid of the unnecessary redundancy. */
2229
2230 static void
2231 build_state (int s, struct dfa *d)
2232 {
2233   int *trans;                   /* The new transition table. */
2234   int i;
2235
2236   /* Set an upper limit on the number of transition tables that will ever
2237      exist at once.  1024 is arbitrary.  The idea is that the frequently
2238      used transition tables will be quickly rebuilt, whereas the ones that
2239      were only needed once or twice will be cleared away. */
2240   if (d->trcount >= 1024)
2241     {
2242       for (i = 0; i < d->tralloc; ++i)
2243         if (d->trans[i])
2244           {
2245             free((ptr_t) d->trans[i]);
2246             d->trans[i] = NULL;
2247           }
2248         else if (d->fails[i])
2249           {
2250             free((ptr_t) d->fails[i]);
2251             d->fails[i] = NULL;
2252           }
2253       d->trcount = 0;
2254     }
2255
2256   ++d->trcount;
2257
2258   /* Set up the success bits for this state. */
2259   d->success[s] = 0;
2260   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2261       s, *d))
2262     d->success[s] |= 4;
2263   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2264       s, *d))
2265     d->success[s] |= 2;
2266   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2267       s, *d))
2268     d->success[s] |= 1;
2269
2270   MALLOC(trans, int, NOTCHAR);
2271   dfastate(s, d, trans);
2272
2273   /* Now go through the new transition table, and make sure that the trans
2274      and fail arrays are allocated large enough to hold a pointer for the
2275      largest state mentioned in the table. */
2276   for (i = 0; i < NOTCHAR; ++i)
2277     if (trans[i] >= d->tralloc)
2278       {
2279         int oldalloc = d->tralloc;
2280
2281         while (trans[i] >= d->tralloc)
2282           d->tralloc *= 2;
2283         REALLOC(d->realtrans, int *, d->tralloc + 1);
2284         d->trans = d->realtrans + 1;
2285         REALLOC(d->fails, int *, d->tralloc);
2286         REALLOC(d->success, int, d->tralloc);
2287         while (oldalloc < d->tralloc)
2288           {
2289             d->trans[oldalloc] = NULL;
2290             d->fails[oldalloc++] = NULL;
2291           }
2292       }
2293
2294   /* Newline is a sentinel.  */
2295   trans[eolbyte] = -1;
2296
2297   if (ACCEPTING(s, *d))
2298     d->fails[s] = trans;
2299   else
2300     d->trans[s] = trans;
2301 }
2302
2303 static void
2304 build_state_zero (struct dfa *d)
2305 {
2306   d->tralloc = 1;
2307   d->trcount = 0;
2308   CALLOC(d->realtrans, int *, d->tralloc + 1);
2309   d->trans = d->realtrans + 1;
2310   CALLOC(d->fails, int *, d->tralloc);
2311   MALLOC(d->success, int, d->tralloc);
2312   build_state(0, d);
2313 }
2314
2315 #ifdef MBS_SUPPORT
2316 /* Multibyte character handling sub-routins for dfaexec.  */
2317
2318 /* Initial state may encounter the byte which is not a singlebyte character
2319    nor 1st byte of a multibyte character.  But it is incorrect for initial
2320    state to accept such a byte.
2321    For example, in sjis encoding the regular expression like "\\" accepts
2322    the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2323    0x815c. Then Initial state must skip the bytes which are not a singlebyte
2324    character nor 1st byte of a multibyte character.  */
2325 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)          \
2326   if (s == 0)                                           \
2327     {                                                   \
2328       while (inputwcs[p - buf_begin] == 0               \
2329             && mblen_buf[p - buf_begin] > 0             \
2330             && p < buf_end)                             \
2331         ++p;                                            \
2332       if (p >= end)                                     \
2333         {                                               \
2334           free(mblen_buf);                              \
2335           free(inputwcs);                               \
2336           return (size_t) -1;                           \
2337         }                                               \
2338     }
2339
2340 static void
2341 realloc_trans_if_necessary(struct dfa *d, int new_state)
2342 {
2343   /* Make sure that the trans and fail arrays are allocated large enough
2344      to hold a pointer for the new state. */
2345   if (new_state >= d->tralloc)
2346     {
2347       int oldalloc = d->tralloc;
2348
2349       while (new_state >= d->tralloc)
2350         d->tralloc *= 2;
2351       REALLOC(d->realtrans, int *, d->tralloc + 1);
2352       d->trans = d->realtrans + 1;
2353       REALLOC(d->fails, int *, d->tralloc);
2354       REALLOC(d->success, int, d->tralloc);
2355       while (oldalloc < d->tralloc)
2356         {
2357           d->trans[oldalloc] = NULL;
2358           d->fails[oldalloc++] = NULL;
2359         }
2360     }
2361 }
2362
2363 /* Return values of transit_state_singlebyte(), and
2364    transit_state_consume_1char.  */
2365 typedef enum
2366 {
2367   TRANSIT_STATE_IN_PROGRESS,    /* State transition has not finished.  */
2368   TRANSIT_STATE_DONE,           /* State transition has finished.  */
2369   TRANSIT_STATE_END_BUFFER      /* Reach the end of the buffer.  */
2370 } status_transit_state;
2371
2372 /* Consume a single byte and transit state from 's' to '*next_state'.
2373    This function is almost same as the state transition routin in dfaexec().
2374    But state transition is done just once, otherwise matching succeed or
2375    reach the end of the buffer.  */
2376 static status_transit_state
2377 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2378                                   int *next_state)
2379 {
2380   int *t;
2381   int works = s;
2382
2383   status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2384
2385   while (rval == TRANSIT_STATE_IN_PROGRESS)
2386     {
2387       if ((t = d->trans[works]) != NULL)
2388         {
2389           works = t[*p];
2390           rval = TRANSIT_STATE_DONE;
2391           if (works < 0)
2392             works = 0;
2393         }
2394       else if (works < 0)
2395         {
2396           if (p == buf_end)
2397             /* At the moment, it must not happen.  */
2398             return TRANSIT_STATE_END_BUFFER;
2399           works = 0;
2400         }
2401       else if (d->fails[works])
2402         {
2403           works = d->fails[works][*p];
2404           rval = TRANSIT_STATE_DONE;
2405         }
2406       else
2407         {
2408           build_state(works, d);
2409         }
2410     }
2411   *next_state = works;
2412   return rval;
2413 }
2414
2415 /* Check whether period can match or not in the current context.  If it can,
2416    return the amount of the bytes with which period can match, otherwise
2417    return 0.
2418    `pos' is the position of the period.  `index' is the index from the
2419    buf_begin, and it is the current position in the buffer.  */
2420 static int
2421 match_anychar (struct dfa *d, int s, position pos, int index)
2422 {
2423   int newline = 0;
2424   int letter = 0;
2425   wchar_t wc;
2426   int mbclen;
2427
2428   wc = inputwcs[index];
2429   mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2430
2431   /* Check context.  */
2432   if (wc == (wchar_t)eolbyte)
2433     {
2434       if (!(syntax_bits & RE_DOT_NEWLINE))
2435         return 0;
2436       newline = 1;
2437     }
2438   else if (wc == (wchar_t)'\0')
2439     {
2440       if (syntax_bits & RE_DOT_NOT_NULL)
2441         return 0;
2442       newline = 1;
2443     }
2444
2445   if (iswalnum(wc) || wc == L'_')
2446     letter = 1;
2447
2448   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2449                            newline, d->states[s].letter, letter))
2450     return 0;
2451
2452   return mbclen;
2453 }
2454
2455 /* Check whether bracket expression can match or not in the current context.
2456    If it can, return the amount of the bytes with which expression can match,
2457    otherwise return 0.
2458    `pos' is the position of the bracket expression.  `index' is the index
2459    from the buf_begin, and it is the current position in the buffer.  */
2460 int
2461 match_mb_charset (struct dfa *d, int s, position pos, int index)
2462 {
2463   int i;
2464   int match;            /* Flag which represent that matching succeed.  */
2465   int match_len;        /* Length of the character (or collating element)
2466                            with which this operator match.  */
2467   size_t op_len;        /* Length of the operator.  */
2468   char buffer[128];
2469   wchar_t wcbuf[6];
2470
2471   /* Pointer to the structure to which we are currently reffering.  */
2472   struct mb_char_classes *work_mbc;
2473
2474   int newline = 0;
2475   int letter = 0;
2476   wchar_t wc;           /* Current reffering character.  */
2477
2478   wc = inputwcs[index];
2479
2480   /* Check context.  */
2481   if (wc == (wchar_t)eolbyte)
2482     {
2483       if (!(syntax_bits & RE_DOT_NEWLINE))
2484         return 0;
2485       newline = 1;
2486     }
2487   else if (wc == (wchar_t)'\0')
2488     {
2489       if (syntax_bits & RE_DOT_NOT_NULL)
2490         return 0;
2491       newline = 1;
2492     }
2493   if (iswalnum(wc) || wc == L'_')
2494     letter = 1;
2495   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2496                            newline, d->states[s].letter, letter))
2497     return 0;
2498
2499   /* Assign the current reffering operator to work_mbc.  */
2500   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2501   match = !work_mbc->invert;
2502   match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2503
2504   /* match with a character class?  */
2505   for (i = 0; i<work_mbc->nch_classes; i++)
2506     {
2507       if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2508         goto charset_matched;
2509     }
2510
2511   strncpy(buffer, buf_begin + index, match_len);
2512   buffer[match_len] = '\0';
2513
2514   /* match with an equivalent class?  */
2515   for (i = 0; i<work_mbc->nequivs; i++)
2516     {
2517       op_len = strlen(work_mbc->equivs[i]);
2518       strncpy(buffer, buf_begin + index, op_len);
2519       buffer[op_len] = '\0';
2520       if (strcoll(work_mbc->equivs[i], buffer) == 0)
2521         {
2522           match_len = op_len;
2523           goto charset_matched;
2524         }
2525     }
2526
2527   /* match with a collating element?  */
2528   for (i = 0; i<work_mbc->ncoll_elems; i++)
2529     {
2530       op_len = strlen(work_mbc->coll_elems[i]);
2531       strncpy(buffer, buf_begin + index, op_len);
2532       buffer[op_len] = '\0';
2533
2534       if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2535         {
2536           match_len = op_len;
2537           goto charset_matched;
2538         }
2539     }
2540
2541   wcbuf[0] = wc;
2542   wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2543
2544   /* match with a range?  */
2545   for (i = 0; i<work_mbc->nranges; i++)
2546     {
2547       wcbuf[2] = work_mbc->range_sts[i];
2548       wcbuf[4] = work_mbc->range_ends[i];
2549
2550       if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2551           wcscoll(wcbuf+4, wcbuf) >= 0)
2552         goto charset_matched;
2553     }
2554
2555   /* match with a character?  */
2556   if (case_fold)
2557     wc = towlower (wc);
2558   for (i = 0; i<work_mbc->nchars; i++)
2559     {
2560       if (wc == work_mbc->chars[i])
2561         goto charset_matched;
2562     }
2563
2564   match = !match;
2565
2566  charset_matched:
2567   return match ? match_len : 0;
2568 }
2569
2570 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2571    array which corresponds to `d->states[s].mbps.elem' and each element of
2572    the array contains the amount of the bytes with which the element can
2573    match.
2574    `index' is the index from the buf_begin, and it is the current position
2575    in the buffer.
2576    Caller MUST free the array which this function return.  */
2577 static int*
2578 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2579 {
2580   int i;
2581   int* rarray;
2582
2583   MALLOC(rarray, int, d->states[s].mbps.nelem);
2584   for (i = 0; i < d->states[s].mbps.nelem; ++i)
2585     {
2586       position pos = d->states[s].mbps.elems[i];
2587       switch(d->tokens[pos.index])
2588         {
2589         case ANYCHAR:
2590           rarray[i] = match_anychar(d, s, pos, index);
2591           break;
2592         case MBCSET:
2593           rarray[i] = match_mb_charset(d, s, pos, index);
2594           break;
2595         default:
2596           break; /* can not happen.  */
2597         }
2598     }
2599   return rarray;
2600 }
2601
2602 /* Consume a single character and enumerate all of the positions which can
2603    be next position from the state `s'.
2604    `match_lens' is the input. It can be NULL, but it can also be the output
2605    of check_matching_with_multibyte_ops() for optimization.
2606    `mbclen' and `pps' are the output.  `mbclen' is the length of the
2607    character consumed, and `pps' is the set this function enumerate.  */
2608 static status_transit_state
2609 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2610                              int *match_lens, int *mbclen, position_set *pps)
2611 {
2612   int i, j;
2613   int s1, s2;
2614   int* work_mbls;
2615   status_transit_state rs = TRANSIT_STATE_DONE;
2616
2617   /* Calculate the length of the (single/multi byte) character
2618      to which p points.  */
2619   *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2620     : mblen_buf[*pp - buf_begin];
2621
2622   /* Calculate the state which can be reached from the state `s' by
2623      consuming `*mbclen' single bytes from the buffer.  */
2624   s1 = s;
2625   for (i = 0; i < *mbclen; i++)
2626     {
2627       s2 = s1;
2628       rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2629     }
2630   /* Copy the positions contained by `s1' to the set `pps'.  */
2631   copy(&(d->states[s1].elems), pps);
2632
2633   /* Check (inputed)match_lens, and initialize if it is NULL.  */
2634   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2635     work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2636   else
2637     work_mbls = match_lens;
2638
2639   /* Add all of the positions which can be reached from `s' by consuming
2640      a single character.  */
2641   for (i = 0; i < d->states[s].mbps.nelem ; i++)
2642    {
2643       if (work_mbls[i] == *mbclen)
2644         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2645              j++)
2646           insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2647                  pps);
2648     }
2649
2650   if (match_lens == NULL && work_mbls != NULL)
2651     free(work_mbls);
2652   return rs;
2653 }
2654
2655 /* Transit state from s, then return new state and update the pointer of the
2656    buffer.  This function is for some operator which can match with a multi-
2657    byte character or a collating element(which may be multi characters).  */
2658 static int
2659 transit_state (struct dfa *d, int s, unsigned char const **pp)
2660 {
2661   int s1;
2662   int mbclen;           /* The length of current input multibyte character. */
2663   int maxlen = 0;
2664   int i, j;
2665   int *match_lens = NULL;
2666   int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2667   position_set follows;
2668   unsigned char const *p1 = *pp;
2669   status_transit_state rs;
2670   wchar_t wc;
2671
2672   if (nelem > 0)
2673     /* This state has (a) multibyte operator(s).
2674        We check whether each of them can match or not.  */
2675     {
2676       /* Note: caller must free the return value of this function.  */
2677       match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2678
2679       for (i = 0; i < nelem; i++)
2680         /* Search the operator which match the longest string,
2681            in this state.  */
2682         {
2683           if (match_lens[i] > maxlen)
2684             maxlen = match_lens[i];
2685         }
2686     }
2687
2688   if (nelem == 0 || maxlen == 0)
2689     /* This state has no multibyte operator which can match.
2690        We need to  check only one singlebyte character.  */
2691     {
2692       status_transit_state rs;
2693       rs = transit_state_singlebyte(d, s, *pp, &s1);
2694
2695       /* We must update the pointer if state transition succeeded.  */
2696       if (rs == TRANSIT_STATE_DONE)
2697         ++*pp;
2698
2699       if (match_lens != NULL)
2700         free(match_lens);
2701       return s1;
2702     }
2703
2704   /* This state has some operators which can match a multibyte character.  */
2705   follows.nelem = 0;
2706   MALLOC(follows.elems, position, d->nleaves);
2707
2708   /* `maxlen' may be longer than the length of a character, because it may
2709      not be a character but a (multi character) collating element.
2710      We enumerate all of the positions which `s' can reach by consuming
2711      `maxlen' bytes.  */
2712   rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2713
2714   wc = inputwcs[*pp - mbclen - buf_begin];
2715   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2716   realloc_trans_if_necessary(d, s1);
2717
2718   while (*pp - p1 < maxlen)
2719     {
2720       follows.nelem = 0;
2721       rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2722
2723       for (i = 0; i < nelem ; i++)
2724         {
2725           if (match_lens[i] == *pp - p1)
2726             for (j = 0;
2727                  j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2728               insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2729                      &follows);
2730         }
2731
2732       wc = inputwcs[*pp - mbclen - buf_begin];
2733       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2734       realloc_trans_if_necessary(d, s1);
2735     }
2736   free(match_lens);
2737   free(follows.elems);
2738   return s1;
2739 }
2740
2741 #endif
2742
2743 /* Search through a buffer looking for a match to the given struct dfa.
2744    Find the first occurrence of a string matching the regexp in the buffer,
2745    and the shortest possible version thereof.  Return the offset of the first
2746    character after the match, or (size_t) -1 if none is found.  BEGIN points to
2747    the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2748    is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2749    where we're supposed to store a 1 if backreferencing happened and the
2750    match needs to be verified by a backtracking matcher.  Otherwise
2751    we store a 0 in *backref. */
2752 size_t
2753 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2754 {
2755   register int s;       /* Current state. */
2756   register unsigned char const *p; /* Current input character. */
2757   register unsigned char const *end; /* One past the last input character.  */
2758   register int **trans, *t;     /* Copy of d->trans so it can be optimized
2759                                    into a register. */
2760   register unsigned char eol = eolbyte; /* Likewise for eolbyte.  */
2761   static int sbit[NOTCHAR];     /* Table for anding with d->success. */
2762   static int sbit_init;
2763
2764   if (! sbit_init)
2765     {
2766       int i;
2767
2768       sbit_init = 1;
2769       for (i = 0; i < NOTCHAR; ++i)
2770         sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2771       sbit[eol] = 4;
2772     }
2773
2774   if (! d->tralloc)
2775     build_state_zero(d);
2776
2777   s = 0;
2778   p = (unsigned char const *) begin;
2779   end = p + size;
2780   trans = d->trans;
2781
2782 #ifdef MBS_SUPPORT
2783   if (MB_CUR_MAX > 1)
2784     {
2785       int remain_bytes, i;
2786       buf_begin = begin;
2787       buf_end = end;
2788
2789       /* initialize mblen_buf, and inputwcs.  */
2790       MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2791       MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2792       memset(&mbs, 0, sizeof(mbstate_t));
2793       remain_bytes = 0;
2794       for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2795         {
2796           if (remain_bytes == 0)
2797             {
2798               remain_bytes
2799                 = mbrtowc(inputwcs + i, begin + i,
2800                           end - (unsigned char const *)begin - i + 1, &mbs);
2801               if (remain_bytes <= 1)
2802                 {
2803                   remain_bytes = 0;
2804                   inputwcs[i] = (wchar_t)begin[i];
2805                   mblen_buf[i] = 0;
2806                 }
2807               else
2808                 {
2809                   mblen_buf[i] = remain_bytes;
2810                   remain_bytes--;
2811                 }
2812             }
2813           else
2814             {
2815               mblen_buf[i] = remain_bytes;
2816               inputwcs[i] = 0;
2817               remain_bytes--;
2818             }
2819         }
2820       mblen_buf[i] = 0;
2821       inputwcs[i] = 0; /* sentinel */
2822     }
2823 #endif /* MBS_SUPPORT */
2824
2825   for (;;)
2826     {
2827 #ifdef MBS_SUPPORT
2828       if (MB_CUR_MAX > 1)
2829         while ((t = trans[s]))
2830           {
2831             if (d->states[s].mbps.nelem != 0)
2832               {
2833                 /* Can match with a multibyte character( and multi character
2834                    collating element).  */
2835                 unsigned char const *nextp;
2836
2837                 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2838
2839                 nextp = p;
2840                 s = transit_state(d, s, &nextp);
2841                 p = nextp;
2842
2843                 /* Trans table might be updated.  */
2844                 trans = d->trans;
2845               }
2846             else
2847               {
2848                 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2849                 s = t[*p++];
2850               }
2851           }
2852       else
2853 #endif /* MBS_SUPPORT */
2854         while ((t = trans[s]))
2855           s = t[*p++];
2856
2857       if (s < 0)
2858         {
2859           if (p == end)
2860             {
2861 #ifdef MBS_SUPPORT
2862               if (MB_CUR_MAX > 1)
2863                 {
2864                   free(mblen_buf);
2865                   free(inputwcs);
2866                 }
2867 #endif /* MBS_SUPPORT */
2868               return (size_t) -1;
2869             }
2870           s = 0;
2871         }
2872       else if ((t = d->fails[s]))
2873         {
2874           if (d->success[s] & sbit[*p])
2875             {
2876               if (backref)
2877                 *backref = (d->states[s].backref != 0);
2878 #ifdef MBS_SUPPORT
2879               if (MB_CUR_MAX > 1)
2880                 {
2881                   free(mblen_buf);
2882                   free(inputwcs);
2883                 }
2884 #endif /* MBS_SUPPORT */
2885               return (char const *) p - begin;
2886             }
2887
2888 #ifdef MBS_SUPPORT
2889           if (MB_CUR_MAX > 1)
2890             {
2891                 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2892                 if (d->states[s].mbps.nelem != 0)
2893                   {
2894                     /* Can match with a multibyte character( and multi
2895                        character collating element).  */
2896                     unsigned char const *nextp;
2897                     nextp = p;
2898                     s = transit_state(d, s, &nextp);
2899                     p = nextp;
2900
2901                     /* Trans table might be updated.  */
2902                     trans = d->trans;
2903                   }
2904                 else
2905                 s = t[*p++];
2906             }
2907           else
2908 #endif /* MBS_SUPPORT */
2909           s = t[*p++];
2910         }
2911       else
2912         {
2913           build_state(s, d);
2914           trans = d->trans;
2915         }
2916     }
2917 }
2918
2919 /* Initialize the components of a dfa that the other routines don't
2920    initialize for themselves. */
2921 void
2922 dfainit (struct dfa *d)
2923 {
2924   d->calloc = 1;
2925   MALLOC(d->charclasses, charclass, d->calloc);
2926   d->cindex = 0;
2927
2928   d->talloc = 1;
2929   MALLOC(d->tokens, token, d->talloc);
2930   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2931 #ifdef MBS_SUPPORT
2932   if (MB_CUR_MAX > 1)
2933     {
2934       d->nmultibyte_prop = 1;
2935       MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2936       d->nmbcsets = 0;
2937       d->mbcsets_alloc = 1;
2938       MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2939     }
2940 #endif
2941
2942   d->searchflag = 0;
2943   d->tralloc = 0;
2944
2945   d->musts = 0;
2946 }
2947
2948 /* Parse and analyze a single string of the given length. */
2949 void
2950 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2951 {
2952   if (case_fold)        /* dummy folding in service of dfamust() */
2953     {
2954       char *lcopy;
2955       int i;
2956
2957       lcopy = malloc(len);
2958       if (!lcopy)
2959         dfaerror(_("out of memory"));
2960
2961       /* This is a kludge. */
2962       case_fold = 0;
2963       for (i = 0; i < len; ++i)
2964         if (ISUPPER ((unsigned char) s[i]))
2965           lcopy[i] = tolower ((unsigned char) s[i]);
2966         else
2967           lcopy[i] = s[i];
2968
2969       dfainit(d);
2970       dfaparse(lcopy, len, d);
2971       free(lcopy);
2972       dfamust(d);
2973       d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2974       case_fold = 1;
2975       dfaparse(s, len, d);
2976       dfaanalyze(d, searchflag);
2977     }
2978   else
2979     {
2980         dfainit(d);
2981         dfaparse(s, len, d);
2982         dfamust(d);
2983         dfaanalyze(d, searchflag);
2984     }
2985 }
2986
2987 /* Free the storage held by the components of a dfa. */
2988 void
2989 dfafree (struct dfa *d)
2990 {
2991   int i;
2992   struct dfamust *dm, *ndm;
2993
2994   free((ptr_t) d->charclasses);
2995   free((ptr_t) d->tokens);
2996
2997 #ifdef MBS_SUPPORT
2998   if (MB_CUR_MAX > 1)
2999     {
3000       free((ptr_t) d->multibyte_prop);
3001       for (i = 0; i < d->nmbcsets; ++i)
3002         {
3003           int j;
3004           struct mb_char_classes *p = &(d->mbcsets[i]);
3005           if (p->chars != NULL)
3006             free(p->chars);
3007           if (p->ch_classes != NULL)
3008             free(p->ch_classes);
3009           if (p->range_sts != NULL)
3010             free(p->range_sts);
3011           if (p->range_ends != NULL)
3012             free(p->range_ends);
3013
3014           for (j = 0; j < p->nequivs; ++j)
3015             free(p->equivs[j]);
3016           if (p->equivs != NULL)
3017             free(p->equivs);
3018
3019           for (j = 0; j < p->ncoll_elems; ++j)
3020             free(p->coll_elems[j]);
3021           if (p->coll_elems != NULL)
3022             free(p->coll_elems);
3023         }
3024       free((ptr_t) d->mbcsets);
3025     }
3026 #endif /* MBS_SUPPORT */
3027
3028   for (i = 0; i < d->sindex; ++i)
3029     free((ptr_t) d->states[i].elems.elems);
3030   free((ptr_t) d->states);
3031   for (i = 0; i < d->tindex; ++i)
3032     if (d->follows[i].elems)
3033       free((ptr_t) d->follows[i].elems);
3034   free((ptr_t) d->follows);
3035   for (i = 0; i < d->tralloc; ++i)
3036     if (d->trans[i])
3037       free((ptr_t) d->trans[i]);
3038     else if (d->fails[i])
3039       free((ptr_t) d->fails[i]);
3040   if (d->realtrans) free((ptr_t) d->realtrans);
3041   if (d->fails) free((ptr_t) d->fails);
3042   if (d->success) free((ptr_t) d->success);
3043   for (dm = d->musts; dm; dm = ndm)
3044     {
3045       ndm = dm->next;
3046       free(dm->must);
3047       free((ptr_t) dm);
3048     }
3049 }
3050
3051 /* Having found the postfix representation of the regular expression,
3052    try to find a long sequence of characters that must appear in any line
3053    containing the r.e.
3054    Finding a "longest" sequence is beyond the scope here;
3055    we take an easy way out and hope for the best.
3056    (Take "(ab|a)b"--please.)
3057
3058    We do a bottom-up calculation of sequences of characters that must appear
3059    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3060    representation:
3061         sequences that must appear at the left of the match ("left")
3062         sequences that must appear at the right of the match ("right")
3063         lists of sequences that must appear somewhere in the match ("in")
3064         sequences that must constitute the match ("is")
3065
3066    When we get to the root of the tree, we use one of the longest of its
3067    calculated "in" sequences as our answer.  The sequence we find is returned in
3068    d->must (where "d" is the single argument passed to "dfamust");
3069    the length of the sequence is returned in d->mustn.
3070
3071    The sequences calculated for the various types of node (in pseudo ANSI c)
3072    are shown below.  "p" is the operand of unary operators (and the left-hand
3073    operand of binary operators); "q" is the right-hand operand of binary
3074    operators.
3075
3076    "ZERO" means "a zero-length sequence" below.
3077
3078         Type    left            right           is              in
3079         ----    ----            -----           --              --
3080         char c  # c             # c             # c             # c
3081
3082         ANYCHAR ZERO            ZERO            ZERO            ZERO
3083
3084         MBCSET  ZERO            ZERO            ZERO            ZERO
3085
3086         CSET    ZERO            ZERO            ZERO            ZERO
3087
3088         STAR    ZERO            ZERO            ZERO            ZERO
3089
3090         QMARK   ZERO            ZERO            ZERO            ZERO
3091
3092         PLUS    p->left         p->right        ZERO            p->in
3093
3094         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3095                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3096                 p->is##q->left  p->right##q->is p->is##q->is :  p->right##q->left
3097                                                 ZERO
3098
3099         OR      longest common  longest common  (do p->is and   substrings common to
3100                 leading         trailing        q->is have same p->in and q->in
3101                 (sub)sequence   (sub)sequence   length and
3102                 of p->left      of p->right     content) ?
3103                 and q->left     and q->right    p->is : NULL
3104
3105    If there's anything else we recognize in the tree, all four sequences get set
3106    to zero-length sequences.  If there's something we don't recognize in the tree,
3107    we just return a zero-length sequence.
3108
3109    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3110    'aaa')?
3111
3112    And. . .is it here or someplace that we might ponder "optimizations" such as
3113         egrep 'psi|epsilon'     ->      egrep 'psi'
3114         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3115                                         (Yes, we now find "epsi" as a "string
3116                                         that must occur", but we might also
3117                                         simplify the *entire* r.e. being sought)
3118         grep '[c]'              ->      grep 'c'
3119         grep '(ab|a)b'          ->      grep 'ab'
3120         grep 'ab*'              ->      grep 'a'
3121         grep 'a*b'              ->      grep 'b'
3122
3123    There are several issues:
3124
3125    Is optimization easy (enough)?
3126
3127    Does optimization actually accomplish anything,
3128    or is the automaton you get from "psi|epsilon" (for example)
3129    the same as the one you get from "psi" (for example)?
3130
3131    Are optimizable r.e.'s likely to be used in real-life situations
3132    (something like 'ab*' is probably unlikely; something like is
3133    'psi|epsilon' is likelier)? */
3134
3135 static char *
3136 icatalloc (char *old, char *new)
3137 {
3138   char *result;
3139   size_t oldsize, newsize;
3140
3141   newsize = (new == NULL) ? 0 : strlen(new);
3142   if (old == NULL)
3143     oldsize = 0;
3144   else if (newsize == 0)
3145     return old;
3146   else  oldsize = strlen(old);
3147   if (old == NULL)
3148     result = (char *) malloc(newsize + 1);
3149   else
3150     result = (char *) realloc((void *) old, oldsize + newsize + 1);
3151   if (result != NULL && new != NULL)
3152     (void) strcpy(result + oldsize, new);
3153   return result;
3154 }
3155
3156 static char *
3157 icpyalloc (char *string)
3158 {
3159   return icatalloc((char *) NULL, string);
3160 }
3161
3162 static char *
3163 istrstr (char *lookin, char *lookfor)
3164 {
3165   char *cp;
3166   size_t len;
3167
3168   len = strlen(lookfor);
3169   for (cp = lookin; *cp != '\0'; ++cp)
3170     if (strncmp(cp, lookfor, len) == 0)
3171       return cp;
3172   return NULL;
3173 }
3174
3175 static void
3176 ifree (char *cp)
3177 {
3178   if (cp != NULL)
3179     free(cp);
3180 }
3181
3182 static void
3183 freelist (char **cpp)
3184 {
3185   int i;
3186
3187   if (cpp == NULL)
3188     return;
3189   for (i = 0; cpp[i] != NULL; ++i)
3190     {
3191       free(cpp[i]);
3192       cpp[i] = NULL;
3193     }
3194 }
3195
3196 static char **
3197 enlist (char **cpp, char *new, size_t len)
3198 {
3199   int i, j;
3200
3201   if (cpp == NULL)
3202     return NULL;
3203   if ((new = icpyalloc(new)) == NULL)
3204     {
3205       freelist(cpp);
3206       return NULL;
3207     }
3208   new[len] = '\0';
3209   /* Is there already something in the list that's new (or longer)? */
3210   for (i = 0; cpp[i] != NULL; ++i)
3211     if (istrstr(cpp[i], new) != NULL)
3212       {
3213         free(new);
3214         return cpp;
3215       }
3216   /* Eliminate any obsoleted strings. */
3217   j = 0;
3218   while (cpp[j] != NULL)
3219     if (istrstr(new, cpp[j]) == NULL)
3220       ++j;
3221     else
3222       {
3223         free(cpp[j]);
3224         if (--i == j)
3225           break;
3226         cpp[j] = cpp[i];
3227         cpp[i] = NULL;
3228       }
3229   /* Add the new string. */
3230   cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3231   if (cpp == NULL)
3232     return NULL;
3233   cpp[i] = new;
3234   cpp[i + 1] = NULL;
3235   return cpp;
3236 }
3237
3238 /* Given pointers to two strings, return a pointer to an allocated
3239    list of their distinct common substrings. Return NULL if something
3240    seems wild. */
3241 static char **
3242 comsubs (char *left, char *right)
3243 {
3244   char **cpp;
3245   char *lcp;
3246   char *rcp;
3247   size_t i, len;
3248
3249   if (left == NULL || right == NULL)
3250     return NULL;
3251   cpp = (char **) malloc(sizeof *cpp);
3252   if (cpp == NULL)
3253     return NULL;
3254   cpp[0] = NULL;
3255   for (lcp = left; *lcp != '\0'; ++lcp)
3256     {
3257       len = 0;
3258       rcp = strchr (right, *lcp);
3259       while (rcp != NULL)
3260         {
3261           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3262             continue;
3263           if (i > len)
3264             len = i;
3265           rcp = strchr (rcp + 1, *lcp);
3266         }
3267       if (len == 0)
3268         continue;
3269       if ((cpp = enlist(cpp, lcp, len)) == NULL)
3270         break;
3271     }
3272   return cpp;
3273 }
3274
3275 static char **
3276 addlists (char **old, char **new)
3277 {
3278   int i;
3279
3280   if (old == NULL || new == NULL)
3281     return NULL;
3282   for (i = 0; new[i] != NULL; ++i)
3283     {
3284       old = enlist(old, new[i], strlen(new[i]));
3285       if (old == NULL)
3286         break;
3287     }
3288   return old;
3289 }
3290
3291 /* Given two lists of substrings, return a new list giving substrings
3292    common to both. */
3293 static char **
3294 inboth (char **left, char **right)
3295 {
3296   char **both;
3297   char **temp;
3298   int lnum, rnum;
3299
3300   if (left == NULL || right == NULL)
3301     return NULL;
3302   both = (char **) malloc(sizeof *both);
3303   if (both == NULL)
3304     return NULL;
3305   both[0] = NULL;
3306   for (lnum = 0; left[lnum] != NULL; ++lnum)
3307     {
3308       for (rnum = 0; right[rnum] != NULL; ++rnum)
3309         {
3310           temp = comsubs(left[lnum], right[rnum]);
3311           if (temp == NULL)
3312             {
3313               freelist(both);
3314               return NULL;
3315             }
3316           both = addlists(both, temp);
3317           freelist(temp);
3318           free(temp);
3319           if (both == NULL)
3320             return NULL;
3321         }
3322     }
3323   return both;
3324 }
3325
3326 typedef struct
3327 {
3328   char **in;
3329   char *left;
3330   char *right;
3331   char *is;
3332 } must;
3333
3334 static void
3335 resetmust (must *mp)
3336 {
3337   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3338   freelist(mp->in);
3339 }
3340
3341 static void
3342 dfamust (struct dfa *dfa)
3343 {
3344   must *musts;
3345   must *mp;
3346   char *result;
3347   int ri;
3348   int i;
3349   int exact;
3350   token t;
3351   static must must0;
3352   struct dfamust *dm;
3353   static char empty_string[] = "";
3354
3355   result = empty_string;
3356   exact = 0;
3357   musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3358   if (musts == NULL)
3359     return;
3360   mp = musts;
3361   for (i = 0; i <= dfa->tindex; ++i)
3362     mp[i] = must0;
3363   for (i = 0; i <= dfa->tindex; ++i)
3364     {
3365       mp[i].in = (char **) malloc(sizeof *mp[i].in);
3366       mp[i].left = malloc(2);
3367       mp[i].right = malloc(2);
3368       mp[i].is = malloc(2);
3369       if (mp[i].in == NULL || mp[i].left == NULL ||
3370           mp[i].right == NULL || mp[i].is == NULL)
3371         goto done;
3372       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3373       mp[i].in[0] = NULL;
3374     }
3375 #ifdef DEBUG
3376   fprintf(stderr, "dfamust:\n");
3377   for (i = 0; i < dfa->tindex; ++i)
3378     {
3379       fprintf(stderr, " %d:", i);
3380       prtok(dfa->tokens[i]);
3381     }
3382   putc('\n', stderr);
3383 #endif
3384   for (ri = 0; ri < dfa->tindex; ++ri)
3385     {
3386       switch (t = dfa->tokens[ri])
3387         {
3388         case LPAREN:
3389         case RPAREN:
3390           goto done;            /* "cannot happen" */
3391         case EMPTY:
3392         case BEGLINE:
3393         case ENDLINE:
3394         case BEGWORD:
3395         case ENDWORD:
3396         case LIMWORD:
3397         case NOTLIMWORD:
3398         case BACKREF:
3399           resetmust(mp);
3400           break;
3401         case STAR:
3402         case QMARK:
3403           if (mp <= musts)
3404             goto done;          /* "cannot happen" */
3405           --mp;
3406           resetmust(mp);
3407           break;
3408         case OR:
3409         case ORTOP:
3410           if (mp < &musts[2])
3411             goto done;          /* "cannot happen" */
3412           {
3413             char **new;
3414             must *lmp;
3415             must *rmp;
3416             int j, ln, rn, n;
3417
3418             rmp = --mp;
3419             lmp = --mp;
3420             /* Guaranteed to be.  Unlikely, but. . . */
3421             if (strcmp(lmp->is, rmp->is) != 0)
3422               lmp->is[0] = '\0';
3423             /* Left side--easy */
3424             i = 0;
3425             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3426               ++i;
3427             lmp->left[i] = '\0';
3428             /* Right side */
3429             ln = strlen(lmp->right);
3430             rn = strlen(rmp->right);
3431             n = ln;
3432             if (n > rn)
3433               n = rn;
3434             for (i = 0; i < n; ++i)
3435               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3436                 break;
3437             for (j = 0; j < i; ++j)
3438               lmp->right[j] = lmp->right[(ln - i) + j];
3439             lmp->right[j] = '\0';
3440             new = inboth(lmp->in, rmp->in);
3441             if (new == NULL)
3442               goto done;
3443             freelist(lmp->in);
3444             free((char *) lmp->in);
3445             lmp->in = new;
3446           }
3447           break;
3448         case PLUS:
3449           if (mp <= musts)
3450             goto done;          /* "cannot happen" */
3451           --mp;
3452           mp->is[0] = '\0';
3453           break;
3454         case END:
3455           if (mp != &musts[1])
3456             goto done;          /* "cannot happen" */
3457           for (i = 0; musts[0].in[i] != NULL; ++i)
3458             if (strlen(musts[0].in[i]) > strlen(result))
3459               result = musts[0].in[i];
3460           if (strcmp(result, musts[0].is) == 0)
3461             exact = 1;
3462           goto done;
3463         case CAT:
3464           if (mp < &musts[2])
3465             goto done;          /* "cannot happen" */
3466           {
3467             must *lmp;
3468             must *rmp;
3469
3470             rmp = --mp;
3471             lmp = --mp;
3472             /* In.  Everything in left, plus everything in
3473                right, plus catenation of
3474                left's right and right's left. */
3475             lmp->in = addlists(lmp->in, rmp->in);
3476             if (lmp->in == NULL)
3477               goto done;
3478             if (lmp->right[0] != '\0' &&
3479                 rmp->left[0] != '\0')
3480               {
3481                 char *tp;
3482
3483                 tp = icpyalloc(lmp->right);
3484                 if (tp == NULL)
3485                   goto done;
3486                 tp = icatalloc(tp, rmp->left);
3487                 if (tp == NULL)
3488                   goto done;
3489                 lmp->in = enlist(lmp->in, tp,
3490                                  strlen(tp));
3491                 free(tp);
3492                 if (lmp->in == NULL)
3493                   goto done;
3494               }
3495             /* Left-hand */
3496             if (lmp->is[0] != '\0')
3497               {
3498                 lmp->left = icatalloc(lmp->left,
3499                                       rmp->left);
3500                 if (lmp->left == NULL)
3501                   goto done;
3502               }
3503             /* Right-hand */
3504             if (rmp->is[0] == '\0')
3505               lmp->right[0] = '\0';
3506             lmp->right = icatalloc(lmp->right, rmp->right);
3507             if (lmp->right == NULL)
3508               goto done;
3509             /* Guaranteed to be */
3510             if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3511               {
3512                 lmp->is = icatalloc(lmp->is, rmp->is);
3513                 if (lmp->is == NULL)
3514                   goto done;
3515               }
3516             else
3517               lmp->is[0] = '\0';
3518           }
3519           break;
3520         default:
3521           if (t < END)
3522             {
3523               /* "cannot happen" */
3524               goto done;
3525             }
3526           else if (t == '\0')
3527             {
3528               /* not on *my* shift */
3529               goto done;
3530             }
3531           else if (t >= CSET
3532 #ifdef MBS_SUPPORT
3533                    || t == ANYCHAR
3534                    || t == MBCSET
3535 #endif /* MBS_SUPPORT */
3536                    )
3537             {
3538               /* easy enough */
3539               resetmust(mp);
3540             }
3541           else
3542             {
3543               /* plain character */
3544               resetmust(mp);
3545               mp->is[0] = mp->left[0] = mp->right[0] = t;
3546               mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3547               mp->in = enlist(mp->in, mp->is, (size_t)1);
3548               if (mp->in == NULL)
3549                 goto done;
3550             }
3551           break;
3552         }
3553 #ifdef DEBUG
3554       fprintf(stderr, " node: %d:", ri);
3555       prtok(dfa->tokens[ri]);
3556       fprintf(stderr, "\n  in:");
3557       for (i = 0; mp->in[i]; ++i)
3558         fprintf(stderr, " \"%s\"", mp->in[i]);
3559       fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3560       fprintf(stderr, "  left: \"%s\"\n", mp->left);
3561       fprintf(stderr, "  right: \"%s\"\n", mp->right);
3562 #endif
3563       ++mp;
3564     }
3565  done:
3566   if (strlen(result))
3567     {
3568       dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3569       dm->exact = exact;
3570       dm->must = malloc(strlen(result) + 1);
3571       strcpy(dm->must, result);
3572       dm->next = dfa->musts;
3573       dfa->musts = dm;
3574     }
3575   mp = musts;
3576   for (i = 0; i <= dfa->tindex; ++i)
3577     {
3578       freelist(mp[i].in);
3579       ifree((char *) mp[i].in);
3580       ifree(mp[i].left);
3581       ifree(mp[i].right);
3582       ifree(mp[i].is);
3583     }
3584   free((char *) mp);
3585 }
3586 /* vim:set shiftwidth=2: */