contrib/llvm/lib/Support/regcomp.c

   1 /*-
   2  * This code is derived from OpenBSD's libc/regex, original license follows:
   3  *
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Henry Spencer.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  36  */
  37
  38 #include <sys/types.h>
  39 #include <stdio.h>
  40 #include <string.h>
  41 #include <ctype.h>
  42 #include <limits.h>
  43 #include <stdlib.h>
  44 #include "regex_impl.h"
  45
  46 #include "regutils.h"
  47 #include "regex2.h"
  48
  49 #include "llvm/Config/config.h"
  50 #if HAVE_STDINT_H
  51 #include <stdint.h>
  52 #else
  53 /* Pessimistically bound memory use */
  54 #define SIZE_MAX UINT_MAX
  55 #endif
  56
  57 /* character-class table */
  58 static struct cclass {
  59         const char *name;
  60         const char *chars;
  61         const char *multis;
  62 } cclasses[] = {
  63         { "alnum",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  64 0123456789",                            ""} ,
  65         { "alpha",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
  66                                         ""} ,
  67         { "blank",      " \t",          ""} ,
  68         { "cntrl",      "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
  69 \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
  70         { "digit",      "0123456789",   ""} ,
  71         { "graph",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  72 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  73                                         ""} ,
  74         { "lower",      "abcdefghijklmnopqrstuvwxyz",
  75                                         ""} ,
  76         { "print",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  77 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
  78                                         ""} ,
  79         { "punct",      "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  80                                         ""} ,
  81         { "space",      "\t\n\v\f\r ",  ""} ,
  82         { "upper",      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
  83                                         ""} ,
  84         { "xdigit",     "0123456789ABCDEFabcdef",
  85                                         ""} ,
  86         { NULL,         0,              "" }
  87 };
  88
  89 /* character-name table */
  90 static struct cname {
  91         const char *name;
  92         char code;
  93 } cnames[] = {
  94         { "NUL",                        '\0' },
  95         { "SOH",                        '\001' },
  96         { "STX",                        '\002' },
  97         { "ETX",                        '\003' },
  98         { "EOT",                        '\004' },
  99         { "ENQ",                        '\005' },
 100         { "ACK",                        '\006' },
 101         { "BEL",                        '\007' },
 102         { "alert",                      '\007' },
 103         { "BS",                         '\010' },
 104         { "backspace",                  '\b' },
 105         { "HT",                         '\011' },
 106         { "tab",                        '\t' },
 107         { "LF",                         '\012' },
 108         { "newline",                    '\n' },
 109         { "VT",                         '\013' },
 110         { "vertical-tab",               '\v' },
 111         { "FF",                         '\014' },
 112         { "form-feed",                  '\f' },
 113         { "CR",                         '\015' },
 114         { "carriage-return",            '\r' },
 115         { "SO",                         '\016' },
 116         { "SI",                         '\017' },
 117         { "DLE",                        '\020' },
 118         { "DC1",                        '\021' },
 119         { "DC2",                        '\022' },
 120         { "DC3",                        '\023' },
 121         { "DC4",                        '\024' },
 122         { "NAK",                        '\025' },
 123         { "SYN",                        '\026' },
 124         { "ETB",                        '\027' },
 125         { "CAN",                        '\030' },
 126         { "EM",                         '\031' },
 127         { "SUB",                        '\032' },
 128         { "ESC",                        '\033' },
 129         { "IS4",                        '\034' },
 130         { "FS",                         '\034' },
 131         { "IS3",                        '\035' },
 132         { "GS",                         '\035' },
 133         { "IS2",                        '\036' },
 134         { "RS",                         '\036' },
 135         { "IS1",                        '\037' },
 136         { "US",                         '\037' },
 137         { "space",                      ' ' },
 138         { "exclamation-mark",           '!' },
 139         { "quotation-mark",             '"' },
 140         { "number-sign",                '#' },
 141         { "dollar-sign",                '$' },
 142         { "percent-sign",               '%' },
 143         { "ampersand",                  '&' },
 144         { "apostrophe",                 '\'' },
 145         { "left-parenthesis",           '(' },
 146         { "right-parenthesis",          ')' },
 147         { "asterisk",                   '*' },
 148         { "plus-sign",                  '+' },
 149         { "comma",                      ',' },
 150         { "hyphen",                     '-' },
 151         { "hyphen-minus",               '-' },
 152         { "period",                     '.' },
 153         { "full-stop",                  '.' },
 154         { "slash",                      '/' },
 155         { "solidus",                    '/' },
 156         { "zero",                       '0' },
 157         { "one",                        '1' },
 158         { "two",                        '2' },
 159         { "three",                      '3' },
 160         { "four",                       '4' },
 161         { "five",                       '5' },
 162         { "six",                        '6' },
 163         { "seven",                      '7' },
 164         { "eight",                      '8' },
 165         { "nine",                       '9' },
 166         { "colon",                      ':' },
 167         { "semicolon",                  ';' },
 168         { "less-than-sign",             '<' },
 169         { "equals-sign",                '=' },
 170         { "greater-than-sign",          '>' },
 171         { "question-mark",              '?' },
 172         { "commercial-at",              '@' },
 173         { "left-square-bracket",        '[' },
 174         { "backslash",                  '\\' },
 175         { "reverse-solidus",            '\\' },
 176         { "right-square-bracket",       ']' },
 177         { "circumflex",                 '^' },
 178         { "circumflex-accent",          '^' },
 179         { "underscore",                 '_' },
 180         { "low-line",                   '_' },
 181         { "grave-accent",               '`' },
 182         { "left-brace",                 '{' },
 183         { "left-curly-bracket",         '{' },
 184         { "vertical-line",              '|' },
 185         { "right-brace",                '}' },
 186         { "right-curly-bracket",        '}' },
 187         { "tilde",                      '~' },
 188         { "DEL",                        '\177' },
 189         { NULL,                         0 }
 190 };
 191
 192 /*
 193  * parse structure, passed up and down to avoid global variables and
 194  * other clumsinesses
 195  */
 196 struct parse {
 197         char *next;             /* next character in RE */
 198         char *end;              /* end of string (-> NUL normally) */
 199         int error;              /* has an error been seen? */
 200         sop *strip;             /* malloced strip */
 201         sopno ssize;            /* malloced strip size (allocated) */
 202         sopno slen;             /* malloced strip length (used) */
 203         int ncsalloc;           /* number of csets allocated */
 204         struct re_guts *g;
 205 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 206         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 207         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 208 };
 209
 210 static void p_ere(struct parse *, int);
 211 static void p_ere_exp(struct parse *);
 212 static void p_str(struct parse *);
 213 static void p_bre(struct parse *, int, int);
 214 static int p_simp_re(struct parse *, int);
 215 static int p_count(struct parse *);
 216 static void p_bracket(struct parse *);
 217 static void p_b_term(struct parse *, cset *);
 218 static void p_b_cclass(struct parse *, cset *);
 219 static void p_b_eclass(struct parse *, cset *);
 220 static char p_b_symbol(struct parse *);
 221 static char p_b_coll_elem(struct parse *, int);
 222 static char othercase(int);
 223 static void bothcases(struct parse *, int);
 224 static void ordinary(struct parse *, int);
 225 static void nonnewline(struct parse *);
 226 static void repeat(struct parse *, sopno, int, int);
 227 static int seterr(struct parse *, int);
 228 static cset *allocset(struct parse *);
 229 static void freeset(struct parse *, cset *);
 230 static int freezeset(struct parse *, cset *);
 231 static int firstch(struct parse *, cset *);
 232 static int nch(struct parse *, cset *);
 233 static void mcadd(struct parse *, cset *, const char *);
 234 static void mcinvert(struct parse *, cset *);
 235 static void mccase(struct parse *, cset *);
 236 static int isinsets(struct re_guts *, int);
 237 static int samesets(struct re_guts *, int, int);
 238 static void categorize(struct parse *, struct re_guts *);
 239 static sopno dupl(struct parse *, sopno, sopno);
 240 static void doemit(struct parse *, sop, size_t);
 241 static void doinsert(struct parse *, sop, size_t, sopno);
 242 static void dofwd(struct parse *, sopno, sop);
 243 static void enlarge(struct parse *, sopno);
 244 static void stripsnug(struct parse *, struct re_guts *);
 245 static void findmust(struct parse *, struct re_guts *);
 246 static sopno pluscount(struct parse *, struct re_guts *);
 247
 248 static char nuls[10];           /* place to point scanner in event of error */
 249
 250 /*
 251  * macros for use with parse structure
 252  * BEWARE:  these know that the parse structure is named `p' !!!
 253  */
 254 #define PEEK()  (*p->next)
 255 #define PEEK2() (*(p->next+1))
 256 #define MORE()  (p->next < p->end)
 257 #define MORE2() (p->next+1 < p->end)
 258 #define SEE(c)  (MORE() && PEEK() == (c))
 259 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 260 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 261 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 262 #define NEXT()  (p->next++)
 263 #define NEXT2() (p->next += 2)
 264 #define NEXTn(n)        (p->next += (n))
 265 #define GETNEXT()       (*p->next++)
 266 #define SETERROR(e)     seterr(p, (e))
 267 #define REQUIRE(co, e)  (void)((co) || SETERROR(e))
 268 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 269 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 270 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 271 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 272 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 273 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 274 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 275 #define HERE()          (p->slen)
 276 #define THERE()         (p->slen - 1)
 277 #define THERETHERE()    (p->slen - 2)
 278 #define DROP(n) (p->slen -= (n))
 279
 280 #ifdef  _POSIX2_RE_DUP_MAX
 281 #define DUPMAX  _POSIX2_RE_DUP_MAX
 282 #else
 283 #define DUPMAX  255
 284 #endif
 285 #define INFINITY        (DUPMAX + 1)
 286
 287 #ifndef NDEBUG
 288 static int never = 0;           /* for use in asserts; shuts lint up */
 289 #else
 290 #define never   0               /* some <assert.h>s have bugs too */
 291 #endif
 292
 293 /*
 294  - llvm_regcomp - interface for parser and compilation
 295  */
 296 int                             /* 0 success, otherwise REG_something */
 297 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
 298 {
 299         struct parse pa;
 300         struct re_guts *g;
 301         struct parse *p = &pa;
 302         int i;
 303         size_t len;
 304 #ifdef REDEBUG
 305 #       define  GOODFLAGS(f)    (f)
 306 #else
 307 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 308 #endif
 309
 310         cflags = GOODFLAGS(cflags);
 311         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 312                 return(REG_INVARG);
 313
 314         if (cflags&REG_PEND) {
 315                 if (preg->re_endp < pattern)
 316                         return(REG_INVARG);
 317                 len = preg->re_endp - pattern;
 318         } else
 319                 len = strlen((const char *)pattern);
 320
 321         /* do the mallocs early so failure handling is easy */
 322         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 323                                                         (NC-1)*sizeof(cat_t));
 324         if (g == NULL)
 325                 return(REG_ESPACE);
 326         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 327         p->strip = (sop *)calloc(p->ssize, sizeof(sop));
 328         p->slen = 0;
 329         if (p->strip == NULL) {
 330                 free((char *)g);
 331                 return(REG_ESPACE);
 332         }
 333
 334         /* set things up */
 335         p->g = g;
 336         p->next = (char *)pattern;      /* convenience; we do not modify it */
 337         p->end = p->next + len;
 338         p->error = 0;
 339         p->ncsalloc = 0;
 340         for (i = 0; i < NPAREN; i++) {
 341                 p->pbegin[i] = 0;
 342                 p->pend[i] = 0;
 343         }
 344         g->csetsize = NC;
 345         g->sets = NULL;
 346         g->setbits = NULL;
 347         g->ncsets = 0;
 348         g->cflags = cflags;
 349         g->iflags = 0;
 350         g->nbol = 0;
 351         g->neol = 0;
 352         g->must = NULL;
 353         g->mlen = 0;
 354         g->nsub = 0;
 355         g->ncategories = 1;     /* category 0 is "everything else" */
 356         g->categories = &g->catspace[-(CHAR_MIN)];
 357         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 358         g->backrefs = 0;
 359
 360         /* do it */
 361         EMIT(OEND, 0);
 362         g->firststate = THERE();
 363         if (cflags&REG_EXTENDED)
 364                 p_ere(p, OUT);
 365         else if (cflags&REG_NOSPEC)
 366                 p_str(p);
 367         else
 368                 p_bre(p, OUT, OUT);
 369         EMIT(OEND, 0);
 370         g->laststate = THERE();
 371
 372         /* tidy up loose ends and fill things in */
 373         categorize(p, g);
 374         stripsnug(p, g);
 375         findmust(p, g);
 376         g->nplus = pluscount(p, g);
 377         g->magic = MAGIC2;
 378         preg->re_nsub = g->nsub;
 379         preg->re_g = g;
 380         preg->re_magic = MAGIC1;
 381 #ifndef REDEBUG
 382         /* not debugging, so can't rely on the assert() in llvm_regexec() */
 383         if (g->iflags&REGEX_BAD)
 384                 SETERROR(REG_ASSERT);
 385 #endif
 386
 387         /* win or lose, we're done */
 388         if (p->error != 0)      /* lose */
 389                 llvm_regfree(preg);
 390         return(p->error);
 391 }
 392
 393 /*
 394  - p_ere - ERE parser top level, concatenation and alternation
 395  */
 396 static void
 397 p_ere(struct parse *p, int stop)        /* character this ERE should end at */
 398 {
 399         char c;
 400         sopno prevback = 0;
 401         sopno prevfwd = 0;
 402         sopno conc;
 403         int first = 1;          /* is this the first alternative? */
 404
 405         for (;;) {
 406                 /* do a bunch of concatenated expressions */
 407                 conc = HERE();
 408                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 409                         p_ere_exp(p);
 410                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
 411
 412                 if (!EAT('|'))
 413                         break;          /* NOTE BREAK OUT */
 414
 415                 if (first) {
 416                         INSERT(OCH_, conc);     /* offset is wrong */
 417                         prevfwd = conc;
 418                         prevback = conc;
 419                         first = 0;
 420                 }
 421                 ASTERN(OOR1, prevback);
 422                 prevback = THERE();
 423                 AHEAD(prevfwd);                 /* fix previous offset */
 424                 prevfwd = HERE();
 425                 EMIT(OOR2, 0);                  /* offset is very wrong */
 426         }
 427
 428         if (!first) {           /* tail-end fixups */
 429                 AHEAD(prevfwd);
 430                 ASTERN(O_CH, prevback);
 431         }
 432
 433         assert(!MORE() || SEE(stop));
 434 }
 435
 436 /*
 437  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 438  */
 439 static void
 440 p_ere_exp(struct parse *p)
 441 {
 442         char c;
 443         sopno pos;
 444         int count;
 445         int count2;
 446         int backrefnum;
 447         sopno subno;
 448         int wascaret = 0;
 449
 450         assert(MORE());         /* caller should have ensured this */
 451         c = GETNEXT();
 452
 453         pos = HERE();
 454         switch (c) {
 455         case '(':
 456                 REQUIRE(MORE(), REG_EPAREN);
 457                 p->g->nsub++;
 458                 subno = p->g->nsub;
 459                 if (subno < NPAREN)
 460                         p->pbegin[subno] = HERE();
 461                 EMIT(OLPAREN, subno);
 462                 if (!SEE(')'))
 463                         p_ere(p, ')');
 464                 if (subno < NPAREN) {
 465                         p->pend[subno] = HERE();
 466                         assert(p->pend[subno] != 0);
 467                 }
 468                 EMIT(ORPAREN, subno);
 469                 MUSTEAT(')', REG_EPAREN);
 470                 break;
 471 #ifndef POSIX_MISTAKE
 472         case ')':               /* happens only if no current unmatched ( */
 473                 /*
 474                  * You may ask, why the ifndef?  Because I didn't notice
 475                  * this until slightly too late for 1003.2, and none of the
 476                  * other 1003.2 regular-expression reviewers noticed it at
 477                  * all.  So an unmatched ) is legal POSIX, at least until
 478                  * we can get it fixed.
 479                  */
 480                 SETERROR(REG_EPAREN);
 481                 break;
 482 #endif
 483         case '^':
 484                 EMIT(OBOL, 0);
 485                 p->g->iflags |= USEBOL;
 486                 p->g->nbol++;
 487                 wascaret = 1;
 488                 break;
 489         case '$':
 490                 EMIT(OEOL, 0);
 491                 p->g->iflags |= USEEOL;
 492                 p->g->neol++;
 493                 break;
 494         case '|':
 495                 SETERROR(REG_EMPTY);
 496                 break;
 497         case '*':
 498         case '+':
 499         case '?':
 500                 SETERROR(REG_BADRPT);
 501                 break;
 502         case '.':
 503                 if (p->g->cflags&REG_NEWLINE)
 504                         nonnewline(p);
 505                 else
 506                         EMIT(OANY, 0);
 507                 break;
 508         case '[':
 509                 p_bracket(p);
 510                 break;
 511         case '\\':
 512                 REQUIRE(MORE(), REG_EESCAPE);
 513                 c = GETNEXT();
 514                 if (c >= '1' && c <= '9') {
 515                         /* \[0-9] is taken to be a back-reference to a previously specified
 516                          * matching group. backrefnum will hold the number. The matching
 517                          * group must exist (i.e. if \4 is found there must have been at
 518                          * least 4 matching groups specified in the pattern previously).
 519                          */
 520                         backrefnum = c - '0';
 521                         if (p->pend[backrefnum] == 0) {
 522                                 SETERROR(REG_ESUBREG);
 523                                 break;
 524                         }
 525
 526                         /* Make sure everything checks out and emit the sequence
 527                          * that marks a back-reference to the parse structure.
 528                          */
 529                         assert(backrefnum <= p->g->nsub);
 530                         EMIT(OBACK_, backrefnum);
 531                         assert(p->pbegin[backrefnum] != 0);
 532                         assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
 533                         assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
 534                         (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
 535                         EMIT(O_BACK, backrefnum);
 536                         p->g->backrefs = 1;
 537                 } else {
 538                         /* Other chars are simply themselves when escaped with a backslash.
 539                          */
 540                         ordinary(p, c);
 541                 }
 542                 break;
 543         case '{':               /* okay as ordinary except if digit follows */
 544                 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
 545                 /* FALLTHROUGH */
 546         default:
 547                 ordinary(p, c);
 548                 break;
 549         }
 550
 551         if (!MORE())
 552                 return;
 553         c = PEEK();
 554         /* we call { a repetition if followed by a digit */
 555         if (!( c == '*' || c == '+' || c == '?' ||
 556                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
 557                 return;         /* no repetition, we're done */
 558         NEXT();
 559
 560         REQUIRE(!wascaret, REG_BADRPT);
 561         switch (c) {
 562         case '*':       /* implemented as +? */
 563                 /* this case does not require the (y|) trick, noKLUDGE */
 564                 INSERT(OPLUS_, pos);
 565                 ASTERN(O_PLUS, pos);
 566                 INSERT(OQUEST_, pos);
 567                 ASTERN(O_QUEST, pos);
 568                 break;
 569         case '+':
 570                 INSERT(OPLUS_, pos);
 571                 ASTERN(O_PLUS, pos);
 572                 break;
 573         case '?':
 574                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 575                 INSERT(OCH_, pos);              /* offset slightly wrong */
 576                 ASTERN(OOR1, pos);              /* this one's right */
 577                 AHEAD(pos);                     /* fix the OCH_ */
 578                 EMIT(OOR2, 0);                  /* offset very wrong... */
 579                 AHEAD(THERE());                 /* ...so fix it */
 580                 ASTERN(O_CH, THERETHERE());
 581                 break;
 582         case '{':
 583                 count = p_count(p);
 584                 if (EAT(',')) {
 585                         if (isdigit((uch)PEEK())) {
 586                                 count2 = p_count(p);
 587                                 REQUIRE(count <= count2, REG_BADBR);
 588                         } else          /* single number with comma */
 589                                 count2 = INFINITY;
 590                 } else          /* just a single number */
 591                         count2 = count;
 592                 repeat(p, pos, count, count2);
 593                 if (!EAT('}')) {        /* error heuristics */
 594                         while (MORE() && PEEK() != '}')
 595                                 NEXT();
 596                         REQUIRE(MORE(), REG_EBRACE);
 597                         SETERROR(REG_BADBR);
 598                 }
 599                 break;
 600         }
 601
 602         if (!MORE())
 603                 return;
 604         c = PEEK();
 605         if (!( c == '*' || c == '+' || c == '?' ||
 606                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
 607                 return;
 608         SETERROR(REG_BADRPT);
 609 }
 610
 611 /*
 612  - p_str - string (no metacharacters) "parser"
 613  */
 614 static void
 615 p_str(struct parse *p)
 616 {
 617         REQUIRE(MORE(), REG_EMPTY);
 618         while (MORE())
 619                 ordinary(p, GETNEXT());
 620 }
 621
 622 /*
 623  - p_bre - BRE parser top level, anchoring and concatenation
 624  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 625  *
 626  * This implementation is a bit of a kludge, in that a trailing $ is first
 627  * taken as an ordinary character and then revised to be an anchor.  The
 628  * only undesirable side effect is that '$' gets included as a character
 629  * category in such cases.  This is fairly harmless; not worth fixing.
 630  * The amount of lookahead needed to avoid this kludge is excessive.
 631  */
 632 static void
 633 p_bre(struct parse *p,
 634     int end1,           /* first terminating character */
 635     int end2)           /* second terminating character */
 636 {
 637         sopno start = HERE();
 638         int first = 1;                  /* first subexpression? */
 639         int wasdollar = 0;
 640
 641         if (EAT('^')) {
 642                 EMIT(OBOL, 0);
 643                 p->g->iflags |= USEBOL;
 644                 p->g->nbol++;
 645         }
 646         while (MORE() && !SEETWO(end1, end2)) {
 647                 wasdollar = p_simp_re(p, first);
 648                 first = 0;
 649         }
 650         if (wasdollar) {        /* oops, that was a trailing anchor */
 651                 DROP(1);
 652                 EMIT(OEOL, 0);
 653                 p->g->iflags |= USEEOL;
 654                 p->g->neol++;
 655         }
 656
 657         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 658 }
 659
 660 /*
 661  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 662  */
 663 static int                      /* was the simple RE an unbackslashed $? */
 664 p_simp_re(struct parse *p,
 665     int starordinary)           /* is a leading * an ordinary character? */
 666 {
 667         int c;
 668         int count;
 669         int count2;
 670         sopno pos;
 671         int i;
 672         sopno subno;
 673 #       define  BACKSL  (1<<CHAR_BIT)
 674
 675         pos = HERE(); /* repetition op, if any, covers from here */
 676
 677         assert(MORE()); /* caller should have ensured this */
 678         c = GETNEXT();
 679         if (c == '\\') {
 680                 REQUIRE(MORE(), REG_EESCAPE);
 681                 c = BACKSL | GETNEXT();
 682         }
 683         switch (c) {
 684         case '.':
 685                 if (p->g->cflags&REG_NEWLINE)
 686                         nonnewline(p);
 687                 else
 688                         EMIT(OANY, 0);
 689                 break;
 690         case '[':
 691                 p_bracket(p);
 692                 break;
 693         case BACKSL|'{':
 694                 SETERROR(REG_BADRPT);
 695                 break;
 696         case BACKSL|'(':
 697                 p->g->nsub++;
 698                 subno = p->g->nsub;
 699                 if (subno < NPAREN)
 700                         p->pbegin[subno] = HERE();
 701                 EMIT(OLPAREN, subno);
 702                 /* the MORE here is an error heuristic */
 703                 if (MORE() && !SEETWO('\\', ')'))
 704                         p_bre(p, '\\', ')');
 705                 if (subno < NPAREN) {
 706                         p->pend[subno] = HERE();
 707                         assert(p->pend[subno] != 0);
 708                 }
 709                 EMIT(ORPAREN, subno);
 710                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 711                 break;
 712         case BACKSL|')':        /* should not get here -- must be user */
 713         case BACKSL|'}':
 714                 SETERROR(REG_EPAREN);
 715                 break;
 716         case BACKSL|'1':
 717         case BACKSL|'2':
 718         case BACKSL|'3':
 719         case BACKSL|'4':
 720         case BACKSL|'5':
 721         case BACKSL|'6':
 722         case BACKSL|'7':
 723         case BACKSL|'8':
 724         case BACKSL|'9':
 725                 i = (c&~BACKSL) - '0';
 726                 assert(i < NPAREN);
 727                 if (p->pend[i] != 0) {
 728                         assert(i <= p->g->nsub);
 729                         EMIT(OBACK_, i);
 730                         assert(p->pbegin[i] != 0);
 731                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 732                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 733                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 734                         EMIT(O_BACK, i);
 735                 } else
 736                         SETERROR(REG_ESUBREG);
 737                 p->g->backrefs = 1;
 738                 break;
 739         case '*':
 740                 REQUIRE(starordinary, REG_BADRPT);
 741                 /* FALLTHROUGH */
 742         default:
 743                 ordinary(p, (char)c);
 744                 break;
 745         }
 746
 747         if (EAT('*')) {         /* implemented as +? */
 748                 /* this case does not require the (y|) trick, noKLUDGE */
 749                 INSERT(OPLUS_, pos);
 750                 ASTERN(O_PLUS, pos);
 751                 INSERT(OQUEST_, pos);
 752                 ASTERN(O_QUEST, pos);
 753         } else if (EATTWO('\\', '{')) {
 754                 count = p_count(p);
 755                 if (EAT(',')) {
 756                         if (MORE() && isdigit((uch)PEEK())) {
 757                                 count2 = p_count(p);
 758                                 REQUIRE(count <= count2, REG_BADBR);
 759                         } else          /* single number with comma */
 760                                 count2 = INFINITY;
 761                 } else          /* just a single number */
 762                         count2 = count;
 763                 repeat(p, pos, count, count2);
 764                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 765                         while (MORE() && !SEETWO('\\', '}'))
 766                                 NEXT();
 767                         REQUIRE(MORE(), REG_EBRACE);
 768                         SETERROR(REG_BADBR);
 769                 }
 770         } else if (c == '$')    /* $ (but not \$) ends it */
 771                 return(1);
 772
 773         return(0);
 774 }
 775
 776 /*
 777  - p_count - parse a repetition count
 778  */
 779 static int                      /* the value */
 780 p_count(struct parse *p)
 781 {
 782         int count = 0;
 783         int ndigits = 0;
 784
 785         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 786                 count = count*10 + (GETNEXT() - '0');
 787                 ndigits++;
 788         }
 789
 790         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 791         return(count);
 792 }
 793
 794 /*
 795  - p_bracket - parse a bracketed character list
 796  *
 797  * Note a significant property of this code:  if the allocset() did SETERROR,
 798  * no set operations are done.
 799  */
 800 static void
 801 p_bracket(struct parse *p)
 802 {
 803         cset *cs;
 804         int invert = 0;
 805
 806         /* Dept of Truly Sickening Special-Case Kludges */
 807         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 808                 EMIT(OBOW, 0);
 809                 NEXTn(6);
 810                 return;
 811         }
 812         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 813                 EMIT(OEOW, 0);
 814                 NEXTn(6);
 815                 return;
 816         }
 817
 818         if ((cs = allocset(p)) == NULL) {
 819                 /* allocset did set error status in p */
 820                 return;
 821         }
 822
 823         if (EAT('^'))
 824                 invert++;       /* make note to invert set at end */
 825         if (EAT(']'))
 826                 CHadd(cs, ']');
 827         else if (EAT('-'))
 828                 CHadd(cs, '-');
 829         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 830                 p_b_term(p, cs);
 831         if (EAT('-'))
 832                 CHadd(cs, '-');
 833         MUSTEAT(']', REG_EBRACK);
 834
 835         if (p->error != 0) {    /* don't mess things up further */
 836                 freeset(p, cs);
 837                 return;
 838         }
 839
 840         if (p->g->cflags&REG_ICASE) {
 841                 int i;
 842                 int ci;
 843
 844                 for (i = p->g->csetsize - 1; i >= 0; i--)
 845                         if (CHIN(cs, i) && isalpha(i)) {
 846                                 ci = othercase(i);
 847                                 if (ci != i)
 848                                         CHadd(cs, ci);
 849                         }
 850                 if (cs->multis != NULL)
 851                         mccase(p, cs);
 852         }
 853         if (invert) {
 854                 int i;
 855
 856                 for (i = p->g->csetsize - 1; i >= 0; i--)
 857                         if (CHIN(cs, i))
 858                                 CHsub(cs, i);
 859                         else
 860                                 CHadd(cs, i);
 861                 if (p->g->cflags&REG_NEWLINE)
 862                         CHsub(cs, '\n');
 863                 if (cs->multis != NULL)
 864                         mcinvert(p, cs);
 865         }
 866
 867         assert(cs->multis == NULL);             /* xxx */
 868
 869         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 870                 ordinary(p, firstch(p, cs));
 871                 freeset(p, cs);
 872         } else
 873                 EMIT(OANYOF, freezeset(p, cs));
 874 }
 875
 876 /*
 877  - p_b_term - parse one term of a bracketed character list
 878  */
 879 static void
 880 p_b_term(struct parse *p, cset *cs)
 881 {
 882         char c;
 883         char start, finish;
 884         int i;
 885
 886         /* classify what we've got */
 887         switch ((MORE()) ? PEEK() : '\0') {
 888         case '[':
 889                 c = (MORE2()) ? PEEK2() : '\0';
 890                 break;
 891         case '-':
 892                 SETERROR(REG_ERANGE);
 893                 return;                 /* NOTE RETURN */
 894                 break;
 895         default:
 896                 c = '\0';
 897                 break;
 898         }
 899
 900         switch (c) {
 901         case ':':               /* character class */
 902                 NEXT2();
 903                 REQUIRE(MORE(), REG_EBRACK);
 904                 c = PEEK();
 905                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 906                 p_b_cclass(p, cs);
 907                 REQUIRE(MORE(), REG_EBRACK);
 908                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 909                 break;
 910         case '=':               /* equivalence class */
 911                 NEXT2();
 912                 REQUIRE(MORE(), REG_EBRACK);
 913                 c = PEEK();
 914                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 915                 p_b_eclass(p, cs);
 916                 REQUIRE(MORE(), REG_EBRACK);
 917                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 918                 break;
 919         default:                /* symbol, ordinary character, or range */
 920 /* xxx revision needed for multichar stuff */
 921                 start = p_b_symbol(p);
 922                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 923                         /* range */
 924                         NEXT();
 925                         if (EAT('-'))
 926                                 finish = '-';
 927                         else
 928                                 finish = p_b_symbol(p);
 929                 } else
 930                         finish = start;
 931 /* xxx what about signed chars here... */
 932                 REQUIRE(start <= finish, REG_ERANGE);
 933                 for (i = start; i <= finish; i++)
 934                         CHadd(cs, i);
 935                 break;
 936         }
 937 }
 938
 939 /*
 940  - p_b_cclass - parse a character-class name and deal with it
 941  */
 942 static void
 943 p_b_cclass(struct parse *p, cset *cs)
 944 {
 945         char *sp = p->next;
 946         struct cclass *cp;
 947         size_t len;
 948         const char *u;
 949         char c;
 950
 951         while (MORE() && isalpha((uch)PEEK()))
 952                 NEXT();
 953         len = p->next - sp;
 954         for (cp = cclasses; cp->name != NULL; cp++)
 955                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 956                         break;
 957         if (cp->name == NULL) {
 958                 /* oops, didn't find it */
 959                 SETERROR(REG_ECTYPE);
 960                 return;
 961         }
 962
 963         u = cp->chars;
 964         while ((c = *u++) != '\0')
 965                 CHadd(cs, c);
 966         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 967                 MCadd(p, cs, u);
 968 }
 969
 970 /*
 971  - p_b_eclass - parse an equivalence-class name and deal with it
 972  *
 973  * This implementation is incomplete. xxx
 974  */
 975 static void
 976 p_b_eclass(struct parse *p, cset *cs)
 977 {
 978         char c;
 979
 980         c = p_b_coll_elem(p, '=');
 981         CHadd(cs, c);
 982 }
 983
 984 /*
 985  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 986  */
 987 static char                     /* value of symbol */
 988 p_b_symbol(struct parse *p)
 989 {
 990         char value;
 991
 992         REQUIRE(MORE(), REG_EBRACK);
 993         if (!EATTWO('[', '.'))
 994                 return(GETNEXT());
 995
 996         /* collating symbol */
 997         value = p_b_coll_elem(p, '.');
 998         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 999         return(value);
1000 }
1001
1002 /*
1003  - p_b_coll_elem - parse a collating-element name and look it up
1004  */
1005 static char                     /* value of collating element */
1006 p_b_coll_elem(struct parse *p,
1007     int endc)                   /* name ended by endc,']' */
1008 {
1009         char *sp = p->next;
1010         struct cname *cp;
1011         size_t len;
1012
1013         while (MORE() && !SEETWO(endc, ']'))
1014                 NEXT();
1015         if (!MORE()) {
1016                 SETERROR(REG_EBRACK);
1017                 return(0);
1018         }
1019         len = p->next - sp;
1020         for (cp = cnames; cp->name != NULL; cp++)
1021                 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1022                         return(cp->code);       /* known name */
1023         if (len == 1)
1024                 return(*sp);    /* single character */
1025         SETERROR(REG_ECOLLATE);                 /* neither */
1026         return(0);
1027 }
1028
1029 /*
1030  - othercase - return the case counterpart of an alphabetic
1031  */
1032 static char                     /* if no counterpart, return ch */
1033 othercase(int ch)
1034 {
1035         ch = (uch)ch;
1036         assert(isalpha(ch));
1037         if (isupper(ch))
1038                 return ((uch)tolower(ch));
1039         else if (islower(ch))
1040                 return ((uch)toupper(ch));
1041         else                    /* peculiar, but could happen */
1042                 return(ch);
1043 }
1044
1045 /*
1046  - bothcases - emit a dualcase version of a two-case character
1047  *
1048  * Boy, is this implementation ever a kludge...
1049  */
1050 static void
1051 bothcases(struct parse *p, int ch)
1052 {
1053         char *oldnext = p->next;
1054         char *oldend = p->end;
1055         char bracket[3];
1056
1057         ch = (uch)ch;
1058         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1059         p->next = bracket;
1060         p->end = bracket+2;
1061         bracket[0] = ch;
1062         bracket[1] = ']';
1063         bracket[2] = '\0';
1064         p_bracket(p);
1065         assert(p->next == bracket+2);
1066         p->next = oldnext;
1067         p->end = oldend;
1068 }
1069
1070 /*
1071  - ordinary - emit an ordinary character
1072  */
1073 static void
1074 ordinary(struct parse *p, int ch)
1075 {
1076         cat_t *cap = p->g->categories;
1077
1078         if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1079                 bothcases(p, ch);
1080         else {
1081                 EMIT(OCHAR, (uch)ch);
1082                 if (cap[ch] == 0)
1083                         cap[ch] = p->g->ncategories++;
1084         }
1085 }
1086
1087 /*
1088  - nonnewline - emit REG_NEWLINE version of OANY
1089  *
1090  * Boy, is this implementation ever a kludge...
1091  */
1092 static void
1093 nonnewline(struct parse *p)
1094 {
1095         char *oldnext = p->next;
1096         char *oldend = p->end;
1097         char bracket[4];
1098
1099         p->next = bracket;
1100         p->end = bracket+3;
1101         bracket[0] = '^';
1102         bracket[1] = '\n';
1103         bracket[2] = ']';
1104         bracket[3] = '\0';
1105         p_bracket(p);
1106         assert(p->next == bracket+3);
1107         p->next = oldnext;
1108         p->end = oldend;
1109 }
1110
1111 /*
1112  - repeat - generate code for a bounded repetition, recursively if needed
1113  */
1114 static void
1115 repeat(struct parse *p,
1116     sopno start,                /* operand from here to end of strip */
1117     int from,                   /* repeated from this number */
1118     int to)                     /* to this number of times (maybe INFINITY) */
1119 {
1120         sopno finish = HERE();
1121 #       define  N       2
1122 #       define  INF     3
1123 #       define  REP(f, t)       ((f)*8 + (t))
1124 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1125         sopno copy;
1126
1127         if (p->error != 0)      /* head off possible runaway recursion */
1128                 return;
1129
1130         assert(from <= to);
1131
1132         switch (REP(MAP(from), MAP(to))) {
1133         case REP(0, 0):                 /* must be user doing this */
1134                 DROP(finish-start);     /* drop the operand */
1135                 break;
1136         case REP(0, 1):                 /* as x{1,1}? */
1137         case REP(0, N):                 /* as x{1,n}? */
1138         case REP(0, INF):               /* as x{1,}? */
1139                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1140                 INSERT(OCH_, start);            /* offset is wrong... */
1141                 repeat(p, start+1, 1, to);
1142                 ASTERN(OOR1, start);
1143                 AHEAD(start);                   /* ... fix it */
1144                 EMIT(OOR2, 0);
1145                 AHEAD(THERE());
1146                 ASTERN(O_CH, THERETHERE());
1147                 break;
1148         case REP(1, 1):                 /* trivial case */
1149                 /* done */
1150                 break;
1151         case REP(1, N):                 /* as x?x{1,n-1} */
1152                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1153                 INSERT(OCH_, start);
1154                 ASTERN(OOR1, start);
1155                 AHEAD(start);
1156                 EMIT(OOR2, 0);                  /* offset very wrong... */
1157                 AHEAD(THERE());                 /* ...so fix it */
1158                 ASTERN(O_CH, THERETHERE());
1159                 copy = dupl(p, start+1, finish+1);
1160                 assert(copy == finish+4);
1161                 repeat(p, copy, 1, to-1);
1162                 break;
1163         case REP(1, INF):               /* as x+ */
1164                 INSERT(OPLUS_, start);
1165                 ASTERN(O_PLUS, start);
1166                 break;
1167         case REP(N, N):                 /* as xx{m-1,n-1} */
1168                 copy = dupl(p, start, finish);
1169                 repeat(p, copy, from-1, to-1);
1170                 break;
1171         case REP(N, INF):               /* as xx{n-1,INF} */
1172                 copy = dupl(p, start, finish);
1173                 repeat(p, copy, from-1, to);
1174                 break;
1175         default:                        /* "can't happen" */
1176                 SETERROR(REG_ASSERT);   /* just in case */
1177                 break;
1178         }
1179 }
1180
1181 /*
1182  - seterr - set an error condition
1183  */
1184 static int                      /* useless but makes type checking happy */
1185 seterr(struct parse *p, int e)
1186 {
1187         if (p->error == 0)      /* keep earliest error condition */
1188                 p->error = e;
1189         p->next = nuls;         /* try to bring things to a halt */
1190         p->end = nuls;
1191         return(0);              /* make the return value well-defined */
1192 }
1193
1194 /*
1195  - allocset - allocate a set of characters for []
1196  */
1197 static cset *
1198 allocset(struct parse *p)
1199 {
1200         int no = p->g->ncsets++;
1201         size_t nc;
1202         size_t nbytes;
1203         cset *cs;
1204         size_t css = (size_t)p->g->csetsize;
1205         int i;
1206
1207         if (no >= p->ncsalloc) {        /* need another column of space */
1208                 void *ptr;
1209
1210                 p->ncsalloc += CHAR_BIT;
1211                 nc = p->ncsalloc;
1212                 if (nc > SIZE_MAX / sizeof(cset))
1213                         goto nomem;
1214                 assert(nc % CHAR_BIT == 0);
1215                 nbytes = nc / CHAR_BIT * css;
1216
1217                 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1218                 if (ptr == NULL)
1219                         goto nomem;
1220                 p->g->sets = ptr;
1221
1222                 ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1223                 if (ptr == NULL)
1224                         goto nomem;
1225                 p->g->setbits = ptr;
1226
1227                 for (i = 0; i < no; i++)
1228                         p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1229
1230                 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1231         }
1232         /* XXX should not happen */
1233         if (p->g->sets == NULL || p->g->setbits == NULL)
1234                 goto nomem;
1235
1236         cs = &p->g->sets[no];
1237         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1238         cs->mask = 1 << ((no) % CHAR_BIT);
1239         cs->hash = 0;
1240         cs->smultis = 0;
1241         cs->multis = NULL;
1242
1243         return(cs);
1244 nomem:
1245         free(p->g->sets);
1246         p->g->sets = NULL;
1247         free(p->g->setbits);
1248         p->g->setbits = NULL;
1249
1250         SETERROR(REG_ESPACE);
1251         /* caller's responsibility not to do set ops */
1252         return(NULL);
1253 }
1254
1255 /*
1256  - freeset - free a now-unused set
1257  */
1258 static void
1259 freeset(struct parse *p, cset *cs)
1260 {
1261         size_t i;
1262         cset *top = &p->g->sets[p->g->ncsets];
1263         size_t css = (size_t)p->g->csetsize;
1264
1265         for (i = 0; i < css; i++)
1266                 CHsub(cs, i);
1267         if (cs == top-1)        /* recover only the easy case */
1268                 p->g->ncsets--;
1269 }
1270
1271 /*
1272  - freezeset - final processing on a set of characters
1273  *
1274  * The main task here is merging identical sets.  This is usually a waste
1275  * of time (although the hash code minimizes the overhead), but can win
1276  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1277  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1278  * the same value!
1279  */
1280 static int                      /* set number */
1281 freezeset(struct parse *p, cset *cs)
1282 {
1283         uch h = cs->hash;
1284         size_t i;
1285         cset *top = &p->g->sets[p->g->ncsets];
1286         cset *cs2;
1287         size_t css = (size_t)p->g->csetsize;
1288
1289         /* look for an earlier one which is the same */
1290         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1291                 if (cs2->hash == h && cs2 != cs) {
1292                         /* maybe */
1293                         for (i = 0; i < css; i++)
1294                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1295                                         break;          /* no */
1296                         if (i == css)
1297                                 break;                  /* yes */
1298                 }
1299
1300         if (cs2 < top) {        /* found one */
1301                 freeset(p, cs);
1302                 cs = cs2;
1303         }
1304
1305         return((int)(cs - p->g->sets));
1306 }
1307
1308 /*
1309  - firstch - return first character in a set (which must have at least one)
1310  */
1311 static int                      /* character; there is no "none" value */
1312 firstch(struct parse *p, cset *cs)
1313 {
1314         size_t i;
1315         size_t css = (size_t)p->g->csetsize;
1316
1317         for (i = 0; i < css; i++)
1318                 if (CHIN(cs, i))
1319                         return((char)i);
1320         assert(never);
1321         return(0);              /* arbitrary */
1322 }
1323
1324 /*
1325  - nch - number of characters in a set
1326  */
1327 static int
1328 nch(struct parse *p, cset *cs)
1329 {
1330         size_t i;
1331         size_t css = (size_t)p->g->csetsize;
1332         int n = 0;
1333
1334         for (i = 0; i < css; i++)
1335                 if (CHIN(cs, i))
1336                         n++;
1337         return(n);
1338 }
1339
1340 /*
1341  - mcadd - add a collating element to a cset
1342  */
1343 static void
1344 mcadd( struct parse *p, cset *cs, const char *cp)
1345 {
1346         size_t oldend = cs->smultis;
1347         void *np;
1348
1349         cs->smultis += strlen(cp) + 1;
1350         np = realloc(cs->multis, cs->smultis);
1351         if (np == NULL) {
1352                 if (cs->multis)
1353                         free(cs->multis);
1354                 cs->multis = NULL;
1355                 SETERROR(REG_ESPACE);
1356                 return;
1357         }
1358         cs->multis = np;
1359
1360         llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1361 }
1362
1363 /*
1364  - mcinvert - invert the list of collating elements in a cset
1365  *
1366  * This would have to know the set of possibilities.  Implementation
1367  * is deferred.
1368  */
1369 /* ARGSUSED */
1370 static void
1371 mcinvert(struct parse *p, cset *cs)
1372 {
1373         assert(cs->multis == NULL);     /* xxx */
1374 }
1375
1376 /*
1377  - mccase - add case counterparts of the list of collating elements in a cset
1378  *
1379  * This would have to know the set of possibilities.  Implementation
1380  * is deferred.
1381  */
1382 /* ARGSUSED */
1383 static void
1384 mccase(struct parse *p, cset *cs)
1385 {
1386         assert(cs->multis == NULL);     /* xxx */
1387 }
1388
1389 /*
1390  - isinsets - is this character in any sets?
1391  */
1392 static int                      /* predicate */
1393 isinsets(struct re_guts *g, int c)
1394 {
1395         uch *col;
1396         int i;
1397         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1398         unsigned uc = (uch)c;
1399
1400         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1401                 if (col[uc] != 0)
1402                         return(1);
1403         return(0);
1404 }
1405
1406 /*
1407  - samesets - are these two characters in exactly the same sets?
1408  */
1409 static int                      /* predicate */
1410 samesets(struct re_guts *g, int c1, int c2)
1411 {
1412         uch *col;
1413         int i;
1414         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1415         unsigned uc1 = (uch)c1;
1416         unsigned uc2 = (uch)c2;
1417
1418         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1419                 if (col[uc1] != col[uc2])
1420                         return(0);
1421         return(1);
1422 }
1423
1424 /*
1425  - categorize - sort out character categories
1426  */
1427 static void
1428 categorize(struct parse *p, struct re_guts *g)
1429 {
1430         cat_t *cats = g->categories;
1431         int c;
1432         int c2;
1433         cat_t cat;
1434
1435         /* avoid making error situations worse */
1436         if (p->error != 0)
1437                 return;
1438
1439         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1440                 if (cats[c] == 0 && isinsets(g, c)) {
1441                         cat = g->ncategories++;
1442                         cats[c] = cat;
1443                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1444                                 if (cats[c2] == 0 && samesets(g, c, c2))
1445                                         cats[c2] = cat;
1446                 }
1447 }
1448
1449 /*
1450  - dupl - emit a duplicate of a bunch of sops
1451  */
1452 static sopno                    /* start of duplicate */
1453 dupl(struct parse *p,
1454     sopno start,                /* from here */
1455     sopno finish)               /* to this less one */
1456 {
1457         sopno ret = HERE();
1458         sopno len = finish - start;
1459
1460         assert(finish >= start);
1461         if (len == 0)
1462                 return(ret);
1463         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1464         assert(p->ssize >= p->slen + len);
1465         (void) memmove((char *)(p->strip + p->slen),
1466                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1467         p->slen += len;
1468         return(ret);
1469 }
1470
1471 /*
1472  - doemit - emit a strip operator
1473  *
1474  * It might seem better to implement this as a macro with a function as
1475  * hard-case backup, but it's just too big and messy unless there are
1476  * some changes to the data structures.  Maybe later.
1477  */
1478 static void
1479 doemit(struct parse *p, sop op, size_t opnd)
1480 {
1481         /* avoid making error situations worse */
1482         if (p->error != 0)
1483                 return;
1484
1485         /* deal with oversize operands ("can't happen", more or less) */
1486         assert(opnd < 1<<OPSHIFT);
1487
1488         /* deal with undersized strip */
1489         if (p->slen >= p->ssize)
1490                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1491         assert(p->slen < p->ssize);
1492
1493         /* finally, it's all reduced to the easy case */
1494         p->strip[p->slen++] = SOP(op, opnd);
1495 }
1496
1497 /*
1498  - doinsert - insert a sop into the strip
1499  */
1500 static void
1501 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1502 {
1503         sopno sn;
1504         sop s;
1505         int i;
1506
1507         /* avoid making error situations worse */
1508         if (p->error != 0)
1509                 return;
1510
1511         sn = HERE();
1512         EMIT(op, opnd);         /* do checks, ensure space */
1513         assert(HERE() == sn+1);
1514         s = p->strip[sn];
1515
1516         /* adjust paren pointers */
1517         assert(pos > 0);
1518         for (i = 1; i < NPAREN; i++) {
1519                 if (p->pbegin[i] >= pos) {
1520                         p->pbegin[i]++;
1521                 }
1522                 if (p->pend[i] >= pos) {
1523                         p->pend[i]++;
1524                 }
1525         }
1526
1527         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1528                                                 (HERE()-pos-1)*sizeof(sop));
1529         p->strip[pos] = s;
1530 }
1531
1532 /*
1533  - dofwd - complete a forward reference
1534  */
1535 static void
1536 dofwd(struct parse *p, sopno pos, sop value)
1537 {
1538         /* avoid making error situations worse */
1539         if (p->error != 0)
1540                 return;
1541
1542         assert(value < 1<<OPSHIFT);
1543         p->strip[pos] = OP(p->strip[pos]) | value;
1544 }
1545
1546 /*
1547  - enlarge - enlarge the strip
1548  */
1549 static void
1550 enlarge(struct parse *p, sopno size)
1551 {
1552         sop *sp;
1553
1554         if (p->ssize >= size)
1555                 return;
1556
1557         if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1558                 SETERROR(REG_ESPACE);
1559                 return;
1560         }
1561
1562         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1563         if (sp == NULL) {
1564                 SETERROR(REG_ESPACE);
1565                 return;
1566         }
1567         p->strip = sp;
1568         p->ssize = size;
1569 }
1570
1571 /*
1572  - stripsnug - compact the strip
1573  */
1574 static void
1575 stripsnug(struct parse *p, struct re_guts *g)
1576 {
1577         g->nstates = p->slen;
1578         if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1579                 g->strip = p->strip;
1580                 SETERROR(REG_ESPACE);
1581                 return;
1582         }
1583
1584         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1585         if (g->strip == NULL) {
1586                 SETERROR(REG_ESPACE);
1587                 g->strip = p->strip;
1588         }
1589 }
1590
1591 /*
1592  - findmust - fill in must and mlen with longest mandatory literal string
1593  *
1594  * This algorithm could do fancy things like analyzing the operands of |
1595  * for common subsequences.  Someday.  This code is simple and finds most
1596  * of the interesting cases.
1597  *
1598  * Note that must and mlen got initialized during setup.
1599  */
1600 static void
1601 findmust(struct parse *p, struct re_guts *g)
1602 {
1603         sop *scan;
1604         sop *start = 0; /* start initialized in the default case, after that */
1605         sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1606         sopno newlen;
1607         sop s;
1608         char *cp;
1609         sopno i;
1610
1611         /* avoid making error situations worse */
1612         if (p->error != 0)
1613                 return;
1614
1615         /* find the longest OCHAR sequence in strip */
1616         newlen = 0;
1617         scan = g->strip + 1;
1618         do {
1619                 s = *scan++;
1620                 switch (OP(s)) {
1621                 case OCHAR:             /* sequence member */
1622                         if (newlen == 0)                /* new sequence */
1623                                 newstart = scan - 1;
1624                         newlen++;
1625                         break;
1626                 case OPLUS_:            /* things that don't break one */
1627                 case OLPAREN:
1628                 case ORPAREN:
1629                         break;
1630                 case OQUEST_:           /* things that must be skipped */
1631                 case OCH_:
1632                         scan--;
1633                         do {
1634                                 scan += OPND(s);
1635                                 s = *scan;
1636                                 /* assert() interferes w debug printouts */
1637                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1638                                                         OP(s) != OOR2) {
1639                                         g->iflags |= REGEX_BAD;
1640                                         return;
1641                                 }
1642                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1643                         /* fallthrough */
1644                 default:                /* things that break a sequence */
1645                         if (newlen > g->mlen) {         /* ends one */
1646                                 start = newstart;
1647                                 g->mlen = newlen;
1648                         }
1649                         newlen = 0;
1650                         break;
1651                 }
1652         } while (OP(s) != OEND);
1653
1654         if (g->mlen == 0)               /* there isn't one */
1655                 return;
1656
1657         /* turn it into a character string */
1658         g->must = malloc((size_t)g->mlen + 1);
1659         if (g->must == NULL) {          /* argh; just forget it */
1660                 g->mlen = 0;
1661                 return;
1662         }
1663         cp = g->must;
1664         scan = start;
1665         for (i = g->mlen; i > 0; i--) {
1666                 while (OP(s = *scan++) != OCHAR)
1667                         continue;
1668                 assert(cp < g->must + g->mlen);
1669                 *cp++ = (char)OPND(s);
1670         }
1671         assert(cp == g->must + g->mlen);
1672         *cp++ = '\0';           /* just on general principles */
1673 }
1674
1675 /*
1676  - pluscount - count + nesting
1677  */
1678 static sopno                    /* nesting depth */
1679 pluscount(struct parse *p, struct re_guts *g)
1680 {
1681         sop *scan;
1682         sop s;
1683         sopno plusnest = 0;
1684         sopno maxnest = 0;
1685
1686         if (p->error != 0)
1687                 return(0);      /* there may not be an OEND */
1688
1689         scan = g->strip + 1;
1690         do {
1691                 s = *scan++;
1692                 switch (OP(s)) {
1693                 case OPLUS_:
1694                         plusnest++;
1695                         break;
1696                 case O_PLUS:
1697                         if (plusnest > maxnest)
1698                                 maxnest = plusnest;
1699                         plusnest--;
1700                         break;
1701                 }
1702         } while (OP(s) != OEND);
1703         if (plusnest != 0)
1704                 g->iflags |= REGEX_BAD;
1705         return(maxnest);
1706 }