contrib/llvm-project/llvm/lib/Support/regcomp.c

   1 /*-
   2  * This code is derived from OpenBSD's libc/regex, original license follows:
   3  *
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Henry Spencer.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  36  */
  37
  38 #include <sys/types.h>
  39 #include <stdint.h>
  40 #include <stdio.h>
  41 #include <string.h>
  42 #include <ctype.h>
  43 #include <limits.h>
  44 #include <stdlib.h>
  45 #include "regex_impl.h"
  46
  47 #include "regutils.h"
  48 #include "regex2.h"
  49
  50 #include "llvm/Config/config.h"
  51 #include "llvm/Support/Compiler.h"
  52
  53 /* character-class table */
  54 static struct cclass {
  55         const char *name;
  56         const char *chars;
  57         const char *multis;
  58 } cclasses[] = {
  59         { "alnum",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  60 0123456789",                            ""} ,
  61         { "alpha",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
  62                                         ""} ,
  63         { "blank",      " \t",          ""} ,
  64         { "cntrl",      "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
  65 \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
  66         { "digit",      "0123456789",   ""} ,
  67         { "graph",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  68 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  69                                         ""} ,
  70         { "lower",      "abcdefghijklmnopqrstuvwxyz",
  71                                         ""} ,
  72         { "print",      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  73 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
  74                                         ""} ,
  75         { "punct",      "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  76                                         ""} ,
  77         { "space",      "\t\n\v\f\r ",  ""} ,
  78         { "upper",      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
  79                                         ""} ,
  80         { "xdigit",     "0123456789ABCDEFabcdef",
  81                                         ""} ,
  82         { NULL,         0,              "" }
  83 };
  84
  85 /* character-name table */
  86 static struct cname {
  87         const char *name;
  88         char code;
  89 } cnames[] = {
  90         { "NUL",                        '\0' },
  91         { "SOH",                        '\001' },
  92         { "STX",                        '\002' },
  93         { "ETX",                        '\003' },
  94         { "EOT",                        '\004' },
  95         { "ENQ",                        '\005' },
  96         { "ACK",                        '\006' },
  97         { "BEL",                        '\007' },
  98         { "alert",                      '\007' },
  99         { "BS",                         '\010' },
 100         { "backspace",                  '\b' },
 101         { "HT",                         '\011' },
 102         { "tab",                        '\t' },
 103         { "LF",                         '\012' },
 104         { "newline",                    '\n' },
 105         { "VT",                         '\013' },
 106         { "vertical-tab",               '\v' },
 107         { "FF",                         '\014' },
 108         { "form-feed",                  '\f' },
 109         { "CR",                         '\015' },
 110         { "carriage-return",            '\r' },
 111         { "SO",                         '\016' },
 112         { "SI",                         '\017' },
 113         { "DLE",                        '\020' },
 114         { "DC1",                        '\021' },
 115         { "DC2",                        '\022' },
 116         { "DC3",                        '\023' },
 117         { "DC4",                        '\024' },
 118         { "NAK",                        '\025' },
 119         { "SYN",                        '\026' },
 120         { "ETB",                        '\027' },
 121         { "CAN",                        '\030' },
 122         { "EM",                         '\031' },
 123         { "SUB",                        '\032' },
 124         { "ESC",                        '\033' },
 125         { "IS4",                        '\034' },
 126         { "FS",                         '\034' },
 127         { "IS3",                        '\035' },
 128         { "GS",                         '\035' },
 129         { "IS2",                        '\036' },
 130         { "RS",                         '\036' },
 131         { "IS1",                        '\037' },
 132         { "US",                         '\037' },
 133         { "space",                      ' ' },
 134         { "exclamation-mark",           '!' },
 135         { "quotation-mark",             '"' },
 136         { "number-sign",                '#' },
 137         { "dollar-sign",                '$' },
 138         { "percent-sign",               '%' },
 139         { "ampersand",                  '&' },
 140         { "apostrophe",                 '\'' },
 141         { "left-parenthesis",           '(' },
 142         { "right-parenthesis",          ')' },
 143         { "asterisk",                   '*' },
 144         { "plus-sign",                  '+' },
 145         { "comma",                      ',' },
 146         { "hyphen",                     '-' },
 147         { "hyphen-minus",               '-' },
 148         { "period",                     '.' },
 149         { "full-stop",                  '.' },
 150         { "slash",                      '/' },
 151         { "solidus",                    '/' },
 152         { "zero",                       '0' },
 153         { "one",                        '1' },
 154         { "two",                        '2' },
 155         { "three",                      '3' },
 156         { "four",                       '4' },
 157         { "five",                       '5' },
 158         { "six",                        '6' },
 159         { "seven",                      '7' },
 160         { "eight",                      '8' },
 161         { "nine",                       '9' },
 162         { "colon",                      ':' },
 163         { "semicolon",                  ';' },
 164         { "less-than-sign",             '<' },
 165         { "equals-sign",                '=' },
 166         { "greater-than-sign",          '>' },
 167         { "question-mark",              '?' },
 168         { "commercial-at",              '@' },
 169         { "left-square-bracket",        '[' },
 170         { "backslash",                  '\\' },
 171         { "reverse-solidus",            '\\' },
 172         { "right-square-bracket",       ']' },
 173         { "circumflex",                 '^' },
 174         { "circumflex-accent",          '^' },
 175         { "underscore",                 '_' },
 176         { "low-line",                   '_' },
 177         { "grave-accent",               '`' },
 178         { "left-brace",                 '{' },
 179         { "left-curly-bracket",         '{' },
 180         { "vertical-line",              '|' },
 181         { "right-brace",                '}' },
 182         { "right-curly-bracket",        '}' },
 183         { "tilde",                      '~' },
 184         { "DEL",                        '\177' },
 185         { NULL,                         0 }
 186 };
 187
 188 /*
 189  * parse structure, passed up and down to avoid global variables and
 190  * other clumsinesses
 191  */
 192 struct parse {
 193         char *next;             /* next character in RE */
 194         char *end;              /* end of string (-> NUL normally) */
 195         int error;              /* has an error been seen? */
 196         sop *strip;             /* malloced strip */
 197         sopno ssize;            /* malloced strip size (allocated) */
 198         sopno slen;             /* malloced strip length (used) */
 199         int ncsalloc;           /* number of csets allocated */
 200         struct re_guts *g;
 201 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 202         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 203         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 204 };
 205
 206 static void p_ere(struct parse *, int);
 207 static void p_ere_exp(struct parse *);
 208 static void p_str(struct parse *);
 209 static void p_bre(struct parse *, int, int);
 210 static int p_simp_re(struct parse *, int);
 211 static int p_count(struct parse *);
 212 static void p_bracket(struct parse *);
 213 static void p_b_term(struct parse *, cset *);
 214 static void p_b_cclass(struct parse *, cset *);
 215 static void p_b_eclass(struct parse *, cset *);
 216 static char p_b_symbol(struct parse *);
 217 static char p_b_coll_elem(struct parse *, int);
 218 static char othercase(int);
 219 static void bothcases(struct parse *, int);
 220 static void ordinary(struct parse *, int);
 221 static void nonnewline(struct parse *);
 222 static void repeat(struct parse *, sopno, int, int);
 223 static int seterr(struct parse *, int);
 224 static cset *allocset(struct parse *);
 225 static void freeset(struct parse *, cset *);
 226 static int freezeset(struct parse *, cset *);
 227 static int firstch(struct parse *, cset *);
 228 static int nch(struct parse *, cset *);
 229 static void mcadd(struct parse *, cset *, const char *);
 230 static void mcinvert(struct parse *, cset *);
 231 static void mccase(struct parse *, cset *);
 232 static int isinsets(struct re_guts *, int);
 233 static int samesets(struct re_guts *, int, int);
 234 static void categorize(struct parse *, struct re_guts *);
 235 static sopno dupl(struct parse *, sopno, sopno);
 236 static void doemit(struct parse *, sop, size_t);
 237 static void doinsert(struct parse *, sop, size_t, sopno);
 238 static void dofwd(struct parse *, sopno, sop);
 239 static void enlarge(struct parse *, sopno);
 240 static void stripsnug(struct parse *, struct re_guts *);
 241 static void findmust(struct parse *, struct re_guts *);
 242 static sopno pluscount(struct parse *, struct re_guts *);
 243
 244 static char nuls[10];           /* place to point scanner in event of error */
 245
 246 /*
 247  * macros for use with parse structure
 248  * BEWARE:  these know that the parse structure is named `p' !!!
 249  */
 250 #define PEEK()  (*p->next)
 251 #define PEEK2() (*(p->next+1))
 252 #define MORE()  (p->end - p->next > 0)
 253 #define MORE2() (p->end - p->next > 1)
 254 #define SEE(c)  (MORE() && PEEK() == (c))
 255 #define SEETWO(a, b)    (MORE2() && PEEK() == (a) && PEEK2() == (b))
 256 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 257 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 258 #define NEXT()  (p->next++)
 259 #define NEXT2() (p->next += 2)
 260 #define NEXTn(n)        (p->next += (n))
 261 #define GETNEXT()       (*p->next++)
 262 #define SETERROR(e)     seterr(p, (e))
 263 #define REQUIRE(co, e)  (void)((co) || SETERROR(e))
 264 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 265 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 266 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 267 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 268 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 269 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 270 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 271 #define HERE()          (p->slen)
 272 #define THERE()         (p->slen - 1)
 273 #define THERETHERE()    (p->slen - 2)
 274 #define DROP(n) (p->slen -= (n))
 275
 276 #ifdef  _POSIX2_RE_DUP_MAX
 277 #define DUPMAX  _POSIX2_RE_DUP_MAX
 278 #else
 279 #define DUPMAX  255
 280 #endif
 281 #define INFINITY        (DUPMAX + 1)
 282
 283 #ifndef NDEBUG
 284 static int never = 0;           /* for use in asserts; shuts lint up */
 285 #else
 286 #define never   0               /* some <assert.h>s have bugs too */
 287 #endif
 288
 289 /*
 290  - llvm_regcomp - interface for parser and compilation
 291  */
 292 int                             /* 0 success, otherwise REG_something */
 293 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
 294 {
 295         struct parse pa;
 296         struct re_guts *g;
 297         struct parse *p = &pa;
 298         int i;
 299         size_t len;
 300 #ifdef REDEBUG
 301 #       define  GOODFLAGS(f)    (f)
 302 #else
 303 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 304 #endif
 305
 306         cflags = GOODFLAGS(cflags);
 307         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 308                 return(REG_INVARG);
 309
 310         if (cflags&REG_PEND) {
 311                 if (preg->re_endp < pattern)
 312                         return(REG_INVARG);
 313                 len = preg->re_endp - pattern;
 314         } else
 315                 len = strlen((const char *)pattern);
 316
 317         /* do the mallocs early so failure handling is easy */
 318         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 319                                                         (NC-1)*sizeof(cat_t));
 320         if (g == NULL)
 321                 return(REG_ESPACE);
 322         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 323         p->strip = (sop *)calloc(p->ssize, sizeof(sop));
 324         p->slen = 0;
 325         if (p->strip == NULL) {
 326                 free((char *)g);
 327                 return(REG_ESPACE);
 328         }
 329
 330         /* set things up */
 331         p->g = g;
 332         /* suppress warning from the following explicit cast. */
 333 #ifdef __GNUC__
 334 #pragma GCC diagnostic push
 335 #pragma GCC diagnostic ignored "-Wcast-qual"
 336 #endif /* __GNUC__ */
 337         p->next = (char *)pattern;      /* convenience; we do not modify it */
 338 #ifdef __GNUC__
 339 #pragma GCC diagnostic pop
 340 #endif /* __GNUC__ */
 341         p->end = p->next + len;
 342         p->error = 0;
 343         p->ncsalloc = 0;
 344         for (i = 0; i < NPAREN; i++) {
 345                 p->pbegin[i] = 0;
 346                 p->pend[i] = 0;
 347         }
 348         g->csetsize = NC;
 349         g->sets = NULL;
 350         g->setbits = NULL;
 351         g->ncsets = 0;
 352         g->cflags = cflags;
 353         g->iflags = 0;
 354         g->nbol = 0;
 355         g->neol = 0;
 356         g->must = NULL;
 357         g->mlen = 0;
 358         g->nsub = 0;
 359         g->ncategories = 1;     /* category 0 is "everything else" */
 360         g->categories = &g->catspace[-(CHAR_MIN)];
 361         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 362         g->backrefs = 0;
 363
 364         /* do it */
 365         EMIT(OEND, 0);
 366         g->firststate = THERE();
 367         if (cflags&REG_EXTENDED)
 368                 p_ere(p, OUT);
 369         else if (cflags&REG_NOSPEC)
 370                 p_str(p);
 371         else
 372                 p_bre(p, OUT, OUT);
 373         EMIT(OEND, 0);
 374         g->laststate = THERE();
 375
 376         /* tidy up loose ends and fill things in */
 377         categorize(p, g);
 378         stripsnug(p, g);
 379         findmust(p, g);
 380         g->nplus = pluscount(p, g);
 381         g->magic = MAGIC2;
 382         preg->re_nsub = g->nsub;
 383         preg->re_g = g;
 384         preg->re_magic = MAGIC1;
 385 #ifndef REDEBUG
 386         /* not debugging, so can't rely on the assert() in llvm_regexec() */
 387         if (g->iflags&REGEX_BAD)
 388                 SETERROR(REG_ASSERT);
 389 #endif
 390
 391         /* win or lose, we're done */
 392         if (p->error != 0)      /* lose */
 393                 llvm_regfree(preg);
 394         return(p->error);
 395 }
 396
 397 /*
 398  - p_ere - ERE parser top level, concatenation and alternation
 399  */
 400 static void
 401 p_ere(struct parse *p, int stop)        /* character this ERE should end at */
 402 {
 403         char c;
 404         sopno prevback = 0;
 405         sopno prevfwd = 0;
 406         sopno conc;
 407         int first = 1;          /* is this the first alternative? */
 408
 409         for (;;) {
 410                 /* do a bunch of concatenated expressions */
 411                 conc = HERE();
 412                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 413                         p_ere_exp(p);
 414                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
 415
 416                 if (!EAT('|'))
 417                         break;          /* NOTE BREAK OUT */
 418
 419                 if (first) {
 420                         INSERT(OCH_, conc);     /* offset is wrong */
 421                         prevfwd = conc;
 422                         prevback = conc;
 423                         first = 0;
 424                 }
 425                 ASTERN(OOR1, prevback);
 426                 prevback = THERE();
 427                 AHEAD(prevfwd);                 /* fix previous offset */
 428                 prevfwd = HERE();
 429                 EMIT(OOR2, 0);                  /* offset is very wrong */
 430         }
 431
 432         if (!first) {           /* tail-end fixups */
 433                 AHEAD(prevfwd);
 434                 ASTERN(O_CH, prevback);
 435         }
 436
 437         assert(!MORE() || SEE(stop));
 438 }
 439
 440 /*
 441  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 442  */
 443 static void
 444 p_ere_exp(struct parse *p)
 445 {
 446         char c;
 447         sopno pos;
 448         int count;
 449         int count2;
 450         int backrefnum;
 451         sopno subno;
 452         int wascaret = 0;
 453
 454         assert(MORE());         /* caller should have ensured this */
 455         c = GETNEXT();
 456
 457         pos = HERE();
 458         switch (c) {
 459         case '(':
 460                 REQUIRE(MORE(), REG_EPAREN);
 461                 p->g->nsub++;
 462                 subno = p->g->nsub;
 463                 if (subno < NPAREN)
 464                         p->pbegin[subno] = HERE();
 465                 EMIT(OLPAREN, subno);
 466                 if (!SEE(')'))
 467                         p_ere(p, ')');
 468                 if (subno < NPAREN) {
 469                         p->pend[subno] = HERE();
 470                         assert(p->pend[subno] != 0);
 471                 }
 472                 EMIT(ORPAREN, subno);
 473                 MUSTEAT(')', REG_EPAREN);
 474                 break;
 475 #ifndef POSIX_MISTAKE
 476         case ')':               /* happens only if no current unmatched ( */
 477                 /*
 478                  * You may ask, why the ifndef?  Because I didn't notice
 479                  * this until slightly too late for 1003.2, and none of the
 480                  * other 1003.2 regular-expression reviewers noticed it at
 481                  * all.  So an unmatched ) is legal POSIX, at least until
 482                  * we can get it fixed.
 483                  */
 484                 SETERROR(REG_EPAREN);
 485                 break;
 486 #endif
 487         case '^':
 488                 EMIT(OBOL, 0);
 489                 p->g->iflags |= USEBOL;
 490                 p->g->nbol++;
 491                 wascaret = 1;
 492                 break;
 493         case '$':
 494                 EMIT(OEOL, 0);
 495                 p->g->iflags |= USEEOL;
 496                 p->g->neol++;
 497                 break;
 498         case '|':
 499                 SETERROR(REG_EMPTY);
 500                 break;
 501         case '*':
 502         case '+':
 503         case '?':
 504                 SETERROR(REG_BADRPT);
 505                 break;
 506         case '.':
 507                 if (p->g->cflags&REG_NEWLINE)
 508                         nonnewline(p);
 509                 else
 510                         EMIT(OANY, 0);
 511                 break;
 512         case '[':
 513                 p_bracket(p);
 514                 break;
 515         case '\\':
 516                 REQUIRE(MORE(), REG_EESCAPE);
 517                 c = GETNEXT();
 518                 if (c >= '1' && c <= '9') {
 519                         /* \[0-9] is taken to be a back-reference to a previously specified
 520                          * matching group. backrefnum will hold the number. The matching
 521                          * group must exist (i.e. if \4 is found there must have been at
 522                          * least 4 matching groups specified in the pattern previously).
 523                          */
 524                         backrefnum = c - '0';
 525                         if (p->pend[backrefnum] == 0) {
 526                                 SETERROR(REG_ESUBREG);
 527                                 break;
 528                         }
 529
 530                         /* Make sure everything checks out and emit the sequence
 531                          * that marks a back-reference to the parse structure.
 532                          */
 533                         assert(backrefnum <= p->g->nsub);
 534                         EMIT(OBACK_, backrefnum);
 535                         assert(p->pbegin[backrefnum] != 0);
 536                         assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
 537                         assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
 538                         (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
 539                         EMIT(O_BACK, backrefnum);
 540                         p->g->backrefs = 1;
 541                 } else {
 542                         /* Other chars are simply themselves when escaped with a backslash.
 543                          */
 544                         ordinary(p, c);
 545                 }
 546                 break;
 547         case '{':               /* okay as ordinary except if digit follows */
 548                 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
 549                 LLVM_FALLTHROUGH;
 550         default:
 551                 ordinary(p, c);
 552                 break;
 553         }
 554
 555         if (!MORE())
 556                 return;
 557         c = PEEK();
 558         /* we call { a repetition if followed by a digit */
 559         if (!( c == '*' || c == '+' || c == '?' ||
 560                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
 561                 return;         /* no repetition, we're done */
 562         NEXT();
 563
 564         REQUIRE(!wascaret, REG_BADRPT);
 565         switch (c) {
 566         case '*':       /* implemented as +? */
 567                 /* this case does not require the (y|) trick, noKLUDGE */
 568                 INSERT(OPLUS_, pos);
 569                 ASTERN(O_PLUS, pos);
 570                 INSERT(OQUEST_, pos);
 571                 ASTERN(O_QUEST, pos);
 572                 break;
 573         case '+':
 574                 INSERT(OPLUS_, pos);
 575                 ASTERN(O_PLUS, pos);
 576                 break;
 577         case '?':
 578                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 579                 INSERT(OCH_, pos);              /* offset slightly wrong */
 580                 ASTERN(OOR1, pos);              /* this one's right */
 581                 AHEAD(pos);                     /* fix the OCH_ */
 582                 EMIT(OOR2, 0);                  /* offset very wrong... */
 583                 AHEAD(THERE());                 /* ...so fix it */
 584                 ASTERN(O_CH, THERETHERE());
 585                 break;
 586         case '{':
 587                 count = p_count(p);
 588                 if (EAT(',')) {
 589                         if (isdigit((uch)PEEK())) {
 590                                 count2 = p_count(p);
 591                                 REQUIRE(count <= count2, REG_BADBR);
 592                         } else          /* single number with comma */
 593                                 count2 = INFINITY;
 594                 } else          /* just a single number */
 595                         count2 = count;
 596                 repeat(p, pos, count, count2);
 597                 if (!EAT('}')) {        /* error heuristics */
 598                         while (MORE() && PEEK() != '}')
 599                                 NEXT();
 600                         REQUIRE(MORE(), REG_EBRACE);
 601                         SETERROR(REG_BADBR);
 602                 }
 603                 break;
 604         }
 605
 606         if (!MORE())
 607                 return;
 608         c = PEEK();
 609         if (!( c == '*' || c == '+' || c == '?' ||
 610                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
 611                 return;
 612         SETERROR(REG_BADRPT);
 613 }
 614
 615 /*
 616  - p_str - string (no metacharacters) "parser"
 617  */
 618 static void
 619 p_str(struct parse *p)
 620 {
 621         REQUIRE(MORE(), REG_EMPTY);
 622         while (MORE())
 623                 ordinary(p, GETNEXT());
 624 }
 625
 626 /*
 627  - p_bre - BRE parser top level, anchoring and concatenation
 628  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 629  *
 630  * This implementation is a bit of a kludge, in that a trailing $ is first
 631  * taken as an ordinary character and then revised to be an anchor.  The
 632  * only undesirable side effect is that '$' gets included as a character
 633  * category in such cases.  This is fairly harmless; not worth fixing.
 634  * The amount of lookahead needed to avoid this kludge is excessive.
 635  */
 636 static void
 637 p_bre(struct parse *p,
 638     int end1,           /* first terminating character */
 639     int end2)           /* second terminating character */
 640 {
 641         sopno start = HERE();
 642         int first = 1;                  /* first subexpression? */
 643         int wasdollar = 0;
 644
 645         if (EAT('^')) {
 646                 EMIT(OBOL, 0);
 647                 p->g->iflags |= USEBOL;
 648                 p->g->nbol++;
 649         }
 650         while (MORE() && !SEETWO(end1, end2)) {
 651                 wasdollar = p_simp_re(p, first);
 652                 first = 0;
 653         }
 654         if (wasdollar) {        /* oops, that was a trailing anchor */
 655                 DROP(1);
 656                 EMIT(OEOL, 0);
 657                 p->g->iflags |= USEEOL;
 658                 p->g->neol++;
 659         }
 660
 661         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 662 }
 663
 664 /*
 665  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 666  */
 667 static int                      /* was the simple RE an unbackslashed $? */
 668 p_simp_re(struct parse *p,
 669     int starordinary)           /* is a leading * an ordinary character? */
 670 {
 671         int c;
 672         int count;
 673         int count2;
 674         sopno pos;
 675         int i;
 676         sopno subno;
 677 #       define  BACKSL  (1<<CHAR_BIT)
 678
 679         pos = HERE(); /* repetition op, if any, covers from here */
 680
 681         assert(MORE()); /* caller should have ensured this */
 682         c = GETNEXT();
 683         if (c == '\\') {
 684                 REQUIRE(MORE(), REG_EESCAPE);
 685                 c = BACKSL | GETNEXT();
 686         }
 687         switch (c) {
 688         case '.':
 689                 if (p->g->cflags&REG_NEWLINE)
 690                         nonnewline(p);
 691                 else
 692                         EMIT(OANY, 0);
 693                 break;
 694         case '[':
 695                 p_bracket(p);
 696                 break;
 697         case BACKSL|'{':
 698                 SETERROR(REG_BADRPT);
 699                 break;
 700         case BACKSL|'(':
 701                 p->g->nsub++;
 702                 subno = p->g->nsub;
 703                 if (subno < NPAREN)
 704                         p->pbegin[subno] = HERE();
 705                 EMIT(OLPAREN, subno);
 706                 /* the MORE here is an error heuristic */
 707                 if (MORE() && !SEETWO('\\', ')'))
 708                         p_bre(p, '\\', ')');
 709                 if (subno < NPAREN) {
 710                         p->pend[subno] = HERE();
 711                         assert(p->pend[subno] != 0);
 712                 }
 713                 EMIT(ORPAREN, subno);
 714                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 715                 break;
 716         case BACKSL|')':        /* should not get here -- must be user */
 717         case BACKSL|'}':
 718                 SETERROR(REG_EPAREN);
 719                 break;
 720         case BACKSL|'1':
 721         case BACKSL|'2':
 722         case BACKSL|'3':
 723         case BACKSL|'4':
 724         case BACKSL|'5':
 725         case BACKSL|'6':
 726         case BACKSL|'7':
 727         case BACKSL|'8':
 728         case BACKSL|'9':
 729                 i = (c&~BACKSL) - '0';
 730                 assert(i < NPAREN);
 731                 if (p->pend[i] != 0) {
 732                         assert(i <= p->g->nsub);
 733                         EMIT(OBACK_, i);
 734                         assert(p->pbegin[i] != 0);
 735                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 736                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 737                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 738                         EMIT(O_BACK, i);
 739                 } else
 740                         SETERROR(REG_ESUBREG);
 741                 p->g->backrefs = 1;
 742                 break;
 743         case '*':
 744                 REQUIRE(starordinary, REG_BADRPT);
 745                 LLVM_FALLTHROUGH;
 746         default:
 747                 ordinary(p, (char)c);
 748                 break;
 749         }
 750
 751         if (EAT('*')) {         /* implemented as +? */
 752                 /* this case does not require the (y|) trick, noKLUDGE */
 753                 INSERT(OPLUS_, pos);
 754                 ASTERN(O_PLUS, pos);
 755                 INSERT(OQUEST_, pos);
 756                 ASTERN(O_QUEST, pos);
 757         } else if (EATTWO('\\', '{')) {
 758                 count = p_count(p);
 759                 if (EAT(',')) {
 760                         if (MORE() && isdigit((uch)PEEK())) {
 761                                 count2 = p_count(p);
 762                                 REQUIRE(count <= count2, REG_BADBR);
 763                         } else          /* single number with comma */
 764                                 count2 = INFINITY;
 765                 } else          /* just a single number */
 766                         count2 = count;
 767                 repeat(p, pos, count, count2);
 768                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 769                         while (MORE() && !SEETWO('\\', '}'))
 770                                 NEXT();
 771                         REQUIRE(MORE(), REG_EBRACE);
 772                         SETERROR(REG_BADBR);
 773                 }
 774         } else if (c == '$')    /* $ (but not \$) ends it */
 775                 return(1);
 776
 777         return(0);
 778 }
 779
 780 /*
 781  - p_count - parse a repetition count
 782  */
 783 static int                      /* the value */
 784 p_count(struct parse *p)
 785 {
 786         int count = 0;
 787         int ndigits = 0;
 788
 789         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 790                 count = count*10 + (GETNEXT() - '0');
 791                 ndigits++;
 792         }
 793
 794         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 795         return(count);
 796 }
 797
 798 /*
 799  - p_bracket - parse a bracketed character list
 800  *
 801  * Note a significant property of this code:  if the allocset() did SETERROR,
 802  * no set operations are done.
 803  */
 804 static void
 805 p_bracket(struct parse *p)
 806 {
 807         cset *cs;
 808         int invert = 0;
 809
 810         /* Dept of Truly Sickening Special-Case Kludges */
 811         if (p->end - p->next > 5) {
 812                 if (strncmp(p->next, "[:<:]]", 6) == 0) {
 813                         EMIT(OBOW, 0);
 814                         NEXTn(6);
 815                         return;
 816                 }
 817                 if (strncmp(p->next, "[:>:]]", 6) == 0) {
 818                         EMIT(OEOW, 0);
 819                         NEXTn(6);
 820                         return;
 821                 }
 822         }
 823
 824         if ((cs = allocset(p)) == NULL) {
 825                 /* allocset did set error status in p */
 826                 return;
 827         }
 828
 829         if (EAT('^'))
 830                 invert++;       /* make note to invert set at end */
 831         if (EAT(']'))
 832                 CHadd(cs, ']');
 833         else if (EAT('-'))
 834                 CHadd(cs, '-');
 835         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 836                 p_b_term(p, cs);
 837         if (EAT('-'))
 838                 CHadd(cs, '-');
 839         MUSTEAT(']', REG_EBRACK);
 840
 841         if (p->error != 0) {    /* don't mess things up further */
 842                 freeset(p, cs);
 843                 return;
 844         }
 845
 846         if (p->g->cflags&REG_ICASE) {
 847                 int i;
 848                 int ci;
 849
 850                 for (i = p->g->csetsize - 1; i >= 0; i--)
 851                         if (CHIN(cs, i) && isalpha(i)) {
 852                                 ci = othercase(i);
 853                                 if (ci != i)
 854                                         CHadd(cs, ci);
 855                         }
 856                 if (cs->multis != NULL)
 857                         mccase(p, cs);
 858         }
 859         if (invert) {
 860                 int i;
 861
 862                 for (i = p->g->csetsize - 1; i >= 0; i--)
 863                         if (CHIN(cs, i))
 864                                 CHsub(cs, i);
 865                         else
 866                                 CHadd(cs, i);
 867                 if (p->g->cflags&REG_NEWLINE)
 868                         CHsub(cs, '\n');
 869                 if (cs->multis != NULL)
 870                         mcinvert(p, cs);
 871         }
 872
 873         assert(cs->multis == NULL);             /* xxx */
 874
 875         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 876                 ordinary(p, firstch(p, cs));
 877                 freeset(p, cs);
 878         } else
 879                 EMIT(OANYOF, freezeset(p, cs));
 880 }
 881
 882 /*
 883  - p_b_term - parse one term of a bracketed character list
 884  */
 885 static void
 886 p_b_term(struct parse *p, cset *cs)
 887 {
 888         char c;
 889         char start, finish;
 890         int i;
 891
 892         /* classify what we've got */
 893         switch ((MORE()) ? PEEK() : '\0') {
 894         case '[':
 895                 c = (MORE2()) ? PEEK2() : '\0';
 896                 break;
 897         case '-':
 898                 SETERROR(REG_ERANGE);
 899                 return;                 /* NOTE RETURN */
 900                 break;
 901         default:
 902                 c = '\0';
 903                 break;
 904         }
 905
 906         switch (c) {
 907         case ':':               /* character class */
 908                 NEXT2();
 909                 REQUIRE(MORE(), REG_EBRACK);
 910                 c = PEEK();
 911                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 912                 p_b_cclass(p, cs);
 913                 REQUIRE(MORE(), REG_EBRACK);
 914                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 915                 break;
 916         case '=':               /* equivalence class */
 917                 NEXT2();
 918                 REQUIRE(MORE(), REG_EBRACK);
 919                 c = PEEK();
 920                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 921                 p_b_eclass(p, cs);
 922                 REQUIRE(MORE(), REG_EBRACK);
 923                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 924                 break;
 925         default:                /* symbol, ordinary character, or range */
 926 /* xxx revision needed for multichar stuff */
 927                 start = p_b_symbol(p);
 928                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 929                         /* range */
 930                         NEXT();
 931                         if (EAT('-'))
 932                                 finish = '-';
 933                         else
 934                                 finish = p_b_symbol(p);
 935                 } else
 936                         finish = start;
 937 /* xxx what about signed chars here... */
 938                 REQUIRE(start <= finish, REG_ERANGE);
 939                 for (i = start; i <= finish; i++)
 940                         CHadd(cs, i);
 941                 break;
 942         }
 943 }
 944
 945 /*
 946  - p_b_cclass - parse a character-class name and deal with it
 947  */
 948 static void
 949 p_b_cclass(struct parse *p, cset *cs)
 950 {
 951         char *sp = p->next;
 952         struct cclass *cp;
 953         size_t len;
 954         const char *u;
 955         char c;
 956
 957         while (MORE() && isalpha((uch)PEEK()))
 958                 NEXT();
 959         len = p->next - sp;
 960         for (cp = cclasses; cp->name != NULL; cp++)
 961                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 962                         break;
 963         if (cp->name == NULL) {
 964                 /* oops, didn't find it */
 965                 SETERROR(REG_ECTYPE);
 966                 return;
 967         }
 968
 969         u = cp->chars;
 970         while ((c = *u++) != '\0')
 971                 CHadd(cs, c);
 972         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 973                 MCadd(p, cs, u);
 974 }
 975
 976 /*
 977  - p_b_eclass - parse an equivalence-class name and deal with it
 978  *
 979  * This implementation is incomplete. xxx
 980  */
 981 static void
 982 p_b_eclass(struct parse *p, cset *cs)
 983 {
 984         char c;
 985
 986         c = p_b_coll_elem(p, '=');
 987         CHadd(cs, c);
 988 }
 989
 990 /*
 991  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 992  */
 993 static char                     /* value of symbol */
 994 p_b_symbol(struct parse *p)
 995 {
 996         char value;
 997
 998         REQUIRE(MORE(), REG_EBRACK);
 999         if (!EATTWO('[', '.'))
1000                 return(GETNEXT());
1001
1002         /* collating symbol */
1003         value = p_b_coll_elem(p, '.');
1004         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1005         return(value);
1006 }
1007
1008 /*
1009  - p_b_coll_elem - parse a collating-element name and look it up
1010  */
1011 static char                     /* value of collating element */
1012 p_b_coll_elem(struct parse *p,
1013     int endc)                   /* name ended by endc,']' */
1014 {
1015         char *sp = p->next;
1016         struct cname *cp;
1017         size_t len;
1018
1019         while (MORE() && !SEETWO(endc, ']'))
1020                 NEXT();
1021         if (!MORE()) {
1022                 SETERROR(REG_EBRACK);
1023                 return(0);
1024         }
1025         len = p->next - sp;
1026         for (cp = cnames; cp->name != NULL; cp++)
1027                 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1028                         return(cp->code);       /* known name */
1029         if (len == 1)
1030                 return(*sp);    /* single character */
1031         SETERROR(REG_ECOLLATE);                 /* neither */
1032         return(0);
1033 }
1034
1035 /*
1036  - othercase - return the case counterpart of an alphabetic
1037  */
1038 static char                     /* if no counterpart, return ch */
1039 othercase(int ch)
1040 {
1041         ch = (uch)ch;
1042         assert(isalpha(ch));
1043         if (isupper(ch))
1044                 return ((uch)tolower(ch));
1045         else if (islower(ch))
1046                 return ((uch)toupper(ch));
1047         else                    /* peculiar, but could happen */
1048                 return(ch);
1049 }
1050
1051 /*
1052  - bothcases - emit a dualcase version of a two-case character
1053  *
1054  * Boy, is this implementation ever a kludge...
1055  */
1056 static void
1057 bothcases(struct parse *p, int ch)
1058 {
1059         char *oldnext = p->next;
1060         char *oldend = p->end;
1061         char bracket[3];
1062
1063         ch = (uch)ch;
1064         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1065         p->next = bracket;
1066         p->end = bracket+2;
1067         bracket[0] = ch;
1068         bracket[1] = ']';
1069         bracket[2] = '\0';
1070         p_bracket(p);
1071         assert(p->next == bracket+2);
1072         p->next = oldnext;
1073         p->end = oldend;
1074 }
1075
1076 /*
1077  - ordinary - emit an ordinary character
1078  */
1079 static void
1080 ordinary(struct parse *p, int ch)
1081 {
1082         cat_t *cap = p->g->categories;
1083
1084         if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1085                 bothcases(p, ch);
1086         else {
1087                 EMIT(OCHAR, (uch)ch);
1088                 if (cap[ch] == 0)
1089                         cap[ch] = p->g->ncategories++;
1090         }
1091 }
1092
1093 /*
1094  - nonnewline - emit REG_NEWLINE version of OANY
1095  *
1096  * Boy, is this implementation ever a kludge...
1097  */
1098 static void
1099 nonnewline(struct parse *p)
1100 {
1101         char *oldnext = p->next;
1102         char *oldend = p->end;
1103         char bracket[4];
1104
1105         p->next = bracket;
1106         p->end = bracket+3;
1107         bracket[0] = '^';
1108         bracket[1] = '\n';
1109         bracket[2] = ']';
1110         bracket[3] = '\0';
1111         p_bracket(p);
1112         assert(p->next == bracket+3);
1113         p->next = oldnext;
1114         p->end = oldend;
1115 }
1116
1117 /*
1118  - repeat - generate code for a bounded repetition, recursively if needed
1119  */
1120 static void
1121 repeat(struct parse *p,
1122     sopno start,                /* operand from here to end of strip */
1123     int from,                   /* repeated from this number */
1124     int to)                     /* to this number of times (maybe INFINITY) */
1125 {
1126         sopno finish = HERE();
1127 #       define  N       2
1128 #       define  INF     3
1129 #       define  REP(f, t)       ((f)*8 + (t))
1130 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1131         sopno copy;
1132
1133         if (p->error != 0)      /* head off possible runaway recursion */
1134                 return;
1135
1136         assert(from <= to);
1137
1138         switch (REP(MAP(from), MAP(to))) {
1139         case REP(0, 0):                 /* must be user doing this */
1140                 DROP(finish-start);     /* drop the operand */
1141                 break;
1142         case REP(0, 1):                 /* as x{1,1}? */
1143         case REP(0, N):                 /* as x{1,n}? */
1144         case REP(0, INF):               /* as x{1,}? */
1145                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1146                 INSERT(OCH_, start);            /* offset is wrong... */
1147                 repeat(p, start+1, 1, to);
1148                 ASTERN(OOR1, start);
1149                 AHEAD(start);                   /* ... fix it */
1150                 EMIT(OOR2, 0);
1151                 AHEAD(THERE());
1152                 ASTERN(O_CH, THERETHERE());
1153                 break;
1154         case REP(1, 1):                 /* trivial case */
1155                 /* done */
1156                 break;
1157         case REP(1, N):                 /* as x?x{1,n-1} */
1158                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1159                 INSERT(OCH_, start);
1160                 ASTERN(OOR1, start);
1161                 AHEAD(start);
1162                 EMIT(OOR2, 0);                  /* offset very wrong... */
1163                 AHEAD(THERE());                 /* ...so fix it */
1164                 ASTERN(O_CH, THERETHERE());
1165                 copy = dupl(p, start+1, finish+1);
1166                 assert(copy == finish+4);
1167                 repeat(p, copy, 1, to-1);
1168                 break;
1169         case REP(1, INF):               /* as x+ */
1170                 INSERT(OPLUS_, start);
1171                 ASTERN(O_PLUS, start);
1172                 break;
1173         case REP(N, N):                 /* as xx{m-1,n-1} */
1174                 copy = dupl(p, start, finish);
1175                 repeat(p, copy, from-1, to-1);
1176                 break;
1177         case REP(N, INF):               /* as xx{n-1,INF} */
1178                 copy = dupl(p, start, finish);
1179                 repeat(p, copy, from-1, to);
1180                 break;
1181         default:                        /* "can't happen" */
1182                 SETERROR(REG_ASSERT);   /* just in case */
1183                 break;
1184         }
1185 }
1186
1187 /*
1188  - seterr - set an error condition
1189  */
1190 static int                      /* useless but makes type checking happy */
1191 seterr(struct parse *p, int e)
1192 {
1193         if (p->error == 0)      /* keep earliest error condition */
1194                 p->error = e;
1195         p->next = nuls;         /* try to bring things to a halt */
1196         p->end = nuls;
1197         return(0);              /* make the return value well-defined */
1198 }
1199
1200 /*
1201  - allocset - allocate a set of characters for []
1202  */
1203 static cset *
1204 allocset(struct parse *p)
1205 {
1206         int no = p->g->ncsets++;
1207         size_t nc;
1208         size_t nbytes;
1209         cset *cs;
1210         size_t css = (size_t)p->g->csetsize;
1211         int i;
1212
1213         if (no >= p->ncsalloc) {        /* need another column of space */
1214                 void *ptr;
1215
1216                 p->ncsalloc += CHAR_BIT;
1217                 nc = p->ncsalloc;
1218                 if (nc > SIZE_MAX / sizeof(cset))
1219                         goto nomem;
1220                 assert(nc % CHAR_BIT == 0);
1221                 nbytes = nc / CHAR_BIT * css;
1222
1223                 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1224                 if (ptr == NULL)
1225                         goto nomem;
1226                 p->g->sets = ptr;
1227
1228                 ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1229                 if (ptr == NULL)
1230                         goto nomem;
1231                 p->g->setbits = ptr;
1232
1233                 for (i = 0; i < no; i++)
1234                         p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1235
1236                 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1237         }
1238         /* XXX should not happen */
1239         if (p->g->sets == NULL || p->g->setbits == NULL)
1240                 goto nomem;
1241
1242         cs = &p->g->sets[no];
1243         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1244         cs->mask = 1 << ((no) % CHAR_BIT);
1245         cs->hash = 0;
1246         cs->smultis = 0;
1247         cs->multis = NULL;
1248
1249         return(cs);
1250 nomem:
1251         free(p->g->sets);
1252         p->g->sets = NULL;
1253         free(p->g->setbits);
1254         p->g->setbits = NULL;
1255
1256         SETERROR(REG_ESPACE);
1257         /* caller's responsibility not to do set ops */
1258         return(NULL);
1259 }
1260
1261 /*
1262  - freeset - free a now-unused set
1263  */
1264 static void
1265 freeset(struct parse *p, cset *cs)
1266 {
1267         size_t i;
1268         cset *top = &p->g->sets[p->g->ncsets];
1269         size_t css = (size_t)p->g->csetsize;
1270
1271         for (i = 0; i < css; i++)
1272                 CHsub(cs, i);
1273         if (cs == top-1)        /* recover only the easy case */
1274                 p->g->ncsets--;
1275 }
1276
1277 /*
1278  - freezeset - final processing on a set of characters
1279  *
1280  * The main task here is merging identical sets.  This is usually a waste
1281  * of time (although the hash code minimizes the overhead), but can win
1282  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1283  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1284  * the same value!
1285  */
1286 static int                      /* set number */
1287 freezeset(struct parse *p, cset *cs)
1288 {
1289         uch h = cs->hash;
1290         size_t i;
1291         cset *top = &p->g->sets[p->g->ncsets];
1292         cset *cs2;
1293         size_t css = (size_t)p->g->csetsize;
1294
1295         /* look for an earlier one which is the same */
1296         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1297                 if (cs2->hash == h && cs2 != cs) {
1298                         /* maybe */
1299                         for (i = 0; i < css; i++)
1300                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1301                                         break;          /* no */
1302                         if (i == css)
1303                                 break;                  /* yes */
1304                 }
1305
1306         if (cs2 < top) {        /* found one */
1307                 freeset(p, cs);
1308                 cs = cs2;
1309         }
1310
1311         return((int)(cs - p->g->sets));
1312 }
1313
1314 /*
1315  - firstch - return first character in a set (which must have at least one)
1316  */
1317 static int                      /* character; there is no "none" value */
1318 firstch(struct parse *p, cset *cs)
1319 {
1320         size_t i;
1321         size_t css = (size_t)p->g->csetsize;
1322
1323         for (i = 0; i < css; i++)
1324                 if (CHIN(cs, i))
1325                         return((char)i);
1326         assert(never);
1327         return(0);              /* arbitrary */
1328 }
1329
1330 /*
1331  - nch - number of characters in a set
1332  */
1333 static int
1334 nch(struct parse *p, cset *cs)
1335 {
1336         size_t i;
1337         size_t css = (size_t)p->g->csetsize;
1338         int n = 0;
1339
1340         for (i = 0; i < css; i++)
1341                 if (CHIN(cs, i))
1342                         n++;
1343         return(n);
1344 }
1345
1346 /*
1347  - mcadd - add a collating element to a cset
1348  */
1349 static void
1350 mcadd( struct parse *p, cset *cs, const char *cp)
1351 {
1352         size_t oldend = cs->smultis;
1353         void *np;
1354
1355         cs->smultis += strlen(cp) + 1;
1356         np = realloc(cs->multis, cs->smultis);
1357         if (np == NULL) {
1358                 if (cs->multis)
1359                         free(cs->multis);
1360                 cs->multis = NULL;
1361                 SETERROR(REG_ESPACE);
1362                 return;
1363         }
1364         cs->multis = np;
1365
1366         llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1367 }
1368
1369 /*
1370  - mcinvert - invert the list of collating elements in a cset
1371  *
1372  * This would have to know the set of possibilities.  Implementation
1373  * is deferred.
1374  */
1375 /* ARGSUSED */
1376 static void
1377 mcinvert(struct parse *p, cset *cs)
1378 {
1379         assert(cs->multis == NULL);     /* xxx */
1380 }
1381
1382 /*
1383  - mccase - add case counterparts of the list of collating elements in a cset
1384  *
1385  * This would have to know the set of possibilities.  Implementation
1386  * is deferred.
1387  */
1388 /* ARGSUSED */
1389 static void
1390 mccase(struct parse *p, cset *cs)
1391 {
1392         assert(cs->multis == NULL);     /* xxx */
1393 }
1394
1395 /*
1396  - isinsets - is this character in any sets?
1397  */
1398 static int                      /* predicate */
1399 isinsets(struct re_guts *g, int c)
1400 {
1401         uch *col;
1402         int i;
1403         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1404         unsigned uc = (uch)c;
1405
1406         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1407                 if (col[uc] != 0)
1408                         return(1);
1409         return(0);
1410 }
1411
1412 /*
1413  - samesets - are these two characters in exactly the same sets?
1414  */
1415 static int                      /* predicate */
1416 samesets(struct re_guts *g, int c1, int c2)
1417 {
1418         uch *col;
1419         int i;
1420         int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1421         unsigned uc1 = (uch)c1;
1422         unsigned uc2 = (uch)c2;
1423
1424         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1425                 if (col[uc1] != col[uc2])
1426                         return(0);
1427         return(1);
1428 }
1429
1430 /*
1431  - categorize - sort out character categories
1432  */
1433 static void
1434 categorize(struct parse *p, struct re_guts *g)
1435 {
1436         cat_t *cats = g->categories;
1437         int c;
1438         int c2;
1439         cat_t cat;
1440
1441         /* avoid making error situations worse */
1442         if (p->error != 0)
1443                 return;
1444
1445         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1446                 if (cats[c] == 0 && isinsets(g, c)) {
1447                         cat = g->ncategories++;
1448                         cats[c] = cat;
1449                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1450                                 if (cats[c2] == 0 && samesets(g, c, c2))
1451                                         cats[c2] = cat;
1452                 }
1453 }
1454
1455 /*
1456  - dupl - emit a duplicate of a bunch of sops
1457  */
1458 static sopno                    /* start of duplicate */
1459 dupl(struct parse *p,
1460     sopno start,                /* from here */
1461     sopno finish)               /* to this less one */
1462 {
1463         sopno ret = HERE();
1464         sopno len = finish - start;
1465
1466         assert(finish >= start);
1467         if (len == 0)
1468                 return(ret);
1469         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1470         assert(p->ssize >= p->slen + len);
1471         (void) memmove((char *)(p->strip + p->slen),
1472                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1473         p->slen += len;
1474         return(ret);
1475 }
1476
1477 /*
1478  - doemit - emit a strip operator
1479  *
1480  * It might seem better to implement this as a macro with a function as
1481  * hard-case backup, but it's just too big and messy unless there are
1482  * some changes to the data structures.  Maybe later.
1483  */
1484 static void
1485 doemit(struct parse *p, sop op, size_t opnd)
1486 {
1487         /* avoid making error situations worse */
1488         if (p->error != 0)
1489                 return;
1490
1491         /* deal with oversize operands ("can't happen", more or less) */
1492         assert(opnd < 1<<OPSHIFT);
1493
1494         /* deal with undersized strip */
1495         if (p->slen >= p->ssize)
1496                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1497         assert(p->slen < p->ssize);
1498
1499         /* finally, it's all reduced to the easy case */
1500         p->strip[p->slen++] = SOP(op, opnd);
1501 }
1502
1503 /*
1504  - doinsert - insert a sop into the strip
1505  */
1506 static void
1507 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1508 {
1509         sopno sn;
1510         sop s;
1511         int i;
1512
1513         /* avoid making error situations worse */
1514         if (p->error != 0)
1515                 return;
1516
1517         sn = HERE();
1518         EMIT(op, opnd);         /* do checks, ensure space */
1519         assert(HERE() == sn+1);
1520         s = p->strip[sn];
1521
1522         /* adjust paren pointers */
1523         assert(pos > 0);
1524         for (i = 1; i < NPAREN; i++) {
1525                 if (p->pbegin[i] >= pos) {
1526                         p->pbegin[i]++;
1527                 }
1528                 if (p->pend[i] >= pos) {
1529                         p->pend[i]++;
1530                 }
1531         }
1532
1533         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1534                                                 (HERE()-pos-1)*sizeof(sop));
1535         p->strip[pos] = s;
1536 }
1537
1538 /*
1539  - dofwd - complete a forward reference
1540  */
1541 static void
1542 dofwd(struct parse *p, sopno pos, sop value)
1543 {
1544         /* avoid making error situations worse */
1545         if (p->error != 0)
1546                 return;
1547
1548         assert(value < 1<<OPSHIFT);
1549         p->strip[pos] = OP(p->strip[pos]) | value;
1550 }
1551
1552 /*
1553  - enlarge - enlarge the strip
1554  */
1555 static void
1556 enlarge(struct parse *p, sopno size)
1557 {
1558         sop *sp;
1559
1560         if (p->ssize >= size)
1561                 return;
1562
1563         if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1564                 SETERROR(REG_ESPACE);
1565                 return;
1566         }
1567
1568         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1569         if (sp == NULL) {
1570                 SETERROR(REG_ESPACE);
1571                 return;
1572         }
1573         p->strip = sp;
1574         p->ssize = size;
1575 }
1576
1577 /*
1578  - stripsnug - compact the strip
1579  */
1580 static void
1581 stripsnug(struct parse *p, struct re_guts *g)
1582 {
1583         g->nstates = p->slen;
1584         if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1585                 g->strip = p->strip;
1586                 SETERROR(REG_ESPACE);
1587                 return;
1588         }
1589
1590         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1591         if (g->strip == NULL) {
1592                 SETERROR(REG_ESPACE);
1593                 g->strip = p->strip;
1594         }
1595 }
1596
1597 /*
1598  - findmust - fill in must and mlen with longest mandatory literal string
1599  *
1600  * This algorithm could do fancy things like analyzing the operands of |
1601  * for common subsequences.  Someday.  This code is simple and finds most
1602  * of the interesting cases.
1603  *
1604  * Note that must and mlen got initialized during setup.
1605  */
1606 static void
1607 findmust(struct parse *p, struct re_guts *g)
1608 {
1609         sop *scan;
1610         sop *start = 0; /* start initialized in the default case, after that */
1611         sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1612         sopno newlen;
1613         sop s;
1614         char *cp;
1615         sopno i;
1616
1617         /* avoid making error situations worse */
1618         if (p->error != 0)
1619                 return;
1620
1621         /* find the longest OCHAR sequence in strip */
1622         newlen = 0;
1623         scan = g->strip + 1;
1624         do {
1625                 s = *scan++;
1626                 switch (OP(s)) {
1627                 case OCHAR:             /* sequence member */
1628                         if (newlen == 0)                /* new sequence */
1629                                 newstart = scan - 1;
1630                         newlen++;
1631                         break;
1632                 case OPLUS_:            /* things that don't break one */
1633                 case OLPAREN:
1634                 case ORPAREN:
1635                         break;
1636                 case OQUEST_:           /* things that must be skipped */
1637                 case OCH_:
1638                         scan--;
1639                         do {
1640                                 scan += OPND(s);
1641                                 s = *scan;
1642                                 /* assert() interferes w debug printouts */
1643                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1644                                                         OP(s) != OOR2) {
1645                                         g->iflags |= REGEX_BAD;
1646                                         return;
1647                                 }
1648                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1649                         LLVM_FALLTHROUGH;
1650                 default:                /* things that break a sequence */
1651                         if (newlen > g->mlen) {         /* ends one */
1652                                 start = newstart;
1653                                 g->mlen = newlen;
1654                         }
1655                         newlen = 0;
1656                         break;
1657                 }
1658         } while (OP(s) != OEND);
1659
1660         if (g->mlen == 0)               /* there isn't one */
1661                 return;
1662
1663         /* turn it into a character string */
1664         g->must = malloc((size_t)g->mlen + 1);
1665         if (g->must == NULL) {          /* argh; just forget it */
1666                 g->mlen = 0;
1667                 return;
1668         }
1669         cp = g->must;
1670         scan = start;
1671         for (i = g->mlen; i > 0; i--) {
1672                 while (OP(s = *scan++) != OCHAR)
1673                         continue;
1674                 assert(cp < g->must + g->mlen);
1675                 *cp++ = (char)OPND(s);
1676         }
1677         assert(cp == g->must + g->mlen);
1678         *cp++ = '\0';           /* just on general principles */
1679 }
1680
1681 /*
1682  - pluscount - count + nesting
1683  */
1684 static sopno                    /* nesting depth */
1685 pluscount(struct parse *p, struct re_guts *g)
1686 {
1687         sop *scan;
1688         sop s;
1689         sopno plusnest = 0;
1690         sopno maxnest = 0;
1691
1692         if (p->error != 0)
1693                 return(0);      /* there may not be an OEND */
1694
1695         scan = g->strip + 1;
1696         do {
1697                 s = *scan++;
1698                 switch (OP(s)) {
1699                 case OPLUS_:
1700                         plusnest++;
1701                         break;
1702                 case O_PLUS:
1703                         if (plusnest > maxnest)
1704                                 maxnest = plusnest;
1705                         plusnest--;
1706                         break;
1707                 }
1708         } while (OP(s) != OEND);
1709         if (plusnest != 0)
1710                 g->iflags |= REGEX_BAD;
1711         return(maxnest);
1712 }