lib/libc/regex/regcomp.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * Copyright (c) 2011 The FreeBSD Foundation
   9  * All rights reserved.
  10  * Portions of this software were developed by David Chisnall
  11  * under sponsorship from the FreeBSD Foundation.
  12  *
  13  * This code is derived from software contributed to Berkeley by
  14  * Henry Spencer.
  15  *
  16  * Redistribution and use in source and binary forms, with or without
  17  * modification, are permitted provided that the following conditions
  18  * are met:
  19  * 1. Redistributions of source code must retain the above copyright
  20  *    notice, this list of conditions and the following disclaimer.
  21  * 2. Redistributions in binary form must reproduce the above copyright
  22  *    notice, this list of conditions and the following disclaimer in the
  23  *    documentation and/or other materials provided with the distribution.
  24  * 3. Neither the name of the University nor the names of its contributors
  25  *    may be used to endorse or promote products derived from this software
  26  *    without specific prior written permission.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  38  * SUCH DAMAGE.
  39  *
  40  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  41  */
  42
  43 #if defined(LIBC_SCCS) && !defined(lint)
  44 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
  45 #endif /* LIBC_SCCS and not lint */
  46 #include <sys/cdefs.h>
  47 __FBSDID("$FreeBSD$");
  48
  49 #include <sys/types.h>
  50 #include <stdio.h>
  51 #include <string.h>
  52 #include <ctype.h>
  53 #include <limits.h>
  54 #include <stdlib.h>
  55 #include <regex.h>
  56 #include <stdbool.h>
  57 #include <wchar.h>
  58 #include <wctype.h>
  59
  60 #ifndef LIBREGEX
  61 #include "collate.h"
  62 #endif
  63
  64 #include "utils.h"
  65 #include "regex2.h"
  66
  67 #include "cname.h"
  68
  69 /*
  70  * Branching context, used to keep track of branch state for all of the branch-
  71  * aware functions. In addition to keeping track of branch positions for the
  72  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
  73  * detection of whether ^ is acting as an anchor or being used erroneously and
  74  * also for whether we're in a sub-expression or not.
  75  */
  76 struct branchc {
  77         sopno start;
  78         sopno back;
  79         sopno fwd;
  80
  81         int nbranch;
  82         int nchain;
  83         bool outer;
  84         bool terminate;
  85 };
  86
  87 /*
  88  * parse structure, passed up and down to avoid global variables and
  89  * other clumsinesses
  90  */
  91 struct parse {
  92         const char *next;       /* next character in RE */
  93         const char *end;        /* end of string (-> NUL normally) */
  94         int error;              /* has an error been seen? */
  95         int gnuext;
  96         sop *strip;             /* malloced strip */
  97         sopno ssize;            /* malloced strip size (allocated) */
  98         sopno slen;             /* malloced strip length (used) */
  99         int ncsalloc;           /* number of csets allocated */
 100         struct re_guts *g;
 101 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 102         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 103         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 104         bool allowbranch;       /* can this expression branch? */
 105         bool bre;               /* convenience; is this a BRE? */
 106         int pflags;             /* other parsing flags -- legacy escapes? */
 107         bool (*parse_expr)(struct parse *, struct branchc *);
 108         void (*pre_parse)(struct parse *, struct branchc *);
 109         void (*post_parse)(struct parse *, struct branchc *);
 110 };
 111
 112 #define PFLAG_LEGACY_ESC        0x00000001
 113
 114 /* ========= begin header generated by ./mkh ========= */
 115 #ifdef __cplusplus
 116 extern "C" {
 117 #endif
 118
 119 /* === regcomp.c === */
 120 static bool p_ere_exp(struct parse *p, struct branchc *bc);
 121 static void p_str(struct parse *p);
 122 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
 123 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
 124 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
 125 static bool p_branch_empty(struct parse *p, struct branchc *bc);
 126 static bool p_branch_do(struct parse *p, struct branchc *bc);
 127 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
 128 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
 129 static void p_re(struct parse *p, int end1, int end2);
 130 static bool p_simp_re(struct parse *p, struct branchc *bc);
 131 static int p_count(struct parse *p);
 132 static void p_bracket(struct parse *p);
 133 static int p_range_cmp(wchar_t c1, wchar_t c2);
 134 static void p_b_term(struct parse *p, cset *cs);
 135 static int p_b_pseudoclass(struct parse *p, char c);
 136 static void p_b_cclass(struct parse *p, cset *cs);
 137 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
 138 static void p_b_eclass(struct parse *p, cset *cs);
 139 static wint_t p_b_symbol(struct parse *p);
 140 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
 141 static bool may_escape(struct parse *p, const wint_t ch);
 142 static wint_t othercase(wint_t ch);
 143 static void bothcases(struct parse *p, wint_t ch);
 144 static void ordinary(struct parse *p, wint_t ch);
 145 static void nonnewline(struct parse *p);
 146 static void repeat(struct parse *p, sopno start, int from, int to);
 147 static int seterr(struct parse *p, int e);
 148 static cset *allocset(struct parse *p);
 149 static void freeset(struct parse *p, cset *cs);
 150 static void CHadd(struct parse *p, cset *cs, wint_t ch);
 151 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
 152 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
 153 static wint_t singleton(cset *cs);
 154 static sopno dupl(struct parse *p, sopno start, sopno finish);
 155 static void doemit(struct parse *p, sop op, size_t opnd);
 156 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
 157 static void dofwd(struct parse *p, sopno pos, sop value);
 158 static int enlarge(struct parse *p, sopno size);
 159 static void stripsnug(struct parse *p, struct re_guts *g);
 160 static void findmust(struct parse *p, struct re_guts *g);
 161 static int altoffset(sop *scan, int offset);
 162 static void computejumps(struct parse *p, struct re_guts *g);
 163 static void computematchjumps(struct parse *p, struct re_guts *g);
 164 static sopno pluscount(struct parse *p, struct re_guts *g);
 165 static wint_t wgetnext(struct parse *p);
 166
 167 #ifdef __cplusplus
 168 }
 169 #endif
 170 /* ========= end header generated by ./mkh ========= */
 171
 172 static char nuls[10];           /* place to point scanner in event of error */
 173
 174 /*
 175  * macros for use with parse structure
 176  * BEWARE:  these know that the parse structure is named `p' !!!
 177  */
 178 #define PEEK()  (*p->next)
 179 #define PEEK2() (*(p->next+1))
 180 #define MORE()  (p->end - p->next > 0)
 181 #define MORE2() (p->end - p->next > 1)
 182 #define SEE(c)  (MORE() && PEEK() == (c))
 183 #define SEETWO(a, b)    (MORE2() && PEEK() == (a) && PEEK2() == (b))
 184 #define SEESPEC(a)      (p->bre ? SEETWO('\\', a) : SEE(a))
 185 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 186 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 187 #define EATSPEC(a)      (p->bre ? EATTWO('\\', a) : EAT(a))
 188 #define NEXT()  (p->next++)
 189 #define NEXT2() (p->next += 2)
 190 #define NEXTn(n)        (p->next += (n))
 191 #define GETNEXT()       (*p->next++)
 192 #define WGETNEXT()      wgetnext(p)
 193 #define SETERROR(e)     seterr(p, (e))
 194 #define REQUIRE(co, e)  ((co) || SETERROR(e))
 195 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 196 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 197 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 198 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 199 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 200 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 201 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 202 #define HERE()          (p->slen)
 203 #define THERE()         (p->slen - 1)
 204 #define THERETHERE()    (p->slen - 2)
 205 #define DROP(n) (p->slen -= (n))
 206
 207 /* Macro used by computejump()/computematchjump() */
 208 #define MIN(a,b)        ((a)<(b)?(a):(b))
 209
 210 static int                              /* 0 success, otherwise REG_something */
 211 regcomp_internal(regex_t * __restrict preg,
 212         const char * __restrict pattern,
 213         int cflags, int pflags)
 214 {
 215         struct parse pa;
 216         struct re_guts *g;
 217         struct parse *p = &pa;
 218         int i;
 219         size_t len;
 220         size_t maxlen;
 221 #ifdef REDEBUG
 222 #       define  GOODFLAGS(f)    (f)
 223 #else
 224 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 225 #endif
 226
 227         cflags = GOODFLAGS(cflags);
 228         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 229                 return(REG_INVARG);
 230
 231         if (cflags&REG_PEND) {
 232                 if (preg->re_endp < pattern)
 233                         return(REG_INVARG);
 234                 len = preg->re_endp - pattern;
 235         } else
 236                 len = strlen(pattern);
 237
 238         /* do the mallocs early so failure handling is easy */
 239         g = (struct re_guts *)malloc(sizeof(struct re_guts));
 240         if (g == NULL)
 241                 return(REG_ESPACE);
 242         /*
 243          * Limit the pattern space to avoid a 32-bit overflow on buffer
 244          * extension.  Also avoid any signed overflow in case of conversion
 245          * so make the real limit based on a 31-bit overflow.
 246          *
 247          * Likely not applicable on 64-bit systems but handle the case
 248          * generically (who are we to stop people from using ~715MB+
 249          * patterns?).
 250          */
 251         maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
 252         if (len >= maxlen) {
 253                 free((char *)g);
 254                 return(REG_ESPACE);
 255         }
 256         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 257         assert(p->ssize >= len);
 258
 259         p->strip = (sop *)malloc(p->ssize * sizeof(sop));
 260         p->slen = 0;
 261         if (p->strip == NULL) {
 262                 free((char *)g);
 263                 return(REG_ESPACE);
 264         }
 265
 266         /* set things up */
 267         p->g = g;
 268         p->next = pattern;      /* convenience; we do not modify it */
 269         p->end = p->next + len;
 270         p->error = 0;
 271         p->ncsalloc = 0;
 272         p->pflags = pflags;
 273         for (i = 0; i < NPAREN; i++) {
 274                 p->pbegin[i] = 0;
 275                 p->pend[i] = 0;
 276         }
 277 #ifdef LIBREGEX
 278         if (cflags&REG_POSIX) {
 279                 p->gnuext = false;
 280                 p->allowbranch = (cflags & REG_EXTENDED) != 0;
 281         } else
 282                 p->gnuext = p->allowbranch = true;
 283 #else
 284         p->gnuext = false;
 285         p->allowbranch = (cflags & REG_EXTENDED) != 0;
 286 #endif
 287         if (cflags & REG_EXTENDED) {
 288                 p->bre = false;
 289                 p->parse_expr = p_ere_exp;
 290                 p->pre_parse = NULL;
 291                 p->post_parse = NULL;
 292         } else {
 293                 p->bre = true;
 294                 p->parse_expr = p_simp_re;
 295                 p->pre_parse = p_bre_pre_parse;
 296                 p->post_parse = p_bre_post_parse;
 297         }
 298         g->sets = NULL;
 299         g->ncsets = 0;
 300         g->cflags = cflags;
 301         g->iflags = 0;
 302         g->nbol = 0;
 303         g->neol = 0;
 304         g->must = NULL;
 305         g->moffset = -1;
 306         g->charjump = NULL;
 307         g->matchjump = NULL;
 308         g->mlen = 0;
 309         g->nsub = 0;
 310         g->backrefs = 0;
 311
 312         /* do it */
 313         EMIT(OEND, 0);
 314         g->firststate = THERE();
 315         if (cflags & REG_NOSPEC)
 316                 p_str(p);
 317         else
 318                 p_re(p, OUT, OUT);
 319         EMIT(OEND, 0);
 320         g->laststate = THERE();
 321
 322         /* tidy up loose ends and fill things in */
 323         stripsnug(p, g);
 324         findmust(p, g);
 325         /* only use Boyer-Moore algorithm if the pattern is bigger
 326          * than three characters
 327          */
 328         if(g->mlen > 3) {
 329                 computejumps(p, g);
 330                 computematchjumps(p, g);
 331                 if(g->matchjump == NULL && g->charjump != NULL) {
 332                         free(g->charjump);
 333                         g->charjump = NULL;
 334                 }
 335         }
 336         g->nplus = pluscount(p, g);
 337         g->magic = MAGIC2;
 338         preg->re_nsub = g->nsub;
 339         preg->re_g = g;
 340         preg->re_magic = MAGIC1;
 341 #ifndef REDEBUG
 342         /* not debugging, so can't rely on the assert() in regexec() */
 343         if (g->iflags&BAD)
 344                 SETERROR(REG_ASSERT);
 345 #endif
 346
 347         /* win or lose, we're done */
 348         if (p->error != 0)      /* lose */
 349                 regfree(preg);
 350         return(p->error);
 351 }
 352
 353 /*
 354  - regcomp - interface for parser and compilation
 355  = extern int regcomp(regex_t *, const char *, int);
 356  = #define      REG_BASIC       0000
 357  = #define      REG_EXTENDED    0001
 358  = #define      REG_ICASE       0002
 359  = #define      REG_NOSUB       0004
 360  = #define      REG_NEWLINE     0010
 361  = #define      REG_NOSPEC      0020
 362  = #define      REG_PEND        0040
 363  = #define      REG_DUMP        0200
 364  */
 365 int                             /* 0 success, otherwise REG_something */
 366 regcomp(regex_t * __restrict preg,
 367         const char * __restrict pattern,
 368         int cflags)
 369 {
 370
 371         return (regcomp_internal(preg, pattern, cflags, 0));
 372 }
 373
 374 #ifndef LIBREGEX
 375 /*
 376  * Legacy interface that requires more lax escaping behavior.
 377  */
 378 int
 379 freebsd12_regcomp(regex_t * __restrict preg,
 380         const char * __restrict pattern,
 381         int cflags, int pflags)
 382 {
 383
 384         return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC));
 385 }
 386
 387 __sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0);
 388 #endif  /* !LIBREGEX */
 389
 390 /*
 391  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
 392  - return whether we should terminate or not
 393  == static bool p_ere_exp(struct parse *p);
 394  */
 395 static bool
 396 p_ere_exp(struct parse *p, struct branchc *bc)
 397 {
 398         char c;
 399         wint_t wc;
 400         sopno pos;
 401         int count;
 402         int count2;
 403 #ifdef LIBREGEX
 404         int i;
 405         int handled;
 406 #endif
 407         sopno subno;
 408         int wascaret = 0;
 409
 410         (void)bc;
 411         assert(MORE());         /* caller should have ensured this */
 412         c = GETNEXT();
 413
 414 #ifdef LIBREGEX
 415         handled = 0;
 416 #endif
 417         pos = HERE();
 418         switch (c) {
 419         case '(':
 420                 (void)REQUIRE(MORE(), REG_EPAREN);
 421                 p->g->nsub++;
 422                 subno = p->g->nsub;
 423                 if (subno < NPAREN)
 424                         p->pbegin[subno] = HERE();
 425                 EMIT(OLPAREN, subno);
 426                 if (!SEE(')'))
 427                         p_re(p, ')', IGN);
 428                 if (subno < NPAREN) {
 429                         p->pend[subno] = HERE();
 430                         assert(p->pend[subno] != 0);
 431                 }
 432                 EMIT(ORPAREN, subno);
 433                 (void)MUSTEAT(')', REG_EPAREN);
 434                 break;
 435 #ifndef POSIX_MISTAKE
 436         case ')':               /* happens only if no current unmatched ( */
 437                 /*
 438                  * You may ask, why the ifndef?  Because I didn't notice
 439                  * this until slightly too late for 1003.2, and none of the
 440                  * other 1003.2 regular-expression reviewers noticed it at
 441                  * all.  So an unmatched ) is legal POSIX, at least until
 442                  * we can get it fixed.
 443                  */
 444                 SETERROR(REG_EPAREN);
 445                 break;
 446 #endif
 447         case '^':
 448                 EMIT(OBOL, 0);
 449                 p->g->iflags |= USEBOL;
 450                 p->g->nbol++;
 451                 wascaret = 1;
 452                 break;
 453         case '$':
 454                 EMIT(OEOL, 0);
 455                 p->g->iflags |= USEEOL;
 456                 p->g->neol++;
 457                 break;
 458         case '|':
 459                 SETERROR(REG_EMPTY);
 460                 break;
 461         case '*':
 462         case '+':
 463         case '?':
 464         case '{':
 465                 SETERROR(REG_BADRPT);
 466                 break;
 467         case '.':
 468                 if (p->g->cflags&REG_NEWLINE)
 469                         nonnewline(p);
 470                 else
 471                         EMIT(OANY, 0);
 472                 break;
 473         case '[':
 474                 p_bracket(p);
 475                 break;
 476         case '\\':
 477                 (void)REQUIRE(MORE(), REG_EESCAPE);
 478                 wc = WGETNEXT();
 479 #ifdef LIBREGEX
 480                 if (p->gnuext) {
 481                         handled = 1;
 482                         switch (wc) {
 483                         case '`':
 484                                 EMIT(OBOS, 0);
 485                                 break;
 486                         case '\'':
 487                                 EMIT(OEOS, 0);
 488                                 break;
 489                         case 'B':
 490                                 EMIT(ONWBND, 0);
 491                                 break;
 492                         case 'b':
 493                                 EMIT(OWBND, 0);
 494                                 break;
 495                         case 'W':
 496                         case 'w':
 497                         case 'S':
 498                         case 's':
 499                                 p_b_pseudoclass(p, wc);
 500                                 break;
 501                         case '1':
 502                         case '2':
 503                         case '3':
 504                         case '4':
 505                         case '5':
 506                         case '6':
 507                         case '7':
 508                         case '8':
 509                         case '9':
 510                                 i = wc - '0';
 511                                 assert(i < NPAREN);
 512                                 if (p->pend[i] != 0) {
 513                                         assert(i <= p->g->nsub);
 514                                         EMIT(OBACK_, i);
 515                                         assert(p->pbegin[i] != 0);
 516                                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 517                                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 518                                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 519                                         EMIT(O_BACK, i);
 520                                 } else
 521                                         SETERROR(REG_ESUBREG);
 522                                 p->g->backrefs = 1;
 523                                 break;
 524                         default:
 525                                 handled = 0;
 526                         }
 527                         /* Don't proceed to the POSIX bits if we've already handled it */
 528                         if (handled)
 529                                 break;
 530                 }
 531 #endif
 532                 switch (wc) {
 533                 case '<':
 534                         EMIT(OBOW, 0);
 535                         break;
 536                 case '>':
 537                         EMIT(OEOW, 0);
 538                         break;
 539                 default:
 540                         if (may_escape(p, wc))
 541                                 ordinary(p, wc);
 542                         else
 543                                 SETERROR(REG_EESCAPE);
 544                         break;
 545                 }
 546                 break;
 547         default:
 548                 if (p->error != 0)
 549                         return (false);
 550                 p->next--;
 551                 wc = WGETNEXT();
 552                 ordinary(p, wc);
 553                 break;
 554         }
 555
 556         if (!MORE())
 557                 return (false);
 558         c = PEEK();
 559         /* we call { a repetition if followed by a digit */
 560         if (!( c == '*' || c == '+' || c == '?' || c == '{'))
 561                 return (false);         /* no repetition, we're done */
 562         else if (c == '{')
 563                 (void)REQUIRE(MORE2() && \
 564                     (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
 565         NEXT();
 566
 567         (void)REQUIRE(!wascaret, REG_BADRPT);
 568         switch (c) {
 569         case '*':       /* implemented as +? */
 570                 /* this case does not require the (y|) trick, noKLUDGE */
 571                 INSERT(OPLUS_, pos);
 572                 ASTERN(O_PLUS, pos);
 573                 INSERT(OQUEST_, pos);
 574                 ASTERN(O_QUEST, pos);
 575                 break;
 576         case '+':
 577                 INSERT(OPLUS_, pos);
 578                 ASTERN(O_PLUS, pos);
 579                 break;
 580         case '?':
 581                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 582                 INSERT(OCH_, pos);              /* offset slightly wrong */
 583                 ASTERN(OOR1, pos);              /* this one's right */
 584                 AHEAD(pos);                     /* fix the OCH_ */
 585                 EMIT(OOR2, 0);                  /* offset very wrong... */
 586                 AHEAD(THERE());                 /* ...so fix it */
 587                 ASTERN(O_CH, THERETHERE());
 588                 break;
 589         case '{':
 590                 count = p_count(p);
 591                 if (EAT(',')) {
 592                         if (isdigit((uch)PEEK())) {
 593                                 count2 = p_count(p);
 594                                 (void)REQUIRE(count <= count2, REG_BADBR);
 595                         } else          /* single number with comma */
 596                                 count2 = INFINITY;
 597                 } else          /* just a single number */
 598                         count2 = count;
 599                 repeat(p, pos, count, count2);
 600                 if (!EAT('}')) {        /* error heuristics */
 601                         while (MORE() && PEEK() != '}')
 602                                 NEXT();
 603                         (void)REQUIRE(MORE(), REG_EBRACE);
 604                         SETERROR(REG_BADBR);
 605                 }
 606                 break;
 607         }
 608
 609         if (!MORE())
 610                 return (false);
 611         c = PEEK();
 612         if (!( c == '*' || c == '+' || c == '?' ||
 613                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
 614                 return (false);
 615         SETERROR(REG_BADRPT);
 616         return (false);
 617 }
 618
 619 /*
 620  - p_str - string (no metacharacters) "parser"
 621  == static void p_str(struct parse *p);
 622  */
 623 static void
 624 p_str(struct parse *p)
 625 {
 626         (void)REQUIRE(MORE(), REG_EMPTY);
 627         while (MORE())
 628                 ordinary(p, WGETNEXT());
 629 }
 630
 631 /*
 632  * Eat consecutive branch delimiters for the kind of expression that we are
 633  * parsing, return the number of delimiters that we ate.
 634  */
 635 static int
 636 p_branch_eat_delim(struct parse *p, struct branchc *bc)
 637 {
 638         int nskip;
 639
 640         (void)bc;
 641         nskip = 0;
 642         while (EATSPEC('|'))
 643                 ++nskip;
 644         return (nskip);
 645 }
 646
 647 /*
 648  * Insert necessary branch book-keeping operations. This emits a
 649  * bogus 'next' offset, since we still have more to parse
 650  */
 651 static void
 652 p_branch_ins_offset(struct parse *p, struct branchc *bc)
 653 {
 654
 655         if (bc->nbranch == 0) {
 656                 INSERT(OCH_, bc->start);        /* offset is wrong */
 657                 bc->fwd = bc->start;
 658                 bc->back = bc->start;
 659         }
 660
 661         ASTERN(OOR1, bc->back);
 662         bc->back = THERE();
 663         AHEAD(bc->fwd);                 /* fix previous offset */
 664         bc->fwd = HERE();
 665         EMIT(OOR2, 0);                  /* offset is very wrong */
 666         ++bc->nbranch;
 667 }
 668
 669 /*
 670  * Fix the offset of the tail branch, if we actually had any branches.
 671  * This is to correct the bogus placeholder offset that we use.
 672  */
 673 static void
 674 p_branch_fix_tail(struct parse *p, struct branchc *bc)
 675 {
 676
 677         /* Fix bogus offset at the tail if we actually have branches */
 678         if (bc->nbranch > 0) {
 679                 AHEAD(bc->fwd);
 680                 ASTERN(O_CH, bc->back);
 681         }
 682 }
 683
 684 /*
 685  * Signal to the parser that an empty branch has been encountered; this will,
 686  * in the future, be used to allow for more permissive behavior with empty
 687  * branches. The return value should indicate whether parsing may continue
 688  * or not.
 689  */
 690 static bool
 691 p_branch_empty(struct parse *p, struct branchc *bc)
 692 {
 693
 694         (void)bc;
 695         SETERROR(REG_EMPTY);
 696         return (false);
 697 }
 698
 699 /*
 700  * Take care of any branching requirements. This includes inserting the
 701  * appropriate branching instructions as well as eating all of the branch
 702  * delimiters until we either run out of pattern or need to parse more pattern.
 703  */
 704 static bool
 705 p_branch_do(struct parse *p, struct branchc *bc)
 706 {
 707         int ate = 0;
 708
 709         ate = p_branch_eat_delim(p, bc);
 710         if (ate == 0)
 711                 return (false);
 712         else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
 713                 /*
 714                  * Halt parsing only if we have an empty branch and p_branch_empty
 715                  * indicates that we must not continue. In the future, this will not
 716                  * necessarily be an error.
 717                  */
 718                 return (false);
 719         p_branch_ins_offset(p, bc);
 720
 721         return (true);
 722 }
 723
 724 static void
 725 p_bre_pre_parse(struct parse *p, struct branchc *bc)
 726 {
 727
 728         (void) bc;
 729         /*
 730          * Does not move cleanly into expression parser because of
 731          * ordinary interpration of * at the beginning position of
 732          * an expression.
 733          */
 734         if (EAT('^')) {
 735                 EMIT(OBOL, 0);
 736                 p->g->iflags |= USEBOL;
 737                 p->g->nbol++;
 738         }
 739 }
 740
 741 static void
 742 p_bre_post_parse(struct parse *p, struct branchc *bc)
 743 {
 744
 745         /* Expression is terminating due to EOL token */
 746         if (bc->terminate) {
 747                 DROP(1);
 748                 EMIT(OEOL, 0);
 749                 p->g->iflags |= USEEOL;
 750                 p->g->neol++;
 751         }
 752 }
 753
 754 /*
 755  - p_re - Top level parser, concatenation and BRE anchoring
 756  == static void p_re(struct parse *p, int end1, int end2);
 757  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 758  *
 759  * This implementation is a bit of a kludge, in that a trailing $ is first
 760  * taken as an ordinary character and then revised to be an anchor.
 761  * The amount of lookahead needed to avoid this kludge is excessive.
 762  */
 763 static void
 764 p_re(struct parse *p,
 765         int end1,       /* first terminating character */
 766         int end2)       /* second terminating character; ignored for EREs */
 767 {
 768         struct branchc bc;
 769
 770         bc.nbranch = 0;
 771         if (end1 == OUT && end2 == OUT)
 772                 bc.outer = true;
 773         else
 774                 bc.outer = false;
 775 #define SEEEND()        (!p->bre ? SEE(end1) : SEETWO(end1, end2))
 776         for (;;) {
 777                 bc.start = HERE();
 778                 bc.nchain = 0;
 779                 bc.terminate = false;
 780                 if (p->pre_parse != NULL)
 781                         p->pre_parse(p, &bc);
 782                 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
 783                         bc.terminate = p->parse_expr(p, &bc);
 784                         ++bc.nchain;
 785                 }
 786                 if (p->post_parse != NULL)
 787                         p->post_parse(p, &bc);
 788                 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
 789 #ifdef LIBREGEX
 790                 if (HERE() == bc.start && !p_branch_empty(p, &bc))
 791                         break;
 792 #endif
 793                 if (!p->allowbranch)
 794                         break;
 795                 /*
 796                  * p_branch_do's return value indicates whether we should
 797                  * continue parsing or not. This is both for correctness and
 798                  * a slight optimization, because it will check if we've
 799                  * encountered an empty branch or the end of the string
 800                  * immediately following a branch delimiter.
 801                  */
 802                 if (!p_branch_do(p, &bc))
 803                         break;
 804         }
 805 #undef SEE_END
 806         if (p->allowbranch)
 807                 p_branch_fix_tail(p, &bc);
 808         assert(!MORE() || SEE(end1));
 809 }
 810
 811 /*
 812  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 813  == static bool p_simp_re(struct parse *p, struct branchc *bc);
 814  */
 815 static bool                     /* was the simple RE an unbackslashed $? */
 816 p_simp_re(struct parse *p, struct branchc *bc)
 817 {
 818         int c;
 819         int cc;                 /* convenient/control character */
 820         int count;
 821         int count2;
 822         sopno pos;
 823         bool handled;
 824         int i;
 825         wint_t wc;
 826         sopno subno;
 827 #       define  BACKSL  (1<<CHAR_BIT)
 828
 829         pos = HERE();           /* repetition op, if any, covers from here */
 830         handled = false;
 831
 832         assert(MORE());         /* caller should have ensured this */
 833         c = GETNEXT();
 834         if (c == '\\') {
 835                 (void)REQUIRE(MORE(), REG_EESCAPE);
 836                 cc = GETNEXT();
 837                 c = BACKSL | cc;
 838 #ifdef LIBREGEX
 839                 if (p->gnuext) {
 840                         handled = true;
 841                         switch (c) {
 842                         case BACKSL|'`':
 843                                 EMIT(OBOS, 0);
 844                                 break;
 845                         case BACKSL|'\'':
 846                                 EMIT(OEOS, 0);
 847                                 break;
 848                         case BACKSL|'B':
 849                                 EMIT(ONWBND, 0);
 850                                 break;
 851                         case BACKSL|'b':
 852                                 EMIT(OWBND, 0);
 853                                 break;
 854                         case BACKSL|'W':
 855                         case BACKSL|'w':
 856                         case BACKSL|'S':
 857                         case BACKSL|'s':
 858                                 p_b_pseudoclass(p, cc);
 859                                 break;
 860                         default:
 861                                 handled = false;
 862                         }
 863                 }
 864 #endif
 865         }
 866         if (!handled) {
 867                 switch (c) {
 868                 case '.':
 869                         if (p->g->cflags&REG_NEWLINE)
 870                                 nonnewline(p);
 871                         else
 872                                 EMIT(OANY, 0);
 873                         break;
 874                 case '[':
 875                         p_bracket(p);
 876                         break;
 877                 case BACKSL|'<':
 878                         EMIT(OBOW, 0);
 879                         break;
 880                 case BACKSL|'>':
 881                         EMIT(OEOW, 0);
 882                         break;
 883                 case BACKSL|'{':
 884                         SETERROR(REG_BADRPT);
 885                         break;
 886                 case BACKSL|'(':
 887                         p->g->nsub++;
 888                         subno = p->g->nsub;
 889                         if (subno < NPAREN)
 890                                 p->pbegin[subno] = HERE();
 891                         EMIT(OLPAREN, subno);
 892                         /* the MORE here is an error heuristic */
 893                         if (MORE() && !SEETWO('\\', ')'))
 894                                 p_re(p, '\\', ')');
 895                         if (subno < NPAREN) {
 896                                 p->pend[subno] = HERE();
 897                                 assert(p->pend[subno] != 0);
 898                         }
 899                         EMIT(ORPAREN, subno);
 900                         (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 901                         break;
 902                 case BACKSL|')':        /* should not get here -- must be user */
 903                         SETERROR(REG_EPAREN);
 904                         break;
 905                 case BACKSL|'1':
 906                 case BACKSL|'2':
 907                 case BACKSL|'3':
 908                 case BACKSL|'4':
 909                 case BACKSL|'5':
 910                 case BACKSL|'6':
 911                 case BACKSL|'7':
 912                 case BACKSL|'8':
 913                 case BACKSL|'9':
 914                         i = (c&~BACKSL) - '0';
 915                         assert(i < NPAREN);
 916                         if (p->pend[i] != 0) {
 917                                 assert(i <= p->g->nsub);
 918                                 EMIT(OBACK_, i);
 919                                 assert(p->pbegin[i] != 0);
 920                                 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 921                                 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 922                                 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 923                                 EMIT(O_BACK, i);
 924                         } else
 925                                 SETERROR(REG_ESUBREG);
 926                         p->g->backrefs = 1;
 927                         break;
 928                 case '*':
 929                         /*
 930                          * Ordinary if used as the first character beyond BOL anchor of
 931                          * a (sub-)expression, counts as a bad repetition operator if it
 932                          * appears otherwise.
 933                          */
 934                         (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
 935                         /* FALLTHROUGH */
 936                 default:
 937                         if (p->error != 0)
 938                                 return (false); /* Definitely not $... */
 939                         p->next--;
 940                         wc = WGETNEXT();
 941                         if ((c & BACKSL) == 0 || may_escape(p, wc))
 942                                 ordinary(p, wc);
 943                         else
 944                                 SETERROR(REG_EESCAPE);
 945                         break;
 946                 }
 947         }
 948
 949         if (EAT('*')) {         /* implemented as +? */
 950                 /* this case does not require the (y|) trick, noKLUDGE */
 951                 INSERT(OPLUS_, pos);
 952                 ASTERN(O_PLUS, pos);
 953                 INSERT(OQUEST_, pos);
 954                 ASTERN(O_QUEST, pos);
 955 #ifdef LIBREGEX
 956         } else if (p->gnuext && EATTWO('\\', '?')) {
 957                 INSERT(OQUEST_, pos);
 958                 ASTERN(O_QUEST, pos);
 959         } else if (p->gnuext && EATTWO('\\', '+')) {
 960                 INSERT(OPLUS_, pos);
 961                 ASTERN(O_PLUS, pos);
 962 #endif
 963         } else if (EATTWO('\\', '{')) {
 964                 count = p_count(p);
 965                 if (EAT(',')) {
 966                         if (MORE() && isdigit((uch)PEEK())) {
 967                                 count2 = p_count(p);
 968                                 (void)REQUIRE(count <= count2, REG_BADBR);
 969                         } else          /* single number with comma */
 970                                 count2 = INFINITY;
 971                 } else          /* just a single number */
 972                         count2 = count;
 973                 repeat(p, pos, count, count2);
 974                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 975                         while (MORE() && !SEETWO('\\', '}'))
 976                                 NEXT();
 977                         (void)REQUIRE(MORE(), REG_EBRACE);
 978                         SETERROR(REG_BADBR);
 979                 }
 980         } else if (c == '$')     /* $ (but not \$) ends it */
 981                 return (true);
 982
 983         return (false);
 984 }
 985
 986 /*
 987  - p_count - parse a repetition count
 988  == static int p_count(struct parse *p);
 989  */
 990 static int                      /* the value */
 991 p_count(struct parse *p)
 992 {
 993         int count = 0;
 994         int ndigits = 0;
 995
 996         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 997                 count = count*10 + (GETNEXT() - '0');
 998                 ndigits++;
 999         }
1000
1001         (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1002         return(count);
1003 }
1004
1005 /*
1006  - p_bracket - parse a bracketed character list
1007  == static void p_bracket(struct parse *p);
1008  */
1009 static void
1010 p_bracket(struct parse *p)
1011 {
1012         cset *cs;
1013         wint_t ch;
1014
1015         /* Dept of Truly Sickening Special-Case Kludges */
1016         if (p->end - p->next > 5) {
1017                 if (strncmp(p->next, "[:<:]]", 6) == 0) {
1018                         EMIT(OBOW, 0);
1019                         NEXTn(6);
1020                         return;
1021                 }
1022                 if (strncmp(p->next, "[:>:]]", 6) == 0) {
1023                         EMIT(OEOW, 0);
1024                         NEXTn(6);
1025                         return;
1026                 }
1027         }
1028
1029         if ((cs = allocset(p)) == NULL)
1030                 return;
1031
1032         if (p->g->cflags&REG_ICASE)
1033                 cs->icase = 1;
1034         if (EAT('^'))
1035                 cs->invert = 1;
1036         if (EAT(']'))
1037                 CHadd(p, cs, ']');
1038         else if (EAT('-'))
1039                 CHadd(p, cs, '-');
1040         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1041                 p_b_term(p, cs);
1042         if (EAT('-'))
1043                 CHadd(p, cs, '-');
1044         (void)MUSTEAT(']', REG_EBRACK);
1045
1046         if (p->error != 0)      /* don't mess things up further */
1047                 return;
1048
1049         if (cs->invert && p->g->cflags&REG_NEWLINE)
1050                 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1051
1052         if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
1053                 ordinary(p, ch);
1054                 freeset(p, cs);
1055         } else
1056                 EMIT(OANYOF, (int)(cs - p->g->sets));
1057 }
1058
1059 static int
1060 p_range_cmp(wchar_t c1, wchar_t c2)
1061 {
1062 #ifndef LIBREGEX
1063         return __wcollate_range_cmp(c1, c2);
1064 #else
1065         /* Copied from libc/collate __wcollate_range_cmp */
1066         wchar_t s1[2], s2[2];
1067
1068         s1[0] = c1;
1069         s1[1] = L'\0';
1070         s2[0] = c2;
1071         s2[1] = L'\0';
1072         return (wcscoll(s1, s2));
1073 #endif
1074 }
1075
1076 /*
1077  - p_b_term - parse one term of a bracketed character list
1078  == static void p_b_term(struct parse *p, cset *cs);
1079  */
1080 static void
1081 p_b_term(struct parse *p, cset *cs)
1082 {
1083         char c;
1084         wint_t start, finish;
1085         wint_t i;
1086 #ifndef LIBREGEX
1087         struct xlocale_collate *table =
1088                 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1089 #endif
1090         /* classify what we've got */
1091         switch ((MORE()) ? PEEK() : '\0') {
1092         case '[':
1093                 c = (MORE2()) ? PEEK2() : '\0';
1094                 break;
1095         case '-':
1096                 SETERROR(REG_ERANGE);
1097                 return;                 /* NOTE RETURN */
1098         default:
1099                 c = '\0';
1100                 break;
1101         }
1102
1103         switch (c) {
1104         case ':':               /* character class */
1105                 NEXT2();
1106                 (void)REQUIRE(MORE(), REG_EBRACK);
1107                 c = PEEK();
1108                 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1109                 p_b_cclass(p, cs);
1110                 (void)REQUIRE(MORE(), REG_EBRACK);
1111                 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1112                 break;
1113         case '=':               /* equivalence class */
1114                 NEXT2();
1115                 (void)REQUIRE(MORE(), REG_EBRACK);
1116                 c = PEEK();
1117                 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1118                 p_b_eclass(p, cs);
1119                 (void)REQUIRE(MORE(), REG_EBRACK);
1120                 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1121                 break;
1122         default:                /* symbol, ordinary character, or range */
1123                 start = p_b_symbol(p);
1124                 if (SEE('-') && MORE2() && PEEK2() != ']') {
1125                         /* range */
1126                         NEXT();
1127                         if (EAT('-'))
1128                                 finish = '-';
1129                         else
1130                                 finish = p_b_symbol(p);
1131                 } else
1132                         finish = start;
1133                 if (start == finish)
1134                         CHadd(p, cs, start);
1135                 else {
1136 #ifndef LIBREGEX
1137                         if (table->__collate_load_error || MB_CUR_MAX > 1) {
1138 #else
1139                         if (MB_CUR_MAX > 1) {
1140 #endif
1141                                 (void)REQUIRE(start <= finish, REG_ERANGE);
1142                                 CHaddrange(p, cs, start, finish);
1143                         } else {
1144                                 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1145                                 for (i = 0; i <= UCHAR_MAX; i++) {
1146                                         if (p_range_cmp(start, i) <= 0 &&
1147                                             p_range_cmp(i, finish) <= 0 )
1148                                                 CHadd(p, cs, i);
1149                                 }
1150                         }
1151                 }
1152                 break;
1153         }
1154 }
1155
1156 /*
1157  - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1158  == static int p_b_pseudoclass(struct parse *p, char c)
1159  */
1160 static int
1161 p_b_pseudoclass(struct parse *p, char c) {
1162         cset *cs;
1163
1164         if ((cs = allocset(p)) == NULL)
1165                 return(0);
1166
1167         if (p->g->cflags&REG_ICASE)
1168                 cs->icase = 1;
1169
1170         switch (c) {
1171         case 'W':
1172                 cs->invert = 1;
1173                 /* PASSTHROUGH */
1174         case 'w':
1175                 p_b_cclass_named(p, cs, "alnum");
1176                 break;
1177         case 'S':
1178                 cs->invert = 1;
1179                 /* PASSTHROUGH */
1180         case 's':
1181                 p_b_cclass_named(p, cs, "space");
1182                 break;
1183         default:
1184                 return(0);
1185         }
1186
1187         EMIT(OANYOF, (int)(cs - p->g->sets));
1188         return(1);
1189 }
1190
1191 /*
1192  - p_b_cclass - parse a character-class name and deal with it
1193  == static void p_b_cclass(struct parse *p, cset *cs);
1194  */
1195 static void
1196 p_b_cclass(struct parse *p, cset *cs)
1197 {
1198         const char *sp = p->next;
1199         size_t len;
1200         char clname[16];
1201
1202         while (MORE() && isalpha((uch)PEEK()))
1203                 NEXT();
1204         len = p->next - sp;
1205         if (len >= sizeof(clname) - 1) {
1206                 SETERROR(REG_ECTYPE);
1207                 return;
1208         }
1209         memcpy(clname, sp, len);
1210         clname[len] = '\0';
1211
1212         p_b_cclass_named(p, cs, clname);
1213 }
1214 /*
1215  - p_b_cclass_named - deal with a named character class
1216  == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1217  */
1218 static void
1219 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1220         wctype_t wct;
1221
1222         if ((wct = wctype(clname)) == 0) {
1223                 SETERROR(REG_ECTYPE);
1224                 return;
1225         }
1226         CHaddtype(p, cs, wct);
1227 }
1228
1229 /*
1230  - p_b_eclass - parse an equivalence-class name and deal with it
1231  == static void p_b_eclass(struct parse *p, cset *cs);
1232  *
1233  * This implementation is incomplete. xxx
1234  */
1235 static void
1236 p_b_eclass(struct parse *p, cset *cs)
1237 {
1238         wint_t c;
1239
1240         c = p_b_coll_elem(p, '=');
1241         CHadd(p, cs, c);
1242 }
1243
1244 /*
1245  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1246  == static wint_t p_b_symbol(struct parse *p);
1247  */
1248 static wint_t                   /* value of symbol */
1249 p_b_symbol(struct parse *p)
1250 {
1251         wint_t value;
1252
1253         (void)REQUIRE(MORE(), REG_EBRACK);
1254         if (!EATTWO('[', '.'))
1255                 return(WGETNEXT());
1256
1257         /* collating symbol */
1258         value = p_b_coll_elem(p, '.');
1259         (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1260         return(value);
1261 }
1262
1263 /*
1264  - p_b_coll_elem - parse a collating-element name and look it up
1265  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1266  */
1267 static wint_t                   /* value of collating element */
1268 p_b_coll_elem(struct parse *p,
1269         wint_t endc)            /* name ended by endc,']' */
1270 {
1271         const char *sp = p->next;
1272         struct cname *cp;
1273         mbstate_t mbs;
1274         wchar_t wc;
1275         size_t clen, len;
1276
1277         while (MORE() && !SEETWO(endc, ']'))
1278                 NEXT();
1279         if (!MORE()) {
1280                 SETERROR(REG_EBRACK);
1281                 return(0);
1282         }
1283         len = p->next - sp;
1284         for (cp = cnames; cp->name != NULL; cp++)
1285                 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1286                         return(cp->code);       /* known name */
1287         memset(&mbs, 0, sizeof(mbs));
1288         if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1289                 return (wc);                    /* single character */
1290         else if (clen == (size_t)-1 || clen == (size_t)-2)
1291                 SETERROR(REG_ILLSEQ);
1292         else
1293                 SETERROR(REG_ECOLLATE);         /* neither */
1294         return(0);
1295 }
1296
1297 /*
1298  - may_escape - determine whether 'ch' is escape-able in the current context
1299  == static int may_escape(struct parse *p, const wint_t ch)
1300  */
1301 static bool
1302 may_escape(struct parse *p, const wint_t ch)
1303 {
1304
1305         if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1306                 return (true);
1307         if (isalpha(ch) || ch == '\'' || ch == '`')
1308                 return (false);
1309         return (true);
1310 #ifdef NOTYET
1311         /*
1312          * Build a whitelist of characters that may be escaped to produce an
1313          * ordinary in the current context. This assumes that these have not
1314          * been otherwise interpreted as a special character. Escaping an
1315          * ordinary character yields undefined results according to
1316          * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1317          * advantage of this and use escaped ordinary characters to provide
1318          * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1319          */
1320         switch(ch) {
1321         case '|':
1322         case '+':
1323         case '?':
1324                 /* The above characters may not be escaped in BREs */
1325                 if (!(p->g->cflags&REG_EXTENDED))
1326                         return (false);
1327                 /* Fallthrough */
1328         case '(':
1329         case ')':
1330         case '{':
1331         case '}':
1332         case '.':
1333         case '[':
1334         case ']':
1335         case '\\':
1336         case '*':
1337         case '^':
1338         case '$':
1339                 return (true);
1340         default:
1341                 return (false);
1342         }
1343 #endif
1344 }
1345
1346 /*
1347  - othercase - return the case counterpart of an alphabetic
1348  == static wint_t othercase(wint_t ch);
1349  */
1350 static wint_t                   /* if no counterpart, return ch */
1351 othercase(wint_t ch)
1352 {
1353         assert(iswalpha(ch));
1354         if (iswupper(ch))
1355                 return(towlower(ch));
1356         else if (iswlower(ch))
1357                 return(towupper(ch));
1358         else                    /* peculiar, but could happen */
1359                 return(ch);
1360 }
1361
1362 /*
1363  - bothcases - emit a dualcase version of a two-case character
1364  == static void bothcases(struct parse *p, wint_t ch);
1365  *
1366  * Boy, is this implementation ever a kludge...
1367  */
1368 static void
1369 bothcases(struct parse *p, wint_t ch)
1370 {
1371         const char *oldnext = p->next;
1372         const char *oldend = p->end;
1373         char bracket[3 + MB_LEN_MAX];
1374         size_t n;
1375         mbstate_t mbs;
1376
1377         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1378         p->next = bracket;
1379         memset(&mbs, 0, sizeof(mbs));
1380         n = wcrtomb(bracket, ch, &mbs);
1381         assert(n != (size_t)-1);
1382         bracket[n] = ']';
1383         bracket[n + 1] = '\0';
1384         p->end = bracket+n+1;
1385         p_bracket(p);
1386         assert(p->next == p->end);
1387         p->next = oldnext;
1388         p->end = oldend;
1389 }
1390
1391 /*
1392  - ordinary - emit an ordinary character
1393  == static void ordinary(struct parse *p, wint_t ch);
1394  */
1395 static void
1396 ordinary(struct parse *p, wint_t ch)
1397 {
1398         cset *cs;
1399
1400         if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1401                 bothcases(p, ch);
1402         else if ((ch & OPDMASK) == ch)
1403                 EMIT(OCHAR, ch);
1404         else {
1405                 /*
1406                  * Kludge: character is too big to fit into an OCHAR operand.
1407                  * Emit a singleton set.
1408                  */
1409                 if ((cs = allocset(p)) == NULL)
1410                         return;
1411                 CHadd(p, cs, ch);
1412                 EMIT(OANYOF, (int)(cs - p->g->sets));
1413         }
1414 }
1415
1416 /*
1417  - nonnewline - emit REG_NEWLINE version of OANY
1418  == static void nonnewline(struct parse *p);
1419  *
1420  * Boy, is this implementation ever a kludge...
1421  */
1422 static void
1423 nonnewline(struct parse *p)
1424 {
1425         const char *oldnext = p->next;
1426         const char *oldend = p->end;
1427         char bracket[4];
1428
1429         p->next = bracket;
1430         p->end = bracket+3;
1431         bracket[0] = '^';
1432         bracket[1] = '\n';
1433         bracket[2] = ']';
1434         bracket[3] = '\0';
1435         p_bracket(p);
1436         assert(p->next == bracket+3);
1437         p->next = oldnext;
1438         p->end = oldend;
1439 }
1440
1441 /*
1442  - repeat - generate code for a bounded repetition, recursively if needed
1443  == static void repeat(struct parse *p, sopno start, int from, int to);
1444  */
1445 static void
1446 repeat(struct parse *p,
1447         sopno start,            /* operand from here to end of strip */
1448         int from,               /* repeated from this number */
1449         int to)                 /* to this number of times (maybe INFINITY) */
1450 {
1451         sopno finish = HERE();
1452 #       define  N       2
1453 #       define  INF     3
1454 #       define  REP(f, t)       ((f)*8 + (t))
1455 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1456         sopno copy;
1457
1458         if (p->error != 0)      /* head off possible runaway recursion */
1459                 return;
1460
1461         assert(from <= to);
1462
1463         switch (REP(MAP(from), MAP(to))) {
1464         case REP(0, 0):                 /* must be user doing this */
1465                 DROP(finish-start);     /* drop the operand */
1466                 break;
1467         case REP(0, 1):                 /* as x{1,1}? */
1468         case REP(0, N):                 /* as x{1,n}? */
1469         case REP(0, INF):               /* as x{1,}? */
1470                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1471                 INSERT(OCH_, start);            /* offset is wrong... */
1472                 repeat(p, start+1, 1, to);
1473                 ASTERN(OOR1, start);
1474                 AHEAD(start);                   /* ... fix it */
1475                 EMIT(OOR2, 0);
1476                 AHEAD(THERE());
1477                 ASTERN(O_CH, THERETHERE());
1478                 break;
1479         case REP(1, 1):                 /* trivial case */
1480                 /* done */
1481                 break;
1482         case REP(1, N):                 /* as x?x{1,n-1} */
1483                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1484                 INSERT(OCH_, start);
1485                 ASTERN(OOR1, start);
1486                 AHEAD(start);
1487                 EMIT(OOR2, 0);                  /* offset very wrong... */
1488                 AHEAD(THERE());                 /* ...so fix it */
1489                 ASTERN(O_CH, THERETHERE());
1490                 copy = dupl(p, start+1, finish+1);
1491                 assert(copy == finish+4);
1492                 repeat(p, copy, 1, to-1);
1493                 break;
1494         case REP(1, INF):               /* as x+ */
1495                 INSERT(OPLUS_, start);
1496                 ASTERN(O_PLUS, start);
1497                 break;
1498         case REP(N, N):                 /* as xx{m-1,n-1} */
1499                 copy = dupl(p, start, finish);
1500                 repeat(p, copy, from-1, to-1);
1501                 break;
1502         case REP(N, INF):               /* as xx{n-1,INF} */
1503                 copy = dupl(p, start, finish);
1504                 repeat(p, copy, from-1, to);
1505                 break;
1506         default:                        /* "can't happen" */
1507                 SETERROR(REG_ASSERT);   /* just in case */
1508                 break;
1509         }
1510 }
1511
1512 /*
1513  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1514  - character from the parse struct, signals a REG_ILLSEQ error if the
1515  - character can't be converted. Returns the number of bytes consumed.
1516  */
1517 static wint_t
1518 wgetnext(struct parse *p)
1519 {
1520         mbstate_t mbs;
1521         wchar_t wc;
1522         size_t n;
1523
1524         memset(&mbs, 0, sizeof(mbs));
1525         n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1526         if (n == (size_t)-1 || n == (size_t)-2) {
1527                 SETERROR(REG_ILLSEQ);
1528                 return (0);
1529         }
1530         if (n == 0)
1531                 n = 1;
1532         p->next += n;
1533         return (wc);
1534 }
1535
1536 /*
1537  - seterr - set an error condition
1538  == static int seterr(struct parse *p, int e);
1539  */
1540 static int                      /* useless but makes type checking happy */
1541 seterr(struct parse *p, int e)
1542 {
1543         if (p->error == 0)      /* keep earliest error condition */
1544                 p->error = e;
1545         p->next = nuls;         /* try to bring things to a halt */
1546         p->end = nuls;
1547         return(0);              /* make the return value well-defined */
1548 }
1549
1550 /*
1551  - allocset - allocate a set of characters for []
1552  == static cset *allocset(struct parse *p);
1553  */
1554 static cset *
1555 allocset(struct parse *p)
1556 {
1557         cset *cs, *ncs;
1558
1559         ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1560         if (ncs == NULL) {
1561                 SETERROR(REG_ESPACE);
1562                 return (NULL);
1563         }
1564         p->g->sets = ncs;
1565         cs = &p->g->sets[p->g->ncsets++];
1566         memset(cs, 0, sizeof(*cs));
1567
1568         return(cs);
1569 }
1570
1571 /*
1572  - freeset - free a now-unused set
1573  == static void freeset(struct parse *p, cset *cs);
1574  */
1575 static void
1576 freeset(struct parse *p, cset *cs)
1577 {
1578         cset *top = &p->g->sets[p->g->ncsets];
1579
1580         free(cs->wides);
1581         free(cs->ranges);
1582         free(cs->types);
1583         memset(cs, 0, sizeof(*cs));
1584         if (cs == top-1)        /* recover only the easy case */
1585                 p->g->ncsets--;
1586 }
1587
1588 /*
1589  - singleton - Determine whether a set contains only one character,
1590  - returning it if so, otherwise returning OUT.
1591  */
1592 static wint_t
1593 singleton(cset *cs)
1594 {
1595         wint_t i, s, n;
1596
1597         for (i = n = 0; i < NC; i++)
1598                 if (CHIN(cs, i)) {
1599                         n++;
1600                         s = i;
1601                 }
1602         if (n == 1)
1603                 return (s);
1604         if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1605             cs->icase == 0)
1606                 return (cs->wides[0]);
1607         /* Don't bother handling the other cases. */
1608         return (OUT);
1609 }
1610
1611 /*
1612  - CHadd - add character to character set.
1613  */
1614 static void
1615 CHadd(struct parse *p, cset *cs, wint_t ch)
1616 {
1617         wint_t nch, *newwides;
1618         assert(ch >= 0);
1619         if (ch < NC)
1620                 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1621         else {
1622                 newwides = reallocarray(cs->wides, cs->nwides + 1,
1623                     sizeof(*cs->wides));
1624                 if (newwides == NULL) {
1625                         SETERROR(REG_ESPACE);
1626                         return;
1627                 }
1628                 cs->wides = newwides;
1629                 cs->wides[cs->nwides++] = ch;
1630         }
1631         if (cs->icase) {
1632                 if ((nch = towlower(ch)) < NC)
1633                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1634                 if ((nch = towupper(ch)) < NC)
1635                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1636         }
1637 }
1638
1639 /*
1640  - CHaddrange - add all characters in the range [min,max] to a character set.
1641  */
1642 static void
1643 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1644 {
1645         crange *newranges;
1646
1647         for (; min < NC && min <= max; min++)
1648                 CHadd(p, cs, min);
1649         if (min >= max)
1650                 return;
1651         newranges = reallocarray(cs->ranges, cs->nranges + 1,
1652             sizeof(*cs->ranges));
1653         if (newranges == NULL) {
1654                 SETERROR(REG_ESPACE);
1655                 return;
1656         }
1657         cs->ranges = newranges;
1658         cs->ranges[cs->nranges].min = min;
1659         cs->ranges[cs->nranges].max = max;
1660         cs->nranges++;
1661 }
1662
1663 /*
1664  - CHaddtype - add all characters of a certain type to a character set.
1665  */
1666 static void
1667 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1668 {
1669         wint_t i;
1670         wctype_t *newtypes;
1671
1672         for (i = 0; i < NC; i++)
1673                 if (iswctype(i, wct))
1674                         CHadd(p, cs, i);
1675         newtypes = reallocarray(cs->types, cs->ntypes + 1,
1676             sizeof(*cs->types));
1677         if (newtypes == NULL) {
1678                 SETERROR(REG_ESPACE);
1679                 return;
1680         }
1681         cs->types = newtypes;
1682         cs->types[cs->ntypes++] = wct;
1683 }
1684
1685 /*
1686  - dupl - emit a duplicate of a bunch of sops
1687  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1688  */
1689 static sopno                    /* start of duplicate */
1690 dupl(struct parse *p,
1691         sopno start,            /* from here */
1692         sopno finish)           /* to this less one */
1693 {
1694         sopno ret = HERE();
1695         sopno len = finish - start;
1696
1697         assert(finish >= start);
1698         if (len == 0)
1699                 return(ret);
1700         if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1701                 return(ret);
1702         (void) memcpy((char *)(p->strip + p->slen),
1703                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1704         p->slen += len;
1705         return(ret);
1706 }
1707
1708 /*
1709  - doemit - emit a strip operator
1710  == static void doemit(struct parse *p, sop op, size_t opnd);
1711  *
1712  * It might seem better to implement this as a macro with a function as
1713  * hard-case backup, but it's just too big and messy unless there are
1714  * some changes to the data structures.  Maybe later.
1715  */
1716 static void
1717 doemit(struct parse *p, sop op, size_t opnd)
1718 {
1719         /* avoid making error situations worse */
1720         if (p->error != 0)
1721                 return;
1722
1723         /* deal with oversize operands ("can't happen", more or less) */
1724         assert(opnd < 1<<OPSHIFT);
1725
1726         /* deal with undersized strip */
1727         if (p->slen >= p->ssize)
1728                 if (!enlarge(p, (p->ssize+1) / 2 * 3))  /* +50% */
1729                         return;
1730
1731         /* finally, it's all reduced to the easy case */
1732         p->strip[p->slen++] = SOP(op, opnd);
1733 }
1734
1735 /*
1736  - doinsert - insert a sop into the strip
1737  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1738  */
1739 static void
1740 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1741 {
1742         sopno sn;
1743         sop s;
1744         int i;
1745
1746         /* avoid making error situations worse */
1747         if (p->error != 0)
1748                 return;
1749
1750         sn = HERE();
1751         EMIT(op, opnd);         /* do checks, ensure space */
1752         assert(HERE() == sn+1);
1753         s = p->strip[sn];
1754
1755         /* adjust paren pointers */
1756         assert(pos > 0);
1757         for (i = 1; i < NPAREN; i++) {
1758                 if (p->pbegin[i] >= pos) {
1759                         p->pbegin[i]++;
1760                 }
1761                 if (p->pend[i] >= pos) {
1762                         p->pend[i]++;
1763                 }
1764         }
1765
1766         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1767                                                 (HERE()-pos-1)*sizeof(sop));
1768         p->strip[pos] = s;
1769 }
1770
1771 /*
1772  - dofwd - complete a forward reference
1773  == static void dofwd(struct parse *p, sopno pos, sop value);
1774  */
1775 static void
1776 dofwd(struct parse *p, sopno pos, sop value)
1777 {
1778         /* avoid making error situations worse */
1779         if (p->error != 0)
1780                 return;
1781
1782         assert(value < 1<<OPSHIFT);
1783         p->strip[pos] = OP(p->strip[pos]) | value;
1784 }
1785
1786 /*
1787  - enlarge - enlarge the strip
1788  == static int enlarge(struct parse *p, sopno size);
1789  */
1790 static int
1791 enlarge(struct parse *p, sopno size)
1792 {
1793         sop *sp;
1794
1795         if (p->ssize >= size)
1796                 return 1;
1797
1798         sp = reallocarray(p->strip, size, sizeof(sop));
1799         if (sp == NULL) {
1800                 SETERROR(REG_ESPACE);
1801                 return 0;
1802         }
1803         p->strip = sp;
1804         p->ssize = size;
1805         return 1;
1806 }
1807
1808 /*
1809  - stripsnug - compact the strip
1810  == static void stripsnug(struct parse *p, struct re_guts *g);
1811  */
1812 static void
1813 stripsnug(struct parse *p, struct re_guts *g)
1814 {
1815         g->nstates = p->slen;
1816         g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop));
1817         if (g->strip == NULL) {
1818                 SETERROR(REG_ESPACE);
1819                 g->strip = p->strip;
1820         }
1821 }
1822
1823 /*
1824  - findmust - fill in must and mlen with longest mandatory literal string
1825  == static void findmust(struct parse *p, struct re_guts *g);
1826  *
1827  * This algorithm could do fancy things like analyzing the operands of |
1828  * for common subsequences.  Someday.  This code is simple and finds most
1829  * of the interesting cases.
1830  *
1831  * Note that must and mlen got initialized during setup.
1832  */
1833 static void
1834 findmust(struct parse *p, struct re_guts *g)
1835 {
1836         sop *scan;
1837         sop *start = NULL;
1838         sop *newstart = NULL;
1839         sopno newlen;
1840         sop s;
1841         char *cp;
1842         int offset;
1843         char buf[MB_LEN_MAX];
1844         size_t clen;
1845         mbstate_t mbs;
1846
1847         /* avoid making error situations worse */
1848         if (p->error != 0)
1849                 return;
1850
1851         /*
1852          * It's not generally safe to do a ``char'' substring search on
1853          * multibyte character strings, but it's safe for at least
1854          * UTF-8 (see RFC 3629).
1855          */
1856         if (MB_CUR_MAX > 1 &&
1857             strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1858                 return;
1859
1860         /* find the longest OCHAR sequence in strip */
1861         newlen = 0;
1862         offset = 0;
1863         g->moffset = 0;
1864         scan = g->strip + 1;
1865         do {
1866                 s = *scan++;
1867                 switch (OP(s)) {
1868                 case OCHAR:             /* sequence member */
1869                         if (newlen == 0) {              /* new sequence */
1870                                 memset(&mbs, 0, sizeof(mbs));
1871                                 newstart = scan - 1;
1872                         }
1873                         clen = wcrtomb(buf, OPND(s), &mbs);
1874                         if (clen == (size_t)-1)
1875                                 goto toohard;
1876                         newlen += clen;
1877                         break;
1878                 case OPLUS_:            /* things that don't break one */
1879                 case OLPAREN:
1880                 case ORPAREN:
1881                         break;
1882                 case OQUEST_:           /* things that must be skipped */
1883                 case OCH_:
1884                         offset = altoffset(scan, offset);
1885                         scan--;
1886                         do {
1887                                 scan += OPND(s);
1888                                 s = *scan;
1889                                 /* assert() interferes w debug printouts */
1890                                 if (OP(s) != (sop)O_QUEST &&
1891                                     OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1892                                         g->iflags |= BAD;
1893                                         return;
1894                                 }
1895                         } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1896                         /* FALLTHROUGH */
1897                 case OBOW:              /* things that break a sequence */
1898                 case OEOW:
1899                 case OBOL:
1900                 case OEOL:
1901                 case OBOS:
1902                 case OEOS:
1903                 case OWBND:
1904                 case ONWBND:
1905                 case O_QUEST:
1906                 case O_CH:
1907                 case OEND:
1908                         if (newlen > (sopno)g->mlen) {          /* ends one */
1909                                 start = newstart;
1910                                 g->mlen = newlen;
1911                                 if (offset > -1) {
1912                                         g->moffset += offset;
1913                                         offset = newlen;
1914                                 } else
1915                                         g->moffset = offset;
1916                         } else {
1917                                 if (offset > -1)
1918                                         offset += newlen;
1919                         }
1920                         newlen = 0;
1921                         break;
1922                 case OANY:
1923                         if (newlen > (sopno)g->mlen) {          /* ends one */
1924                                 start = newstart;
1925                                 g->mlen = newlen;
1926                                 if (offset > -1) {
1927                                         g->moffset += offset;
1928                                         offset = newlen;
1929                                 } else
1930                                         g->moffset = offset;
1931                         } else {
1932                                 if (offset > -1)
1933                                         offset += newlen;
1934                         }
1935                         if (offset > -1)
1936                                 offset++;
1937                         newlen = 0;
1938                         break;
1939                 case OANYOF:            /* may or may not invalidate offset */
1940                         /* First, everything as OANY */
1941                         if (newlen > (sopno)g->mlen) {          /* ends one */
1942                                 start = newstart;
1943                                 g->mlen = newlen;
1944                                 if (offset > -1) {
1945                                         g->moffset += offset;
1946                                         offset = newlen;
1947                                 } else
1948                                         g->moffset = offset;
1949                         } else {
1950                                 if (offset > -1)
1951                                         offset += newlen;
1952                         }
1953                         if (offset > -1)
1954                                 offset++;
1955                         newlen = 0;
1956                         break;
1957                 toohard:
1958                 default:
1959                         /* Anything here makes it impossible or too hard
1960                          * to calculate the offset -- so we give up;
1961                          * save the last known good offset, in case the
1962                          * must sequence doesn't occur later.
1963                          */
1964                         if (newlen > (sopno)g->mlen) {          /* ends one */
1965                                 start = newstart;
1966                                 g->mlen = newlen;
1967                                 if (offset > -1)
1968                                         g->moffset += offset;
1969                                 else
1970                                         g->moffset = offset;
1971                         }
1972                         offset = -1;
1973                         newlen = 0;
1974                         break;
1975                 }
1976         } while (OP(s) != OEND);
1977
1978         if (g->mlen == 0) {             /* there isn't one */
1979                 g->moffset = -1;
1980                 return;
1981         }
1982
1983         /* turn it into a character string */
1984         g->must = malloc((size_t)g->mlen + 1);
1985         if (g->must == NULL) {          /* argh; just forget it */
1986                 g->mlen = 0;
1987                 g->moffset = -1;
1988                 return;
1989         }
1990         cp = g->must;
1991         scan = start;
1992         memset(&mbs, 0, sizeof(mbs));
1993         while (cp < g->must + g->mlen) {
1994                 while (OP(s = *scan++) != OCHAR)
1995                         continue;
1996                 clen = wcrtomb(cp, OPND(s), &mbs);
1997                 assert(clen != (size_t)-1);
1998                 cp += clen;
1999         }
2000         assert(cp == g->must + g->mlen);
2001         *cp++ = '\0';           /* just on general principles */
2002 }
2003
2004 /*
2005  - altoffset - choose biggest offset among multiple choices
2006  == static int altoffset(sop *scan, int offset);
2007  *
2008  * Compute, recursively if necessary, the largest offset among multiple
2009  * re paths.
2010  */
2011 static int
2012 altoffset(sop *scan, int offset)
2013 {
2014         int largest;
2015         int try;
2016         sop s;
2017
2018         /* If we gave up already on offsets, return */
2019         if (offset == -1)
2020                 return -1;
2021
2022         largest = 0;
2023         try = 0;
2024         s = *scan++;
2025         while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
2026                 switch (OP(s)) {
2027                 case OOR1:
2028                         if (try > largest)
2029                                 largest = try;
2030                         try = 0;
2031                         break;
2032                 case OQUEST_:
2033                 case OCH_:
2034                         try = altoffset(scan, try);
2035                         if (try == -1)
2036                                 return -1;
2037                         scan--;
2038                         do {
2039                                 scan += OPND(s);
2040                                 s = *scan;
2041                                 if (OP(s) != (sop)O_QUEST &&
2042                                     OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
2043                                         return -1;
2044                         } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
2045                         /* We must skip to the next position, or we'll
2046                          * leave altoffset() too early.
2047                          */
2048                         scan++;
2049                         break;
2050                 case OANYOF:
2051                 case OCHAR:
2052                 case OANY:
2053                         try++;
2054                 case OBOW:
2055                 case OEOW:
2056                 case OWBND:
2057                 case ONWBND:
2058                 case OLPAREN:
2059                 case ORPAREN:
2060                 case OOR2:
2061                         break;
2062                 default:
2063                         try = -1;
2064                         break;
2065                 }
2066                 if (try == -1)
2067                         return -1;
2068                 s = *scan++;
2069         }
2070
2071         if (try > largest)
2072                 largest = try;
2073
2074         return largest+offset;
2075 }
2076
2077 /*
2078  - computejumps - compute char jumps for BM scan
2079  == static void computejumps(struct parse *p, struct re_guts *g);
2080  *
2081  * This algorithm assumes g->must exists and is has size greater than
2082  * zero. It's based on the algorithm found on Computer Algorithms by
2083  * Sara Baase.
2084  *
2085  * A char jump is the number of characters one needs to jump based on
2086  * the value of the character from the text that was mismatched.
2087  */
2088 static void
2089 computejumps(struct parse *p, struct re_guts *g)
2090 {
2091         int ch;
2092         int mindex;
2093
2094         /* Avoid making errors worse */
2095         if (p->error != 0)
2096                 return;
2097
2098         g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int));
2099         if (g->charjump == NULL)        /* Not a fatal error */
2100                 return;
2101         /* Adjust for signed chars, if necessary */
2102         g->charjump = &g->charjump[-(CHAR_MIN)];
2103
2104         /* If the character does not exist in the pattern, the jump
2105          * is equal to the number of characters in the pattern.
2106          */
2107         for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2108                 g->charjump[ch] = g->mlen;
2109
2110         /* If the character does exist, compute the jump that would
2111          * take us to the last character in the pattern equal to it
2112          * (notice that we match right to left, so that last character
2113          * is the first one that would be matched).
2114          */
2115         for (mindex = 0; mindex < g->mlen; mindex++)
2116                 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2117 }
2118
2119 /*
2120  - computematchjumps - compute match jumps for BM scan
2121  == static void computematchjumps(struct parse *p, struct re_guts *g);
2122  *
2123  * This algorithm assumes g->must exists and is has size greater than
2124  * zero. It's based on the algorithm found on Computer Algorithms by
2125  * Sara Baase.
2126  *
2127  * A match jump is the number of characters one needs to advance based
2128  * on the already-matched suffix.
2129  * Notice that all values here are minus (g->mlen-1), because of the way
2130  * the search algorithm works.
2131  */
2132 static void
2133 computematchjumps(struct parse *p, struct re_guts *g)
2134 {
2135         int mindex;             /* General "must" iterator */
2136         int suffix;             /* Keeps track of matching suffix */
2137         int ssuffix;            /* Keeps track of suffixes' suffix */
2138         int* pmatches;          /* pmatches[k] points to the next i
2139                                  * such that i+1...mlen is a substring
2140                                  * of k+1...k+mlen-i-1
2141                                  */
2142
2143         /* Avoid making errors worse */
2144         if (p->error != 0)
2145                 return;
2146
2147         pmatches = (int*) malloc(g->mlen * sizeof(int));
2148         if (pmatches == NULL) {
2149                 g->matchjump = NULL;
2150                 return;
2151         }
2152
2153         g->matchjump = (int*) malloc(g->mlen * sizeof(int));
2154         if (g->matchjump == NULL) {     /* Not a fatal error */
2155                 free(pmatches);
2156                 return;
2157         }
2158
2159         /* Set maximum possible jump for each character in the pattern */
2160         for (mindex = 0; mindex < g->mlen; mindex++)
2161                 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2162
2163         /* Compute pmatches[] */
2164         for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2165             mindex--, suffix--) {
2166                 pmatches[mindex] = suffix;
2167
2168                 /* If a mismatch is found, interrupting the substring,
2169                  * compute the matchjump for that position. If no
2170                  * mismatch is found, then a text substring mismatched
2171                  * against the suffix will also mismatch against the
2172                  * substring.
2173                  */
2174                 while (suffix < g->mlen
2175                     && g->must[mindex] != g->must[suffix]) {
2176                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
2177                             g->mlen - mindex - 1);
2178                         suffix = pmatches[suffix];
2179                 }
2180         }
2181
2182         /* Compute the matchjump up to the last substring found to jump
2183          * to the beginning of the largest must pattern prefix matching
2184          * it's own suffix.
2185          */
2186         for (mindex = 0; mindex <= suffix; mindex++)
2187                 g->matchjump[mindex] = MIN(g->matchjump[mindex],
2188                     g->mlen + suffix - mindex);
2189
2190         ssuffix = pmatches[suffix];
2191         while (suffix < g->mlen) {
2192                 while (suffix <= ssuffix && suffix < g->mlen) {
2193                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
2194                             g->mlen + ssuffix - suffix);
2195                         suffix++;
2196                 }
2197                 if (suffix < g->mlen)
2198                         ssuffix = pmatches[ssuffix];
2199         }
2200
2201         free(pmatches);
2202 }
2203
2204 /*
2205  - pluscount - count + nesting
2206  == static sopno pluscount(struct parse *p, struct re_guts *g);
2207  */
2208 static sopno                    /* nesting depth */
2209 pluscount(struct parse *p, struct re_guts *g)
2210 {
2211         sop *scan;
2212         sop s;
2213         sopno plusnest = 0;
2214         sopno maxnest = 0;
2215
2216         if (p->error != 0)
2217                 return(0);      /* there may not be an OEND */
2218
2219         scan = g->strip + 1;
2220         do {
2221                 s = *scan++;
2222                 switch (OP(s)) {
2223                 case OPLUS_:
2224                         plusnest++;
2225                         break;
2226                 case O_PLUS:
2227                         if (plusnest > maxnest)
2228                                 maxnest = plusnest;
2229                         plusnest--;
2230                         break;
2231                 }
2232         } while (OP(s) != OEND);
2233         if (plusnest != 0)
2234                 g->iflags |= BAD;
2235         return(maxnest);
2236 }