lib/libc/regex/regcomp.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * Copyright (c) 2011 The FreeBSD Foundation
   9  * All rights reserved.
  10  * Portions of this software were developed by David Chisnall
  11  * under sponsorship from the FreeBSD Foundation.
  12  *
  13  * This code is derived from software contributed to Berkeley by
  14  * Henry Spencer.
  15  *
  16  * Redistribution and use in source and binary forms, with or without
  17  * modification, are permitted provided that the following conditions
  18  * are met:
  19  * 1. Redistributions of source code must retain the above copyright
  20  *    notice, this list of conditions and the following disclaimer.
  21  * 2. Redistributions in binary form must reproduce the above copyright
  22  *    notice, this list of conditions and the following disclaimer in the
  23  *    documentation and/or other materials provided with the distribution.
  24  * 3. Neither the name of the University nor the names of its contributors
  25  *    may be used to endorse or promote products derived from this software
  26  *    without specific prior written permission.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  38  * SUCH DAMAGE.
  39  *
  40  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  41  */
  42
  43 #if defined(LIBC_SCCS) && !defined(lint)
  44 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
  45 #endif /* LIBC_SCCS and not lint */
  46 #include <sys/cdefs.h>
  47 __FBSDID("$FreeBSD$");
  48
  49 #include <sys/types.h>
  50 #include <stdio.h>
  51 #include <string.h>
  52 #include <ctype.h>
  53 #include <limits.h>
  54 #include <stdlib.h>
  55 #include <regex.h>
  56 #include <stdbool.h>
  57 #include <wchar.h>
  58 #include <wctype.h>
  59
  60 #ifndef LIBREGEX
  61 #include "collate.h"
  62 #endif
  63
  64 #include "utils.h"
  65 #include "regex2.h"
  66
  67 #include "cname.h"
  68
  69 /*
  70  * Branching context, used to keep track of branch state for all of the branch-
  71  * aware functions. In addition to keeping track of branch positions for the
  72  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
  73  * detection of whether ^ is acting as an anchor or being used erroneously and
  74  * also for whether we're in a sub-expression or not.
  75  */
  76 struct branchc {
  77         sopno start;
  78         sopno back;
  79         sopno fwd;
  80
  81         int nbranch;
  82         int nchain;
  83         bool outer;
  84         bool terminate;
  85 };
  86
  87 /*
  88  * parse structure, passed up and down to avoid global variables and
  89  * other clumsinesses
  90  */
  91 struct parse {
  92         const char *next;       /* next character in RE */
  93         const char *end;        /* end of string (-> NUL normally) */
  94         int error;              /* has an error been seen? */
  95         int gnuext;
  96         sop *strip;             /* malloced strip */
  97         sopno ssize;            /* malloced strip size (allocated) */
  98         sopno slen;             /* malloced strip length (used) */
  99         int ncsalloc;           /* number of csets allocated */
 100         struct re_guts *g;
 101 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 102         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 103         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 104         bool allowbranch;       /* can this expression branch? */
 105         bool bre;               /* convenience; is this a BRE? */
 106         int pflags;             /* other parsing flags -- legacy escapes? */
 107         bool (*parse_expr)(struct parse *, struct branchc *);
 108         void (*pre_parse)(struct parse *, struct branchc *);
 109         void (*post_parse)(struct parse *, struct branchc *);
 110 };
 111
 112 #define PFLAG_LEGACY_ESC        0x00000001
 113
 114 /* ========= begin header generated by ./mkh ========= */
 115 #ifdef __cplusplus
 116 extern "C" {
 117 #endif
 118
 119 /* === regcomp.c === */
 120 static bool p_ere_exp(struct parse *p, struct branchc *bc);
 121 static void p_str(struct parse *p);
 122 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
 123 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
 124 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
 125 static bool p_branch_empty(struct parse *p, struct branchc *bc);
 126 static bool p_branch_do(struct parse *p, struct branchc *bc);
 127 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
 128 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
 129 static void p_re(struct parse *p, int end1, int end2);
 130 static bool p_simp_re(struct parse *p, struct branchc *bc);
 131 static int p_count(struct parse *p);
 132 static void p_bracket(struct parse *p);
 133 static int p_range_cmp(wchar_t c1, wchar_t c2);
 134 static void p_b_term(struct parse *p, cset *cs);
 135 static int p_b_pseudoclass(struct parse *p, char c);
 136 static void p_b_cclass(struct parse *p, cset *cs);
 137 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
 138 static void p_b_eclass(struct parse *p, cset *cs);
 139 static wint_t p_b_symbol(struct parse *p);
 140 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
 141 static bool may_escape(struct parse *p, const wint_t ch);
 142 static wint_t othercase(wint_t ch);
 143 static void bothcases(struct parse *p, wint_t ch);
 144 static void ordinary(struct parse *p, wint_t ch);
 145 static void nonnewline(struct parse *p);
 146 static void repeat(struct parse *p, sopno start, int from, int to);
 147 static int seterr(struct parse *p, int e);
 148 static cset *allocset(struct parse *p);
 149 static void freeset(struct parse *p, cset *cs);
 150 static void CHadd(struct parse *p, cset *cs, wint_t ch);
 151 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
 152 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
 153 static wint_t singleton(cset *cs);
 154 static sopno dupl(struct parse *p, sopno start, sopno finish);
 155 static void doemit(struct parse *p, sop op, size_t opnd);
 156 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
 157 static void dofwd(struct parse *p, sopno pos, sop value);
 158 static int enlarge(struct parse *p, sopno size);
 159 static void stripsnug(struct parse *p, struct re_guts *g);
 160 static void findmust(struct parse *p, struct re_guts *g);
 161 static int altoffset(sop *scan, int offset);
 162 static void computejumps(struct parse *p, struct re_guts *g);
 163 static void computematchjumps(struct parse *p, struct re_guts *g);
 164 static sopno pluscount(struct parse *p, struct re_guts *g);
 165 static wint_t wgetnext(struct parse *p);
 166
 167 #ifdef __cplusplus
 168 }
 169 #endif
 170 /* ========= end header generated by ./mkh ========= */
 171
 172 static char nuls[10];           /* place to point scanner in event of error */
 173
 174 /*
 175  * macros for use with parse structure
 176  * BEWARE:  these know that the parse structure is named `p' !!!
 177  */
 178 #define PEEK()  (*p->next)
 179 #define PEEK2() (*(p->next+1))
 180 #define MORE()  (p->next < p->end)
 181 #define MORE2() (p->next+1 < p->end)
 182 #define SEE(c)  (MORE() && PEEK() == (c))
 183 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 184 #define SEESPEC(a)      (p->bre ? SEETWO('\\', a) : SEE(a))
 185 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 186 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 187 #define EATSPEC(a)      (p->bre ? EATTWO('\\', a) : EAT(a))
 188 #define NEXT()  (p->next++)
 189 #define NEXT2() (p->next += 2)
 190 #define NEXTn(n)        (p->next += (n))
 191 #define GETNEXT()       (*p->next++)
 192 #define WGETNEXT()      wgetnext(p)
 193 #define SETERROR(e)     seterr(p, (e))
 194 #define REQUIRE(co, e)  ((co) || SETERROR(e))
 195 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 196 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 197 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 198 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 199 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 200 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 201 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 202 #define HERE()          (p->slen)
 203 #define THERE()         (p->slen - 1)
 204 #define THERETHERE()    (p->slen - 2)
 205 #define DROP(n) (p->slen -= (n))
 206
 207 /* Macro used by computejump()/computematchjump() */
 208 #define MIN(a,b)        ((a)<(b)?(a):(b))
 209
 210 static int                              /* 0 success, otherwise REG_something */
 211 regcomp_internal(regex_t * __restrict preg,
 212         const char * __restrict pattern,
 213         int cflags, int pflags)
 214 {
 215         struct parse pa;
 216         struct re_guts *g;
 217         struct parse *p = &pa;
 218         int i;
 219         size_t len;
 220         size_t maxlen;
 221 #ifdef REDEBUG
 222 #       define  GOODFLAGS(f)    (f)
 223 #else
 224 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 225 #endif
 226
 227         cflags = GOODFLAGS(cflags);
 228         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 229                 return(REG_INVARG);
 230
 231         if (cflags&REG_PEND) {
 232                 if (preg->re_endp < pattern)
 233                         return(REG_INVARG);
 234                 len = preg->re_endp - pattern;
 235         } else
 236                 len = strlen(pattern);
 237
 238         /* do the mallocs early so failure handling is easy */
 239         g = (struct re_guts *)malloc(sizeof(struct re_guts));
 240         if (g == NULL)
 241                 return(REG_ESPACE);
 242         /*
 243          * Limit the pattern space to avoid a 32-bit overflow on buffer
 244          * extension.  Also avoid any signed overflow in case of conversion
 245          * so make the real limit based on a 31-bit overflow.
 246          *
 247          * Likely not applicable on 64-bit systems but handle the case
 248          * generically (who are we to stop people from using ~715MB+
 249          * patterns?).
 250          */
 251         maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
 252         if (len >= maxlen) {
 253                 free((char *)g);
 254                 return(REG_ESPACE);
 255         }
 256         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 257         assert(p->ssize >= len);
 258
 259         p->strip = (sop *)malloc(p->ssize * sizeof(sop));
 260         p->slen = 0;
 261         if (p->strip == NULL) {
 262                 free((char *)g);
 263                 return(REG_ESPACE);
 264         }
 265
 266         /* set things up */
 267         p->g = g;
 268         p->next = pattern;      /* convenience; we do not modify it */
 269         p->end = p->next + len;
 270         p->error = 0;
 271         p->ncsalloc = 0;
 272         p->pflags = pflags;
 273         for (i = 0; i < NPAREN; i++) {
 274                 p->pbegin[i] = 0;
 275                 p->pend[i] = 0;
 276         }
 277 #ifdef LIBREGEX
 278         if (cflags&REG_POSIX) {
 279                 p->gnuext = false;
 280                 p->allowbranch = (cflags & REG_EXTENDED) != 0;
 281         } else
 282                 p->gnuext = p->allowbranch = true;
 283 #else
 284         p->gnuext = false;
 285         p->allowbranch = (cflags & REG_EXTENDED) != 0;
 286 #endif
 287         if (cflags & REG_EXTENDED) {
 288                 p->bre = false;
 289                 p->parse_expr = p_ere_exp;
 290                 p->pre_parse = NULL;
 291                 p->post_parse = NULL;
 292         } else {
 293                 p->bre = true;
 294                 p->parse_expr = p_simp_re;
 295                 p->pre_parse = p_bre_pre_parse;
 296                 p->post_parse = p_bre_post_parse;
 297         }
 298         g->sets = NULL;
 299         g->ncsets = 0;
 300         g->cflags = cflags;
 301         g->iflags = 0;
 302         g->nbol = 0;
 303         g->neol = 0;
 304         g->must = NULL;
 305         g->moffset = -1;
 306         g->charjump = NULL;
 307         g->matchjump = NULL;
 308         g->mlen = 0;
 309         g->nsub = 0;
 310         g->backrefs = 0;
 311
 312         /* do it */
 313         EMIT(OEND, 0);
 314         g->firststate = THERE();
 315         if (cflags & REG_NOSPEC)
 316                 p_str(p);
 317         else
 318                 p_re(p, OUT, OUT);
 319         EMIT(OEND, 0);
 320         g->laststate = THERE();
 321
 322         /* tidy up loose ends and fill things in */
 323         stripsnug(p, g);
 324         findmust(p, g);
 325         /* only use Boyer-Moore algorithm if the pattern is bigger
 326          * than three characters
 327          */
 328         if(g->mlen > 3) {
 329                 computejumps(p, g);
 330                 computematchjumps(p, g);
 331                 if(g->matchjump == NULL && g->charjump != NULL) {
 332                         free(g->charjump);
 333                         g->charjump = NULL;
 334                 }
 335         }
 336         g->nplus = pluscount(p, g);
 337         g->magic = MAGIC2;
 338         preg->re_nsub = g->nsub;
 339         preg->re_g = g;
 340         preg->re_magic = MAGIC1;
 341 #ifndef REDEBUG
 342         /* not debugging, so can't rely on the assert() in regexec() */
 343         if (g->iflags&BAD)
 344                 SETERROR(REG_ASSERT);
 345 #endif
 346
 347         /* win or lose, we're done */
 348         if (p->error != 0)      /* lose */
 349                 regfree(preg);
 350         return(p->error);
 351 }
 352
 353 /*
 354  - regcomp - interface for parser and compilation
 355  = extern int regcomp(regex_t *, const char *, int);
 356  = #define      REG_BASIC       0000
 357  = #define      REG_EXTENDED    0001
 358  = #define      REG_ICASE       0002
 359  = #define      REG_NOSUB       0004
 360  = #define      REG_NEWLINE     0010
 361  = #define      REG_NOSPEC      0020
 362  = #define      REG_PEND        0040
 363  = #define      REG_DUMP        0200
 364  */
 365 int                             /* 0 success, otherwise REG_something */
 366 regcomp(regex_t * __restrict preg,
 367         const char * __restrict pattern,
 368         int cflags)
 369 {
 370
 371         return (regcomp_internal(preg, pattern, cflags, 0));
 372 }
 373
 374 #ifndef LIBREGEX
 375 /*
 376  * Legacy interface that requires more lax escaping behavior.
 377  */
 378 int
 379 freebsd12_regcomp(regex_t * __restrict preg,
 380         const char * __restrict pattern,
 381         int cflags, int pflags)
 382 {
 383
 384         return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC));
 385 }
 386
 387 __sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0);
 388 #endif  /* !LIBREGEX */
 389
 390 /*
 391  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
 392  - return whether we should terminate or not
 393  == static bool p_ere_exp(struct parse *p);
 394  */
 395 static bool
 396 p_ere_exp(struct parse *p, struct branchc *bc)
 397 {
 398         char c;
 399         wint_t wc;
 400         sopno pos;
 401         int count;
 402         int count2;
 403 #ifdef LIBREGEX
 404         int i;
 405         int handled;
 406 #endif
 407         sopno subno;
 408         int wascaret = 0;
 409
 410         (void)bc;
 411         assert(MORE());         /* caller should have ensured this */
 412         c = GETNEXT();
 413
 414 #ifdef LIBREGEX
 415         handled = 0;
 416 #endif
 417         pos = HERE();
 418         switch (c) {
 419         case '(':
 420                 (void)REQUIRE(MORE(), REG_EPAREN);
 421                 p->g->nsub++;
 422                 subno = p->g->nsub;
 423                 if (subno < NPAREN)
 424                         p->pbegin[subno] = HERE();
 425                 EMIT(OLPAREN, subno);
 426                 if (!SEE(')'))
 427                         p_re(p, ')', IGN);
 428                 if (subno < NPAREN) {
 429                         p->pend[subno] = HERE();
 430                         assert(p->pend[subno] != 0);
 431                 }
 432                 EMIT(ORPAREN, subno);
 433                 (void)MUSTEAT(')', REG_EPAREN);
 434                 break;
 435 #ifndef POSIX_MISTAKE
 436         case ')':               /* happens only if no current unmatched ( */
 437                 /*
 438                  * You may ask, why the ifndef?  Because I didn't notice
 439                  * this until slightly too late for 1003.2, and none of the
 440                  * other 1003.2 regular-expression reviewers noticed it at
 441                  * all.  So an unmatched ) is legal POSIX, at least until
 442                  * we can get it fixed.
 443                  */
 444                 SETERROR(REG_EPAREN);
 445                 break;
 446 #endif
 447         case '^':
 448                 EMIT(OBOL, 0);
 449                 p->g->iflags |= USEBOL;
 450                 p->g->nbol++;
 451                 wascaret = 1;
 452                 break;
 453         case '$':
 454                 EMIT(OEOL, 0);
 455                 p->g->iflags |= USEEOL;
 456                 p->g->neol++;
 457                 break;
 458         case '|':
 459                 SETERROR(REG_EMPTY);
 460                 break;
 461         case '*':
 462         case '+':
 463         case '?':
 464         case '{':
 465                 SETERROR(REG_BADRPT);
 466                 break;
 467         case '.':
 468                 if (p->g->cflags&REG_NEWLINE)
 469                         nonnewline(p);
 470                 else
 471                         EMIT(OANY, 0);
 472                 break;
 473         case '[':
 474                 p_bracket(p);
 475                 break;
 476         case '\\':
 477                 (void)REQUIRE(MORE(), REG_EESCAPE);
 478                 wc = WGETNEXT();
 479 #ifdef LIBREGEX
 480                 if (p->gnuext) {
 481                         handled = 1;
 482                         switch (wc) {
 483                         case '`':
 484                                 EMIT(OBOS, 0);
 485                                 break;
 486                         case '\'':
 487                                 EMIT(OEOS, 0);
 488                                 break;
 489                         case 'B':
 490                                 EMIT(ONWBND, 0);
 491                                 break;
 492                         case 'b':
 493                                 EMIT(OWBND, 0);
 494                                 break;
 495                         case 'W':
 496                         case 'w':
 497                         case 'S':
 498                         case 's':
 499                                 p_b_pseudoclass(p, wc);
 500                                 break;
 501                         case '1':
 502                         case '2':
 503                         case '3':
 504                         case '4':
 505                         case '5':
 506                         case '6':
 507                         case '7':
 508                         case '8':
 509                         case '9':
 510                                 i = wc - '0';
 511                                 assert(i < NPAREN);
 512                                 if (p->pend[i] != 0) {
 513                                         assert(i <= p->g->nsub);
 514                                         EMIT(OBACK_, i);
 515                                         assert(p->pbegin[i] != 0);
 516                                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 517                                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 518                                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 519                                         EMIT(O_BACK, i);
 520                                 } else
 521                                         SETERROR(REG_ESUBREG);
 522                                 p->g->backrefs = 1;
 523                                 break;
 524                         default:
 525                                 handled = 0;
 526                         }
 527                         /* Don't proceed to the POSIX bits if we've already handled it */
 528                         if (handled)
 529                                 break;
 530                 }
 531 #endif
 532                 switch (wc) {
 533                 case '<':
 534                         EMIT(OBOW, 0);
 535                         break;
 536                 case '>':
 537                         EMIT(OEOW, 0);
 538                         break;
 539                 default:
 540                         if (may_escape(p, wc))
 541                                 ordinary(p, wc);
 542                         else
 543                                 SETERROR(REG_EESCAPE);
 544                         break;
 545                 }
 546                 break;
 547         default:
 548                 if (p->error != 0)
 549                         return (false);
 550                 p->next--;
 551                 wc = WGETNEXT();
 552                 ordinary(p, wc);
 553                 break;
 554         }
 555
 556         if (!MORE())
 557                 return (false);
 558         c = PEEK();
 559         /* we call { a repetition if followed by a digit */
 560         if (!( c == '*' || c == '+' || c == '?' || c == '{'))
 561                 return (false);         /* no repetition, we're done */
 562         else if (c == '{')
 563                 (void)REQUIRE(MORE2() && \
 564                     (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
 565         NEXT();
 566
 567         (void)REQUIRE(!wascaret, REG_BADRPT);
 568         switch (c) {
 569         case '*':       /* implemented as +? */
 570                 /* this case does not require the (y|) trick, noKLUDGE */
 571                 INSERT(OPLUS_, pos);
 572                 ASTERN(O_PLUS, pos);
 573                 INSERT(OQUEST_, pos);
 574                 ASTERN(O_QUEST, pos);
 575                 break;
 576         case '+':
 577                 INSERT(OPLUS_, pos);
 578                 ASTERN(O_PLUS, pos);
 579                 break;
 580         case '?':
 581                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 582                 INSERT(OCH_, pos);              /* offset slightly wrong */
 583                 ASTERN(OOR1, pos);              /* this one's right */
 584                 AHEAD(pos);                     /* fix the OCH_ */
 585                 EMIT(OOR2, 0);                  /* offset very wrong... */
 586                 AHEAD(THERE());                 /* ...so fix it */
 587                 ASTERN(O_CH, THERETHERE());
 588                 break;
 589         case '{':
 590                 count = p_count(p);
 591                 if (EAT(',')) {
 592                         if (isdigit((uch)PEEK())) {
 593                                 count2 = p_count(p);
 594                                 (void)REQUIRE(count <= count2, REG_BADBR);
 595                         } else          /* single number with comma */
 596                                 count2 = INFINITY;
 597                 } else          /* just a single number */
 598                         count2 = count;
 599                 repeat(p, pos, count, count2);
 600                 if (!EAT('}')) {        /* error heuristics */
 601                         while (MORE() && PEEK() != '}')
 602                                 NEXT();
 603                         (void)REQUIRE(MORE(), REG_EBRACE);
 604                         SETERROR(REG_BADBR);
 605                 }
 606                 break;
 607         }
 608
 609         if (!MORE())
 610                 return (false);
 611         c = PEEK();
 612         if (!( c == '*' || c == '+' || c == '?' ||
 613                                 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
 614                 return (false);
 615         SETERROR(REG_BADRPT);
 616         return (false);
 617 }
 618
 619 /*
 620  - p_str - string (no metacharacters) "parser"
 621  == static void p_str(struct parse *p);
 622  */
 623 static void
 624 p_str(struct parse *p)
 625 {
 626         (void)REQUIRE(MORE(), REG_EMPTY);
 627         while (MORE())
 628                 ordinary(p, WGETNEXT());
 629 }
 630
 631 /*
 632  * Eat consecutive branch delimiters for the kind of expression that we are
 633  * parsing, return the number of delimiters that we ate.
 634  */
 635 static int
 636 p_branch_eat_delim(struct parse *p, struct branchc *bc)
 637 {
 638         int nskip;
 639
 640         (void)bc;
 641         nskip = 0;
 642         while (EATSPEC('|'))
 643                 ++nskip;
 644         return (nskip);
 645 }
 646
 647 /*
 648  * Insert necessary branch book-keeping operations. This emits a
 649  * bogus 'next' offset, since we still have more to parse
 650  */
 651 static void
 652 p_branch_ins_offset(struct parse *p, struct branchc *bc)
 653 {
 654
 655         if (bc->nbranch == 0) {
 656                 INSERT(OCH_, bc->start);        /* offset is wrong */
 657                 bc->fwd = bc->start;
 658                 bc->back = bc->start;
 659         }
 660
 661         ASTERN(OOR1, bc->back);
 662         bc->back = THERE();
 663         AHEAD(bc->fwd);                 /* fix previous offset */
 664         bc->fwd = HERE();
 665         EMIT(OOR2, 0);                  /* offset is very wrong */
 666         ++bc->nbranch;
 667 }
 668
 669 /*
 670  * Fix the offset of the tail branch, if we actually had any branches.
 671  * This is to correct the bogus placeholder offset that we use.
 672  */
 673 static void
 674 p_branch_fix_tail(struct parse *p, struct branchc *bc)
 675 {
 676
 677         /* Fix bogus offset at the tail if we actually have branches */
 678         if (bc->nbranch > 0) {
 679                 AHEAD(bc->fwd);
 680                 ASTERN(O_CH, bc->back);
 681         }
 682 }
 683
 684 /*
 685  * Signal to the parser that an empty branch has been encountered; this will,
 686  * in the future, be used to allow for more permissive behavior with empty
 687  * branches. The return value should indicate whether parsing may continue
 688  * or not.
 689  */
 690 static bool
 691 p_branch_empty(struct parse *p, struct branchc *bc)
 692 {
 693
 694         (void)bc;
 695         SETERROR(REG_EMPTY);
 696         return (false);
 697 }
 698
 699 /*
 700  * Take care of any branching requirements. This includes inserting the
 701  * appropriate branching instructions as well as eating all of the branch
 702  * delimiters until we either run out of pattern or need to parse more pattern.
 703  */
 704 static bool
 705 p_branch_do(struct parse *p, struct branchc *bc)
 706 {
 707         int ate = 0;
 708
 709         ate = p_branch_eat_delim(p, bc);
 710         if (ate == 0)
 711                 return (false);
 712         else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
 713                 /*
 714                  * Halt parsing only if we have an empty branch and p_branch_empty
 715                  * indicates that we must not continue. In the future, this will not
 716                  * necessarily be an error.
 717                  */
 718                 return (false);
 719         p_branch_ins_offset(p, bc);
 720
 721         return (true);
 722 }
 723
 724 static void
 725 p_bre_pre_parse(struct parse *p, struct branchc *bc)
 726 {
 727
 728         (void) bc;
 729         /*
 730          * Does not move cleanly into expression parser because of
 731          * ordinary interpration of * at the beginning position of
 732          * an expression.
 733          */
 734         if (EAT('^')) {
 735                 EMIT(OBOL, 0);
 736                 p->g->iflags |= USEBOL;
 737                 p->g->nbol++;
 738         }
 739 }
 740
 741 static void
 742 p_bre_post_parse(struct parse *p, struct branchc *bc)
 743 {
 744
 745         /* Expression is terminating due to EOL token */
 746         if (bc->terminate) {
 747                 DROP(1);
 748                 EMIT(OEOL, 0);
 749                 p->g->iflags |= USEEOL;
 750                 p->g->neol++;
 751         }
 752 }
 753
 754 /*
 755  - p_re - Top level parser, concatenation and BRE anchoring
 756  == static void p_re(struct parse *p, int end1, int end2);
 757  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 758  *
 759  * This implementation is a bit of a kludge, in that a trailing $ is first
 760  * taken as an ordinary character and then revised to be an anchor.
 761  * The amount of lookahead needed to avoid this kludge is excessive.
 762  */
 763 static void
 764 p_re(struct parse *p,
 765         int end1,       /* first terminating character */
 766         int end2)       /* second terminating character; ignored for EREs */
 767 {
 768         struct branchc bc;
 769
 770         bc.nbranch = 0;
 771         if (end1 == OUT && end2 == OUT)
 772                 bc.outer = true;
 773         else
 774                 bc.outer = false;
 775 #define SEEEND()        (!p->bre ? SEE(end1) : SEETWO(end1, end2))
 776         for (;;) {
 777                 bc.start = HERE();
 778                 bc.nchain = 0;
 779                 bc.terminate = false;
 780                 if (p->pre_parse != NULL)
 781                         p->pre_parse(p, &bc);
 782                 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
 783                         bc.terminate = p->parse_expr(p, &bc);
 784                         ++bc.nchain;
 785                 }
 786                 if (p->post_parse != NULL)
 787                         p->post_parse(p, &bc);
 788                 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
 789 #ifdef LIBREGEX
 790                 if (HERE() == bc.start && !p_branch_empty(p, &bc))
 791                         break;
 792 #endif
 793                 if (!p->allowbranch)
 794                         break;
 795                 /*
 796                  * p_branch_do's return value indicates whether we should
 797                  * continue parsing or not. This is both for correctness and
 798                  * a slight optimization, because it will check if we've
 799                  * encountered an empty branch or the end of the string
 800                  * immediately following a branch delimiter.
 801                  */
 802                 if (!p_branch_do(p, &bc))
 803                         break;
 804         }
 805 #undef SEE_END
 806         if (p->allowbranch)
 807                 p_branch_fix_tail(p, &bc);
 808         assert(!MORE() || SEE(end1));
 809 }
 810
 811 /*
 812  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 813  == static bool p_simp_re(struct parse *p, struct branchc *bc);
 814  */
 815 static bool                     /* was the simple RE an unbackslashed $? */
 816 p_simp_re(struct parse *p, struct branchc *bc)
 817 {
 818         int c;
 819         int cc;                 /* convenient/control character */
 820         int count;
 821         int count2;
 822         sopno pos;
 823         bool handled;
 824         int i;
 825         wint_t wc;
 826         sopno subno;
 827 #       define  BACKSL  (1<<CHAR_BIT)
 828
 829         pos = HERE();           /* repetition op, if any, covers from here */
 830         handled = false;
 831
 832         assert(MORE());         /* caller should have ensured this */
 833         c = GETNEXT();
 834         if (c == '\\') {
 835                 (void)REQUIRE(MORE(), REG_EESCAPE);
 836                 cc = GETNEXT();
 837                 c = BACKSL | cc;
 838 #ifdef LIBREGEX
 839                 if (p->gnuext) {
 840                         handled = true;
 841                         switch (c) {
 842                         case BACKSL|'`':
 843                                 EMIT(OBOS, 0);
 844                                 break;
 845                         case BACKSL|'\'':
 846                                 EMIT(OEOS, 0);
 847                                 break;
 848                         case BACKSL|'B':
 849                                 EMIT(ONWBND, 0);
 850                                 break;
 851                         case BACKSL|'b':
 852                                 EMIT(OWBND, 0);
 853                                 break;
 854                         case BACKSL|'W':
 855                         case BACKSL|'w':
 856                         case BACKSL|'S':
 857                         case BACKSL|'s':
 858                                 p_b_pseudoclass(p, cc);
 859                                 break;
 860                         default:
 861                                 handled = false;
 862                         }
 863                 }
 864 #endif
 865         }
 866         if (!handled) {
 867                 switch (c) {
 868                 case '.':
 869                         if (p->g->cflags&REG_NEWLINE)
 870                                 nonnewline(p);
 871                         else
 872                                 EMIT(OANY, 0);
 873                         break;
 874                 case '[':
 875                         p_bracket(p);
 876                         break;
 877                 case BACKSL|'<':
 878                         EMIT(OBOW, 0);
 879                         break;
 880                 case BACKSL|'>':
 881                         EMIT(OEOW, 0);
 882                         break;
 883                 case BACKSL|'{':
 884                         SETERROR(REG_BADRPT);
 885                         break;
 886                 case BACKSL|'(':
 887                         p->g->nsub++;
 888                         subno = p->g->nsub;
 889                         if (subno < NPAREN)
 890                                 p->pbegin[subno] = HERE();
 891                         EMIT(OLPAREN, subno);
 892                         /* the MORE here is an error heuristic */
 893                         if (MORE() && !SEETWO('\\', ')'))
 894                                 p_re(p, '\\', ')');
 895                         if (subno < NPAREN) {
 896                                 p->pend[subno] = HERE();
 897                                 assert(p->pend[subno] != 0);
 898                         }
 899                         EMIT(ORPAREN, subno);
 900                         (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 901                         break;
 902                 case BACKSL|')':        /* should not get here -- must be user */
 903                         SETERROR(REG_EPAREN);
 904                         break;
 905                 case BACKSL|'1':
 906                 case BACKSL|'2':
 907                 case BACKSL|'3':
 908                 case BACKSL|'4':
 909                 case BACKSL|'5':
 910                 case BACKSL|'6':
 911                 case BACKSL|'7':
 912                 case BACKSL|'8':
 913                 case BACKSL|'9':
 914                         i = (c&~BACKSL) - '0';
 915                         assert(i < NPAREN);
 916                         if (p->pend[i] != 0) {
 917                                 assert(i <= p->g->nsub);
 918                                 EMIT(OBACK_, i);
 919                                 assert(p->pbegin[i] != 0);
 920                                 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 921                                 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 922                                 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 923                                 EMIT(O_BACK, i);
 924                         } else
 925                                 SETERROR(REG_ESUBREG);
 926                         p->g->backrefs = 1;
 927                         break;
 928                 case '*':
 929                         /*
 930                          * Ordinary if used as the first character beyond BOL anchor of
 931                          * a (sub-)expression, counts as a bad repetition operator if it
 932                          * appears otherwise.
 933                          */
 934                         (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
 935                         /* FALLTHROUGH */
 936                 default:
 937                         if (p->error != 0)
 938                                 return (false); /* Definitely not $... */
 939                         p->next--;
 940                         wc = WGETNEXT();
 941                         if ((c & BACKSL) == 0 || may_escape(p, wc))
 942                                 ordinary(p, wc);
 943                         else
 944                                 SETERROR(REG_EESCAPE);
 945                         break;
 946                 }
 947         }
 948
 949         if (EAT('*')) {         /* implemented as +? */
 950                 /* this case does not require the (y|) trick, noKLUDGE */
 951                 INSERT(OPLUS_, pos);
 952                 ASTERN(O_PLUS, pos);
 953                 INSERT(OQUEST_, pos);
 954                 ASTERN(O_QUEST, pos);
 955 #ifdef LIBREGEX
 956         } else if (p->gnuext && EATTWO('\\', '?')) {
 957                 INSERT(OQUEST_, pos);
 958                 ASTERN(O_QUEST, pos);
 959         } else if (p->gnuext && EATTWO('\\', '+')) {
 960                 INSERT(OPLUS_, pos);
 961                 ASTERN(O_PLUS, pos);
 962 #endif
 963         } else if (EATTWO('\\', '{')) {
 964                 count = p_count(p);
 965                 if (EAT(',')) {
 966                         if (MORE() && isdigit((uch)PEEK())) {
 967                                 count2 = p_count(p);
 968                                 (void)REQUIRE(count <= count2, REG_BADBR);
 969                         } else          /* single number with comma */
 970                                 count2 = INFINITY;
 971                 } else          /* just a single number */
 972                         count2 = count;
 973                 repeat(p, pos, count, count2);
 974                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 975                         while (MORE() && !SEETWO('\\', '}'))
 976                                 NEXT();
 977                         (void)REQUIRE(MORE(), REG_EBRACE);
 978                         SETERROR(REG_BADBR);
 979                 }
 980         } else if (c == '$')     /* $ (but not \$) ends it */
 981                 return (true);
 982
 983         return (false);
 984 }
 985
 986 /*
 987  - p_count - parse a repetition count
 988  == static int p_count(struct parse *p);
 989  */
 990 static int                      /* the value */
 991 p_count(struct parse *p)
 992 {
 993         int count = 0;
 994         int ndigits = 0;
 995
 996         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 997                 count = count*10 + (GETNEXT() - '0');
 998                 ndigits++;
 999         }
1000
1001         (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1002         return(count);
1003 }
1004
1005 /*
1006  - p_bracket - parse a bracketed character list
1007  == static void p_bracket(struct parse *p);
1008  */
1009 static void
1010 p_bracket(struct parse *p)
1011 {
1012         cset *cs;
1013         wint_t ch;
1014
1015         /* Dept of Truly Sickening Special-Case Kludges */
1016         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1017                 EMIT(OBOW, 0);
1018                 NEXTn(6);
1019                 return;
1020         }
1021         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1022                 EMIT(OEOW, 0);
1023                 NEXTn(6);
1024                 return;
1025         }
1026
1027         if ((cs = allocset(p)) == NULL)
1028                 return;
1029
1030         if (p->g->cflags&REG_ICASE)
1031                 cs->icase = 1;
1032         if (EAT('^'))
1033                 cs->invert = 1;
1034         if (EAT(']'))
1035                 CHadd(p, cs, ']');
1036         else if (EAT('-'))
1037                 CHadd(p, cs, '-');
1038         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1039                 p_b_term(p, cs);
1040         if (EAT('-'))
1041                 CHadd(p, cs, '-');
1042         (void)MUSTEAT(']', REG_EBRACK);
1043
1044         if (p->error != 0)      /* don't mess things up further */
1045                 return;
1046
1047         if (cs->invert && p->g->cflags&REG_NEWLINE)
1048                 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1049
1050         if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
1051                 ordinary(p, ch);
1052                 freeset(p, cs);
1053         } else
1054                 EMIT(OANYOF, (int)(cs - p->g->sets));
1055 }
1056
1057 static int
1058 p_range_cmp(wchar_t c1, wchar_t c2)
1059 {
1060 #ifndef LIBREGEX
1061         return __wcollate_range_cmp(c1, c2);
1062 #else
1063         /* Copied from libc/collate __wcollate_range_cmp */
1064         wchar_t s1[2], s2[2];
1065
1066         s1[0] = c1;
1067         s1[1] = L'\0';
1068         s2[0] = c2;
1069         s2[1] = L'\0';
1070         return (wcscoll(s1, s2));
1071 #endif
1072 }
1073
1074 /*
1075  - p_b_term - parse one term of a bracketed character list
1076  == static void p_b_term(struct parse *p, cset *cs);
1077  */
1078 static void
1079 p_b_term(struct parse *p, cset *cs)
1080 {
1081         char c;
1082         wint_t start, finish;
1083         wint_t i;
1084 #ifndef LIBREGEX
1085         struct xlocale_collate *table =
1086                 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1087 #endif
1088         /* classify what we've got */
1089         switch ((MORE()) ? PEEK() : '\0') {
1090         case '[':
1091                 c = (MORE2()) ? PEEK2() : '\0';
1092                 break;
1093         case '-':
1094                 SETERROR(REG_ERANGE);
1095                 return;                 /* NOTE RETURN */
1096         default:
1097                 c = '\0';
1098                 break;
1099         }
1100
1101         switch (c) {
1102         case ':':               /* character class */
1103                 NEXT2();
1104                 (void)REQUIRE(MORE(), REG_EBRACK);
1105                 c = PEEK();
1106                 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1107                 p_b_cclass(p, cs);
1108                 (void)REQUIRE(MORE(), REG_EBRACK);
1109                 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1110                 break;
1111         case '=':               /* equivalence class */
1112                 NEXT2();
1113                 (void)REQUIRE(MORE(), REG_EBRACK);
1114                 c = PEEK();
1115                 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1116                 p_b_eclass(p, cs);
1117                 (void)REQUIRE(MORE(), REG_EBRACK);
1118                 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1119                 break;
1120         default:                /* symbol, ordinary character, or range */
1121                 start = p_b_symbol(p);
1122                 if (SEE('-') && MORE2() && PEEK2() != ']') {
1123                         /* range */
1124                         NEXT();
1125                         if (EAT('-'))
1126                                 finish = '-';
1127                         else
1128                                 finish = p_b_symbol(p);
1129                 } else
1130                         finish = start;
1131                 if (start == finish)
1132                         CHadd(p, cs, start);
1133                 else {
1134 #ifndef LIBREGEX
1135                         if (table->__collate_load_error || MB_CUR_MAX > 1) {
1136 #else
1137                         if (MB_CUR_MAX > 1) {
1138 #endif
1139                                 (void)REQUIRE(start <= finish, REG_ERANGE);
1140                                 CHaddrange(p, cs, start, finish);
1141                         } else {
1142                                 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1143                                 for (i = 0; i <= UCHAR_MAX; i++) {
1144                                         if (p_range_cmp(start, i) <= 0 &&
1145                                             p_range_cmp(i, finish) <= 0 )
1146                                                 CHadd(p, cs, i);
1147                                 }
1148                         }
1149                 }
1150                 break;
1151         }
1152 }
1153
1154 /*
1155  - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1156  == static int p_b_pseudoclass(struct parse *p, char c)
1157  */
1158 static int
1159 p_b_pseudoclass(struct parse *p, char c) {
1160         cset *cs;
1161
1162         if ((cs = allocset(p)) == NULL)
1163                 return(0);
1164
1165         if (p->g->cflags&REG_ICASE)
1166                 cs->icase = 1;
1167
1168         switch (c) {
1169         case 'W':
1170                 cs->invert = 1;
1171                 /* PASSTHROUGH */
1172         case 'w':
1173                 p_b_cclass_named(p, cs, "alnum");
1174                 break;
1175         case 'S':
1176                 cs->invert = 1;
1177                 /* PASSTHROUGH */
1178         case 's':
1179                 p_b_cclass_named(p, cs, "space");
1180                 break;
1181         default:
1182                 return(0);
1183         }
1184
1185         EMIT(OANYOF, (int)(cs - p->g->sets));
1186         return(1);
1187 }
1188
1189 /*
1190  - p_b_cclass - parse a character-class name and deal with it
1191  == static void p_b_cclass(struct parse *p, cset *cs);
1192  */
1193 static void
1194 p_b_cclass(struct parse *p, cset *cs)
1195 {
1196         const char *sp = p->next;
1197         size_t len;
1198         char clname[16];
1199
1200         while (MORE() && isalpha((uch)PEEK()))
1201                 NEXT();
1202         len = p->next - sp;
1203         if (len >= sizeof(clname) - 1) {
1204                 SETERROR(REG_ECTYPE);
1205                 return;
1206         }
1207         memcpy(clname, sp, len);
1208         clname[len] = '\0';
1209
1210         p_b_cclass_named(p, cs, clname);
1211 }
1212 /*
1213  - p_b_cclass_named - deal with a named character class
1214  == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1215  */
1216 static void
1217 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1218         wctype_t wct;
1219
1220         if ((wct = wctype(clname)) == 0) {
1221                 SETERROR(REG_ECTYPE);
1222                 return;
1223         }
1224         CHaddtype(p, cs, wct);
1225 }
1226
1227 /*
1228  - p_b_eclass - parse an equivalence-class name and deal with it
1229  == static void p_b_eclass(struct parse *p, cset *cs);
1230  *
1231  * This implementation is incomplete. xxx
1232  */
1233 static void
1234 p_b_eclass(struct parse *p, cset *cs)
1235 {
1236         wint_t c;
1237
1238         c = p_b_coll_elem(p, '=');
1239         CHadd(p, cs, c);
1240 }
1241
1242 /*
1243  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1244  == static wint_t p_b_symbol(struct parse *p);
1245  */
1246 static wint_t                   /* value of symbol */
1247 p_b_symbol(struct parse *p)
1248 {
1249         wint_t value;
1250
1251         (void)REQUIRE(MORE(), REG_EBRACK);
1252         if (!EATTWO('[', '.'))
1253                 return(WGETNEXT());
1254
1255         /* collating symbol */
1256         value = p_b_coll_elem(p, '.');
1257         (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1258         return(value);
1259 }
1260
1261 /*
1262  - p_b_coll_elem - parse a collating-element name and look it up
1263  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1264  */
1265 static wint_t                   /* value of collating element */
1266 p_b_coll_elem(struct parse *p,
1267         wint_t endc)            /* name ended by endc,']' */
1268 {
1269         const char *sp = p->next;
1270         struct cname *cp;
1271         mbstate_t mbs;
1272         wchar_t wc;
1273         size_t clen, len;
1274
1275         while (MORE() && !SEETWO(endc, ']'))
1276                 NEXT();
1277         if (!MORE()) {
1278                 SETERROR(REG_EBRACK);
1279                 return(0);
1280         }
1281         len = p->next - sp;
1282         for (cp = cnames; cp->name != NULL; cp++)
1283                 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1284                         return(cp->code);       /* known name */
1285         memset(&mbs, 0, sizeof(mbs));
1286         if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1287                 return (wc);                    /* single character */
1288         else if (clen == (size_t)-1 || clen == (size_t)-2)
1289                 SETERROR(REG_ILLSEQ);
1290         else
1291                 SETERROR(REG_ECOLLATE);         /* neither */
1292         return(0);
1293 }
1294
1295 /*
1296  - may_escape - determine whether 'ch' is escape-able in the current context
1297  == static int may_escape(struct parse *p, const wint_t ch)
1298  */
1299 static bool
1300 may_escape(struct parse *p, const wint_t ch)
1301 {
1302
1303         if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1304                 return (true);
1305         if (isalpha(ch) || ch == '\'' || ch == '`')
1306                 return (false);
1307         return (true);
1308 #ifdef NOTYET
1309         /*
1310          * Build a whitelist of characters that may be escaped to produce an
1311          * ordinary in the current context. This assumes that these have not
1312          * been otherwise interpreted as a special character. Escaping an
1313          * ordinary character yields undefined results according to
1314          * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1315          * advantage of this and use escaped ordinary characters to provide
1316          * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1317          */
1318         switch(ch) {
1319         case '|':
1320         case '+':
1321         case '?':
1322                 /* The above characters may not be escaped in BREs */
1323                 if (!(p->g->cflags&REG_EXTENDED))
1324                         return (false);
1325                 /* Fallthrough */
1326         case '(':
1327         case ')':
1328         case '{':
1329         case '}':
1330         case '.':
1331         case '[':
1332         case ']':
1333         case '\\':
1334         case '*':
1335         case '^':
1336         case '$':
1337                 return (true);
1338         default:
1339                 return (false);
1340         }
1341 #endif
1342 }
1343
1344 /*
1345  - othercase - return the case counterpart of an alphabetic
1346  == static wint_t othercase(wint_t ch);
1347  */
1348 static wint_t                   /* if no counterpart, return ch */
1349 othercase(wint_t ch)
1350 {
1351         assert(iswalpha(ch));
1352         if (iswupper(ch))
1353                 return(towlower(ch));
1354         else if (iswlower(ch))
1355                 return(towupper(ch));
1356         else                    /* peculiar, but could happen */
1357                 return(ch);
1358 }
1359
1360 /*
1361  - bothcases - emit a dualcase version of a two-case character
1362  == static void bothcases(struct parse *p, wint_t ch);
1363  *
1364  * Boy, is this implementation ever a kludge...
1365  */
1366 static void
1367 bothcases(struct parse *p, wint_t ch)
1368 {
1369         const char *oldnext = p->next;
1370         const char *oldend = p->end;
1371         char bracket[3 + MB_LEN_MAX];
1372         size_t n;
1373         mbstate_t mbs;
1374
1375         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1376         p->next = bracket;
1377         memset(&mbs, 0, sizeof(mbs));
1378         n = wcrtomb(bracket, ch, &mbs);
1379         assert(n != (size_t)-1);
1380         bracket[n] = ']';
1381         bracket[n + 1] = '\0';
1382         p->end = bracket+n+1;
1383         p_bracket(p);
1384         assert(p->next == p->end);
1385         p->next = oldnext;
1386         p->end = oldend;
1387 }
1388
1389 /*
1390  - ordinary - emit an ordinary character
1391  == static void ordinary(struct parse *p, wint_t ch);
1392  */
1393 static void
1394 ordinary(struct parse *p, wint_t ch)
1395 {
1396         cset *cs;
1397
1398         if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1399                 bothcases(p, ch);
1400         else if ((ch & OPDMASK) == ch)
1401                 EMIT(OCHAR, ch);
1402         else {
1403                 /*
1404                  * Kludge: character is too big to fit into an OCHAR operand.
1405                  * Emit a singleton set.
1406                  */
1407                 if ((cs = allocset(p)) == NULL)
1408                         return;
1409                 CHadd(p, cs, ch);
1410                 EMIT(OANYOF, (int)(cs - p->g->sets));
1411         }
1412 }
1413
1414 /*
1415  - nonnewline - emit REG_NEWLINE version of OANY
1416  == static void nonnewline(struct parse *p);
1417  *
1418  * Boy, is this implementation ever a kludge...
1419  */
1420 static void
1421 nonnewline(struct parse *p)
1422 {
1423         const char *oldnext = p->next;
1424         const char *oldend = p->end;
1425         char bracket[4];
1426
1427         p->next = bracket;
1428         p->end = bracket+3;
1429         bracket[0] = '^';
1430         bracket[1] = '\n';
1431         bracket[2] = ']';
1432         bracket[3] = '\0';
1433         p_bracket(p);
1434         assert(p->next == bracket+3);
1435         p->next = oldnext;
1436         p->end = oldend;
1437 }
1438
1439 /*
1440  - repeat - generate code for a bounded repetition, recursively if needed
1441  == static void repeat(struct parse *p, sopno start, int from, int to);
1442  */
1443 static void
1444 repeat(struct parse *p,
1445         sopno start,            /* operand from here to end of strip */
1446         int from,               /* repeated from this number */
1447         int to)                 /* to this number of times (maybe INFINITY) */
1448 {
1449         sopno finish = HERE();
1450 #       define  N       2
1451 #       define  INF     3
1452 #       define  REP(f, t)       ((f)*8 + (t))
1453 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1454         sopno copy;
1455
1456         if (p->error != 0)      /* head off possible runaway recursion */
1457                 return;
1458
1459         assert(from <= to);
1460
1461         switch (REP(MAP(from), MAP(to))) {
1462         case REP(0, 0):                 /* must be user doing this */
1463                 DROP(finish-start);     /* drop the operand */
1464                 break;
1465         case REP(0, 1):                 /* as x{1,1}? */
1466         case REP(0, N):                 /* as x{1,n}? */
1467         case REP(0, INF):               /* as x{1,}? */
1468                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1469                 INSERT(OCH_, start);            /* offset is wrong... */
1470                 repeat(p, start+1, 1, to);
1471                 ASTERN(OOR1, start);
1472                 AHEAD(start);                   /* ... fix it */
1473                 EMIT(OOR2, 0);
1474                 AHEAD(THERE());
1475                 ASTERN(O_CH, THERETHERE());
1476                 break;
1477         case REP(1, 1):                 /* trivial case */
1478                 /* done */
1479                 break;
1480         case REP(1, N):                 /* as x?x{1,n-1} */
1481                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1482                 INSERT(OCH_, start);
1483                 ASTERN(OOR1, start);
1484                 AHEAD(start);
1485                 EMIT(OOR2, 0);                  /* offset very wrong... */
1486                 AHEAD(THERE());                 /* ...so fix it */
1487                 ASTERN(O_CH, THERETHERE());
1488                 copy = dupl(p, start+1, finish+1);
1489                 assert(copy == finish+4);
1490                 repeat(p, copy, 1, to-1);
1491                 break;
1492         case REP(1, INF):               /* as x+ */
1493                 INSERT(OPLUS_, start);
1494                 ASTERN(O_PLUS, start);
1495                 break;
1496         case REP(N, N):                 /* as xx{m-1,n-1} */
1497                 copy = dupl(p, start, finish);
1498                 repeat(p, copy, from-1, to-1);
1499                 break;
1500         case REP(N, INF):               /* as xx{n-1,INF} */
1501                 copy = dupl(p, start, finish);
1502                 repeat(p, copy, from-1, to);
1503                 break;
1504         default:                        /* "can't happen" */
1505                 SETERROR(REG_ASSERT);   /* just in case */
1506                 break;
1507         }
1508 }
1509
1510 /*
1511  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1512  - character from the parse struct, signals a REG_ILLSEQ error if the
1513  - character can't be converted. Returns the number of bytes consumed.
1514  */
1515 static wint_t
1516 wgetnext(struct parse *p)
1517 {
1518         mbstate_t mbs;
1519         wchar_t wc;
1520         size_t n;
1521
1522         memset(&mbs, 0, sizeof(mbs));
1523         n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1524         if (n == (size_t)-1 || n == (size_t)-2) {
1525                 SETERROR(REG_ILLSEQ);
1526                 return (0);
1527         }
1528         if (n == 0)
1529                 n = 1;
1530         p->next += n;
1531         return (wc);
1532 }
1533
1534 /*
1535  - seterr - set an error condition
1536  == static int seterr(struct parse *p, int e);
1537  */
1538 static int                      /* useless but makes type checking happy */
1539 seterr(struct parse *p, int e)
1540 {
1541         if (p->error == 0)      /* keep earliest error condition */
1542                 p->error = e;
1543         p->next = nuls;         /* try to bring things to a halt */
1544         p->end = nuls;
1545         return(0);              /* make the return value well-defined */
1546 }
1547
1548 /*
1549  - allocset - allocate a set of characters for []
1550  == static cset *allocset(struct parse *p);
1551  */
1552 static cset *
1553 allocset(struct parse *p)
1554 {
1555         cset *cs, *ncs;
1556
1557         ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1558         if (ncs == NULL) {
1559                 SETERROR(REG_ESPACE);
1560                 return (NULL);
1561         }
1562         p->g->sets = ncs;
1563         cs = &p->g->sets[p->g->ncsets++];
1564         memset(cs, 0, sizeof(*cs));
1565
1566         return(cs);
1567 }
1568
1569 /*
1570  - freeset - free a now-unused set
1571  == static void freeset(struct parse *p, cset *cs);
1572  */
1573 static void
1574 freeset(struct parse *p, cset *cs)
1575 {
1576         cset *top = &p->g->sets[p->g->ncsets];
1577
1578         free(cs->wides);
1579         free(cs->ranges);
1580         free(cs->types);
1581         memset(cs, 0, sizeof(*cs));
1582         if (cs == top-1)        /* recover only the easy case */
1583                 p->g->ncsets--;
1584 }
1585
1586 /*
1587  - singleton - Determine whether a set contains only one character,
1588  - returning it if so, otherwise returning OUT.
1589  */
1590 static wint_t
1591 singleton(cset *cs)
1592 {
1593         wint_t i, s, n;
1594
1595         for (i = n = 0; i < NC; i++)
1596                 if (CHIN(cs, i)) {
1597                         n++;
1598                         s = i;
1599                 }
1600         if (n == 1)
1601                 return (s);
1602         if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1603             cs->icase == 0)
1604                 return (cs->wides[0]);
1605         /* Don't bother handling the other cases. */
1606         return (OUT);
1607 }
1608
1609 /*
1610  - CHadd - add character to character set.
1611  */
1612 static void
1613 CHadd(struct parse *p, cset *cs, wint_t ch)
1614 {
1615         wint_t nch, *newwides;
1616         assert(ch >= 0);
1617         if (ch < NC)
1618                 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1619         else {
1620                 newwides = reallocarray(cs->wides, cs->nwides + 1,
1621                     sizeof(*cs->wides));
1622                 if (newwides == NULL) {
1623                         SETERROR(REG_ESPACE);
1624                         return;
1625                 }
1626                 cs->wides = newwides;
1627                 cs->wides[cs->nwides++] = ch;
1628         }
1629         if (cs->icase) {
1630                 if ((nch = towlower(ch)) < NC)
1631                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1632                 if ((nch = towupper(ch)) < NC)
1633                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1634         }
1635 }
1636
1637 /*
1638  - CHaddrange - add all characters in the range [min,max] to a character set.
1639  */
1640 static void
1641 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1642 {
1643         crange *newranges;
1644
1645         for (; min < NC && min <= max; min++)
1646                 CHadd(p, cs, min);
1647         if (min >= max)
1648                 return;
1649         newranges = reallocarray(cs->ranges, cs->nranges + 1,
1650             sizeof(*cs->ranges));
1651         if (newranges == NULL) {
1652                 SETERROR(REG_ESPACE);
1653                 return;
1654         }
1655         cs->ranges = newranges;
1656         cs->ranges[cs->nranges].min = min;
1657         cs->ranges[cs->nranges].max = max;
1658         cs->nranges++;
1659 }
1660
1661 /*
1662  - CHaddtype - add all characters of a certain type to a character set.
1663  */
1664 static void
1665 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1666 {
1667         wint_t i;
1668         wctype_t *newtypes;
1669
1670         for (i = 0; i < NC; i++)
1671                 if (iswctype(i, wct))
1672                         CHadd(p, cs, i);
1673         newtypes = reallocarray(cs->types, cs->ntypes + 1,
1674             sizeof(*cs->types));
1675         if (newtypes == NULL) {
1676                 SETERROR(REG_ESPACE);
1677                 return;
1678         }
1679         cs->types = newtypes;
1680         cs->types[cs->ntypes++] = wct;
1681 }
1682
1683 /*
1684  - dupl - emit a duplicate of a bunch of sops
1685  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1686  */
1687 static sopno                    /* start of duplicate */
1688 dupl(struct parse *p,
1689         sopno start,            /* from here */
1690         sopno finish)           /* to this less one */
1691 {
1692         sopno ret = HERE();
1693         sopno len = finish - start;
1694
1695         assert(finish >= start);
1696         if (len == 0)
1697                 return(ret);
1698         if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1699                 return(ret);
1700         (void) memcpy((char *)(p->strip + p->slen),
1701                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1702         p->slen += len;
1703         return(ret);
1704 }
1705
1706 /*
1707  - doemit - emit a strip operator
1708  == static void doemit(struct parse *p, sop op, size_t opnd);
1709  *
1710  * It might seem better to implement this as a macro with a function as
1711  * hard-case backup, but it's just too big and messy unless there are
1712  * some changes to the data structures.  Maybe later.
1713  */
1714 static void
1715 doemit(struct parse *p, sop op, size_t opnd)
1716 {
1717         /* avoid making error situations worse */
1718         if (p->error != 0)
1719                 return;
1720
1721         /* deal with oversize operands ("can't happen", more or less) */
1722         assert(opnd < 1<<OPSHIFT);
1723
1724         /* deal with undersized strip */
1725         if (p->slen >= p->ssize)
1726                 if (!enlarge(p, (p->ssize+1) / 2 * 3))  /* +50% */
1727                         return;
1728
1729         /* finally, it's all reduced to the easy case */
1730         p->strip[p->slen++] = SOP(op, opnd);
1731 }
1732
1733 /*
1734  - doinsert - insert a sop into the strip
1735  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1736  */
1737 static void
1738 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1739 {
1740         sopno sn;
1741         sop s;
1742         int i;
1743
1744         /* avoid making error situations worse */
1745         if (p->error != 0)
1746                 return;
1747
1748         sn = HERE();
1749         EMIT(op, opnd);         /* do checks, ensure space */
1750         assert(HERE() == sn+1);
1751         s = p->strip[sn];
1752
1753         /* adjust paren pointers */
1754         assert(pos > 0);
1755         for (i = 1; i < NPAREN; i++) {
1756                 if (p->pbegin[i] >= pos) {
1757                         p->pbegin[i]++;
1758                 }
1759                 if (p->pend[i] >= pos) {
1760                         p->pend[i]++;
1761                 }
1762         }
1763
1764         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1765                                                 (HERE()-pos-1)*sizeof(sop));
1766         p->strip[pos] = s;
1767 }
1768
1769 /*
1770  - dofwd - complete a forward reference
1771  == static void dofwd(struct parse *p, sopno pos, sop value);
1772  */
1773 static void
1774 dofwd(struct parse *p, sopno pos, sop value)
1775 {
1776         /* avoid making error situations worse */
1777         if (p->error != 0)
1778                 return;
1779
1780         assert(value < 1<<OPSHIFT);
1781         p->strip[pos] = OP(p->strip[pos]) | value;
1782 }
1783
1784 /*
1785  - enlarge - enlarge the strip
1786  == static int enlarge(struct parse *p, sopno size);
1787  */
1788 static int
1789 enlarge(struct parse *p, sopno size)
1790 {
1791         sop *sp;
1792
1793         if (p->ssize >= size)
1794                 return 1;
1795
1796         sp = reallocarray(p->strip, size, sizeof(sop));
1797         if (sp == NULL) {
1798                 SETERROR(REG_ESPACE);
1799                 return 0;
1800         }
1801         p->strip = sp;
1802         p->ssize = size;
1803         return 1;
1804 }
1805
1806 /*
1807  - stripsnug - compact the strip
1808  == static void stripsnug(struct parse *p, struct re_guts *g);
1809  */
1810 static void
1811 stripsnug(struct parse *p, struct re_guts *g)
1812 {
1813         g->nstates = p->slen;
1814         g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop));
1815         if (g->strip == NULL) {
1816                 SETERROR(REG_ESPACE);
1817                 g->strip = p->strip;
1818         }
1819 }
1820
1821 /*
1822  - findmust - fill in must and mlen with longest mandatory literal string
1823  == static void findmust(struct parse *p, struct re_guts *g);
1824  *
1825  * This algorithm could do fancy things like analyzing the operands of |
1826  * for common subsequences.  Someday.  This code is simple and finds most
1827  * of the interesting cases.
1828  *
1829  * Note that must and mlen got initialized during setup.
1830  */
1831 static void
1832 findmust(struct parse *p, struct re_guts *g)
1833 {
1834         sop *scan;
1835         sop *start = NULL;
1836         sop *newstart = NULL;
1837         sopno newlen;
1838         sop s;
1839         char *cp;
1840         int offset;
1841         char buf[MB_LEN_MAX];
1842         size_t clen;
1843         mbstate_t mbs;
1844
1845         /* avoid making error situations worse */
1846         if (p->error != 0)
1847                 return;
1848
1849         /*
1850          * It's not generally safe to do a ``char'' substring search on
1851          * multibyte character strings, but it's safe for at least
1852          * UTF-8 (see RFC 3629).
1853          */
1854         if (MB_CUR_MAX > 1 &&
1855             strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1856                 return;
1857
1858         /* find the longest OCHAR sequence in strip */
1859         newlen = 0;
1860         offset = 0;
1861         g->moffset = 0;
1862         scan = g->strip + 1;
1863         do {
1864                 s = *scan++;
1865                 switch (OP(s)) {
1866                 case OCHAR:             /* sequence member */
1867                         if (newlen == 0) {              /* new sequence */
1868                                 memset(&mbs, 0, sizeof(mbs));
1869                                 newstart = scan - 1;
1870                         }
1871                         clen = wcrtomb(buf, OPND(s), &mbs);
1872                         if (clen == (size_t)-1)
1873                                 goto toohard;
1874                         newlen += clen;
1875                         break;
1876                 case OPLUS_:            /* things that don't break one */
1877                 case OLPAREN:
1878                 case ORPAREN:
1879                         break;
1880                 case OQUEST_:           /* things that must be skipped */
1881                 case OCH_:
1882                         offset = altoffset(scan, offset);
1883                         scan--;
1884                         do {
1885                                 scan += OPND(s);
1886                                 s = *scan;
1887                                 /* assert() interferes w debug printouts */
1888                                 if (OP(s) != (sop)O_QUEST &&
1889                                     OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1890                                         g->iflags |= BAD;
1891                                         return;
1892                                 }
1893                         } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1894                         /* FALLTHROUGH */
1895                 case OBOW:              /* things that break a sequence */
1896                 case OEOW:
1897                 case OBOL:
1898                 case OEOL:
1899                 case OBOS:
1900                 case OEOS:
1901                 case OWBND:
1902                 case ONWBND:
1903                 case O_QUEST:
1904                 case O_CH:
1905                 case OEND:
1906                         if (newlen > (sopno)g->mlen) {          /* ends one */
1907                                 start = newstart;
1908                                 g->mlen = newlen;
1909                                 if (offset > -1) {
1910                                         g->moffset += offset;
1911                                         offset = newlen;
1912                                 } else
1913                                         g->moffset = offset;
1914                         } else {
1915                                 if (offset > -1)
1916                                         offset += newlen;
1917                         }
1918                         newlen = 0;
1919                         break;
1920                 case OANY:
1921                         if (newlen > (sopno)g->mlen) {          /* ends one */
1922                                 start = newstart;
1923                                 g->mlen = newlen;
1924                                 if (offset > -1) {
1925                                         g->moffset += offset;
1926                                         offset = newlen;
1927                                 } else
1928                                         g->moffset = offset;
1929                         } else {
1930                                 if (offset > -1)
1931                                         offset += newlen;
1932                         }
1933                         if (offset > -1)
1934                                 offset++;
1935                         newlen = 0;
1936                         break;
1937                 case OANYOF:            /* may or may not invalidate offset */
1938                         /* First, everything as OANY */
1939                         if (newlen > (sopno)g->mlen) {          /* ends one */
1940                                 start = newstart;
1941                                 g->mlen = newlen;
1942                                 if (offset > -1) {
1943                                         g->moffset += offset;
1944                                         offset = newlen;
1945                                 } else
1946                                         g->moffset = offset;
1947                         } else {
1948                                 if (offset > -1)
1949                                         offset += newlen;
1950                         }
1951                         if (offset > -1)
1952                                 offset++;
1953                         newlen = 0;
1954                         break;
1955                 toohard:
1956                 default:
1957                         /* Anything here makes it impossible or too hard
1958                          * to calculate the offset -- so we give up;
1959                          * save the last known good offset, in case the
1960                          * must sequence doesn't occur later.
1961                          */
1962                         if (newlen > (sopno)g->mlen) {          /* ends one */
1963                                 start = newstart;
1964                                 g->mlen = newlen;
1965                                 if (offset > -1)
1966                                         g->moffset += offset;
1967                                 else
1968                                         g->moffset = offset;
1969                         }
1970                         offset = -1;
1971                         newlen = 0;
1972                         break;
1973                 }
1974         } while (OP(s) != OEND);
1975
1976         if (g->mlen == 0) {             /* there isn't one */
1977                 g->moffset = -1;
1978                 return;
1979         }
1980
1981         /* turn it into a character string */
1982         g->must = malloc((size_t)g->mlen + 1);
1983         if (g->must == NULL) {          /* argh; just forget it */
1984                 g->mlen = 0;
1985                 g->moffset = -1;
1986                 return;
1987         }
1988         cp = g->must;
1989         scan = start;
1990         memset(&mbs, 0, sizeof(mbs));
1991         while (cp < g->must + g->mlen) {
1992                 while (OP(s = *scan++) != OCHAR)
1993                         continue;
1994                 clen = wcrtomb(cp, OPND(s), &mbs);
1995                 assert(clen != (size_t)-1);
1996                 cp += clen;
1997         }
1998         assert(cp == g->must + g->mlen);
1999         *cp++ = '\0';           /* just on general principles */
2000 }
2001
2002 /*
2003  - altoffset - choose biggest offset among multiple choices
2004  == static int altoffset(sop *scan, int offset);
2005  *
2006  * Compute, recursively if necessary, the largest offset among multiple
2007  * re paths.
2008  */
2009 static int
2010 altoffset(sop *scan, int offset)
2011 {
2012         int largest;
2013         int try;
2014         sop s;
2015
2016         /* If we gave up already on offsets, return */
2017         if (offset == -1)
2018                 return -1;
2019
2020         largest = 0;
2021         try = 0;
2022         s = *scan++;
2023         while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
2024                 switch (OP(s)) {
2025                 case OOR1:
2026                         if (try > largest)
2027                                 largest = try;
2028                         try = 0;
2029                         break;
2030                 case OQUEST_:
2031                 case OCH_:
2032                         try = altoffset(scan, try);
2033                         if (try == -1)
2034                                 return -1;
2035                         scan--;
2036                         do {
2037                                 scan += OPND(s);
2038                                 s = *scan;
2039                                 if (OP(s) != (sop)O_QUEST &&
2040                                     OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
2041                                         return -1;
2042                         } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
2043                         /* We must skip to the next position, or we'll
2044                          * leave altoffset() too early.
2045                          */
2046                         scan++;
2047                         break;
2048                 case OANYOF:
2049                 case OCHAR:
2050                 case OANY:
2051                         try++;
2052                 case OBOW:
2053                 case OEOW:
2054                 case OWBND:
2055                 case ONWBND:
2056                 case OLPAREN:
2057                 case ORPAREN:
2058                 case OOR2:
2059                         break;
2060                 default:
2061                         try = -1;
2062                         break;
2063                 }
2064                 if (try == -1)
2065                         return -1;
2066                 s = *scan++;
2067         }
2068
2069         if (try > largest)
2070                 largest = try;
2071
2072         return largest+offset;
2073 }
2074
2075 /*
2076  - computejumps - compute char jumps for BM scan
2077  == static void computejumps(struct parse *p, struct re_guts *g);
2078  *
2079  * This algorithm assumes g->must exists and is has size greater than
2080  * zero. It's based on the algorithm found on Computer Algorithms by
2081  * Sara Baase.
2082  *
2083  * A char jump is the number of characters one needs to jump based on
2084  * the value of the character from the text that was mismatched.
2085  */
2086 static void
2087 computejumps(struct parse *p, struct re_guts *g)
2088 {
2089         int ch;
2090         int mindex;
2091
2092         /* Avoid making errors worse */
2093         if (p->error != 0)
2094                 return;
2095
2096         g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int));
2097         if (g->charjump == NULL)        /* Not a fatal error */
2098                 return;
2099         /* Adjust for signed chars, if necessary */
2100         g->charjump = &g->charjump[-(CHAR_MIN)];
2101
2102         /* If the character does not exist in the pattern, the jump
2103          * is equal to the number of characters in the pattern.
2104          */
2105         for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2106                 g->charjump[ch] = g->mlen;
2107
2108         /* If the character does exist, compute the jump that would
2109          * take us to the last character in the pattern equal to it
2110          * (notice that we match right to left, so that last character
2111          * is the first one that would be matched).
2112          */
2113         for (mindex = 0; mindex < g->mlen; mindex++)
2114                 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2115 }
2116
2117 /*
2118  - computematchjumps - compute match jumps for BM scan
2119  == static void computematchjumps(struct parse *p, struct re_guts *g);
2120  *
2121  * This algorithm assumes g->must exists and is has size greater than
2122  * zero. It's based on the algorithm found on Computer Algorithms by
2123  * Sara Baase.
2124  *
2125  * A match jump is the number of characters one needs to advance based
2126  * on the already-matched suffix.
2127  * Notice that all values here are minus (g->mlen-1), because of the way
2128  * the search algorithm works.
2129  */
2130 static void
2131 computematchjumps(struct parse *p, struct re_guts *g)
2132 {
2133         int mindex;             /* General "must" iterator */
2134         int suffix;             /* Keeps track of matching suffix */
2135         int ssuffix;            /* Keeps track of suffixes' suffix */
2136         int* pmatches;          /* pmatches[k] points to the next i
2137                                  * such that i+1...mlen is a substring
2138                                  * of k+1...k+mlen-i-1
2139                                  */
2140
2141         /* Avoid making errors worse */
2142         if (p->error != 0)
2143                 return;
2144
2145         pmatches = (int*) malloc(g->mlen * sizeof(int));
2146         if (pmatches == NULL) {
2147                 g->matchjump = NULL;
2148                 return;
2149         }
2150
2151         g->matchjump = (int*) malloc(g->mlen * sizeof(int));
2152         if (g->matchjump == NULL) {     /* Not a fatal error */
2153                 free(pmatches);
2154                 return;
2155         }
2156
2157         /* Set maximum possible jump for each character in the pattern */
2158         for (mindex = 0; mindex < g->mlen; mindex++)
2159                 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2160
2161         /* Compute pmatches[] */
2162         for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2163             mindex--, suffix--) {
2164                 pmatches[mindex] = suffix;
2165
2166                 /* If a mismatch is found, interrupting the substring,
2167                  * compute the matchjump for that position. If no
2168                  * mismatch is found, then a text substring mismatched
2169                  * against the suffix will also mismatch against the
2170                  * substring.
2171                  */
2172                 while (suffix < g->mlen
2173                     && g->must[mindex] != g->must[suffix]) {
2174                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
2175                             g->mlen - mindex - 1);
2176                         suffix = pmatches[suffix];
2177                 }
2178         }
2179
2180         /* Compute the matchjump up to the last substring found to jump
2181          * to the beginning of the largest must pattern prefix matching
2182          * it's own suffix.
2183          */
2184         for (mindex = 0; mindex <= suffix; mindex++)
2185                 g->matchjump[mindex] = MIN(g->matchjump[mindex],
2186                     g->mlen + suffix - mindex);
2187
2188         ssuffix = pmatches[suffix];
2189         while (suffix < g->mlen) {
2190                 while (suffix <= ssuffix && suffix < g->mlen) {
2191                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
2192                             g->mlen + ssuffix - suffix);
2193                         suffix++;
2194                 }
2195                 if (suffix < g->mlen)
2196                         ssuffix = pmatches[ssuffix];
2197         }
2198
2199         free(pmatches);
2200 }
2201
2202 /*
2203  - pluscount - count + nesting
2204  == static sopno pluscount(struct parse *p, struct re_guts *g);
2205  */
2206 static sopno                    /* nesting depth */
2207 pluscount(struct parse *p, struct re_guts *g)
2208 {
2209         sop *scan;
2210         sop s;
2211         sopno plusnest = 0;
2212         sopno maxnest = 0;
2213
2214         if (p->error != 0)
2215                 return(0);      /* there may not be an OEND */
2216
2217         scan = g->strip + 1;
2218         do {
2219                 s = *scan++;
2220                 switch (OP(s)) {
2221                 case OPLUS_:
2222                         plusnest++;
2223                         break;
2224                 case O_PLUS:
2225                         if (plusnest > maxnest)
2226                                 maxnest = plusnest;
2227                         plusnest--;
2228                         break;
2229                 }
2230         } while (OP(s) != OEND);
2231         if (plusnest != 0)
2232                 g->iflags |= BAD;
2233         return(maxnest);
2234 }