From 1d807124bbeab68ebeeb894ae28814add74fd4c2 Mon Sep 17 00:00:00 2001 From: jkim Date: Thu, 16 May 2013 23:23:37 +0000 Subject: [PATCH] Import libregex from GNU libc 2.17. --- posix/regex.h | 593 ----------------------------- regcomp.c | 903 +++++++++++++++++++++------------------------ regex.c | 39 +- regex.h | 600 ++++++++++++++++++++++++++++-- regex_internal.c | 684 ++++++++++++++++++---------------- regex_internal.h | 209 +++++------ regexec.c | 943 +++++++++++++++++++++++++---------------------- 7 files changed, 1958 insertions(+), 2013 deletions(-) delete mode 100644 posix/regex.h diff --git a/posix/regex.h b/posix/regex.h deleted file mode 100644 index b2d9a62fec9..00000000000 --- a/posix/regex.h +++ /dev/null @@ -1,593 +0,0 @@ -/* Definitions for data structures and routines for the regular - expression library. - Copyright (C) 1985,1989-93,1995-98,2000,2001,2002,2003 - Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#ifndef _REGEX_H -#define _REGEX_H 1 - -#include - -/* Allow the use in C++ code. */ -#ifdef __cplusplus -extern "C" { -#endif - -/* POSIX says that must be included (by the caller) before - . */ - -#if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS -/* VMS doesn't have `size_t' in , even though POSIX says it - should be there. */ -# include -#endif - -/* The following two types have to be signed and unsigned integer type - wide enough to hold a value of a pointer. For most ANSI compilers - ptrdiff_t and size_t should be likely OK. Still size of these two - types is 2 for Microsoft C. Ugh... */ -typedef long int s_reg_t; -typedef unsigned long int active_reg_t; - -/* The following bits are used to determine the regexp syntax we - recognize. The set/not-set meanings are chosen so that Emacs syntax - remains the value 0. The bits are given in alphabetical order, and - the definitions shifted by one from the previous bit; thus, when we - add or remove a bit, only one other definition need change. */ -typedef unsigned long int reg_syntax_t; - -/* If this bit is not set, then \ inside a bracket expression is literal. - If set, then such a \ quotes the following character. */ -#define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1) - -/* If this bit is not set, then + and ? are operators, and \+ and \? are - literals. - If set, then \+ and \? are operators and + and ? are literals. */ -#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) - -/* If this bit is set, then character classes are supported. They are: - [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], - [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. - If not set, then character classes are not supported. */ -#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) - -/* If this bit is set, then ^ and $ are always anchors (outside bracket - expressions, of course). - If this bit is not set, then it depends: - ^ is an anchor if it is at the beginning of a regular - expression or after an open-group or an alternation operator; - $ is an anchor if it is at the end of a regular expression, or - before a close-group or an alternation operator. - - This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because - POSIX draft 11.2 says that * etc. in leading positions is undefined. - We already implemented a previous draft which made those constructs - invalid, though, so we haven't changed the code back. */ -#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) - -/* If this bit is set, then special characters are always special - regardless of where they are in the pattern. - If this bit is not set, then special characters are special only in - some contexts; otherwise they are ordinary. Specifically, - * + ? and intervals are only special when not after the beginning, - open-group, or alternation operator. */ -#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) - -/* If this bit is set, then *, +, ?, and { cannot be first in an re or - immediately after an alternation or begin-group operator. */ -#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) - -/* If this bit is set, then . matches newline. - If not set, then it doesn't. */ -#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) - -/* If this bit is set, then . doesn't match NUL. - If not set, then it does. */ -#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) - -/* If this bit is set, nonmatching lists [^...] do not match newline. - If not set, they do. */ -#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) - -/* If this bit is set, either \{...\} or {...} defines an - interval, depending on RE_NO_BK_BRACES. - If not set, \{, \}, {, and } are literals. */ -#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) - -/* If this bit is set, +, ? and | aren't recognized as operators. - If not set, they are. */ -#define RE_LIMITED_OPS (RE_INTERVALS << 1) - -/* If this bit is set, newline is an alternation operator. - If not set, newline is literal. */ -#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) - -/* If this bit is set, then `{...}' defines an interval, and \{ and \} - are literals. - If not set, then `\{...\}' defines an interval. */ -#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) - -/* If this bit is set, (...) defines a group, and \( and \) are literals. - If not set, \(...\) defines a group, and ( and ) are literals. */ -#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) - -/* If this bit is set, then \ matches . - If not set, then \ is a back-reference. */ -#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) - -/* If this bit is set, then | is an alternation operator, and \| is literal. - If not set, then \| is an alternation operator, and | is literal. */ -#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) - -/* If this bit is set, then an ending range point collating higher - than the starting range point, as in [z-a], is invalid. - If not set, then when ending range point collates higher than the - starting range point, the range is ignored. */ -#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) - -/* If this bit is set, then an unmatched ) is ordinary. - If not set, then an unmatched ) is invalid. */ -#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) - -/* If this bit is set, succeed as soon as we match the whole pattern, - without further backtracking. */ -#define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) - -/* If this bit is set, do not process the GNU regex operators. - If not set, then the GNU regex operators are recognized. */ -#define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1) - -/* If this bit is set, turn on internal regex debugging. - If not set, and debugging was on, turn it off. - This only works if regex.c is compiled -DDEBUG. - We define this bit always, so that all that's needed to turn on - debugging is to recompile regex.c; the calling code can always have - this bit set, and it won't affect anything in the normal case. */ -#define RE_DEBUG (RE_NO_GNU_OPS << 1) - -/* If this bit is set, a syntactically invalid interval is treated as - a string of ordinary characters. For example, the ERE 'a{1' is - treated as 'a\{1'. */ -#define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1) - -/* If this bit is set, then ignore case when matching. - If not set, then case is significant. */ -#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) - -/* This bit is used internally like RE_CONTEXT_INDEP_ANCHORS but only - for ^, because it is difficult to scan the regex backwards to find - whether ^ should be special. */ -#define RE_CARET_ANCHORS_HERE (RE_ICASE << 1) - -/* If this bit is set, then \{ cannot be first in an bre or - immediately after an alternation or begin-group operator. */ -#define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1) - -/* If this bit is set, then no_sub will be set to 1 during - re_compile_pattern. */ -#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) - -/* This global variable defines the particular regexp syntax to use (for - some interfaces). When a regexp is compiled, the syntax used is - stored in the pattern buffer, so changing this does not affect - already-compiled regexps. */ -extern reg_syntax_t re_syntax_options; - -/* Define combinations of the above bits for the standard possibilities. - (The [[[ comments delimit what gets put into the Texinfo file, so - don't delete them!) */ -/* [[[begin syntaxes]]] */ -#define RE_SYNTAX_EMACS 0 - -#define RE_SYNTAX_AWK \ - (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ - | RE_NO_BK_PARENS | RE_NO_BK_REFS \ - | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ - | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \ - | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS) - -#define RE_SYNTAX_GNU_AWK \ - ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \ - & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS \ - | RE_CONTEXT_INVALID_OPS )) - -#define RE_SYNTAX_POSIX_AWK \ - (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ - | RE_INTERVALS | RE_NO_GNU_OPS) - -#define RE_SYNTAX_GREP \ - (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ - | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ - | RE_NEWLINE_ALT) - -#define RE_SYNTAX_EGREP \ - (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ - | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ - | RE_NO_BK_VBAR) - -#define RE_SYNTAX_POSIX_EGREP \ - (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES \ - | RE_INVALID_INTERVAL_ORD) - -/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ -#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC - -#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC - -/* Syntax bits common to both basic and extended POSIX regex syntax. */ -#define _RE_SYNTAX_POSIX_COMMON \ - (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ - | RE_INTERVALS | RE_NO_EMPTY_RANGES) - -#define RE_SYNTAX_POSIX_BASIC \ - (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM | RE_CONTEXT_INVALID_DUP) - -/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes - RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this - isn't minimal, since other operators, such as \`, aren't disabled. */ -#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ - (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) - -#define RE_SYNTAX_POSIX_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ - | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD) - -/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is - removed and RE_NO_BK_REFS is added. */ -#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_REFS \ - | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) -/* [[[end syntaxes]]] */ - -/* Maximum number of duplicates an interval can allow. Some systems - (erroneously) define this in other header files, but we want our - value, so remove any previous define. */ -#ifdef RE_DUP_MAX -# undef RE_DUP_MAX -#endif -/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */ -#define RE_DUP_MAX (0x7fff) - - -/* POSIX `cflags' bits (i.e., information for `regcomp'). */ - -/* If this bit is set, then use extended regular expression syntax. - If not set, then use basic regular expression syntax. */ -#define REG_EXTENDED 1 - -/* If this bit is set, then ignore case when matching. - If not set, then case is significant. */ -#define REG_ICASE (REG_EXTENDED << 1) - -/* If this bit is set, then anchors do not match at newline - characters in the string. - If not set, then anchors do match at newlines. */ -#define REG_NEWLINE (REG_ICASE << 1) - -/* If this bit is set, then report only success or fail in regexec. - If not set, then returns differ between not matching and errors. */ -#define REG_NOSUB (REG_NEWLINE << 1) - - -/* POSIX `eflags' bits (i.e., information for regexec). */ - -/* If this bit is set, then the beginning-of-line operator doesn't match - the beginning of the string (presumably because it's not the - beginning of a line). - If not set, then the beginning-of-line operator does match the - beginning of the string. */ -#define REG_NOTBOL 1 - -/* Like REG_NOTBOL, except for the end-of-line. */ -#define REG_NOTEOL (1 << 1) - -/* Use PMATCH[0] to delimit the start and end of the search in the - buffer. */ -#define REG_STARTEND (1 << 2) - - -/* If any error codes are removed, changed, or added, update the - `re_error_msg' table in regex.c. */ -typedef enum -{ -#ifdef _XOPEN_SOURCE - REG_ENOSYS = -1, /* This will never happen for this implementation. */ -#endif - - REG_NOERROR = 0, /* Success. */ - REG_NOMATCH, /* Didn't find a match (for regexec). */ - - /* POSIX regcomp return error codes. (In the order listed in the - standard.) */ - REG_BADPAT, /* Invalid pattern. */ - REG_ECOLLATE, /* Inalid collating element. */ - REG_ECTYPE, /* Invalid character class name. */ - REG_EESCAPE, /* Trailing backslash. */ - REG_ESUBREG, /* Invalid back reference. */ - REG_EBRACK, /* Unmatched left bracket. */ - REG_EPAREN, /* Parenthesis imbalance. */ - REG_EBRACE, /* Unmatched \{. */ - REG_BADBR, /* Invalid contents of \{\}. */ - REG_ERANGE, /* Invalid range end. */ - REG_ESPACE, /* Ran out of memory. */ - REG_BADRPT, /* No preceding re for repetition op. */ - - /* Error codes we've added. */ - REG_EEND, /* Premature end. */ - REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ - REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ -} reg_errcode_t; - -/* This data structure represents a compiled pattern. Before calling - the pattern compiler, the fields `buffer', `allocated', `fastmap', - `translate', and `no_sub' can be set. After the pattern has been - compiled, the `re_nsub' field is available. All other fields are - private to the regex routines. */ - -#ifndef RE_TRANSLATE_TYPE -# define RE_TRANSLATE_TYPE char * -#endif - -struct re_pattern_buffer -{ -/* [[[begin pattern_buffer]]] */ - /* Space that holds the compiled pattern. It is declared as - `unsigned char *' because its elements are - sometimes used as array indexes. */ - unsigned char *buffer; - - /* Number of bytes to which `buffer' points. */ - unsigned long int allocated; - - /* Number of bytes actually used in `buffer'. */ - unsigned long int used; - - /* Syntax setting with which the pattern was compiled. */ - reg_syntax_t syntax; - - /* Pointer to a fastmap, if any, otherwise zero. re_search uses - the fastmap, if there is one, to skip over impossible - starting points for matches. */ - char *fastmap; - - /* Either a translate table to apply to all characters before - comparing them, or zero for no translation. The translation - is applied to a pattern when it is compiled and to a string - when it is matched. */ - RE_TRANSLATE_TYPE translate; - - /* Number of subexpressions found by the compiler. */ - size_t re_nsub; - - /* Zero if this pattern cannot match the empty string, one else. - Well, in truth it's used only in `re_search_2', to see - whether or not we should use the fastmap, so we don't set - this absolutely perfectly; see `re_compile_fastmap' (the - `duplicate' case). */ - unsigned can_be_null : 1; - - /* If REGS_UNALLOCATED, allocate space in the `regs' structure - for `max (RE_NREGS, re_nsub + 1)' groups. - If REGS_REALLOCATE, reallocate space if necessary. - If REGS_FIXED, use what's there. */ -#define REGS_UNALLOCATED 0 -#define REGS_REALLOCATE 1 -#define REGS_FIXED 2 - unsigned regs_allocated : 2; - - /* Set to zero when `regex_compile' compiles a pattern; set to one - by `re_compile_fastmap' if it updates the fastmap. */ - unsigned fastmap_accurate : 1; - - /* If set, `re_match_2' does not return information about - subexpressions. */ - unsigned no_sub : 1; - - /* If set, a beginning-of-line anchor doesn't match at the - beginning of the string. */ - unsigned not_bol : 1; - - /* Similarly for an end-of-line anchor. */ - unsigned not_eol : 1; - - /* If true, an anchor at a newline matches. */ - unsigned newline_anchor : 1; - -/* [[[end pattern_buffer]]] */ -}; - -typedef struct re_pattern_buffer regex_t; - -/* Type for byte offsets within the string. POSIX mandates this. */ -typedef int regoff_t; - - -/* This is the structure we store register match data in. See - regex.texinfo for a full description of what registers match. */ -struct re_registers -{ - unsigned num_regs; - regoff_t *start; - regoff_t *end; -}; - - -/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, - `re_match_2' returns information about at least this many registers - the first time a `regs' structure is passed. */ -#ifndef RE_NREGS -# define RE_NREGS 30 -#endif - - -/* POSIX specification for registers. Aside from the different names than - `re_registers', POSIX uses an array of structures, instead of a - structure of arrays. */ -typedef struct -{ - regoff_t rm_so; /* Byte offset from string's start to substring's start. */ - regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ -} regmatch_t; - -/* Declarations for routines. */ - -/* To avoid duplicating every routine declaration -- once with a - prototype (if we are ANSI), and once without (if we aren't) -- we - use the following macro to declare argument types. This - unfortunately clutters up the declarations a bit, but I think it's - worth it. */ - -#if __STDC__ - -# define _RE_ARGS(args) args - -#else /* not __STDC__ */ - -# define _RE_ARGS(args) () - -#endif /* not __STDC__ */ - -/* Sets the current default syntax to SYNTAX, and return the old syntax. - You can also simply assign to the `re_syntax_options' variable. */ -extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); - -/* Compile the regular expression PATTERN, with length LENGTH - and syntax given by the global `re_syntax_options', into the buffer - BUFFER. Return NULL if successful, and an error string if not. */ -extern const char *re_compile_pattern - _RE_ARGS ((const char *pattern, size_t length, - struct re_pattern_buffer *buffer)); - - -/* Compile a fastmap for the compiled pattern in BUFFER; used to - accelerate searches. Return 0 if successful and -2 if was an - internal error. */ -extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); - - -/* Search in the string STRING (with length LENGTH) for the pattern - compiled into BUFFER. Start searching at position START, for RANGE - characters. Return the starting position of the match, -1 for no - match, or -2 for an internal error. Also return register - information in REGS (if REGS and BUFFER->no_sub are nonzero). */ -extern int re_search - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, - int length, int start, int range, struct re_registers *regs)); - - -/* Like `re_search', but search in the concatenation of STRING1 and - STRING2. Also, stop searching at index START + STOP. */ -extern int re_search_2 - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, - int length1, const char *string2, int length2, - int start, int range, struct re_registers *regs, int stop)); - - -/* Like `re_search', but return how many characters in STRING the regexp - in BUFFER matched, starting at position START. */ -extern int re_match - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, - int length, int start, struct re_registers *regs)); - - -/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ -extern int re_match_2 - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, - int length1, const char *string2, int length2, - int start, struct re_registers *regs, int stop)); - - -/* Set REGS to hold NUM_REGS registers, storing them in STARTS and - ENDS. Subsequent matches using BUFFER and REGS will use this memory - for recording register information. STARTS and ENDS must be - allocated with malloc, and must each be at least `NUM_REGS * sizeof - (regoff_t)' bytes long. - - If NUM_REGS == 0, then subsequent matches should allocate their own - register data. - - Unless this function is called, the first search or match using - PATTERN_BUFFER will allocate its own register data, without - freeing the old data. */ -extern void re_set_registers - _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, - unsigned num_regs, regoff_t *starts, regoff_t *ends)); - -#if defined _REGEX_RE_COMP || defined _LIBC -# ifndef _CRAY -/* 4.2 bsd compatibility. */ -extern char *re_comp _RE_ARGS ((const char *)); -extern int re_exec _RE_ARGS ((const char *)); -# endif -#endif - -/* GCC 2.95 and later have "__restrict"; C99 compilers have - "restrict", and "configure" may have defined "restrict". */ -#ifndef __restrict -# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__)) -# if defined restrict || 199901L <= __STDC_VERSION__ -# define __restrict restrict -# else -# define __restrict -# endif -# endif -#endif -/* gcc 3.1 and up support the [restrict] syntax. */ -#ifndef __restrict_arr -# if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1) -# define __restrict_arr __restrict -# else -# define __restrict_arr -# endif -#endif - -/* POSIX compatibility. */ -extern int regcomp _RE_ARGS ((regex_t *__restrict __preg, - const char *__restrict __pattern, - int __cflags)); - -extern int regexec _RE_ARGS ((const regex_t *__restrict __preg, - const char *__restrict __string, size_t __nmatch, - regmatch_t __pmatch[__restrict_arr], - int __eflags)); - -extern size_t regerror _RE_ARGS ((int __errcode, const regex_t *__preg, - char *__errbuf, size_t __errbuf_size)); - -extern void regfree _RE_ARGS ((regex_t *__preg)); - - -#ifdef __cplusplus -} -#endif /* C++ */ - -#endif /* regex.h */ - -/* -Local variables: -make-backup-files: t -version-control: t -trim-versions-without-asking: nil -End: -*/ diff --git a/regcomp.c b/regcomp.c index 68e2bdab92d..e85b2351459 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -14,17 +14,15 @@ Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ + License along with the GNU C Library; if not, see + . */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, - int length, reg_syntax_t syntax); + size_t length, reg_syntax_t syntax); static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap); -static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); -static void init_word_char (re_dfa_t *dfa); +static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); #ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset); #endif /* RE_ENABLE_I18N */ @@ -34,7 +32,6 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa); static void optimize_utf8 (re_dfa_t *dfa); #endif static reg_errcode_t analyze (regex_t *preg); -static reg_errcode_t create_initial_state (re_dfa_t *dfa); static reg_errcode_t preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), void *extra); @@ -48,12 +45,8 @@ static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, static reg_errcode_t calc_first (void *extra, bin_tree_t *node); static reg_errcode_t calc_next (void *extra, bin_tree_t *node); static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); -static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, - int top_clone_node, int root_node, - unsigned int constraint); -static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx, - unsigned int constraint); -static int search_duplicated_node (re_dfa_t *dfa, int org_node, +static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); +static int search_duplicated_node (const re_dfa_t *dfa, int org_node, unsigned int constraint); static reg_errcode_t calc_eclosure (re_dfa_t *dfa); static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, @@ -61,12 +54,8 @@ static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); static int fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax); -static void fetch_token (re_token_t *result, re_string_t *input, - reg_syntax_t syntax); static int peek_token (re_token_t *token, re_string_t *input, - reg_syntax_t syntax); -static int peek_token_bracket (re_token_t *token, re_string_t *input, - reg_syntax_t syntax); + reg_syntax_t syntax) internal_function; static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, reg_errcode_t *err); static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, @@ -96,45 +85,27 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token); -#ifndef _LIBC -# ifdef RE_ENABLE_I18N -static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, - re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, - bracket_elem_t *end_elem); -static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, - re_charset_t *mbcset, - int *coll_sym_alloc, - const unsigned char *name); -# else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, - bracket_elem_t *start_elem, - bracket_elem_t *end_elem); -static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, - const unsigned char *name); -# endif /* not RE_ENABLE_I18N */ -#endif /* not _LIBC */ #ifdef RE_ENABLE_I18N -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, int *equiv_class_alloc, const unsigned char *name); -static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, re_charset_t *mbcset, int *char_class_alloc, const unsigned char *class_name, reg_syntax_t syntax); #else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, const unsigned char *name); -static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, const unsigned char *class_name, reg_syntax_t syntax); #endif /* not RE_ENABLE_I18N */ static bin_tree_t *build_charclass_op (re_dfa_t *dfa, - unsigned RE_TRANSLATE_TYPE trans, + RE_TRANSLATE_TYPE trans, const unsigned char *class_name, const unsigned char *extra, int non_match, reg_errcode_t *err); @@ -327,10 +298,8 @@ re_set_fastmap (char *fastmap, int icase, int ch) Compile fastmap for the initial_state INIT_STATE. */ static void -re_compile_fastmap_iter (bufp, init_state, fastmap) - regex_t *bufp; - const re_dfastate_t *init_state; - char *fastmap; +re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, + char *fastmap) { re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; int node_cnt; @@ -356,9 +325,9 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) && dfa->nodes[node].type == CHARACTER && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; - memset (&state, 0, sizeof (state)); - if (mbrtowc (&wc, (const char *) buf, p - buf, - &state) == p - buf + memset (&state, '\0', sizeof (state)); + if (__mbrtowc (&wc, (const char *) buf, p - buf, + &state) == p - buf && (__wcrtomb ((char *) buf, towlower (wc), &state) != (size_t) -1)) re_set_fastmap (fastmap, 0, buf[0]); @@ -367,56 +336,78 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) } else if (type == SIMPLE_BRACKET) { - int i, j, ch; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) - re_set_fastmap (fastmap, icase, ch); + int i, ch; + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + { + int j; + bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (w & ((bitset_word_t) 1 << j)) + re_set_fastmap (fastmap, icase, ch); + } } #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { - int i; re_charset_t *cset = dfa->nodes[node].opr.mbcset; - if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes - || cset->nranges || cset->nchar_classes) - { + int i; + # ifdef _LIBC - if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + /* See if we have to try all bytes which start multiple collation + elements. + e.g. In da_DK, we want to catch 'a' since "aa" is a valid + collation element, and don't catch 'b' since 'b' is + the only collation element which starts from 'b' (and + it is caught by SIMPLE_BRACKET). */ + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 + && (cset->ncoll_syms || cset->nranges)) { - /* In this case we want to catch the bytes which are - the first byte of any collation elements. - e.g. In da_DK, we want to catch 'a' since "aa" - is a valid collation element, and don't catch - 'b' since 'b' is the only collation element - which starts from 'b'. */ - int j, ch; const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (table[ch] < 0) - re_set_fastmap (fastmap, icase, ch); + for (i = 0; i < SBC_MAX; ++i) + if (table[i] < 0) + re_set_fastmap (fastmap, icase, i); } -# else - if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) - re_set_fastmap (fastmap, icase, i); -# endif /* not _LIBC */ +# endif /* _LIBC */ + + /* See if we have to start the match at all multibyte characters, + i.e. where we would not find an invalid sequence. This only + applies to multibyte character sets; for single byte character + sets, the SIMPLE_BRACKET again suffices. */ + if (dfa->mb_cur_max > 1 + && (cset->nchar_classes || cset->non_match || cset->nranges +# ifdef _LIBC + || cset->nequiv_classes +# endif /* _LIBC */ + )) + { + unsigned char c = 0; + do + { + mbstate_t mbs; + memset (&mbs, 0, sizeof (mbs)); + if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) + re_set_fastmap (fastmap, false, (int) c); + } + while (++c != 0); } - for (i = 0; i < cset->nmbchars; ++i) + + else { - char buf[256]; - mbstate_t state; - memset (&state, '\0', sizeof (state)); - if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + /* ... Else catch all bytes which can start the mbchars. */ + for (i = 0; i < cset->nmbchars; ++i) { - if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) - != (size_t) -1) - re_set_fastmap (fastmap, 0, *(unsigned char *) buf); + char buf[256]; + mbstate_t state; + memset (&state, '\0', sizeof (state)); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, false, *(unsigned char *) buf); + } } } } @@ -536,8 +527,8 @@ weak_alias (__regcomp, regcomp) size_t regerror (errcode, preg, errbuf, errbuf_size) int errcode; - const regex_t *preg; - char *errbuf; + const regex_t *__restrict preg; + char *__restrict errbuf; size_t errbuf_size; { const char *msg; @@ -583,14 +574,10 @@ weak_alias (__regerror, regerror) UTF-8 is used. Otherwise we would allocate memory just to initialize it the same all the time. UTF-8 is the preferred encoding so this is a worthwhile optimization. */ -static const bitset utf8_sb_map = +static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if UINT_MAX == 0xffffffff - 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff -# else -# error "Add case for new unsigned int size" -# endif + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX }; #endif @@ -627,7 +614,7 @@ free_dfa_content (re_dfa_t *dfa) re_dfastate_t *state = entry->array[j]; free_state (state); } - re_free (entry->array); + re_free (entry->array); } re_free (dfa->state_table); #ifdef RE_ENABLE_I18N @@ -739,11 +726,8 @@ libc_freeres_fn (free_mem) SYNTAX indicate regular expression's syntax. */ static reg_errcode_t -re_compile_internal (preg, pattern, length, syntax) - regex_t *preg; - const char * pattern; - int length; - reg_syntax_t syntax; +re_compile_internal (regex_t *preg, const char * pattern, size_t length, + reg_syntax_t syntax) { reg_errcode_t err = REG_NOERROR; re_dfa_t *dfa; @@ -783,10 +767,13 @@ re_compile_internal (preg, pattern, length, syntax) return err; } #ifdef DEBUG + /* Note: length+1 will not overflow since it is checked in init_dfa. */ dfa->re_str = re_malloc (char, length + 1); strncpy (dfa->re_str, pattern, length + 1); #endif + __libc_lock_init (dfa->lock); + err = re_string_construct (®exp, pattern, length, preg->translate, syntax & RE_ICASE, dfa); if (BE (err != REG_NOERROR, 0)) @@ -838,11 +825,9 @@ re_compile_internal (preg, pattern, length, syntax) as the initial length of some arrays. */ static reg_errcode_t -init_dfa (dfa, pat_len) - re_dfa_t *dfa; - int pat_len; +init_dfa (re_dfa_t *dfa, size_t pat_len) { - int table_size; + unsigned int table_size; #ifndef _LIBC char *codeset_name; #endif @@ -852,13 +837,15 @@ init_dfa (dfa, pat_len) /* Force allocation of str_tree_storage the first time. */ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; + /* Avoid overflows. */ + if (pat_len == SIZE_MAX) + return REG_ESPACE; + dfa->nodes_alloc = pat_len + 1; dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); - dfa->states_alloc = pat_len + 1; - /* table_size = 2 ^ ceil(log pat_len) */ - for (table_size = 1; table_size > 0; table_size <<= 1) + for (table_size = 1; ; table_size <<= 1) if (table_size > pat_len) break; @@ -905,22 +892,19 @@ init_dfa (dfa, pat_len) { int i, j, ch; - dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); if (BE (dfa->sb_char == NULL, 0)) return REG_ESPACE; - /* Clear all bits by, then set those corresponding to single - byte chars. */ - bitset_empty (dfa->sb_char); - - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + /* Set the bits corresponding to single byte chars. */ + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) { - wchar_t wch = __btowc (ch); + wint_t wch = __btowc (ch); if (wch != WEOF) - dfa->sb_char[i] |= 1 << j; + dfa->sb_char[i] |= (bitset_word_t) 1 << j; # ifndef _LIBC - if (isascii (ch) && wch != (wchar_t) ch) + if (isascii (ch) && wch != ch) dfa->map_notascii = 1; # endif } @@ -938,22 +922,53 @@ init_dfa (dfa, pat_len) character used by some operators like "\<", "\>", etc. */ static void -init_word_char (dfa) - re_dfa_t *dfa; +internal_function +init_word_char (re_dfa_t *dfa) { - int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + int i = 0; + int ch = 0; + if (BE (dfa->map_notascii == 0, 1)) + { + if (sizeof (dfa->word_char[0]) == 8) + { + /* The extra temporaries here avoid "implicitly truncated" + warnings in the case when this is dead code, i.e. 32-bit. */ + const uint64_t wc0 = UINT64_C (0x03ff000000000000); + const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe); + dfa->word_char[0] = wc0; + dfa->word_char[1] = wc1; + i = 2; + } + else if (sizeof (dfa->word_char[0]) == 4) + { + dfa->word_char[0] = UINT32_C (0x00000000); + dfa->word_char[1] = UINT32_C (0x03ff0000); + dfa->word_char[2] = UINT32_C (0x87fffffe); + dfa->word_char[3] = UINT32_C (0x07fffffe); + i = 4; + } + else + abort (); + ch = 128; + + if (BE (dfa->is_utf8, 1)) + { + memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); + return; + } + } + + for (; i < BITSET_WORDS; ++i) + for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') - dfa->word_char[i] |= 1 << j; + dfa->word_char[i] |= (bitset_word_t) 1 << j; } /* Free the work area which are only used while compiling. */ static void -free_workarea_compile (preg) - regex_t *preg; +free_workarea_compile (regex_t *preg) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_storage_t *storage, *next; @@ -972,8 +987,7 @@ free_workarea_compile (preg) /* Create initial states for all contexts. */ static reg_errcode_t -create_initial_state (dfa) - re_dfa_t *dfa; +create_initial_state (re_dfa_t *dfa) { int first, i; reg_errcode_t err; @@ -1016,7 +1030,11 @@ create_initial_state (dfa) int dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { - re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); + reg_errcode_t err = re_node_set_merge (&init_nodes, + dfa->eclosures + + dest_idx); + if (err != REG_NOERROR) + return err; i = 0; } } @@ -1055,8 +1073,7 @@ create_initial_state (dfa) DFA nodes where needed. */ static void -optimize_utf8 (dfa) - re_dfa_t *dfa; +optimize_utf8 (re_dfa_t *dfa) { int node, i, mb_chars = 0, has_period = 0; @@ -1068,7 +1085,7 @@ optimize_utf8 (dfa) mb_chars = 1; break; case ANCHOR: - switch (dfa->nodes[node].opr.idx) + switch (dfa->nodes[node].opr.ctx_type) { case LINE_FIRST: case LINE_LAST: @@ -1076,13 +1093,15 @@ optimize_utf8 (dfa) case BUF_LAST: break; default: - /* Word anchors etc. cannot be handled. */ + /* Word anchors etc. cannot be handled. It's okay to test + opr.ctx_type since constraints (for all DFA nodes) are + created by ORing one or more opr.ctx_type values. */ return; } break; case OP_PERIOD: - has_period = 1; - break; + has_period = 1; + break; case OP_BACK_REF: case OP_ALT: case END_OF_RE: @@ -1093,8 +1112,9 @@ optimize_utf8 (dfa) case COMPLEX_BRACKET: return; case SIMPLE_BRACKET: - /* Just double check. */ - for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) + /* Just double check. The non-ASCII range starts at 0x80. */ + assert (0x80 % BITSET_WORD_BITS == 0); + for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) if (dfa->nodes[node].opr.sbcset[i]) return; break; @@ -1123,8 +1143,7 @@ optimize_utf8 (dfa) "eclosure", and "inveclosure". */ static reg_errcode_t -analyze (preg) - regex_t *preg; +analyze (regex_t *preg) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; reg_errcode_t ret; @@ -1176,7 +1195,7 @@ analyze (preg) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); if (BE (dfa->inveclosures == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; ret = calc_inveclosure (dfa); } @@ -1187,10 +1206,8 @@ analyze (preg) implement parse tree visits. Instead, we use parent pointers and some hairy code in these two functions. */ static reg_errcode_t -postorder (root, fn, extra) - bin_tree_t *root; - reg_errcode_t (fn (void *, bin_tree_t *)); - void *extra; +postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), + void *extra) { bin_tree_t *node, *prev; @@ -1200,16 +1217,16 @@ postorder (root, fn, extra) if that's the only child). */ while (node->left || node->right) if (node->left) - node = node->left; - else - node = node->right; + node = node->left; + else + node = node->right; do { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; - if (node->parent == NULL) + if (node->parent == NULL) return REG_NOERROR; prev = node; node = node->parent; @@ -1221,10 +1238,8 @@ postorder (root, fn, extra) } static reg_errcode_t -preorder (root, fn, extra) - bin_tree_t *root; - reg_errcode_t (fn (void *, bin_tree_t *)); - void *extra; +preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), + void *extra) { bin_tree_t *node; @@ -1245,7 +1260,7 @@ preorder (root, fn, extra) prev = node; node = node->parent; if (!node) - return REG_NOERROR; + return REG_NOERROR; } node = node->right; } @@ -1256,9 +1271,7 @@ preorder (root, fn, extra) re_search_internal to map the inner one's opr.idx to this one's. Adjust backreferences as well. Requires a preorder visit. */ static reg_errcode_t -optimize_subexps (extra, node) - void *extra; - bin_tree_t *node; +optimize_subexps (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; @@ -1270,17 +1283,17 @@ optimize_subexps (extra, node) } else if (node->token.type == SUBEXP - && node->left && node->left->token.type == SUBEXP) + && node->left && node->left->token.type == SUBEXP) { int other_idx = node->left->token.opr.idx; node->left = node->left->left; if (node->left) - node->left->parent = node; + node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; - if (other_idx < 8 * sizeof (dfa->used_bkref_map)) - dfa->used_bkref_map &= ~(1 << other_idx); + if (other_idx < BITSET_WORD_BITS) + dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); } return REG_NOERROR; @@ -1289,9 +1302,7 @@ optimize_subexps (extra, node) /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ static reg_errcode_t -lower_subexps (extra, node) - void *extra; - bin_tree_t *node; +lower_subexps (void *extra, bin_tree_t *node) { regex_t *preg = (regex_t *) extra; reg_errcode_t err = REG_NOERROR; @@ -1313,10 +1324,7 @@ lower_subexps (extra, node) } static bin_tree_t * -lower_subexp (err, preg, node) - reg_errcode_t *err; - regex_t *preg; - bin_tree_t *node; +lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *body = node->left; @@ -1328,8 +1336,9 @@ lower_subexp (err, preg, node) very common, so we do not lose much. An example that triggers this case is the sed "script" /\(\)/x. */ && node->left != NULL - && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map) - || !(dfa->used_bkref_map & (1 << node->token.opr.idx)))) + && (node->token.opr.idx >= BITSET_WORD_BITS + || !(dfa->used_bkref_map + & ((bitset_word_t) 1 << node->token.opr.idx)))) return node->left; /* Convert the SUBEXP node to the concatenation of an @@ -1352,9 +1361,7 @@ lower_subexp (err, preg, node) /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton nodes. Requires a postorder visit. */ static reg_errcode_t -calc_first (extra, node) - void *extra; - bin_tree_t *node; +calc_first (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; if (node->token.type == CONCAT) @@ -1367,16 +1374,16 @@ calc_first (extra, node) node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); if (BE (node->node_idx == -1, 0)) - return REG_ESPACE; + return REG_ESPACE; + if (node->token.type == ANCHOR) + dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; } return REG_NOERROR; } /* Pass 2: compute NEXT on the tree. Preorder visit. */ static reg_errcode_t -calc_next (extra, node) - void *extra; - bin_tree_t *node; +calc_next (void *extra, bin_tree_t *node) { switch (node->token.type) { @@ -1391,7 +1398,7 @@ calc_next (extra, node) if (node->left) node->left->next = node->next; if (node->right) - node->right->next = node->next; + node->right->next = node->next; break; } return REG_NOERROR; @@ -1399,9 +1406,7 @@ calc_next (extra, node) /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ static reg_errcode_t -link_nfa_nodes (extra, node) - void *extra; - bin_tree_t *node; +link_nfa_nodes (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; int idx = node->node_idx; @@ -1444,7 +1449,7 @@ link_nfa_nodes (extra, node) case OP_BACK_REF: dfa->nexts[idx] = node->next->node_idx; if (node->token.type == OP_BACK_REF) - re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); + err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); break; default: @@ -1461,13 +1466,10 @@ link_nfa_nodes (extra, node) to their own constraint. */ static reg_errcode_t -duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, - init_constraint) - re_dfa_t *dfa; - int top_org_node, top_clone_node, root_node; - unsigned int init_constraint; +internal_function +duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, + int root_node, unsigned int init_constraint) { - reg_errcode_t err; int org_node, clone_node, ret; unsigned int constraint = init_constraint; for (org_node = top_org_node, clone_node = top_clone_node;;) @@ -1481,9 +1483,9 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, edests of the back reference. */ org_dest = dfa->nexts[org_node]; re_node_set_empty (dfa->edests + clone_node); - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; dfa->nexts[clone_node] = dfa->nexts[org_node]; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) @@ -1503,25 +1505,20 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, destination. */ org_dest = dfa->edests[org_node].elems[0]; re_node_set_empty (dfa->edests + clone_node); - if (dfa->nodes[org_node].type == ANCHOR) + /* If the node is root_node itself, it means the epsilon clsoure + has a loop. Then tie it to the destination of the root_node. */ + if (org_node == root_node && clone_node != org_node) { - /* In case of the node has another constraint, append it. */ - if (org_node == root_node && clone_node != org_node) - { - /* ...but if the node is root_node itself, it means the - epsilon closure have a loop, then tie it to the - destination of the root_node. */ - ret = re_node_set_insert (dfa->edests + clone_node, - org_dest); - if (BE (ret < 0, 0)) - return REG_ESPACE; - break; - } - constraint |= dfa->nodes[org_node].opr.ctx_type; + ret = re_node_set_insert (dfa->edests + clone_node, org_dest); + if (BE (ret < 0, 0)) + return REG_ESPACE; + break; } - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + /* In case of the node has another constraint, add it. */ + constraint |= dfa->nodes[org_node].constraint; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1536,10 +1533,11 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, clone_dest = search_duplicated_node (dfa, org_dest, constraint); if (clone_dest == -1) { - /* There are no such a duplicated node, create a new one. */ - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + /* There is no such duplicated node, create a new one. */ + reg_errcode_t err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1550,7 +1548,7 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, } else { - /* There are a duplicated node which satisfy the constraint, + /* There is a duplicated node which satisfies the constraint, use it to avoid infinite loop. */ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) @@ -1558,9 +1556,9 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, } org_dest = dfa->edests[org_node].elems[1]; - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1575,10 +1573,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, satisfies the constraint CONSTRAINT. */ static int -search_duplicated_node (dfa, org_node, constraint) - re_dfa_t *dfa; - int org_node; - unsigned int constraint; +search_duplicated_node (const re_dfa_t *dfa, int org_node, + unsigned int constraint) { int idx; for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) @@ -1591,32 +1587,27 @@ search_duplicated_node (dfa, org_node, constraint) } /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. - The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, - otherwise return the error code. */ + Return the index of the new node, or -1 if insufficient storage is + available. */ -static reg_errcode_t -duplicate_node (new_idx, dfa, org_idx, constraint) - re_dfa_t *dfa; - int *new_idx, org_idx; - unsigned int constraint; +static int +duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) { int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); - if (BE (dup_idx == -1, 0)) - return REG_ESPACE; - dfa->nodes[dup_idx].constraint = constraint; - if (dfa->nodes[org_idx].type == ANCHOR) - dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; - dfa->nodes[dup_idx].duplicated = 1; - - /* Store the index of the original node. */ - dfa->org_indices[dup_idx] = org_idx; - *new_idx = dup_idx; - return REG_NOERROR; + if (BE (dup_idx != -1, 1)) + { + dfa->nodes[dup_idx].constraint = constraint; + dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; + dfa->nodes[dup_idx].duplicated = 1; + + /* Store the index of the original node. */ + dfa->org_indices[dup_idx] = org_idx; + } + return dup_idx; } static reg_errcode_t -calc_inveclosure (dfa) - re_dfa_t *dfa; +calc_inveclosure (re_dfa_t *dfa) { int src, idx, ret; for (idx = 0; idx < dfa->nodes_len; ++idx) @@ -1639,8 +1630,7 @@ calc_inveclosure (dfa) /* Calculate "eclosure" for all the node in DFA. */ static reg_errcode_t -calc_eclosure (dfa) - re_dfa_t *dfa; +calc_eclosure (re_dfa_t *dfa) { int node_idx, incomplete; #ifdef DEBUG @@ -1684,16 +1674,13 @@ calc_eclosure (dfa) /* Calculate epsilon closure of NODE. */ static reg_errcode_t -calc_eclosure_iter (new_set, dfa, node, root) - re_node_set *new_set; - re_dfa_t *dfa; - int node, root; +calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) { reg_errcode_t err; - unsigned int constraint; - int i, incomplete; + int i; re_node_set eclosure; - incomplete = 0; + int ret; + int incomplete = 0; err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); if (BE (err != REG_NOERROR, 0)) return err; @@ -1702,17 +1689,14 @@ calc_eclosure_iter (new_set, dfa, node, root) We reference this value to avoid infinite loop. */ dfa->eclosures[node].nelem = -1; - constraint = ((dfa->nodes[node].type == ANCHOR) - ? dfa->nodes[node].opr.ctx_type : 0); - /* If the current node has constraints, duplicate all nodes. - Since they must inherit the constraints. */ - if (constraint + /* If the current node has constraints, duplicate all nodes + since they must inherit the constraints. */ + if (dfa->nodes[node].constraint && dfa->edests[node].nelem && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) { - int org_node, cur_node; - org_node = cur_node = node; - err = duplicate_node_closure (dfa, node, node, node, constraint); + err = duplicate_node_closure (dfa, node, node, node, + dfa->nodes[node].constraint); if (BE (err != REG_NOERROR, 0)) return err; } @@ -1741,7 +1725,9 @@ calc_eclosure_iter (new_set, dfa, node, root) else eclosure_elem = dfa->eclosures[edest]; /* Merge the epsilon closure of `edest'. */ - re_node_set_merge (&eclosure, &eclosure_elem); + err = re_node_set_merge (&eclosure, &eclosure_elem); + if (BE (err != REG_NOERROR, 0)) + return err; /* If the epsilon closure of `edest' is incomplete, the epsilon closure of this node is also incomplete. */ if (dfa->eclosures[edest].nelem == 0) @@ -1751,8 +1737,10 @@ calc_eclosure_iter (new_set, dfa, node, root) } } - /* Epsilon closures include itself. */ - re_node_set_insert (&eclosure, node); + /* An epsilon closure includes itself. */ + ret = re_node_set_insert (&eclosure, node); + if (BE (ret < 0, 0)) + return REG_ESPACE; if (incomplete && !root) dfa->eclosures[node].nelem = 0; else @@ -1767,10 +1755,8 @@ calc_eclosure_iter (new_set, dfa, node, root) We must not use this function inside bracket expressions. */ static void -fetch_token (result, input, syntax) - re_token_t *result; - re_string_t *input; - reg_syntax_t syntax; +internal_function +fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) { re_string_skip_bytes (input, peek_token (result, input, syntax)); } @@ -1779,10 +1765,8 @@ fetch_token (result, input, syntax) We must not use this function inside bracket expressions. */ static int -peek_token (token, input, syntax) - re_token_t *token; - re_string_t *input; - reg_syntax_t syntax; +internal_function +peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; @@ -2020,10 +2004,8 @@ peek_token (token, input, syntax) We must not use this function out of bracket expressions. */ static int -peek_token_bracket (token, input, syntax) - re_token_t *token; - re_string_t *input; - reg_syntax_t syntax; +internal_function +peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; if (re_string_eoi (input)) @@ -2119,11 +2101,8 @@ peek_token_bracket (token, input, syntax) EOR means end of regular expression. */ static bin_tree_t * -parse (regexp, preg, syntax, err) - re_string_t *regexp; - regex_t *preg; - reg_syntax_t syntax; - reg_errcode_t *err; +parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, + reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree, *eor, *root; @@ -2156,13 +2135,8 @@ parse (regexp, preg, syntax, err) ALT means alternative, which represents the operator `|'. */ static bin_tree_t * -parse_reg_exp (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree, *branch = NULL; @@ -2202,13 +2176,8 @@ parse_reg_exp (regexp, preg, token, syntax, nest, err) CAT means concatenation. */ static bin_tree_t * -parse_branch (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { bin_tree_t *tree, *exp; re_dfa_t *dfa = (re_dfa_t *) preg->buffer; @@ -2222,16 +2191,21 @@ parse_branch (regexp, preg, token, syntax, nest, err) exp = parse_expression (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && exp == NULL, 0)) { + if (tree != NULL) + postorder (tree, free_tree, NULL); return NULL; } if (tree != NULL && exp != NULL) { - tree = create_tree (dfa, tree, exp, CONCAT); - if (tree == NULL) + bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT); + if (newtree == NULL) { + postorder (exp, free_tree, NULL); + postorder (tree, free_tree, NULL); *err = REG_ESPACE; return NULL; } + tree = newtree; } else if (tree == NULL) tree = exp; @@ -2247,13 +2221,8 @@ parse_branch (regexp, preg, token, syntax, nest, err) */ static bin_tree_t * -parse_expression (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree; @@ -2360,7 +2329,7 @@ parse_expression (regexp, preg, token, syntax, nest, err) && dfa->word_ops_used == 0) init_word_char (dfa); if (token->opr.ctx_type == WORD_DELIM - || token->opr.ctx_type == NOT_WORD_DELIM) + || token->opr.ctx_type == NOT_WORD_DELIM) { bin_tree_t *tree_first, *tree_last; if (token->opr.ctx_type == WORD_DELIM) @@ -2368,13 +2337,13 @@ parse_expression (regexp, preg, token, syntax, nest, err) token->opr.ctx_type = WORD_FIRST; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = WORD_LAST; - } - else - { + } + else + { token->opr.ctx_type = INSIDE_WORD; tree_first = create_token_tree (dfa, NULL, NULL, token); token->opr.ctx_type = INSIDE_NOTWORD; - } + } tree_last = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree_first, tree_last, OP_ALT); if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) @@ -2468,13 +2437,8 @@ parse_expression (regexp, preg, token, syntax, nest, err) */ static bin_tree_t * -parse_sub_exp (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree; @@ -2490,11 +2454,17 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) - *err = REG_EPAREN; + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + *err = REG_EPAREN; + } if (BE (*err != REG_NOERROR, 0)) return NULL; } - dfa->completed_bkref_map |= 1 << cur_nsub; + + if (cur_nsub <= '9' - '1') + dfa->completed_bkref_map |= 1 << cur_nsub; tree = create_tree (dfa, tree, NULL, SUBEXP); if (BE (tree == NULL, 0)) @@ -2509,13 +2479,8 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ static bin_tree_t * -parse_dup_op (elem, regexp, dfa, token, syntax, err) - bin_tree_t *elem; - re_string_t *regexp; - re_dfa_t *dfa; - re_token_t *token; - reg_syntax_t syntax; - reg_errcode_t *err; +parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, + re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) { bin_tree_t *tree = NULL, *old_tree = NULL; int i, start, end, start_idx = re_string_cur_idx (regexp); @@ -2564,7 +2529,7 @@ parse_dup_op (elem, regexp, dfa, token, syntax, err) return elem; } - if (BE (end != -1 && start > end, 0)) + if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0)) { /* First number greater than second. */ *err = REG_BADBR; @@ -2624,11 +2589,11 @@ parse_dup_op (elem, regexp, dfa, token, syntax, err) elem = duplicate_tree (elem, dfa); tree = create_tree (dfa, tree, elem, CONCAT); if (BE (elem == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; + goto parse_dup_op_espace; tree = create_tree (dfa, tree, NULL, OP_ALT); if (BE (tree == NULL, 0)) - goto parse_dup_op_espace; + goto parse_dup_op_espace; } if (old_tree) @@ -2654,15 +2619,14 @@ parse_dup_op (elem, regexp, dfa, token, syntax, err) update it. */ static reg_errcode_t +internal_function # ifdef RE_ENABLE_I18N -build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) - re_charset_t *mbcset; - int *range_alloc; +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem) # else /* not RE_ENABLE_I18N */ -build_range_exp (sbcset, start_elem, end_elem) +build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, + bracket_elem_t *end_elem) # endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; - bracket_elem_t *start_elem, *end_elem; { unsigned int start_ch, end_ch; /* Equivalence Classes and Character Classes can't be a range start/end. */ @@ -2681,7 +2645,9 @@ build_range_exp (sbcset, start_elem, end_elem) # ifdef RE_ENABLE_I18N { - wchar_t wc, start_wc, end_wc; + wchar_t wc; + wint_t start_wc; + wint_t end_wc; wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch @@ -2708,9 +2674,9 @@ build_range_exp (sbcset, start_elem, end_elem) no MBCSET if dfa->mb_cur_max == 1. */ if (mbcset) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) + { /* There is not enough space, need realloc. */ wchar_t *new_array_start, *new_array_end; int new_nranges; @@ -2720,9 +2686,9 @@ build_range_exp (sbcset, start_elem, end_elem) /* Use realloc since mbcset->range_starts and mbcset->range_ends are NULL if *range_alloc == 0. */ new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); + new_nranges); new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) return REG_ESPACE; @@ -2730,10 +2696,10 @@ build_range_exp (sbcset, start_elem, end_elem) mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; - } + } - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; } /* Build the table for single byte characters. */ @@ -2774,15 +2740,13 @@ build_range_exp (sbcset, start_elem, end_elem) pointer argument since we may update it. */ static reg_errcode_t +internal_function # ifdef RE_ENABLE_I18N -build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) - re_charset_t *mbcset; - int *coll_sym_alloc; +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + int *coll_sym_alloc, const unsigned char *name) # else /* not RE_ENABLE_I18N */ -build_collating_symbol (sbcset, name) +build_collating_symbol (bitset_t sbcset, const unsigned char *name) # endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; - const unsigned char *name; { size_t name_len = strlen ((const char *) name); if (BE (name_len != 1, 0)) @@ -2799,12 +2763,8 @@ build_collating_symbol (sbcset, name) "[[.a-a.]]" etc. */ static bin_tree_t * -parse_bracket_exp (regexp, dfa, token, syntax, err) - re_string_t *regexp; - re_dfa_t *dfa; - re_token_t *token; - reg_syntax_t syntax; - reg_errcode_t *err; +parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err) { #ifdef _LIBC const unsigned char *collseqmb; @@ -2826,28 +2786,33 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) { int32_t hash = elem_hash ((const char *) name, name_len); int32_t elem = hash % table_size; - int32_t second = hash % (table_size - 2); - while (symb_table[2 * elem] != 0) - { - /* First compare the hashing value. */ - if (symb_table[2 * elem] == hash - /* Compare the length of the name. */ - && name_len == extra[symb_table[2 * elem + 1]] - /* Compare the name. */ - && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], - name_len) == 0) + if (symb_table[2 * elem] != 0) + { + int32_t second = hash % (table_size - 2) + 1; + + do { - /* Yep, this is the entry. */ - break; - } + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + /* Compare the length of the name. */ + && name_len == extra[symb_table[2 * elem + 1]] + /* Compare the name. */ + && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], + name_len) == 0) + { + /* Yep, this is the entry. */ + break; + } - /* Next entry. */ - elem += second; + /* Next entry. */ + elem += second; + } + while (symb_table[2 * elem] != 0); } return elem; } - /* Local function for parse_bracket_exp used in _LIBC environement. + /* Local function for parse_bracket_exp used in _LIBC environment. Look up the collation sequence value of BR_ELEM. Return the value if succeeded, UINT_MAX otherwise. */ @@ -2871,7 +2836,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) } else if (br_elem->type == MB_CHAR) { - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); } else if (br_elem->type == COLL_SYM) { @@ -2924,7 +2890,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) re_charset_t *mbcset; int *range_alloc; - re_bitset_ptr_t sbcset; + bitset_t sbcset; bracket_elem_t *start_elem, *end_elem; { unsigned int ch; @@ -2952,8 +2918,8 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build below suffices. */ if (nrules > 0 || dfa->mb_cur_max > 1) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) + /* Check the space of the arrays. */ + if (BE (*range_alloc == mbcset->nranges, 0)) { /* There is not enough space, need realloc. */ uint32_t *new_array_start; @@ -2965,18 +2931,18 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) new_array_start = re_realloc (mbcset->range_starts, uint32_t, new_nranges); new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); + new_nranges); if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; mbcset->range_starts = new_array_start; mbcset->range_ends = new_array_end; *range_alloc = new_nranges; } - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } /* Build the table for single byte characters. */ @@ -3007,7 +2973,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) re_charset_t *mbcset; int *coll_sym_alloc; - re_bitset_ptr_t sbcset; + bitset_t sbcset; const unsigned char *name; { int32_t elem, idx; @@ -3084,7 +3050,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) /* if (MB_CUR_MAX > 1) */ - collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); + collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB); @@ -3092,7 +3058,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) _NL_COLLATE_SYMB_EXTRAMB); } #endif - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3102,6 +3068,10 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) if (BE (sbcset == NULL, 0)) #endif /* RE_ENABLE_I18N */ { + re_free (sbcset); +#ifdef RE_ENABLE_I18N + re_free (mbcset); +#endif *err = REG_ESPACE; return NULL; } @@ -3119,7 +3089,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) #endif /* not RE_ENABLE_I18N */ non_match = 1; if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - bitset_set (sbcset, '\0'); + bitset_set (sbcset, '\n'); re_string_skip_bytes (regexp, token_len); /* Skip a token. */ token_len = peek_token_bracket (token, regexp, syntax); if (BE (token->type == END_OF_RE, 0)) @@ -3302,24 +3272,24 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (mbc_tree == NULL, 0)) goto parse_bracket_exp_espace; - for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) + for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) if (sbcset[sbc_idx]) break; /* If there are no bits set in sbcset, there is no point of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ - if (sbc_idx < BITSET_UINTS) + if (sbc_idx < BITSET_WORDS) { - /* Build a tree for simple bracket. */ - br_token.type = SIMPLE_BRACKET; - br_token.opr.sbcset = sbcset; - work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Build a tree for simple bracket. */ + br_token.type = SIMPLE_BRACKET; + br_token.opr.sbcset = sbcset; + work_tree = create_token_tree (dfa, NULL, NULL, &br_token); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; - /* Then join them by ALT node. */ - work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); - if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + /* Then join them by ALT node. */ + work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); + if (BE (work_tree == NULL, 0)) + goto parse_bracket_exp_espace; } else { @@ -3338,7 +3308,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (work_tree == NULL, 0)) - goto parse_bracket_exp_espace; + goto parse_bracket_exp_espace; } return work_tree; @@ -3355,15 +3325,9 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) /* Parse an element in the bracket expression. */ static reg_errcode_t -parse_bracket_element (elem, regexp, token, token_len, dfa, syntax, - accept_hyphen) - bracket_elem_t *elem; - re_string_t *regexp; - re_token_t *token; - int token_len; - re_dfa_t *dfa; - reg_syntax_t syntax; - int accept_hyphen; +parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, + re_token_t *token, int token_len, re_dfa_t *dfa, + reg_syntax_t syntax, int accept_hyphen) { #ifdef RE_ENABLE_I18N int cur_char_size; @@ -3401,10 +3365,8 @@ parse_bracket_element (elem, regexp, token, token_len, dfa, syntax, [==]. */ static reg_errcode_t -parse_bracket_symbol (elem, regexp, token) - bracket_elem_t *elem; - re_string_t *regexp; - re_token_t *token; +parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, + re_token_t *token) { unsigned char ch, delim = token->opr.c; int i = 0; @@ -3451,16 +3413,13 @@ parse_bracket_symbol (elem, regexp, token) static reg_errcode_t #ifdef RE_ENABLE_I18N -build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) - re_charset_t *mbcset; - int *equiv_class_alloc; +build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, + int *equiv_class_alloc, const unsigned char *name) #else /* not RE_ENABLE_I18N */ -build_equiv_class (sbcset, name) +build_equiv_class (bitset_t sbcset, const unsigned char *name) #endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; - const unsigned char *name; { -#if defined _LIBC +#ifdef _LIBC uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { @@ -3481,30 +3440,33 @@ build_equiv_class (sbcset, name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (&cp); - if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) + idx1 = findidx (&cp, -1); + if (BE (idx1 == 0 || *cp != '\0', 0)) /* This isn't a valid character. */ return REG_ECOLLATE; /* Build single byte matcing table for this equivalence class. */ - char_buf[1] = (unsigned char) '\0'; - len = weights[idx1]; + len = weights[idx1 & 0xffffff]; for (ch = 0; ch < SBC_MAX; ++ch) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (&cp); + idx2 = findidx (&cp, 1); /* idx2 = table[ch]; */ if (idx2 == 0) /* This isn't a valid character. */ continue; - if (len == weights[idx2]) + /* Compare only if the length matches and the collation rule + index is the same. */ + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) { int cnt = 0; + while (cnt <= len && - weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) + weights[(idx1 & 0xffffff) + 1 + cnt] + == weights[(idx2 & 0xffffff) + 1 + cnt]) ++cnt; if (cnt > len) @@ -3546,16 +3508,13 @@ build_equiv_class (sbcset, name) static reg_errcode_t #ifdef RE_ENABLE_I18N -build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) - re_charset_t *mbcset; - int *char_class_alloc; +build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, + re_charset_t *mbcset, int *char_class_alloc, + const unsigned char *class_name, reg_syntax_t syntax) #else /* not RE_ENABLE_I18N */ -build_charclass (trans, sbcset, class_name, syntax) +build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, + const unsigned char *class_name, reg_syntax_t syntax) #endif /* not RE_ENABLE_I18N */ - unsigned RE_TRANSLATE_TYPE trans; - re_bitset_ptr_t sbcset; - const unsigned char *class_name; - reg_syntax_t syntax; { int i; const char *name = (const char *) class_name; @@ -3585,39 +3544,45 @@ build_charclass (trans, sbcset, class_name, syntax) #endif /* RE_ENABLE_I18N */ #define BUILD_CHARCLASS_LOOP(ctype_func) \ - for (i = 0; i < SBC_MAX; ++i) \ + do { \ + if (BE (trans != NULL, 0)) \ { \ - if (ctype_func (i)) \ - { \ - int ch = trans ? trans[i] : i; \ - bitset_set (sbcset, ch); \ - } \ - } + for (i = 0; i < SBC_MAX; ++i) \ + if (ctype_func (i)) \ + bitset_set (sbcset, trans[i]); \ + } \ + else \ + { \ + for (i = 0; i < SBC_MAX; ++i) \ + if (ctype_func (i)) \ + bitset_set (sbcset, i); \ + } \ + } while (0) if (strcmp (name, "alnum") == 0) - BUILD_CHARCLASS_LOOP (isalnum) + BUILD_CHARCLASS_LOOP (isalnum); else if (strcmp (name, "cntrl") == 0) - BUILD_CHARCLASS_LOOP (iscntrl) + BUILD_CHARCLASS_LOOP (iscntrl); else if (strcmp (name, "lower") == 0) - BUILD_CHARCLASS_LOOP (islower) + BUILD_CHARCLASS_LOOP (islower); else if (strcmp (name, "space") == 0) - BUILD_CHARCLASS_LOOP (isspace) + BUILD_CHARCLASS_LOOP (isspace); else if (strcmp (name, "alpha") == 0) - BUILD_CHARCLASS_LOOP (isalpha) + BUILD_CHARCLASS_LOOP (isalpha); else if (strcmp (name, "digit") == 0) - BUILD_CHARCLASS_LOOP (isdigit) + BUILD_CHARCLASS_LOOP (isdigit); else if (strcmp (name, "print") == 0) - BUILD_CHARCLASS_LOOP (isprint) + BUILD_CHARCLASS_LOOP (isprint); else if (strcmp (name, "upper") == 0) - BUILD_CHARCLASS_LOOP (isupper) + BUILD_CHARCLASS_LOOP (isupper); else if (strcmp (name, "blank") == 0) - BUILD_CHARCLASS_LOOP (isblank) + BUILD_CHARCLASS_LOOP (isblank); else if (strcmp (name, "graph") == 0) - BUILD_CHARCLASS_LOOP (isgraph) + BUILD_CHARCLASS_LOOP (isgraph); else if (strcmp (name, "punct") == 0) - BUILD_CHARCLASS_LOOP (ispunct) + BUILD_CHARCLASS_LOOP (ispunct); else if (strcmp (name, "xdigit") == 0) - BUILD_CHARCLASS_LOOP (isxdigit) + BUILD_CHARCLASS_LOOP (isxdigit); else return REG_ECTYPE; @@ -3625,13 +3590,10 @@ build_charclass (trans, sbcset, class_name, syntax) } static bin_tree_t * -build_charclass_op (dfa, trans, class_name, extra, non_match, err) - re_dfa_t *dfa; - unsigned RE_TRANSLATE_TYPE trans; - const unsigned char *class_name; - const unsigned char *extra; - int non_match; - reg_errcode_t *err; +build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, + const unsigned char *class_name, + const unsigned char *extra, int non_match, + reg_errcode_t *err) { re_bitset_ptr_t sbcset; #ifdef RE_ENABLE_I18N @@ -3642,7 +3604,7 @@ build_charclass_op (dfa, trans, class_name, extra, non_match, err) re_token_t br_token; bin_tree_t *tree; - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3660,10 +3622,6 @@ build_charclass_op (dfa, trans, class_name, extra, non_match, err) if (non_match) { #ifdef RE_ENABLE_I18N - /* - if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - bitset_set(cset->sbcset, '\0'); - */ mbcset->non_match = 1; #endif /* not RE_ENABLE_I18N */ } @@ -3745,10 +3703,7 @@ build_charclass_op (dfa, trans, class_name, extra, non_match, err) Return -2, If an error is occured. */ static int -fetch_number (input, token, syntax) - re_string_t *input; - re_token_t *token; - reg_syntax_t syntax; +fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) { int num = -1; unsigned char c; @@ -3788,11 +3743,8 @@ free_charset (re_charset_t *cset) /* Create a tree node. */ static bin_tree_t * -create_tree (dfa, left, right, type) - re_dfa_t *dfa; - bin_tree_t *left; - bin_tree_t *right; - re_token_type_t type; +create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, + re_token_type_t type) { re_token_t t; t.type = type; @@ -3800,11 +3752,8 @@ create_tree (dfa, left, right, type) } static bin_tree_t * -create_token_tree (dfa, left, right, token) - re_dfa_t *dfa; - bin_tree_t *left; - bin_tree_t *right; - const re_token_t *token; +create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, + const re_token_t *token) { bin_tree_t *tree; if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) @@ -3840,9 +3789,7 @@ create_token_tree (dfa, left, right, token) To be called from preorder or postorder. */ static reg_errcode_t -mark_opt_subexp (extra, node) - void *extra; - bin_tree_t *node; +mark_opt_subexp (void *extra, bin_tree_t *node) { int idx = (int) (long) extra; if (node->token.type == SUBEXP && node->token.opr.idx == idx) @@ -3882,9 +3829,7 @@ free_tree (void *extra, bin_tree_t *node) it's easier to duplicate. */ static bin_tree_t * -duplicate_tree (root, dfa) - const bin_tree_t *root; - re_dfa_t *dfa; +duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) { const bin_tree_t *node; bin_tree_t *dup_root; @@ -3915,7 +3860,7 @@ duplicate_tree (root, dfa) node = node->parent; dup_node = dup_node->parent; if (!node) - return dup_root; + return dup_root; } node = node->right; p_new = &dup_node->right; diff --git a/regex.c b/regex.c index 7a4f304cddc..3ab9a6adcb7 100644 --- a/regex.c +++ b/regex.c @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 2002-2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -14,36 +14,16 @@ Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ + License along with the GNU C Library; if not, see + . */ #ifdef HAVE_CONFIG_H #include "config.h" #endif -#ifdef _AIX -#pragma alloca -#else -# ifndef allocax /* predefined by HP cc +Olibcalls */ -# ifdef __GNUC__ -# define alloca(size) __builtin_alloca (size) -# else -# if HAVE_ALLOCA_H -# include -# else -# ifdef __hpux - void *alloca (); -# else -# if !defined __OS2__ && !defined WIN32 - char *alloca (); -# else -# include /* OS/2 defines alloca in here */ -# endif -# endif -# endif -# endif -# endif +/* Make sure noone compiles this code with a C++ compiler. */ +#ifdef __cplusplus +# error "This is C code, use a C compiler" #endif #ifdef _LIBC @@ -71,15 +51,14 @@ # include "../locale/localeinfo.h" #endif -/* POSIX says that must be included (by the caller) before - . */ -#include - /* On some systems, limits.h sets RE_DUP_MAX to a lower value than GNU regex allows. Include it before , which correctly #undefs RE_DUP_MAX and sets it to the right value. */ #include +/* This header defines the MIN and MAX macros. */ +#include + #include #include "regex_internal.h" diff --git a/regex.h b/regex.h index 81789be8978..469a22b1d3b 100644 --- a/regex.h +++ b/regex.h @@ -1,46 +1,582 @@ +/* Definitions for data structures and routines for the regular + expression library. + Copyright (C) 1985,1989-93,1995-98,2000,2001,2002,2003,2005,2006,2008,2011 + Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + #ifndef _REGEX_H -#include +#define _REGEX_H 1 + +#include + +/* Allow the use in C++ code. */ +#ifdef __cplusplus +extern "C" { +#endif + +/* The following two types have to be signed and unsigned integer type + wide enough to hold a value of a pointer. For most ANSI compilers + ptrdiff_t and size_t should be likely OK. Still size of these two + types is 2 for Microsoft C. Ugh... */ +typedef long int s_reg_t; +typedef unsigned long int active_reg_t; + +/* The following bits are used to determine the regexp syntax we + recognize. The set/not-set meanings are chosen so that Emacs syntax + remains the value 0. The bits are given in alphabetical order, and + the definitions shifted by one from the previous bit; thus, when we + add or remove a bit, only one other definition need change. */ +typedef unsigned long int reg_syntax_t; + +#ifdef __USE_GNU +/* If this bit is not set, then \ inside a bracket expression is literal. + If set, then such a \ quotes the following character. */ +# define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1) + +/* If this bit is not set, then + and ? are operators, and \+ and \? are + literals. + If set, then \+ and \? are operators and + and ? are literals. */ +# define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) + +/* If this bit is set, then character classes are supported. They are: + [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], + [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. + If not set, then character classes are not supported. */ +# define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) + +/* If this bit is set, then ^ and $ are always anchors (outside bracket + expressions, of course). + If this bit is not set, then it depends: + ^ is an anchor if it is at the beginning of a regular + expression or after an open-group or an alternation operator; + $ is an anchor if it is at the end of a regular expression, or + before a close-group or an alternation operator. + + This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because + POSIX draft 11.2 says that * etc. in leading positions is undefined. + We already implemented a previous draft which made those constructs + invalid, though, so we haven't changed the code back. */ +# define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) + +/* If this bit is set, then special characters are always special + regardless of where they are in the pattern. + If this bit is not set, then special characters are special only in + some contexts; otherwise they are ordinary. Specifically, + * + ? and intervals are only special when not after the beginning, + open-group, or alternation operator. */ +# define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) + +/* If this bit is set, then *, +, ?, and { cannot be first in an re or + immediately after an alternation or begin-group operator. */ +# define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) + +/* If this bit is set, then . matches newline. + If not set, then it doesn't. */ +# define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) + +/* If this bit is set, then . doesn't match NUL. + If not set, then it does. */ +# define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) + +/* If this bit is set, nonmatching lists [^...] do not match newline. + If not set, they do. */ +# define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) + +/* If this bit is set, either \{...\} or {...} defines an + interval, depending on RE_NO_BK_BRACES. + If not set, \{, \}, {, and } are literals. */ +# define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) + +/* If this bit is set, +, ? and | aren't recognized as operators. + If not set, they are. */ +# define RE_LIMITED_OPS (RE_INTERVALS << 1) + +/* If this bit is set, newline is an alternation operator. + If not set, newline is literal. */ +# define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) + +/* If this bit is set, then `{...}' defines an interval, and \{ and \} + are literals. + If not set, then `\{...\}' defines an interval. */ +# define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) + +/* If this bit is set, (...) defines a group, and \( and \) are literals. + If not set, \(...\) defines a group, and ( and ) are literals. */ +# define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) + +/* If this bit is set, then \ matches . + If not set, then \ is a back-reference. */ +# define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) + +/* If this bit is set, then | is an alternation operator, and \| is literal. + If not set, then \| is an alternation operator, and | is literal. */ +# define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) + +/* If this bit is set, then an ending range point collating higher + than the starting range point, as in [z-a], is invalid. + If not set, then when ending range point collates higher than the + starting range point, the range is ignored. */ +# define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) + +/* If this bit is set, then an unmatched ) is ordinary. + If not set, then an unmatched ) is invalid. */ +# define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) + +/* If this bit is set, succeed as soon as we match the whole pattern, + without further backtracking. */ +# define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) + +/* If this bit is set, do not process the GNU regex operators. + If not set, then the GNU regex operators are recognized. */ +# define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1) + +/* If this bit is set, turn on internal regex debugging. + If not set, and debugging was on, turn it off. + This only works if regex.c is compiled -DDEBUG. + We define this bit always, so that all that's needed to turn on + debugging is to recompile regex.c; the calling code can always have + this bit set, and it won't affect anything in the normal case. */ +# define RE_DEBUG (RE_NO_GNU_OPS << 1) + +/* If this bit is set, a syntactically invalid interval is treated as + a string of ordinary characters. For example, the ERE 'a{1' is + treated as 'a\{1'. */ +# define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1) + +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +# define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) + +/* This bit is used internally like RE_CONTEXT_INDEP_ANCHORS but only + for ^, because it is difficult to scan the regex backwards to find + whether ^ should be special. */ +# define RE_CARET_ANCHORS_HERE (RE_ICASE << 1) + +/* If this bit is set, then \{ cannot be first in an bre or + immediately after an alternation or begin-group operator. */ +# define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1) + +/* If this bit is set, then no_sub will be set to 1 during + re_compile_pattern. */ +# define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) +#endif + +/* This global variable defines the particular regexp syntax to use (for + some interfaces). When a regexp is compiled, the syntax used is + stored in the pattern buffer, so changing this does not affect + already-compiled regexps. */ +extern reg_syntax_t re_syntax_options; + +#ifdef __USE_GNU +/* Define combinations of the above bits for the standard possibilities. + (The [[[ comments delimit what gets put into the Texinfo file, so + don't delete them!) */ +/* [[[begin syntaxes]]] */ +#define RE_SYNTAX_EMACS 0 + +#define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CHAR_CLASSES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS) + +#define RE_SYNTAX_GNU_AWK \ + ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ + | RE_INVALID_INTERVAL_ORD) \ + & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \ + | RE_CONTEXT_INVALID_OPS )) + +#define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ + | RE_INTERVALS | RE_NO_GNU_OPS \ + | RE_INVALID_INTERVAL_ORD) + +#define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + +#define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + +#define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES \ + | RE_INVALID_INTERVAL_ORD) + +/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ +#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + +#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + +/* Syntax bits common to both basic and extended POSIX regex syntax. */ +#define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + +#define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM | RE_CONTEXT_INVALID_DUP) + +/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ +#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + +#define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD) + +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is + removed and RE_NO_BK_REFS is added. */ +#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) +/* [[[end syntaxes]]] */ + +/* Maximum number of duplicates an interval can allow. Some systems + (erroneously) define this in other header files, but we want our + value, so remove any previous define. */ +# ifdef RE_DUP_MAX +# undef RE_DUP_MAX +# endif +/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */ +# define RE_DUP_MAX (0x7fff) +#endif + + +/* POSIX `cflags' bits (i.e., information for `regcomp'). */ + +/* If this bit is set, then use extended regular expression syntax. + If not set, then use basic regular expression syntax. */ +#define REG_EXTENDED 1 + +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define REG_ICASE (REG_EXTENDED << 1) -/* Document internal interfaces. */ -extern reg_syntax_t __re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); +/* If this bit is set, then anchors do not match at newline + characters in the string. + If not set, then anchors do match at newlines. */ +#define REG_NEWLINE (REG_ICASE << 1) -extern const char *__re_compile_pattern - _RE_ARGS ((const char *pattern, size_t length, - struct re_pattern_buffer *buffer)); +/* If this bit is set, then report only success or fail in regexec. + If not set, then returns differ between not matching and errors. */ +#define REG_NOSUB (REG_NEWLINE << 1) -extern int __re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); -extern int __re_search - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, - int length, int start, int range, struct re_registers *regs)); +/* POSIX `eflags' bits (i.e., information for regexec). */ -extern int __re_search_2 - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, - int length1, const char *string2, int length2, - int start, int range, struct re_registers *regs, int stop)); +/* If this bit is set, then the beginning-of-line operator doesn't match + the beginning of the string (presumably because it's not the + beginning of a line). + If not set, then the beginning-of-line operator does match the + beginning of the string. */ +#define REG_NOTBOL 1 -extern int __re_match - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, - int length, int start, struct re_registers *regs)); +/* Like REG_NOTBOL, except for the end-of-line. */ +#define REG_NOTEOL (1 << 1) -extern int __re_match_2 - _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, - int length1, const char *string2, int length2, - int start, struct re_registers *regs, int stop)); +/* Use PMATCH[0] to delimit the start and end of the search in the + buffer. */ +#define REG_STARTEND (1 << 2) -extern void __re_set_registers - _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, - unsigned num_regs, regoff_t *starts, regoff_t *ends)); -extern int __regcomp _RE_ARGS ((regex_t *__preg, const char *__pattern, - int __cflags)); +/* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ +typedef enum +{ +#if defined _XOPEN_SOURCE || defined __USE_XOPEN2K + REG_ENOSYS = -1, /* This will never happen for this implementation. */ +#endif + + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Inalid collating element. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched \{. */ + REG_BADBR, /* Invalid contents of \{\}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ +} reg_errcode_t; + +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + and `translate' can be set. After the pattern has been compiled, + the fields `re_nsub', `not_bol' and `not_eol' are available. All + other fields are private to the regex routines. */ + +#ifndef RE_TRANSLATE_TYPE +# define __RE_TRANSLATE_TYPE unsigned char * +# ifdef __USE_GNU +# define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE +# endif +#endif + +#ifdef __USE_GNU +# define __REPB_PREFIX(name) name +#else +# define __REPB_PREFIX(name) __##name +#endif + +struct re_pattern_buffer +{ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are sometimes used as + array indexes. */ + unsigned char *__REPB_PREFIX(buffer); + + /* Number of bytes to which `buffer' points. */ + unsigned long int __REPB_PREFIX(allocated); + + /* Number of bytes actually used in `buffer'. */ + unsigned long int __REPB_PREFIX(used); + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t __REPB_PREFIX(syntax); + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses the + fastmap, if there is one, to skip over impossible starting points + for matches. */ + char *__REPB_PREFIX(fastmap); + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation is + applied to a pattern when it is compiled and to a string when it + is matched. */ + __RE_TRANSLATE_TYPE __REPB_PREFIX(translate); + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see whether or + not we should use the fastmap, so we don't set this absolutely + perfectly; see `re_compile_fastmap' (the `duplicate' case). */ + unsigned __REPB_PREFIX(can_be_null) : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#ifdef __USE_GNU +# define REGS_UNALLOCATED 0 +# define REGS_REALLOCATE 1 +# define REGS_FIXED 2 +#endif + unsigned __REPB_PREFIX(regs_allocated) : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned __REPB_PREFIX(fastmap_accurate) : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned __REPB_PREFIX(no_sub) : 1; + + /* If set, a beginning-of-line anchor doesn't match at the beginning + of the string. */ + unsigned __REPB_PREFIX(not_bol) : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned __REPB_PREFIX(not_eol) : 1; + + /* If true, an anchor at a newline matches. */ + unsigned __REPB_PREFIX(newline_anchor) : 1; +}; + +typedef struct re_pattern_buffer regex_t; + +/* Type for byte offsets within the string. POSIX mandates this. */ +typedef int regoff_t; + + +#ifdef __USE_GNU +/* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ +struct re_registers +{ + unsigned num_regs; + regoff_t *start; + regoff_t *end; +}; + + +/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ +# ifndef RE_NREGS +# define RE_NREGS 30 +# endif +#endif + + +/* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ +typedef struct +{ + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ +} regmatch_t; + +/* Declarations for routines. */ + +#ifdef __USE_GNU +/* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ +extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax); + +/* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. + + To free the allocated storage, you must call `regfree' on BUFFER. + Note that the translate table must either have been initialised by + `regcomp', with a malloc'ed value, or set to NULL before calling + `regfree'. */ +extern const char *re_compile_pattern (const char *__pattern, size_t __length, + struct re_pattern_buffer *__buffer); + + +/* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ +extern int re_compile_fastmap (struct re_pattern_buffer *__buffer); + + +/* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ +extern int re_search (struct re_pattern_buffer *__buffer, const char *__string, + int __length, int __start, int __range, + struct re_registers *__regs); + + +/* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ +extern int re_search_2 (struct re_pattern_buffer *__buffer, + const char *__string1, int __length1, + const char *__string2, int __length2, int __start, + int __range, struct re_registers *__regs, int __stop); + + +/* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ +extern int re_match (struct re_pattern_buffer *__buffer, const char *__string, + int __length, int __start, struct re_registers *__regs); -extern int __regexec _RE_ARGS ((const regex_t *__preg, - const char *__string, size_t __nmatch, - regmatch_t __pmatch[], int __eflags)); -extern size_t __regerror _RE_ARGS ((int __errcode, const regex_t *__preg, - char *__errbuf, size_t __errbuf_size)); +/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ +extern int re_match_2 (struct re_pattern_buffer *__buffer, + const char *__string1, int __length1, + const char *__string2, int __length2, int __start, + struct re_registers *__regs, int __stop); -extern void __regfree _RE_ARGS ((regex_t *__preg)); + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +extern void re_set_registers (struct re_pattern_buffer *__buffer, + struct re_registers *__regs, + unsigned int __num_regs, + regoff_t *__starts, regoff_t *__ends); +#endif /* Use GNU */ + +#if defined _REGEX_RE_COMP || (defined _LIBC && defined __USE_BSD) +# ifndef _CRAY +/* 4.2 bsd compatibility. */ +extern char *re_comp (const char *); +extern int re_exec (const char *); +# endif +#endif + +/* GCC 2.95 and later have "__restrict"; C99 compilers have + "restrict", and "configure" may have defined "restrict". */ +#ifndef __restrict +# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__)) +# if defined restrict || 199901L <= __STDC_VERSION__ +# define __restrict restrict +# else +# define __restrict +# endif +# endif +#endif +/* gcc 3.1 and up support the [restrict] syntax. */ +#ifndef __restrict_arr +# if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) \ + && !defined __GNUG__ +# define __restrict_arr __restrict +# else +# define __restrict_arr +# endif #endif + +/* POSIX compatibility. */ +extern int regcomp (regex_t *__restrict __preg, + const char *__restrict __pattern, + int __cflags); + +extern int regexec (const regex_t *__restrict __preg, + const char *__restrict __string, size_t __nmatch, + regmatch_t __pmatch[__restrict_arr], + int __eflags); + +extern size_t regerror (int __errcode, const regex_t *__restrict __preg, + char *__restrict __errbuf, size_t __errbuf_size); + +extern void regfree (regex_t *__preg); + + +#ifdef __cplusplus +} +#endif /* C++ */ + +#endif /* regex.h */ diff --git a/regex_internal.c b/regex_internal.c index b3d44c368dd..9be8a532e6d 100644 --- a/regex_internal.c +++ b/regex_internal.c @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -14,29 +14,20 @@ Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ + License along with the GNU C Library; if not, see + . */ static void re_string_construct_common (const char *str, int len, re_string_t *pstr, RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) internal_function; -#ifdef RE_ENABLE_I18N -static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx, - wint_t *last_wc) internal_function; -#endif /* RE_ENABLE_I18N */ -static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate, - unsigned int hash) internal_function; -static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa, +static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, unsigned int hash) internal_function; -static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa, +static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, unsigned int context, unsigned int hash) internal_function; -static unsigned int inline calc_state_hash (const re_node_set *nodes, - unsigned int context) internal_function; /* Functions for string operation. */ @@ -44,12 +35,9 @@ static unsigned int inline calc_state_hash (const re_node_set *nodes, re_string_reconstruct before using the object. */ static reg_errcode_t -re_string_allocate (pstr, str, len, init_len, trans, icase, dfa) - re_string_t *pstr; - const char *str; - int len, init_len, icase; - RE_TRANSLATE_TYPE trans; - const re_dfa_t *dfa; +internal_function __attribute_warn_unused_result__ +re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len, + RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) { reg_errcode_t ret; int init_buf_len; @@ -75,12 +63,9 @@ re_string_allocate (pstr, str, len, init_len, trans, icase, dfa) /* This function allocate the buffers, and initialize them. */ static reg_errcode_t -re_string_construct (pstr, str, len, trans, icase, dfa) - re_string_t *pstr; - const char *str; - int len, icase; - RE_TRANSLATE_TYPE trans; - const re_dfa_t *dfa; +internal_function __attribute_warn_unused_result__ +re_string_construct (re_string_t *pstr, const char *str, int len, + RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) { reg_errcode_t ret; memset (pstr, '\0', sizeof (re_string_t)); @@ -141,33 +126,39 @@ re_string_construct (pstr, str, len, trans, icase, dfa) /* Helper functions for re_string_allocate, and re_string_construct. */ static reg_errcode_t -re_string_realloc_buffers (pstr, new_buf_len) - re_string_t *pstr; - int new_buf_len; +internal_function __attribute_warn_unused_result__ +re_string_realloc_buffers (re_string_t *pstr, int new_buf_len) { #ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { - wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len); - if (BE (new_array == NULL, 0)) + wint_t *new_wcs; + + /* Avoid overflow in realloc. */ + const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int)); + if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) return REG_ESPACE; - pstr->wcs = new_array; + + new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); + if (BE (new_wcs == NULL, 0)) + return REG_ESPACE; + pstr->wcs = new_wcs; if (pstr->offsets != NULL) { - int *new_array = re_realloc (pstr->offsets, int, new_buf_len); - if (BE (new_array == NULL, 0)) + int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len); + if (BE (new_offsets == NULL, 0)) return REG_ESPACE; - pstr->offsets = new_array; + pstr->offsets = new_offsets; } } #endif /* RE_ENABLE_I18N */ if (pstr->mbs_allocated) { - unsigned char *new_array = re_realloc (pstr->mbs, unsigned char, - new_buf_len); - if (BE (new_array == NULL, 0)) + unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, + new_buf_len); + if (BE (new_mbs == NULL, 0)) return REG_ESPACE; - pstr->mbs = new_array; + pstr->mbs = new_mbs; } pstr->bufs_len = new_buf_len; return REG_NOERROR; @@ -175,18 +166,15 @@ re_string_realloc_buffers (pstr, new_buf_len) static void -re_string_construct_common (str, len, pstr, trans, icase, dfa) - const char *str; - int len; - re_string_t *pstr; - RE_TRANSLATE_TYPE trans; - int icase; - const re_dfa_t *dfa; +internal_function +re_string_construct_common (const char *str, int len, re_string_t *pstr, + RE_TRANSLATE_TYPE trans, int icase, + const re_dfa_t *dfa) { pstr->raw_mbs = (const unsigned char *) str; pstr->len = len; pstr->raw_len = len; - pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans; + pstr->trans = trans; pstr->icase = icase ? 1 : 0; pstr->mbs_allocated = (trans != NULL || icase); pstr->mb_cur_max = dfa->mb_cur_max; @@ -210,12 +198,12 @@ re_string_construct_common (str, len, pstr, trans, icase, dfa) built and starts from PSTR->VALID_LEN. */ static void -build_wcs_buffer (pstr) - re_string_t *pstr; +internal_function +build_wcs_buffer (re_string_t *pstr) { #ifdef _LIBC - unsigned char buf[MB_CUR_MAX]; - assert (MB_CUR_MAX >= pstr->mb_cur_max); + unsigned char buf[MB_LEN_MAX]; + assert (MB_LEN_MAX >= pstr->mb_cur_max); #else unsigned char buf[64]; #endif @@ -247,14 +235,9 @@ build_wcs_buffer (pstr) } else p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; - mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); - if (BE (mbclen == (size_t) -2, 0)) - { - /* The buffer doesn't have enough space, finish to build. */ - pstr->cur_state = prev_st; - break; - } - else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) + mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); + if (BE (mbclen == (size_t) -1 || mbclen == 0 + || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0)) { /* We treat these cases as a singlebyte character. */ mbclen = 1; @@ -263,6 +246,12 @@ build_wcs_buffer (pstr) wc = pstr->trans[wc]; pstr->cur_state = prev_st; } + else if (BE (mbclen == (size_t) -2, 0)) + { + /* The buffer doesn't have enough space, finish to build. */ + pstr->cur_state = prev_st; + break; + } /* Write wide character and padding. */ pstr->wcs[byte_idx++] = wc; @@ -277,16 +266,16 @@ build_wcs_buffer (pstr) /* Build wide character buffer PSTR->WCS like build_wcs_buffer, but for REG_ICASE. */ -static int -build_wcs_upper_buffer (pstr) - re_string_t *pstr; +static reg_errcode_t +internal_function __attribute_warn_unused_result__ +build_wcs_upper_buffer (re_string_t *pstr) { mbstate_t prev_st; int src_idx, byte_idx, end_idx, remain_len; size_t mbclen; #ifdef _LIBC - char buf[MB_CUR_MAX]; - assert (MB_CUR_MAX >= pstr->mb_cur_max); + char buf[MB_LEN_MAX]; + assert (MB_LEN_MAX >= pstr->mb_cur_max); #else char buf[64]; #endif @@ -317,9 +306,9 @@ build_wcs_upper_buffer (pstr) remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, - ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx - + byte_idx), remain_len, &pstr->cur_state); + mbclen = __mbrtowc (&wc, + ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx + + byte_idx), remain_len, &pstr->cur_state); if (BE (mbclen + 2 > 2, 1)) { wchar_t wcu = wc; @@ -345,9 +334,11 @@ build_wcs_upper_buffer (pstr) for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } - else if (mbclen == (size_t) -1 || mbclen == 0) + else if (mbclen == (size_t) -1 || mbclen == 0 + || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) { - /* It is an invalid character or '\0'. Just use the byte. */ + /* It is an invalid character, an incomplete character + at the end of the string, or '\0'. Just use the byte. */ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; pstr->mbs[byte_idx] = ch; /* And also cast it to wide char. */ @@ -387,7 +378,7 @@ build_wcs_upper_buffer (pstr) } else p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; - mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); + mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); if (BE (mbclen + 2 > 2, 1)) { wchar_t wcu = wc; @@ -441,8 +432,8 @@ build_wcs_upper_buffer (pstr) src_idx += mbclen; continue; } - else - memcpy (pstr->mbs + byte_idx, p, mbclen); + else + memcpy (pstr->mbs + byte_idx, p, mbclen); } else memcpy (pstr->mbs + byte_idx, p, mbclen); @@ -460,7 +451,8 @@ build_wcs_upper_buffer (pstr) for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } - else if (mbclen == (size_t) -1 || mbclen == 0) + else if (mbclen == (size_t) -1 || mbclen == 0 + || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) { /* It is an invalid character or '\0'. Just use the byte. */ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; @@ -494,35 +486,39 @@ build_wcs_upper_buffer (pstr) Return the index. */ static int -re_string_skip_chars (pstr, new_raw_idx, last_wc) - re_string_t *pstr; - int new_raw_idx; - wint_t *last_wc; +internal_function +re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc) { mbstate_t prev_st; int rawbuf_idx; size_t mbclen; - wchar_t wc = 0; + wint_t wc = WEOF; /* Skip the characters which are not necessary to check. */ for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; rawbuf_idx < new_raw_idx;) { - int remain_len; - remain_len = pstr->len - rawbuf_idx; + wchar_t wc2; + int remain_len = pstr->raw_len - rawbuf_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, - remain_len, &pstr->cur_state); - if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) + mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, + remain_len, &pstr->cur_state); + if (BE ((ssize_t) mbclen <= 0, 0)) { - /* We treat these cases as a singlebyte character. */ + /* We treat these cases as a single byte character. */ + if (mbclen == 0 || remain_len == 0) + wc = L'\0'; + else + wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); mbclen = 1; pstr->cur_state = prev_st; } + else + wc = (wint_t) wc2; /* Then proceed the next character. */ rawbuf_idx += mbclen; } - *last_wc = (wint_t) wc; + *last_wc = wc; return rawbuf_idx; } #endif /* RE_ENABLE_I18N */ @@ -531,8 +527,8 @@ re_string_skip_chars (pstr, new_raw_idx, last_wc) This function is used in case of REG_ICASE. */ static void -build_upper_buffer (pstr) - re_string_t *pstr; +internal_function +build_upper_buffer (re_string_t *pstr) { int char_idx, end_idx; end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; @@ -554,8 +550,8 @@ build_upper_buffer (pstr) /* Apply TRANS to the buffer in PSTR. */ static void -re_string_translate_buffer (pstr) - re_string_t *pstr; +internal_function +re_string_translate_buffer (re_string_t *pstr) { int buf_idx, end_idx; end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; @@ -575,9 +571,8 @@ re_string_translate_buffer (pstr) convert to upper case in case of REG_ICASE, apply translation. */ static reg_errcode_t -re_string_reconstruct (pstr, idx, eflags) - re_string_t *pstr; - int idx, eflags; +internal_function __attribute_warn_unused_result__ +re_string_reconstruct (re_string_t *pstr, int idx, int eflags) { int offset = idx - pstr->raw_mbs_idx; if (BE (offset < 0, 0)) @@ -602,34 +597,98 @@ re_string_reconstruct (pstr, idx, eflags) if (BE (offset != 0, 1)) { - /* Are the characters which are already checked remain? */ - if (BE (offset < pstr->valid_raw_len, 1) -#ifdef RE_ENABLE_I18N - /* Handling this would enlarge the code too much. - Accept a slowdown in that case. */ - && pstr->offsets_needed == 0 -#endif - ) + /* Should the already checked characters be kept? */ + if (BE (offset < pstr->valid_raw_len, 1)) { /* Yes, move them to the front of the buffer. */ - pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags); #ifdef RE_ENABLE_I18N - if (pstr->mb_cur_max > 1) - memmove (pstr->wcs, pstr->wcs + offset, - (pstr->valid_len - offset) * sizeof (wint_t)); + if (BE (pstr->offsets_needed, 0)) + { + int low = 0, high = pstr->valid_len, mid; + do + { + mid = (high + low) / 2; + if (pstr->offsets[mid] > offset) + high = mid; + else if (pstr->offsets[mid] < offset) + low = mid + 1; + else + break; + } + while (low < high); + if (pstr->offsets[mid] < offset) + ++mid; + pstr->tip_context = re_string_context_at (pstr, mid - 1, + eflags); + /* This can be quite complicated, so handle specially + only the common and easy case where the character with + different length representation of lower and upper + case is present at or after offset. */ + if (pstr->valid_len > offset + && mid == offset && pstr->offsets[mid] == offset) + { + memmove (pstr->wcs, pstr->wcs + offset, + (pstr->valid_len - offset) * sizeof (wint_t)); + memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); + pstr->valid_len -= offset; + pstr->valid_raw_len -= offset; + for (low = 0; low < pstr->valid_len; low++) + pstr->offsets[low] = pstr->offsets[low + offset] - offset; + } + else + { + /* Otherwise, just find out how long the partial multibyte + character at offset is and fill it with WEOF/255. */ + pstr->len = pstr->raw_len - idx + offset; + pstr->stop = pstr->raw_stop - idx + offset; + pstr->offsets_needed = 0; + while (mid > 0 && pstr->offsets[mid - 1] == offset) + --mid; + while (mid < pstr->valid_len) + if (pstr->wcs[mid] != WEOF) + break; + else + ++mid; + if (mid == pstr->valid_len) + pstr->valid_len = 0; + else + { + pstr->valid_len = pstr->offsets[mid] - offset; + if (pstr->valid_len) + { + for (low = 0; low < pstr->valid_len; ++low) + pstr->wcs[low] = WEOF; + memset (pstr->mbs, 255, pstr->valid_len); + } + } + pstr->valid_raw_len = pstr->valid_len; + } + } + else +#endif + { + pstr->tip_context = re_string_context_at (pstr, offset - 1, + eflags); +#ifdef RE_ENABLE_I18N + if (pstr->mb_cur_max > 1) + memmove (pstr->wcs, pstr->wcs + offset, + (pstr->valid_len - offset) * sizeof (wint_t)); #endif /* RE_ENABLE_I18N */ - if (BE (pstr->mbs_allocated, 0)) - memmove (pstr->mbs, pstr->mbs + offset, - pstr->valid_len - offset); - pstr->valid_len -= offset; - pstr->valid_raw_len -= offset; + if (BE (pstr->mbs_allocated, 0)) + memmove (pstr->mbs, pstr->mbs + offset, + pstr->valid_len - offset); + pstr->valid_len -= offset; + pstr->valid_raw_len -= offset; #if DEBUG - assert (pstr->valid_len > 0); + assert (pstr->valid_len > 0); #endif + } } else { /* No, skip all characters until IDX. */ + int prev_valid_len = pstr->valid_len; + #ifdef RE_ENABLE_I18N if (BE (pstr->offsets_needed, 0)) { @@ -639,7 +698,6 @@ re_string_reconstruct (pstr, idx, eflags) } #endif pstr->valid_len = 0; - pstr->valid_raw_len = 0; #ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { @@ -648,47 +706,72 @@ re_string_reconstruct (pstr, idx, eflags) if (pstr->is_utf8) { - const unsigned char *raw, *p, *q, *end; + const unsigned char *raw, *p, *end; /* Special case UTF-8. Multi-byte chars start with any byte other than 0x80 - 0xbf. */ raw = pstr->raw_mbs + pstr->raw_mbs_idx; end = raw + (offset - pstr->mb_cur_max); - for (p = raw + offset - 1; p >= end; --p) - if ((*p & 0xc0) != 0x80) - { - mbstate_t cur_state; - wchar_t wc2; - int mlen = raw + pstr->len - p; - unsigned char buf[6]; - - q = p; - if (BE (pstr->trans != NULL, 0)) - { - int i = mlen < 6 ? mlen : 6; - while (--i >= 0) - buf[i] = pstr->trans[p[i]]; - q = buf; - } - /* XXX Don't use mbrtowc, we know which conversion - to use (UTF-8 -> UCS4). */ - memset (&cur_state, 0, sizeof (cur_state)); - mlen = (mbrtowc (&wc2, (const char *) p, mlen, - &cur_state) - - (raw + offset - p)); - if (mlen >= 0) - { - memset (&pstr->cur_state, '\0', - sizeof (mbstate_t)); - pstr->valid_len = mlen; - wc = wc2; - } - break; - } + if (end < pstr->raw_mbs) + end = pstr->raw_mbs; + p = raw + offset - 1; +#ifdef _LIBC + /* We know the wchar_t encoding is UCS4, so for the simple + case, ASCII characters, skip the conversion step. */ + if (isascii (*p) && BE (pstr->trans == NULL, 1)) + { + memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); + /* pstr->valid_len = 0; */ + wc = (wchar_t) *p; + } + else +#endif + for (; p >= end; --p) + if ((*p & 0xc0) != 0x80) + { + mbstate_t cur_state; + wchar_t wc2; + int mlen = raw + pstr->len - p; + unsigned char buf[6]; + size_t mbclen; + + const unsigned char *pp = p; + if (BE (pstr->trans != NULL, 0)) + { + int i = mlen < 6 ? mlen : 6; + while (--i >= 0) + buf[i] = pstr->trans[p[i]]; + pp = buf; + } + /* XXX Don't use mbrtowc, we know which conversion + to use (UTF-8 -> UCS4). */ + memset (&cur_state, 0, sizeof (cur_state)); + mbclen = __mbrtowc (&wc2, (const char *) pp, mlen, + &cur_state); + if (raw + offset - p <= mbclen + && mbclen < (size_t) -2) + { + memset (&pstr->cur_state, '\0', + sizeof (mbstate_t)); + pstr->valid_len = mbclen - (raw + offset - p); + wc = wc2; + } + break; + } } if (wc == WEOF) pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; + if (wc == WEOF) + pstr->tip_context + = re_string_context_at (pstr, prev_valid_len - 1, eflags); + else + pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) + && IS_WIDE_WORD_CHAR (wc)) + ? CONTEXT_WORD + : ((IS_WIDE_NEWLINE (wc) + && pstr->newline_anchor) + ? CONTEXT_NEWLINE : 0)); if (BE (pstr->valid_len, 0)) { for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) @@ -697,17 +780,12 @@ re_string_reconstruct (pstr, idx, eflags) memset (pstr->mbs, 255, pstr->valid_len); } pstr->valid_raw_len = pstr->valid_len; - pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) - && IS_WIDE_WORD_CHAR (wc)) - ? CONTEXT_WORD - : ((IS_WIDE_NEWLINE (wc) - && pstr->newline_anchor) - ? CONTEXT_NEWLINE : 0)); } else #endif /* RE_ENABLE_I18N */ { int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; + pstr->valid_raw_len = 0; if (pstr->trans) c = pstr->trans[c]; pstr->tip_context = (bitset_contain (pstr->word_char, c) @@ -729,7 +807,7 @@ re_string_reconstruct (pstr, idx, eflags) { if (pstr->icase) { - int ret = build_wcs_upper_buffer (pstr); + reg_errcode_t ret = build_wcs_upper_buffer (pstr); if (BE (ret != REG_NOERROR, 0)) return ret; } @@ -738,24 +816,23 @@ re_string_reconstruct (pstr, idx, eflags) } else #endif /* RE_ENABLE_I18N */ - if (BE (pstr->mbs_allocated, 0)) - { - if (pstr->icase) - build_upper_buffer (pstr); - else if (pstr->trans != NULL) - re_string_translate_buffer (pstr); - } - else - pstr->valid_len = pstr->len; + if (BE (pstr->mbs_allocated, 0)) + { + if (pstr->icase) + build_upper_buffer (pstr); + else if (pstr->trans != NULL) + re_string_translate_buffer (pstr); + } + else + pstr->valid_len = pstr->len; pstr->cur_idx = 0; return REG_NOERROR; } static unsigned char -re_string_peek_byte_case (pstr, idx) - const re_string_t *pstr; - int idx; +internal_function __attribute ((pure)) +re_string_peek_byte_case (const re_string_t *pstr, int idx) { int ch, off; @@ -790,8 +867,8 @@ re_string_peek_byte_case (pstr, idx) } static unsigned char -re_string_fetch_byte_case (pstr) - re_string_t *pstr; +internal_function +re_string_fetch_byte_case (re_string_t *pstr) { if (BE (!pstr->mbs_allocated, 1)) return re_string_fetch_byte (pstr); @@ -827,8 +904,8 @@ re_string_fetch_byte_case (pstr) } static void -re_string_destruct (pstr) - re_string_t *pstr; +internal_function +re_string_destruct (re_string_t *pstr) { #ifdef RE_ENABLE_I18N re_free (pstr->wcs); @@ -841,9 +918,8 @@ re_string_destruct (pstr) /* Return the context at IDX in INPUT. */ static unsigned int -re_string_context_at (input, idx, eflags) - const re_string_t *input; - int idx, eflags; +internal_function +re_string_context_at (const re_string_t *input, int idx, int eflags) { int c; if (BE (idx < 0, 0)) @@ -887,9 +963,8 @@ re_string_context_at (input, idx, eflags) /* Functions for set operation. */ static reg_errcode_t -re_node_set_alloc (set, size) - re_node_set *set; - int size; +internal_function __attribute_warn_unused_result__ +re_node_set_alloc (re_node_set *set, int size) { set->alloc = size; set->nelem = 0; @@ -900,9 +975,8 @@ re_node_set_alloc (set, size) } static reg_errcode_t -re_node_set_init_1 (set, elem) - re_node_set *set; - int elem; +internal_function __attribute_warn_unused_result__ +re_node_set_init_1 (re_node_set *set, int elem) { set->alloc = 1; set->nelem = 1; @@ -917,9 +991,8 @@ re_node_set_init_1 (set, elem) } static reg_errcode_t -re_node_set_init_2 (set, elem1, elem2) - re_node_set *set; - int elem1, elem2; +internal_function __attribute_warn_unused_result__ +re_node_set_init_2 (re_node_set *set, int elem1, int elem2) { set->alloc = 2; set->elems = re_malloc (int, 2); @@ -948,9 +1021,8 @@ re_node_set_init_2 (set, elem1, elem2) } static reg_errcode_t -re_node_set_init_copy (dest, src) - re_node_set *dest; - const re_node_set *src; +internal_function __attribute_warn_unused_result__ +re_node_set_init_copy (re_node_set *dest, const re_node_set *src) { dest->nelem = src->nelem; if (src->nelem > 0) @@ -974,9 +1046,9 @@ re_node_set_init_copy (dest, src) Note: We assume dest->elems is NULL, when dest->alloc is 0. */ static reg_errcode_t -re_node_set_add_intersect (dest, src1, src2) - re_node_set *dest; - const re_node_set *src1, *src2; +internal_function __attribute_warn_unused_result__ +re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, + const re_node_set *src2) { int i1, i2, is, id, delta, sbase; if (src1->nelem == 0 || src2->nelem == 0) @@ -989,7 +1061,7 @@ re_node_set_add_intersect (dest, src1, src2) int new_alloc = src1->nelem + src2->nelem + dest->alloc; int *new_elems = re_realloc (dest->elems, int, new_alloc); if (BE (new_elems == NULL, 0)) - return REG_ESPACE; + return REG_ESPACE; dest->elems = new_elems; dest->alloc = new_alloc; } @@ -1008,8 +1080,8 @@ re_node_set_add_intersect (dest, src1, src2) while (id >= 0 && dest->elems[id] > src1->elems[i1]) --id; - if (id < 0 || dest->elems[id] != src1->elems[i1]) - dest->elems[--sbase] = src1->elems[i1]; + if (id < 0 || dest->elems[id] != src1->elems[i1]) + dest->elems[--sbase] = src1->elems[i1]; if (--i1 < 0 || --i2 < 0) break; @@ -1039,20 +1111,20 @@ re_node_set_add_intersect (dest, src1, src2) if (delta > 0 && id >= 0) for (;;) { - if (dest->elems[is] > dest->elems[id]) - { - /* Copy from the top. */ - dest->elems[id + delta--] = dest->elems[is--]; - if (delta == 0) - break; - } - else - { - /* Slide from the bottom. */ - dest->elems[id + delta] = dest->elems[id]; - if (--id < 0) - break; - } + if (dest->elems[is] > dest->elems[id]) + { + /* Copy from the top. */ + dest->elems[id + delta--] = dest->elems[is--]; + if (delta == 0) + break; + } + else + { + /* Slide from the bottom. */ + dest->elems[id + delta] = dest->elems[id]; + if (--id < 0) + break; + } } /* Copy remaining SRC elements. */ @@ -1065,9 +1137,9 @@ re_node_set_add_intersect (dest, src1, src2) DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ static reg_errcode_t -re_node_set_init_union (dest, src1, src2) - re_node_set *dest; - const re_node_set *src1, *src2; +internal_function __attribute_warn_unused_result__ +re_node_set_init_union (re_node_set *dest, const re_node_set *src1, + const re_node_set *src2) { int i1, i2, id; if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) @@ -1118,9 +1190,8 @@ re_node_set_init_union (dest, src1, src2) DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ static reg_errcode_t -re_node_set_merge (dest, src) - re_node_set *dest; - const re_node_set *src; +internal_function __attribute_warn_unused_result__ +re_node_set_merge (re_node_set *dest, const re_node_set *src) { int is, id, sbase, delta; if (src == NULL || src->nelem == 0) @@ -1148,11 +1219,11 @@ re_node_set_merge (dest, src) is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; ) { if (dest->elems[id] == src->elems[is]) - is--, id--; + is--, id--; else if (dest->elems[id] < src->elems[is]) - dest->elems[--sbase] = src->elems[is--]; + dest->elems[--sbase] = src->elems[is--]; else /* if (dest->elems[id] > src->elems[is]) */ - --id; + --id; } if (is >= 0) @@ -1174,21 +1245,21 @@ re_node_set_merge (dest, src) for (;;) { if (dest->elems[is] > dest->elems[id]) - { + { /* Copy from the top. */ - dest->elems[id + delta--] = dest->elems[is--]; + dest->elems[id + delta--] = dest->elems[is--]; if (delta == 0) break; } else - { - /* Slide from the bottom. */ - dest->elems[id + delta] = dest->elems[id]; + { + /* Slide from the bottom. */ + dest->elems[id + delta] = dest->elems[id]; if (--id < 0) { /* Copy remaining SRC elements. */ memcpy (dest->elems, dest->elems + sbase, - delta * sizeof (int)); + delta * sizeof (int)); break; } } @@ -1202,9 +1273,8 @@ re_node_set_merge (dest, src) return -1 if an error is occured, return 1 otherwise. */ static int -re_node_set_insert (set, elem) - re_node_set *set; - int elem; +internal_function __attribute_warn_unused_result__ +re_node_set_insert (re_node_set *set, int elem) { int idx; /* In case the set is empty. */ @@ -1227,12 +1297,12 @@ re_node_set_insert (set, elem) /* Realloc if we need. */ if (set->alloc == set->nelem) { - int *new_array; + int *new_elems; set->alloc = set->alloc * 2; - new_array = re_realloc (set->elems, int, set->alloc); - if (BE (new_array == NULL, 0)) + new_elems = re_realloc (set->elems, int, set->alloc); + if (BE (new_elems == NULL, 0)) return -1; - set->elems = new_array; + set->elems = new_elems; } /* Move the elements which follows the new element. Test the @@ -1241,12 +1311,12 @@ re_node_set_insert (set, elem) { idx = 0; for (idx = set->nelem; idx > 0; idx--) - set->elems[idx] = set->elems[idx - 1]; + set->elems[idx] = set->elems[idx - 1]; } else { for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) - set->elems[idx] = set->elems[idx - 1]; + set->elems[idx] = set->elems[idx - 1]; } /* Insert the new element. */ @@ -1260,19 +1330,18 @@ re_node_set_insert (set, elem) Return -1 if an error is occured, return 1 otherwise. */ static int -re_node_set_insert_last (set, elem) - re_node_set *set; - int elem; +internal_function __attribute_warn_unused_result__ +re_node_set_insert_last (re_node_set *set, int elem) { /* Realloc if we need. */ if (set->alloc == set->nelem) { - int *new_array; + int *new_elems; set->alloc = (set->alloc + 1) * 2; - new_array = re_realloc (set->elems, int, set->alloc); - if (BE (new_array == NULL, 0)) + new_elems = re_realloc (set->elems, int, set->alloc); + if (BE (new_elems == NULL, 0)) return -1; - set->elems = new_array; + set->elems = new_elems; } /* Insert the new element. */ @@ -1284,8 +1353,8 @@ re_node_set_insert_last (set, elem) return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */ static int -re_node_set_compare (set1, set2) - const re_node_set *set1, *set2; +internal_function __attribute ((pure)) +re_node_set_compare (const re_node_set *set1, const re_node_set *set2) { int i; if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) @@ -1299,9 +1368,8 @@ re_node_set_compare (set1, set2) /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ static int -re_node_set_contains (set, elem) - const re_node_set *set; - int elem; +internal_function __attribute ((pure)) +re_node_set_contains (const re_node_set *set, int elem) { unsigned int idx, right, mid; if (set->nelem <= 0) @@ -1322,9 +1390,8 @@ re_node_set_contains (set, elem) } static void -re_node_set_remove_at (set, idx) - re_node_set *set; - int idx; +internal_function +re_node_set_remove_at (re_node_set *set, int idx) { if (idx < 0 || idx >= set->nelem) return; @@ -1338,22 +1405,28 @@ re_node_set_remove_at (set, idx) Or return -1, if an error will be occured. */ static int -re_dfa_add_node (dfa, token) - re_dfa_t *dfa; - re_token_t token; +internal_function +re_dfa_add_node (re_dfa_t *dfa, re_token_t token) { int type = token.type; if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) { - int new_nodes_alloc = dfa->nodes_alloc * 2; + size_t new_nodes_alloc = dfa->nodes_alloc * 2; int *new_nexts, *new_indices; re_node_set *new_edests, *new_eclosures; + re_token_t *new_nodes; - re_token_t *new_array = re_realloc (dfa->nodes, re_token_t, - new_nodes_alloc); - if (BE (new_array == NULL, 0)) + /* Avoid overflows in realloc. */ + const size_t max_object_size = MAX (sizeof (re_token_t), + MAX (sizeof (re_node_set), + sizeof (int))); + if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0)) + return -1; + + new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); + if (BE (new_nodes == NULL, 0)) return -1; - dfa->nodes = new_array; + dfa->nodes = new_nodes; new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc); new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc); new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); @@ -1379,10 +1452,9 @@ re_dfa_add_node (dfa, token) return dfa->nodes_len++; } -static unsigned int inline -calc_state_hash (nodes, context) - const re_node_set *nodes; - unsigned int context; +static inline unsigned int +internal_function +calc_state_hash (const re_node_set *nodes, unsigned int context) { unsigned int hash = nodes->nelem + context; int i; @@ -1400,11 +1472,10 @@ calc_state_hash (nodes, context) - We never return non-NULL value in case of any errors, it is for optimization. */ -static re_dfastate_t* -re_acquire_state (err, dfa, nodes) - reg_errcode_t *err; - re_dfa_t *dfa; - const re_node_set *nodes; +static re_dfastate_t * +internal_function __attribute_warn_unused_result__ +re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, + const re_node_set *nodes) { unsigned int hash; re_dfastate_t *new_state; @@ -1429,13 +1500,10 @@ re_acquire_state (err, dfa, nodes) /* There are no appropriate state in the dfa, create the new one. */ new_state = create_ci_newstate (dfa, nodes, hash); - if (BE (new_state != NULL, 1)) - return new_state; - else - { - *err = REG_ESPACE; - return NULL; - } + if (BE (new_state == NULL, 0)) + *err = REG_ESPACE; + + return new_state; } /* Search for the state whose node_set is equivalent to NODES and @@ -1448,12 +1516,10 @@ re_acquire_state (err, dfa, nodes) - We never return non-NULL value in case of any errors, it is for optimization. */ -static re_dfastate_t* -re_acquire_state_context (err, dfa, nodes, context) - reg_errcode_t *err; - re_dfa_t *dfa; - const re_node_set *nodes; - unsigned int context; +static re_dfastate_t * +internal_function __attribute_warn_unused_result__ +re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, + const re_node_set *nodes, unsigned int context) { unsigned int hash; re_dfastate_t *new_state; @@ -1477,13 +1543,10 @@ re_acquire_state_context (err, dfa, nodes, context) } /* There are no appropriate state in `dfa', create the new one. */ new_state = create_cd_newstate (dfa, nodes, context, hash); - if (BE (new_state != NULL, 1)) - return new_state; - else - { - *err = REG_ESPACE; - return NULL; - } + if (BE (new_state == NULL, 0)) + *err = REG_ESPACE; + + return new_state; } /* Finish initialization of the new state NEWSTATE, and using its hash value @@ -1491,10 +1554,9 @@ re_acquire_state_context (err, dfa, nodes, context) indicates the error code if failed. */ static reg_errcode_t -register_state (dfa, newstate, hash) - re_dfa_t *dfa; - re_dfastate_t *newstate; - unsigned int hash; +__attribute_warn_unused_result__ +register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, + unsigned int hash) { struct re_state_table_entry *spot; reg_errcode_t err; @@ -1508,7 +1570,8 @@ register_state (dfa, newstate, hash) { int elem = newstate->nodes.elems[i]; if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) - re_node_set_insert_last (&newstate->non_eps_nodes, elem); + if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0) + return REG_ESPACE; } spot = dfa->state_table + (hash & dfa->state_hash_mask); @@ -1526,14 +1589,29 @@ register_state (dfa, newstate, hash) return REG_NOERROR; } +static void +free_state (re_dfastate_t *state) +{ + re_node_set_free (&state->non_eps_nodes); + re_node_set_free (&state->inveclosure); + if (state->entrance_nodes != &state->nodes) + { + re_node_set_free (state->entrance_nodes); + re_free (state->entrance_nodes); + } + re_node_set_free (&state->nodes); + re_free (state->word_trtable); + re_free (state->trtable); + re_free (state); +} + /* Create the new state which is independ of contexts. Return the new state if succeeded, otherwise return NULL. */ static re_dfastate_t * -create_ci_newstate (dfa, nodes, hash) - re_dfa_t *dfa; - const re_node_set *nodes; - unsigned int hash; +internal_function __attribute_warn_unused_result__ +create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, + unsigned int hash) { int i; reg_errcode_t err; @@ -1581,10 +1659,9 @@ create_ci_newstate (dfa, nodes, hash) Return the new state if succeeded, otherwise return NULL. */ static re_dfastate_t * -create_cd_newstate (dfa, nodes, context, hash) - re_dfa_t *dfa; - const re_node_set *nodes; - unsigned int context, hash; +internal_function __attribute_warn_unused_result__ +create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, + unsigned int context, unsigned int hash) { int i, nctx_nodes = 0; reg_errcode_t err; @@ -1605,11 +1682,9 @@ create_cd_newstate (dfa, nodes, context, hash) for (i = 0 ; i < nodes->nelem ; i++) { - unsigned int constraint = 0; re_token_t *node = dfa->nodes + nodes->elems[i]; re_token_type_t type = node->type; - if (node->constraint) - constraint = node->constraint; + unsigned int constraint = node->constraint; if (type == CHARACTER && !constraint) continue; @@ -1622,8 +1697,6 @@ create_cd_newstate (dfa, nodes, context, hash) newstate->halt = 1; else if (type == OP_BACK_REF) newstate->has_backref = 1; - else if (type == ANCHOR) - constraint = node->opr.ctx_type; if (constraint) { @@ -1635,7 +1708,9 @@ create_cd_newstate (dfa, nodes, context, hash) free_state (newstate); return NULL; } - re_node_set_init_copy (newstate->entrance_nodes, nodes); + if (re_node_set_init_copy (newstate->entrance_nodes, nodes) + != REG_NOERROR) + return NULL; nctx_nodes = 0; newstate->has_constraint = 1; } @@ -1655,20 +1730,3 @@ create_cd_newstate (dfa, nodes, context, hash) } return newstate; } - -static void -free_state (state) - re_dfastate_t *state; -{ - re_node_set_free (&state->non_eps_nodes); - re_node_set_free (&state->inveclosure); - if (state->entrance_nodes != &state->nodes) - { - re_node_set_free (state->entrance_nodes); - re_free (state->entrance_nodes); - } - re_node_set_free (&state->nodes); - re_free (state->word_trtable); - re_free (state->trtable); - re_free (state); -} diff --git a/regex_internal.h b/regex_internal.h index 58fa749e900..6dfdef66394 100644 --- a/regex_internal.h +++ b/regex_internal.h @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2002-2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -14,9 +14,8 @@ Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ + License along with the GNU C Library; if not, see + . */ #ifndef _REGEX_INTERNAL_H #define _REGEX_INTERNAL_H 1 @@ -39,6 +38,20 @@ #if defined HAVE_WCTYPE_H || defined _LIBC # include #endif /* HAVE_WCTYPE_H || _LIBC */ +#if defined HAVE_STDBOOL_H || defined _LIBC +# include +#endif /* HAVE_STDBOOL_H || _LIBC */ +#if defined HAVE_STDINT_H || defined _LIBC +# include +#endif /* HAVE_STDINT_H || _LIBC */ +#if defined _LIBC +# include +#else +# define __libc_lock_define(CLASS,NAME) +# define __libc_lock_init(NAME) do { } while (0) +# define __libc_lock_lock(NAME) do { } while (0) +# define __libc_lock_unlock(NAME) do { } while (0) +#endif /* In case that the system doesn't have isblank(). */ #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank @@ -60,7 +73,7 @@ # ifdef _LIBC # undef gettext # define gettext(msgid) \ - INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES) + __dcgettext (_libc_intl_domainname, msgid, LC_MESSAGES) # endif #else # define gettext(msgid) (msgid) @@ -72,6 +85,11 @@ # define gettext_noop(String) String #endif +/* For loser systems without the definition. */ +#ifndef SIZE_MAX +# define SIZE_MAX ((size_t) -1) +#endif + #if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC # define RE_ENABLE_I18N #endif @@ -83,8 +101,6 @@ # define inline #endif -/* Number of bits in a byte. */ -#define BYTE_BITS 8 /* Number of single byte character. */ #define SBC_MAX 256 @@ -99,6 +115,7 @@ # define __wctype wctype # define __iswctype iswctype # define __btowc btowc +# define __mbrtowc mbrtowc # define __mempcpy mempcpy # define __wcrtomb wcrtomb # define __regfree regfree @@ -114,26 +131,28 @@ extern const char __re_error_msgid[] attribute_hidden; extern const size_t __re_error_msgid_idx[] attribute_hidden; -/* Number of bits in an unsinged int. */ -#define UINT_BITS (sizeof (unsigned int) * BYTE_BITS) -/* Number of unsigned int in an bit_set. */ -#define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS) -typedef unsigned int bitset[BITSET_UINTS]; -typedef unsigned int *re_bitset_ptr_t; -typedef const unsigned int *re_const_bitset_ptr_t; - -#define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS) -#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS)) -#define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS)) -#define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS) -#define bitset_set_all(set) \ - memset (set, 255, sizeof (unsigned int) * BITSET_UINTS) -#define bitset_copy(dest,src) \ - memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS) -static inline void bitset_not (bitset set); -static inline void bitset_merge (bitset dest, const bitset src); -static inline void bitset_not_merge (bitset dest, const bitset src); -static inline void bitset_mask (bitset dest, const bitset src); +/* An integer used to represent a set of bits. It must be unsigned, + and must be at least as wide as unsigned int. */ +typedef unsigned long int bitset_word_t; +/* All bits set in a bitset_word_t. */ +#define BITSET_WORD_MAX ULONG_MAX +/* Number of bits in a bitset_word_t. */ +#define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT) +/* Number of bitset_word_t in a bit_set. */ +#define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS) +typedef bitset_word_t bitset_t[BITSET_WORDS]; +typedef bitset_word_t *re_bitset_ptr_t; +typedef const bitset_word_t *re_const_bitset_ptr_t; + +#define bitset_set(set,i) \ + (set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS) +#define bitset_clear(set,i) \ + (set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS)) +#define bitset_contain(set,i) \ + (set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS)) +#define bitset_empty(set) memset (set, '\0', sizeof (bitset_t)) +#define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t)) +#define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t)) #define PREV_WORD_CONSTRAINT 0x0001 #define PREV_NOTWORD_CONSTRAINT 0x0002 @@ -339,7 +358,7 @@ struct re_string_t the beginning of the input string. */ unsigned int tip_context; /* The translation passed as a part of an argument of re_compile_pattern. */ - unsigned RE_TRANSLATE_TYPE trans; + RE_TRANSLATE_TYPE trans; /* Copy of re_dfa_t's word_char. */ re_const_bitset_ptr_t word_char; /* 1 if REG_ICASE. */ @@ -366,44 +385,20 @@ typedef struct re_dfa_t re_dfa_t; # endif #endif -#ifndef RE_NO_INTERNAL_PROTOTYPES -static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str, - int len, int init_len, - RE_TRANSLATE_TYPE trans, int icase, - const re_dfa_t *dfa) - internal_function; -static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str, - int len, RE_TRANSLATE_TYPE trans, - int icase, const re_dfa_t *dfa) - internal_function; -static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx, - int eflags) internal_function; +#ifndef NOT_IN_libc static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr, int new_buf_len) internal_function; # ifdef RE_ENABLE_I18N static void build_wcs_buffer (re_string_t *pstr) internal_function; -static int build_wcs_upper_buffer (re_string_t *pstr) internal_function; +static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr) + internal_function; # endif /* RE_ENABLE_I18N */ static void build_upper_buffer (re_string_t *pstr) internal_function; static void re_string_translate_buffer (re_string_t *pstr) internal_function; -static void re_string_destruct (re_string_t *pstr) internal_function; -# ifdef RE_ENABLE_I18N -static int re_string_elem_size_at (const re_string_t *pstr, int idx) - internal_function __attribute ((pure)); -static inline int re_string_char_size_at (const re_string_t *pstr, int idx) - internal_function __attribute ((pure)); -static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx) - internal_function __attribute ((pure)); -# endif /* RE_ENABLE_I18N */ static unsigned int re_string_context_at (const re_string_t *input, int idx, int eflags) internal_function __attribute ((pure)); -static unsigned char re_string_peek_byte_case (const re_string_t *pstr, - int idx) - internal_function __attribute ((pure)); -static unsigned char re_string_fetch_byte_case (re_string_t *pstr) - internal_function __attribute ((pure)); #endif #define re_string_peek_byte(pstr, offset) \ ((pstr)->mbs[(pstr)->cur_idx + offset]) @@ -422,6 +417,21 @@ static unsigned char re_string_fetch_byte_case (re_string_t *pstr) #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx)) #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx)) +#include + +#ifndef _LIBC +# if HAVE_ALLOCA +/* The OS usually guarantees only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + allocate anything larger than 4096 bytes. Also care for the possibility + of a few compiler-allocated temporary stack slots. */ +# define __libc_use_alloca(n) ((n) < 4032) +# else +/* alloca is implemented with malloc, so just use malloc. */ +# define __libc_use_alloca(n) 0 +# endif +#endif + #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t))) #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t))) #define re_free(p) free (p) @@ -533,7 +543,6 @@ typedef struct { int str_idx; int node; - int next_last_offset; state_array_t *path; int alasts; /* Allocation size of LASTS. */ int nlasts; /* The number of LASTS. */ @@ -556,9 +565,9 @@ typedef struct /* The string object corresponding to the input string. */ re_string_t input; #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) - re_dfa_t *const dfa; + const re_dfa_t *const dfa; #else - re_dfa_t *dfa; + const re_dfa_t *dfa; #endif /* EFLAGS of the argument of regexec. */ int eflags; @@ -605,8 +614,8 @@ struct re_fail_stack_t struct re_dfa_t { re_token_t *nodes; - int nodes_alloc; - int nodes_len; + size_t nodes_alloc; + size_t nodes_len; int *nexts; int *org_indices; re_node_set *edests; @@ -624,13 +633,12 @@ struct re_dfa_t /* number of subexpressions `re_nsub' is in regex_t. */ unsigned int state_hash_mask; - int states_alloc; int init_node; int nbackref; /* The number of backreference in this dfa. */ /* Bitmap expressing which backreference is used. */ - unsigned int used_bkref_map; - unsigned int completed_bkref_map; + bitset_word_t used_bkref_map; + bitset_word_t completed_bkref_map; unsigned int has_plural_match : 1; /* If this dfa has "multibyte node", which is a backreference or @@ -641,52 +649,20 @@ struct re_dfa_t unsigned int map_notascii : 1; unsigned int word_ops_used : 1; int mb_cur_max; - bitset word_char; + bitset_t word_char; reg_syntax_t syntax; int *subexp_map; #ifdef DEBUG char* re_str; #endif + __libc_lock_define (, lock) }; -#ifndef RE_NO_INTERNAL_PROTOTYPES -static reg_errcode_t re_node_set_alloc (re_node_set *set, int size) internal_function; -static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem) internal_function; -static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1, - int elem2) internal_function; #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set)) -static reg_errcode_t re_node_set_init_copy (re_node_set *dest, - const re_node_set *src) internal_function; -static reg_errcode_t re_node_set_add_intersect (re_node_set *dest, - const re_node_set *src1, - const re_node_set *src2) internal_function; -static reg_errcode_t re_node_set_init_union (re_node_set *dest, - const re_node_set *src1, - const re_node_set *src2) internal_function; -static reg_errcode_t re_node_set_merge (re_node_set *dest, - const re_node_set *src) internal_function; -static int re_node_set_insert (re_node_set *set, int elem) internal_function; -static int re_node_set_insert_last (re_node_set *set, - int elem) internal_function; -static int re_node_set_compare (const re_node_set *set1, - const re_node_set *set2) - internal_function __attribute ((pure)); -static int re_node_set_contains (const re_node_set *set, int elem) - internal_function __attribute ((pure)); -static void re_node_set_remove_at (re_node_set *set, int idx) internal_function; #define re_node_set_remove(set,id) \ (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1)) #define re_node_set_empty(p) ((p)->nelem = 0) #define re_node_set_free(set) re_free ((set)->elems) -static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token) internal_function; -static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, - const re_node_set *nodes) internal_function; -static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err, - re_dfa_t *dfa, - const re_node_set *nodes, - unsigned int context) internal_function; -static void free_state (re_dfastate_t *state) internal_function; -#endif typedef enum @@ -712,41 +688,33 @@ typedef struct /* Inline functions for bitset operation. */ static inline void -bitset_not (bitset set) +bitset_not (bitset_t set) { int bitset_i; - for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) + for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i) set[bitset_i] = ~set[bitset_i]; } static inline void -bitset_merge (bitset dest, const bitset src) +bitset_merge (bitset_t dest, const bitset_t src) { int bitset_i; - for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) + for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i) dest[bitset_i] |= src[bitset_i]; } static inline void -bitset_not_merge (bitset dest, const bitset src) -{ - int i; - for (i = 0; i < BITSET_UINTS; ++i) - dest[i] |= ~src[i]; -} - -static inline void -bitset_mask (bitset dest, const bitset src) +bitset_mask (bitset_t dest, const bitset_t src) { int bitset_i; - for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) + for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i) dest[bitset_i] &= src[bitset_i]; } -#if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES +#ifdef RE_ENABLE_I18N /* Inline functions for re_string. */ static inline int -internal_function +internal_function __attribute ((pure)) re_string_char_size_at (const re_string_t *pstr, int idx) { int byte_idx; @@ -759,7 +727,7 @@ re_string_char_size_at (const re_string_t *pstr, int idx) } static inline wint_t -internal_function +internal_function __attribute ((pure)) re_string_wchar_at (const re_string_t *pstr, int idx) { if (pstr->mb_cur_max == 1) @@ -767,15 +735,15 @@ re_string_wchar_at (const re_string_t *pstr, int idx) return (wint_t) pstr->wcs[idx]; } +# ifndef NOT_IN_libc static int -internal_function +internal_function __attribute ((pure)) re_string_elem_size_at (const re_string_t *pstr, int idx) { -#ifdef _LIBC +# ifdef _LIBC const unsigned char *p, *extra; const int32_t *table, *indirect; - int32_t tmp; -# include +# include uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) @@ -786,13 +754,14 @@ re_string_elem_size_at (const re_string_t *pstr, int idx) indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); p = pstr->mbs + idx; - tmp = findidx (&p); + findidx (&p, pstr->len - idx); return p - pstr->mbs - idx; } else -#endif /* _LIBC */ +# endif /* _LIBC */ return 1; } +# endif #endif /* RE_ENABLE_I18N */ #endif /* _REGEX_INTERNAL_H */ diff --git a/regexec.c b/regexec.c index 3c226e3c20c..ec4ae1316ff 100644 --- a/regexec.c +++ b/regexec.c @@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2002-2005,2007,2009,2010,2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . @@ -14,9 +14,8 @@ Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ + License along with the GNU C Library; if not, see + . */ static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, int n) internal_function; @@ -25,7 +24,7 @@ static void match_ctx_free (re_match_context_t *cache) internal_function; static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, int str_idx, int from, int to) internal_function; -static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx) +static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) internal_function; static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx) internal_function; @@ -52,81 +51,76 @@ static int re_search_stub (struct re_pattern_buffer *bufp, int ret_len) internal_function; static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs, int regs_allocated) internal_function; -static inline re_dfastate_t *acquire_init_state_context - (reg_errcode_t *err, const re_match_context_t *mctx, int idx) - __attribute ((always_inline)) internal_function; static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) internal_function; static int check_matching (re_match_context_t *mctx, int fl_longest_match, - int *p_match_first) - internal_function; -static int check_halt_node_context (const re_dfa_t *dfa, int node, - unsigned int context) internal_function; + int *p_match_first) internal_function; static int check_halt_state_context (const re_match_context_t *mctx, const re_dfastate_t *state, int idx) internal_function; -static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, +static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch) internal_function; -static int proceed_next_node (const re_match_context_t *mctx, - int nregs, regmatch_t *regs, - int *pidx, int node, re_node_set *eps_via_nodes, - struct re_fail_stack_t *fs) internal_function; static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node, int nregs, regmatch_t *regs, - re_node_set *eps_via_nodes) internal_function; -static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, - regmatch_t *regs, re_node_set *eps_via_nodes) internal_function; + re_node_set *eps_via_nodes) + internal_function; static reg_errcode_t set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, regmatch_t *pmatch, int fl_backtrack) internal_function; -static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function; +static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) + internal_function; #ifdef RE_ENABLE_I18N static int sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, - int node_idx, int str_idx, int max_str_idx) internal_function; + int node_idx, int str_idx, int max_str_idx) + internal_function; #endif /* RE_ENABLE_I18N */ -static reg_errcode_t sift_states_backward (re_match_context_t *mctx, - re_sift_context_t *sctx) internal_function; -static reg_errcode_t build_sifted_states (re_match_context_t *mctx, +static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, + re_sift_context_t *sctx) + internal_function; +static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, int str_idx, - re_node_set *cur_dest) internal_function; -static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx, + re_node_set *cur_dest) + internal_function; +static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, re_sift_context_t *sctx, int str_idx, - re_node_set *dest_nodes) internal_function; -static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa, - re_node_set *dest_nodes, - const re_node_set *candidates) internal_function; -static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node, + re_node_set *dest_nodes) + internal_function; +static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, - const re_node_set *and_nodes) internal_function; -static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits, + const re_node_set *candidates) + internal_function; +static int check_dst_limits (const re_match_context_t *mctx, + re_node_set *limits, int dst_node, int dst_idx, int src_node, int src_idx) internal_function; -static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx, +static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, int subexp_idx, - int from_node, int bkref_idx) internal_function; -static int check_dst_limits_calc_pos (re_match_context_t *mctx, + int from_node, int bkref_idx) + internal_function; +static int check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit, int subexp_idx, int node, int str_idx, int bkref_idx) internal_function; -static reg_errcode_t check_subexp_limits (re_dfa_t *dfa, +static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, const re_node_set *candidates, re_node_set *limits, struct re_backref_cache_entry *bkref_ents, int str_idx) internal_function; -static reg_errcode_t sift_states_bkref (re_match_context_t *mctx, +static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, - int str_idx, const re_node_set *candidates) internal_function; -static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx, - int next_state_log_idx) internal_function; -static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, - re_dfastate_t **src, int num) internal_function; + int str_idx, const re_node_set *candidates) + internal_function; +static reg_errcode_t merge_state_array (const re_dfa_t *dfa, + re_dfastate_t **dst, + re_dfastate_t **src, int num) + internal_function; static re_dfastate_t *find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) internal_function; static re_dfastate_t *transit_state (reg_errcode_t *err, @@ -134,27 +128,33 @@ static re_dfastate_t *transit_state (reg_errcode_t *err, re_dfastate_t *state) internal_function; static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, - re_dfastate_t *next_state) internal_function; + re_dfastate_t *next_state) + internal_function; static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, int str_idx) internal_function; #if 0 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, - re_dfastate_t *pstate) internal_function; + re_dfastate_t *pstate) + internal_function; #endif #ifdef RE_ENABLE_I18N static reg_errcode_t transit_state_mb (re_match_context_t *mctx, - re_dfastate_t *pstate) internal_function; + re_dfastate_t *pstate) + internal_function; #endif /* RE_ENABLE_I18N */ static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, - const re_node_set *nodes) internal_function; + const re_node_set *nodes) + internal_function; static reg_errcode_t get_subexp (re_match_context_t *mctx, - int bkref_node, int bkref_str_idx) internal_function; + int bkref_node, int bkref_str_idx) + internal_function; static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, re_sub_match_last_t *sub_last, - int bkref_node, int bkref_str) internal_function; + int bkref_node, int bkref_str) + internal_function; static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, int subexp_idx, int type) internal_function; static reg_errcode_t check_arrival (re_match_context_t *mctx, @@ -164,34 +164,41 @@ static reg_errcode_t check_arrival (re_match_context_t *mctx, static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx, re_node_set *cur_nodes, - re_node_set *next_nodes) internal_function; -static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa, + re_node_set *next_nodes) + internal_function; +static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, - int ex_subexp, int type) internal_function; -static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa, + int ex_subexp, int type) + internal_function; +static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, int target, int ex_subexp, int type) internal_function; static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, int cur_str, - int subexp_num, int type) internal_function; -static int build_trtable (re_dfa_t *dfa, + int subexp_num, int type) + internal_function; +static int build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) internal_function; #ifdef RE_ENABLE_I18N -static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx, - const re_string_t *input, int idx) internal_function; +static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, + const re_string_t *input, int idx) + internal_function; # ifdef _LIBC static unsigned int find_collation_sequence_value (const unsigned char *mbs, - size_t name_len) internal_function; + size_t name_len) + internal_function; # endif /* _LIBC */ #endif /* RE_ENABLE_I18N */ -static int group_nodes_into_DFAstates (re_dfa_t *dfa, +static int group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, re_node_set *states_node, - bitset *states_ch) internal_function; + bitset_t *states_ch) internal_function; static int check_node_accept (const re_match_context_t *mctx, - const re_token_t *node, int idx) internal_function; -static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function; + const re_token_t *node, int idx) + internal_function; +static reg_errcode_t extend_buffers (re_match_context_t *mctx) + internal_function; /* Entry point for POSIX code. */ @@ -219,6 +226,7 @@ regexec (preg, string, nmatch, pmatch, eflags) { reg_errcode_t err; int start, length; + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) return REG_BADPAT; @@ -233,12 +241,15 @@ regexec (preg, string, nmatch, pmatch, eflags) start = 0; length = strlen (string); } + + __libc_lock_lock (dfa->lock); if (preg->no_sub) err = re_search_internal (preg, string, length, start, length - start, length, 0, NULL, eflags); else err = re_search_internal (preg, string, length, start, length - start, length, nmatch, pmatch, eflags); + __libc_lock_unlock (dfa->lock); return err != REG_NOERROR; } @@ -356,33 +367,34 @@ re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, const char *str; int rval; int len = length1 + length2; - int free_str = 0; + char *s = NULL; - if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) + if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0)) return -2; /* Concatenate the strings. */ if (length2 > 0) if (length1 > 0) { - char *s = re_malloc (char, len); + s = re_malloc (char, len); if (BE (s == NULL, 0)) return -2; +#ifdef _LIBC + memcpy (__mempcpy (s, string1, length1), string2, length2); +#else memcpy (s, string1, length1); memcpy (s + length1, string2, length2); +#endif str = s; - free_str = 1; } else str = string2; else str = string1; - rval = re_search_stub (bufp, str, len, start, range, stop, regs, - ret_len); - if (free_str) - re_free ((char *) str); + rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len); + re_free (s); return rval; } @@ -402,6 +414,7 @@ re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) regmatch_t *pmatch; int nregs, rval; int eflags = 0; + re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; /* Check for out-of-range. */ if (BE (start < 0 || start > length, 0)) @@ -411,6 +424,8 @@ re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) else if (BE (start + range < 0, 0)) range = -start; + __libc_lock_lock (dfa->lock); + eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; @@ -439,7 +454,10 @@ re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) nregs = bufp->re_nsub + 1; pmatch = re_malloc (regmatch_t, nregs); if (BE (pmatch == NULL, 0)) - return -2; + { + rval = -2; + goto out; + } result = re_search_internal (bufp, string, length, start, range, stop, nregs, pmatch, eflags); @@ -469,6 +487,8 @@ re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) rval = pmatch[0].rm_so; } re_free (pmatch); + out: + __libc_lock_unlock (dfa->lock); return rval; } @@ -488,9 +508,14 @@ re_copy_regs (regs, pmatch, nregs, regs_allocated) if (regs_allocated == REGS_UNALLOCATED) { /* No. So allocate them with malloc. */ regs->start = re_malloc (regoff_t, need_regs); - regs->end = re_malloc (regoff_t, need_regs); - if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0)) + if (BE (regs->start == NULL, 0)) return REGS_UNALLOCATED; + regs->end = re_malloc (regoff_t, need_regs); + if (BE (regs->end == NULL, 0)) + { + re_free (regs->start); + return REGS_UNALLOCATED; + } regs->num_regs = need_regs; } else if (regs_allocated == REGS_REALLOCATE) @@ -500,9 +525,15 @@ re_copy_regs (regs, pmatch, nregs, regs_allocated) if (BE (need_regs > regs->num_regs, 0)) { regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); - regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs); - if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0)) + regoff_t *new_end; + if (BE (new_start == NULL, 0)) return REGS_UNALLOCATED; + new_end = re_realloc (regs->end, regoff_t, need_regs); + if (BE (new_end == NULL, 0)) + { + re_free (new_start); + return REGS_UNALLOCATED; + } regs->start = new_start; regs->end = new_end; regs->num_regs = need_regs; @@ -593,6 +624,7 @@ re_exec (s) (START + RANGE >= 0 && START + RANGE <= LENGTH) */ static reg_errcode_t +__attribute_warn_unused_result__ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, eflags) const regex_t *preg; @@ -602,7 +634,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, regmatch_t pmatch[]; { reg_errcode_t err; - re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; int left_lim, right_lim, incr; int fl_longest_match, match_first, match_kind, match_last = -1; int extra_nmatch; @@ -614,7 +646,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, #endif char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate && range && !preg->can_be_null) ? preg->fastmap : NULL; - unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate; + RE_TRANSLATE_TYPE t = preg->translate; #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) memset (&mctx, '\0', sizeof (re_match_context_t)); @@ -644,7 +676,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, || !preg->newline_anchor)) { if (start != 0 && start + range != 0) - return REG_NOMATCH; + return REG_NOMATCH; start = range = 0; } @@ -669,6 +701,13 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, multi character collating element. */ if (nmatch > 1 || dfa->has_mb_node) { + /* Avoid overflow. */ + if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0)) + { + err = REG_ESPACE; + goto free_return; + } + mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); if (BE (mctx.state_log == NULL, 0)) { @@ -776,10 +815,10 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, break; match_first += incr; if (match_first < left_lim || match_first > right_lim) - { - err = REG_NOMATCH; - goto free_return; - } + { + err = REG_NOMATCH; + goto free_return; + } } break; } @@ -871,14 +910,14 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, #ifdef RE_ENABLE_I18N if (BE (mctx.input.offsets_needed != 0, 0)) { - if (pmatch[reg_idx].rm_so == mctx.input.valid_len) - pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len; - else - pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so]; - if (pmatch[reg_idx].rm_eo == mctx.input.valid_len) - pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len; - else - pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo]; + pmatch[reg_idx].rm_so = + (pmatch[reg_idx].rm_so == mctx.input.valid_len + ? mctx.input.valid_raw_len + : mctx.input.offsets[pmatch[reg_idx].rm_so]); + pmatch[reg_idx].rm_eo = + (pmatch[reg_idx].rm_eo == mctx.input.valid_len + ? mctx.input.valid_raw_len + : mctx.input.offsets[pmatch[reg_idx].rm_eo]); } #else assert (mctx.input.offsets_needed == 0); @@ -893,14 +932,14 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, } if (dfa->subexp_map) - for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) - if (dfa->subexp_map[reg_idx] != reg_idx) - { - pmatch[reg_idx + 1].rm_so - = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; - pmatch[reg_idx + 1].rm_eo - = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; - } + for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) + if (dfa->subexp_map[reg_idx] != reg_idx) + { + pmatch[reg_idx + 1].rm_so + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; + pmatch[reg_idx + 1].rm_eo + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; + } } free_return: @@ -912,10 +951,11 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, } static reg_errcode_t +__attribute_warn_unused_result__ prune_impossible_nodes (mctx) re_match_context_t *mctx; { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int halt_node, match_last; reg_errcode_t ret; re_dfastate_t **sifted_states; @@ -926,6 +966,11 @@ prune_impossible_nodes (mctx) #endif match_last = mctx->match_last; halt_node = mctx->last_node; + + /* Avoid overflow. */ + if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0)) + return REG_ESPACE; + sifted_states = re_malloc (re_dfastate_t *, match_last + 1); if (BE (sifted_states == NULL, 0)) { @@ -980,6 +1025,11 @@ prune_impossible_nodes (mctx) re_node_set_free (&sctx.limits); if (BE (ret != REG_NOERROR, 0)) goto free_return; + if (sifted_states[0] == NULL) + { + ret = REG_NOMATCH; + goto free_return; + } } re_free (mctx->state_log); mctx->state_log = sifted_states; @@ -998,12 +1048,11 @@ prune_impossible_nodes (mctx) since initial states may have constraints like "\<", "^", etc.. */ static inline re_dfastate_t * -acquire_init_state_context (err, mctx, idx) - reg_errcode_t *err; - const re_match_context_t *mctx; - int idx; +__attribute ((always_inline)) internal_function +acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, + int idx) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; if (dfa->init_state->has_constraint) { unsigned int context; @@ -1041,12 +1090,11 @@ acquire_init_state_context (err, mctx, idx) index of the buffer. */ static int -check_matching (mctx, fl_longest_match, p_match_first) - re_match_context_t *mctx; - int fl_longest_match; - int *p_match_first; +internal_function __attribute_warn_unused_result__ +check_matching (re_match_context_t *mctx, int fl_longest_match, + int *p_match_first) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; reg_errcode_t err; int match = 0; int match_last = -1; @@ -1081,7 +1129,7 @@ check_matching (mctx, fl_longest_match, p_match_first) { err = transit_state_bkref (mctx, &cur_state->nodes); if (BE (err != REG_NOERROR, 0)) - return err; + return err; } } } @@ -1107,17 +1155,18 @@ check_matching (mctx, fl_longest_match, p_match_first) re_dfastate_t *old_state = cur_state; int next_char_idx = re_string_cur_idx (&mctx->input) + 1; - if (BE (next_char_idx >= mctx->input.bufs_len, 0) - || (BE (next_char_idx >= mctx->input.valid_len, 0) - && mctx->input.valid_len < mctx->input.len)) - { - err = extend_buffers (mctx); - if (BE (err != REG_NOERROR, 0)) + if ((BE (next_char_idx >= mctx->input.bufs_len, 0) + && mctx->input.bufs_len < mctx->input.len) + || (BE (next_char_idx >= mctx->input.valid_len, 0) + && mctx->input.valid_len < mctx->input.len)) + { + err = extend_buffers (mctx); + if (BE (err != REG_NOERROR, 0)) { assert (err == REG_ESPACE); return -2; } - } + } cur_state = transit_state (&err, mctx, cur_state); if (mctx->state_log != NULL) @@ -1173,10 +1222,9 @@ check_matching (mctx, fl_longest_match, p_match_first) /* Check NODE match the current context. */ -static int check_halt_node_context (dfa, node, context) - const re_dfa_t *dfa; - int node; - unsigned int context; +static int +internal_function +check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context) { re_token_type_t type = dfa->nodes[node].type; unsigned int constraint = dfa->nodes[node].constraint; @@ -1194,10 +1242,9 @@ static int check_halt_node_context (dfa, node, context) match the context, return the node. */ static int -check_halt_state_context (mctx, state, idx) - const re_match_context_t *mctx; - const re_dfastate_t *state; - int idx; +internal_function +check_halt_state_context (const re_match_context_t *mctx, + const re_dfastate_t *state, int idx) { int i; unsigned int context; @@ -1217,16 +1264,13 @@ check_halt_state_context (mctx, state, idx) of errors. */ static int -proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) - const re_match_context_t *mctx; - regmatch_t *regs; - int nregs, *pidx, node; - re_node_set *eps_via_nodes; - struct re_fail_stack_t *fs; +internal_function +proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs, + int *pidx, int node, re_node_set *eps_via_nodes, + struct re_fail_stack_t *fs) { - re_dfa_t *const dfa = mctx->dfa; - int i, err, dest_node; - dest_node = -1; + const re_dfa_t *const dfa = mctx->dfa; + int i, err; if (IS_EPSILON_NODE (dfa->nodes[node].type)) { re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; @@ -1241,20 +1285,20 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) int candidate = edests->elems[i]; if (!re_node_set_contains (cur_nodes, candidate)) continue; - if (dest_node == -1) + if (dest_node == -1) dest_node = candidate; - else + else { /* In order to avoid infinite loop like "(a*)*", return the second - epsilon-transition if the first was already considered. */ + epsilon-transition if the first was already considered. */ if (re_node_set_contains (eps_via_nodes, dest_node)) - return candidate; + return candidate; /* Otherwise, push the second epsilon-transition on the fail stack. */ else if (fs != NULL && push_fail_stack (fs, *pidx, candidate, nregs, regs, - eps_via_nodes)) + eps_via_nodes)) return -2; /* We know we are going to exit. */ @@ -1292,6 +1336,7 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) if (naccepted == 0) { + int dest_node; err = re_node_set_insert (eps_via_nodes, node); if (BE (err < 0, 0)) return -2; @@ -1305,7 +1350,7 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) if (naccepted != 0 || check_node_accept (mctx, dfa->nodes + node, *pidx)) { - dest_node = dfa->nexts[node]; + int dest_node = dfa->nexts[node]; *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, @@ -1319,11 +1364,9 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) } static reg_errcode_t -push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes) - struct re_fail_stack_t *fs; - int str_idx, dest_node, nregs; - regmatch_t *regs; - re_node_set *eps_via_nodes; +internal_function __attribute_warn_unused_result__ +push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node, + int nregs, regmatch_t *regs, re_node_set *eps_via_nodes) { reg_errcode_t err; int num = fs->num++; @@ -1348,11 +1391,9 @@ push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes) } static int -pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) - struct re_fail_stack_t *fs; - int *pidx, nregs; - regmatch_t *regs; - re_node_set *eps_via_nodes; +internal_function +pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, + regmatch_t *regs, re_node_set *eps_via_nodes) { int num = --fs->num; assert (num >= 0); @@ -1370,19 +1411,17 @@ pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ static reg_errcode_t -set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) - const regex_t *preg; - const re_match_context_t *mctx; - size_t nmatch; - regmatch_t *pmatch; - int fl_backtrack; +internal_function __attribute_warn_unused_result__ +set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, + regmatch_t *pmatch, int fl_backtrack) { - re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; int idx, cur_node; re_node_set eps_via_nodes; struct re_fail_stack_t *fs; struct re_fail_stack_t fs_body = { 0, 2, NULL }; regmatch_t *prev_idx_match; + int prev_idx_match_malloced = 0; #ifdef DEBUG assert (nmatch > 1); @@ -1401,7 +1440,18 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) cur_node = dfa->init_node; re_node_set_init_empty (&eps_via_nodes); - prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch); + if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) + prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); + else + { + prev_idx_match = re_malloc (regmatch_t, nmatch); + if (prev_idx_match == NULL) + { + free_fail_stack_return (fs); + return REG_ESPACE; + } + prev_idx_match_malloced = 1; + } memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) @@ -1419,6 +1469,8 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) if (reg_idx == nmatch) { re_node_set_free (&eps_via_nodes); + if (prev_idx_match_malloced) + re_free (prev_idx_match); return free_fail_stack_return (fs); } cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, @@ -1427,6 +1479,8 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) else { re_node_set_free (&eps_via_nodes); + if (prev_idx_match_malloced) + re_free (prev_idx_match); return REG_NOERROR; } } @@ -1440,6 +1494,8 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) if (BE (cur_node == -2, 0)) { re_node_set_free (&eps_via_nodes); + if (prev_idx_match_malloced) + re_free (prev_idx_match); free_fail_stack_return (fs); return REG_ESPACE; } @@ -1449,17 +1505,21 @@ set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) else { re_node_set_free (&eps_via_nodes); + if (prev_idx_match_malloced) + re_free (prev_idx_match); return REG_NOMATCH; } } } re_node_set_free (&eps_via_nodes); + if (prev_idx_match_malloced) + re_free (prev_idx_match); return free_fail_stack_return (fs); } static reg_errcode_t -free_fail_stack_return (fs) - struct re_fail_stack_t *fs; +internal_function +free_fail_stack_return (struct re_fail_stack_t *fs) { if (fs) { @@ -1475,10 +1535,9 @@ free_fail_stack_return (fs) } static void -update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch) - re_dfa_t *dfa; - regmatch_t *pmatch, *prev_idx_match; - int cur_node, cur_idx, nmatch; +internal_function +update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, + regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch) { int type = dfa->nodes[cur_node].type; if (type == OP_OPEN_SUBEXP) @@ -1548,9 +1607,8 @@ update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch) ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) static reg_errcode_t -sift_states_backward (mctx, sctx) - re_match_context_t *mctx; - re_sift_context_t *sctx; +internal_function +sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) { reg_errcode_t err; int null_cnt = 0; @@ -1588,7 +1646,7 @@ sift_states_backward (mctx, sctx) if (mctx->state_log[str_idx]) { err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); - if (BE (err != REG_NOERROR, 0)) + if (BE (err != REG_NOERROR, 0)) goto free_return; } @@ -1607,14 +1665,12 @@ sift_states_backward (mctx, sctx) } static reg_errcode_t -build_sifted_states (mctx, sctx, str_idx, cur_dest) - re_match_context_t *mctx; - re_sift_context_t *sctx; - int str_idx; - re_node_set *cur_dest; +internal_function __attribute_warn_unused_result__ +build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, + int str_idx, re_node_set *cur_dest) { - re_dfa_t *const dfa = mctx->dfa; - re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; + const re_dfa_t *const dfa = mctx->dfa; + const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; int i; /* Then build the next sifted state. @@ -1671,13 +1727,13 @@ build_sifted_states (mctx, sctx, str_idx, cur_dest) /* Helper functions. */ static reg_errcode_t -clean_state_log_if_needed (mctx, next_state_log_idx) - re_match_context_t *mctx; - int next_state_log_idx; +internal_function +clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx) { int top = mctx->state_log_top; - if (next_state_log_idx >= mctx->input.bufs_len + if ((next_state_log_idx >= mctx->input.bufs_len + && mctx->input.bufs_len < mctx->input.len) || (next_state_log_idx >= mctx->input.valid_len && mctx->input.valid_len < mctx->input.len)) { @@ -1697,11 +1753,9 @@ clean_state_log_if_needed (mctx, next_state_log_idx) } static reg_errcode_t -merge_state_array (dfa, dst, src, num) - re_dfa_t *dfa; - re_dfastate_t **dst; - re_dfastate_t **src; - int num; +internal_function +merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, + re_dfastate_t **src, int num) { int st_idx; reg_errcode_t err; @@ -1726,14 +1780,13 @@ merge_state_array (dfa, dst, src, num) } static reg_errcode_t -update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes) - re_match_context_t *mctx; - re_sift_context_t *sctx; - int str_idx; - re_node_set *dest_nodes; +internal_function +update_cur_sifted_state (const re_match_context_t *mctx, + re_sift_context_t *sctx, int str_idx, + re_node_set *dest_nodes) { - re_dfa_t *const dfa = mctx->dfa; - reg_errcode_t err; + const re_dfa_t *const dfa = mctx->dfa; + reg_errcode_t err = REG_NOERROR; const re_node_set *candidates; candidates = ((mctx->state_log[str_idx] == NULL) ? NULL : &mctx->state_log[str_idx]->nodes); @@ -1775,10 +1828,9 @@ update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes) } static reg_errcode_t -add_epsilon_src_nodes (dfa, dest_nodes, candidates) - re_dfa_t *dfa; - re_node_set *dest_nodes; - const re_node_set *candidates; +internal_function __attribute_warn_unused_result__ +add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, + const re_node_set *candidates) { reg_errcode_t err = REG_NOERROR; int i; @@ -1791,21 +1843,23 @@ add_epsilon_src_nodes (dfa, dest_nodes, candidates) { err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); if (BE (err != REG_NOERROR, 0)) - return REG_ESPACE; + return REG_ESPACE; for (i = 0; i < dest_nodes->nelem; i++) - re_node_set_merge (&state->inveclosure, - dfa->inveclosures + dest_nodes->elems[i]); + { + err = re_node_set_merge (&state->inveclosure, + dfa->inveclosures + dest_nodes->elems[i]); + if (BE (err != REG_NOERROR, 0)) + return REG_ESPACE; + } } return re_node_set_add_intersect (dest_nodes, candidates, &state->inveclosure); } static reg_errcode_t -sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) - re_dfa_t *dfa; - int node; - re_node_set *dest_nodes; - const re_node_set *candidates; +internal_function +sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes, + const re_node_set *candidates) { int ecl_idx; reg_errcode_t err; @@ -1852,12 +1906,11 @@ sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) } static int -check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx) - re_match_context_t *mctx; - re_node_set *limits; - int dst_node, dst_idx, src_node, src_idx; +internal_function +check_dst_limits (const re_match_context_t *mctx, re_node_set *limits, + int dst_node, int dst_idx, int src_node, int src_idx) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int lim_idx, src_pos, dst_pos; int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); @@ -1889,12 +1942,12 @@ check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx) } static int -check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) - re_match_context_t *mctx; - int boundaries, subexp_idx, from_node, bkref_idx; +internal_function +check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, + int subexp_idx, int from_node, int bkref_idx) { - re_dfa_t *const dfa = mctx->dfa; - re_node_set *eclosures = dfa->eclosures + from_node; + const re_dfa_t *const dfa = mctx->dfa; + const re_node_set *eclosures = dfa->eclosures + from_node; int node_idx; /* Else, we are on the boundary: examine the nodes on the epsilon @@ -1909,14 +1962,15 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) { struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; do - { + { int dst, cpos; if (ent->node != node) continue; - if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map) - && !(ent->eps_reachable_subexps_map & (1 << subexp_idx))) + if (subexp_idx < BITSET_WORD_BITS + && !(ent->eps_reachable_subexps_map + & ((bitset_word_t) 1 << subexp_idx))) continue; /* Recurse trying to reach the OP_OPEN_SUBEXP and @@ -1929,9 +1983,9 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) if (dst == from_node) { if (boundaries & 1) - return -1; + return -1; else /* if (boundaries & 2) */ - return 0; + return 0; } cpos = @@ -1942,8 +1996,10 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) if (cpos == 0 && (boundaries & 2)) return 0; - ent->eps_reachable_subexps_map &= ~(1 << subexp_idx); - } + if (subexp_idx < BITSET_WORD_BITS) + ent->eps_reachable_subexps_map + &= ~((bitset_word_t) 1 << subexp_idx); + } while (ent++->more); } break; @@ -1967,9 +2023,10 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) } static int -check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx) - re_match_context_t *mctx; - int limit, subexp_idx, from_node, str_idx, bkref_idx; +internal_function +check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit, + int subexp_idx, int from_node, int str_idx, + int bkref_idx) { struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; int boundaries; @@ -1996,13 +2053,10 @@ check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_id which are against limitations from DEST_NODES. */ static reg_errcode_t -check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx) - re_dfa_t *dfa; - re_node_set *dest_nodes; - const re_node_set *candidates; - re_node_set *limits; - struct re_backref_cache_entry *bkref_ents; - int str_idx; +internal_function +check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, + const re_node_set *candidates, re_node_set *limits, + struct re_backref_cache_entry *bkref_ents, int str_idx) { reg_errcode_t err; int node_idx, lim_idx; @@ -2087,13 +2141,11 @@ check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx) } static reg_errcode_t -sift_states_bkref (mctx, sctx, str_idx, candidates) - re_match_context_t *mctx; - re_sift_context_t *sctx; - int str_idx; - const re_node_set *candidates; +internal_function __attribute_warn_unused_result__ +sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, + int str_idx, const re_node_set *candidates) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; reg_errcode_t err; int node_idx, node; re_sift_context_t local_sctx; @@ -2121,7 +2173,10 @@ sift_states_bkref (mctx, sctx, str_idx, candidates) enabled_idx = first_idx; do { - int subexp_len, to_idx, dst_node; + int subexp_len; + int to_idx; + int dst_node; + int ret; re_dfastate_t *cur_state; if (entry->node != node) @@ -2147,8 +2202,8 @@ sift_states_bkref (mctx, sctx, str_idx, candidates) } local_sctx.last_node = node; local_sctx.last_str_idx = str_idx; - err = re_node_set_insert (&local_sctx.limits, enabled_idx); - if (BE (err < 0, 0)) + ret = re_node_set_insert (&local_sctx.limits, enabled_idx); + if (BE (ret < 0, 0)) { err = REG_ESPACE; goto free_return; @@ -2169,7 +2224,7 @@ sift_states_bkref (mctx, sctx, str_idx, candidates) re_node_set_remove (&local_sctx.limits, enabled_idx); /* mctx->bkref_ents may have changed, reload the pointer. */ - entry = mctx->bkref_ents + enabled_idx; + entry = mctx->bkref_ents + enabled_idx; } while (enabled_idx++, entry++->more); } @@ -2186,12 +2241,11 @@ sift_states_bkref (mctx, sctx, str_idx, candidates) #ifdef RE_ENABLE_I18N static int -sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx) - const re_match_context_t *mctx; - re_sift_context_t *sctx; - int node_idx, str_idx, max_str_idx; +internal_function +sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, + int node_idx, int str_idx, int max_str_idx) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int naccepted; /* Check the node can accept `multi byte'. */ naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); @@ -2217,10 +2271,9 @@ sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx) update the destination of STATE_LOG. */ static re_dfastate_t * -transit_state (err, mctx, state) - reg_errcode_t *err; - re_match_context_t *mctx; - re_dfastate_t *state; +internal_function __attribute_warn_unused_result__ +transit_state (reg_errcode_t *err, re_match_context_t *mctx, + re_dfastate_t *state) { re_dfastate_t **trtable; unsigned char ch; @@ -2252,7 +2305,7 @@ transit_state (err, mctx, state) trtable = state->word_trtable; if (BE (trtable != NULL, 1)) - { + { unsigned int context; context = re_string_context_at (&mctx->input, @@ -2276,12 +2329,11 @@ transit_state (err, mctx, state) /* Update the state_log if we need */ re_dfastate_t * -merge_state_with_log (err, mctx, next_state) - reg_errcode_t *err; - re_match_context_t *mctx; - re_dfastate_t *next_state; +internal_function +merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, + re_dfastate_t *next_state) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int cur_idx = re_string_cur_idx (&mctx->input); if (cur_idx > mctx->state_log_top) @@ -2299,21 +2351,21 @@ merge_state_with_log (err, mctx, next_state) unsigned int context; re_node_set next_nodes, *log_nodes, *table_nodes = NULL; /* If (state_log[cur_idx] != 0), it implies that cur_idx is - the destination of a multibyte char/collating element/ - back reference. Then the next state is the union set of - these destinations and the results of the transition table. */ + the destination of a multibyte char/collating element/ + back reference. Then the next state is the union set of + these destinations and the results of the transition table. */ pstate = mctx->state_log[cur_idx]; log_nodes = pstate->entrance_nodes; if (next_state != NULL) - { - table_nodes = next_state->entrance_nodes; - *err = re_node_set_init_union (&next_nodes, table_nodes, + { + table_nodes = next_state->entrance_nodes; + *err = re_node_set_init_union (&next_nodes, table_nodes, log_nodes); - if (BE (*err != REG_NOERROR, 0)) + if (BE (*err != REG_NOERROR, 0)) return NULL; - } + } else - next_nodes = *log_nodes; + next_nodes = *log_nodes; /* Note: We already add the nodes of the initial state, then we don't need to add them here. */ @@ -2321,12 +2373,12 @@ merge_state_with_log (err, mctx, next_state) re_string_cur_idx (&mctx->input) - 1, mctx->eflags); next_state = mctx->state_log[cur_idx] - = re_acquire_state_context (err, dfa, &next_nodes, context); + = re_acquire_state_context (err, dfa, &next_nodes, context); /* We don't need to check errors here, since the return value of - this function is next_state and ERR is already set. */ + this function is next_state and ERR is already set. */ if (table_nodes != NULL) - re_node_set_free (&next_nodes); + re_node_set_free (&next_nodes); } if (BE (dfa->nbackref, 0) && next_state != NULL) @@ -2356,11 +2408,10 @@ merge_state_with_log (err, mctx, next_state) multi-byte match, then look in the log for a state from which to restart matching. */ re_dfastate_t * -find_recover_state (err, mctx) - reg_errcode_t *err; - re_match_context_t *mctx; +internal_function +find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) { - re_dfastate_t *cur_state = NULL; + re_dfastate_t *cur_state; do { int max = mctx->state_log_top; @@ -2368,15 +2419,15 @@ find_recover_state (err, mctx) do { - if (++cur_str_idx > max) - return NULL; - re_string_skip_bytes (&mctx->input, 1); + if (++cur_str_idx > max) + return NULL; + re_string_skip_bytes (&mctx->input, 1); } while (mctx->state_log[cur_str_idx] == NULL); cur_state = merge_state_with_log (err, mctx, NULL); } - while (err == REG_NOERROR && cur_state == NULL); + while (*err == REG_NOERROR && cur_state == NULL); return cur_state; } @@ -2388,12 +2439,11 @@ find_recover_state (err, mctx) correspoding back references. */ static reg_errcode_t -check_subexp_matching_top (mctx, cur_nodes, str_idx) - re_match_context_t *mctx; - re_node_set *cur_nodes; - int str_idx; +internal_function +check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, + int str_idx) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int node_idx; reg_errcode_t err; @@ -2406,8 +2456,9 @@ check_subexp_matching_top (mctx, cur_nodes, str_idx) { int node = cur_nodes->elems[node_idx]; if (dfa->nodes[node].type == OP_OPEN_SUBEXP - && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map)) - && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx)) + && dfa->nodes[node].opr.idx < BITSET_WORD_BITS + && (dfa->used_bkref_map + & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) { err = match_ctx_add_subtop (mctx, node, str_idx); if (BE (err != REG_NOERROR, 0)) @@ -2422,12 +2473,10 @@ check_subexp_matching_top (mctx, cur_nodes, str_idx) accepting the current input byte. */ static re_dfastate_t * -transit_state_sb (err, mctx, state) - reg_errcode_t *err; - re_match_context_t *mctx; - re_dfastate_t *state; +transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, + re_dfastate_t *state) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; re_node_set next_nodes; re_dfastate_t *next_state; int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); @@ -2463,11 +2512,10 @@ transit_state_sb (err, mctx, state) #ifdef RE_ENABLE_I18N static reg_errcode_t -transit_state_mb (mctx, pstate) - re_match_context_t *mctx; - re_dfastate_t *pstate; +internal_function +transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; reg_errcode_t err; int i; @@ -2480,7 +2528,7 @@ transit_state_mb (mctx, pstate) re_dfastate_t *dest_state; if (!dfa->nodes[cur_node_idx].accept_mb) - continue; + continue; if (dfa->nodes[cur_node_idx].constraint) { @@ -2520,7 +2568,8 @@ transit_state_mb (mctx, pstate) if (BE (err != REG_NOERROR, 0)) return err; } - context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags); + context = re_string_context_at (&mctx->input, dest_idx - 1, + mctx->eflags); mctx->state_log[dest_idx] = re_acquire_state_context (&err, dfa, &dest_nodes, context); if (dest_state != NULL) @@ -2533,11 +2582,10 @@ transit_state_mb (mctx, pstate) #endif /* RE_ENABLE_I18N */ static reg_errcode_t -transit_state_bkref (mctx, nodes) - re_match_context_t *mctx; - const re_node_set *nodes; +internal_function +transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; reg_errcode_t err; int i; int cur_str_idx = re_string_cur_idx (&mctx->input); @@ -2648,20 +2696,20 @@ transit_state_bkref (mctx, nodes) delay these checking for prune_impossible_nodes(). */ static reg_errcode_t -get_subexp (mctx, bkref_node, bkref_str_idx) - re_match_context_t *mctx; - int bkref_node, bkref_str_idx; +internal_function __attribute_warn_unused_result__ +get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int subexp_num, sub_top_idx; const char *buf = (const char *) re_string_get_buffer (&mctx->input); /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); if (cache_idx != -1) { - const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx; + const struct re_backref_cache_entry *entry + = mctx->bkref_ents + cache_idx; do - if (entry->node == bkref_node) + if (entry->node == bkref_node) return REG_NOERROR; /* We already checked it. */ while (entry++->more); } @@ -2706,7 +2754,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) buf = (const char *) re_string_get_buffer (&mctx->input); } if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) - break; /* We don't need to search this sub expression any more. */ + /* We don't need to search this sub expression any more. */ + break; } bkref_str_off += sl_str_diff; sl_str += sl_str_diff; @@ -2757,7 +2806,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) continue; /* Does this state have a ')' of the sub expression? */ nodes = &mctx->state_log[sl_str]->nodes; - cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP); + cls_node = find_subexp_node (dfa, nodes, subexp_num, + OP_CLOSE_SUBEXP); if (cls_node == -1) continue; /* No. */ if (sub_top->path == NULL) @@ -2770,7 +2820,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node in the current context? */ err = check_arrival (mctx, sub_top->path, sub_top->node, - sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP); + sub_top->str_idx, cls_node, sl_str, + OP_CLOSE_SUBEXP); if (err == REG_NOMATCH) continue; if (BE (err != REG_NOERROR, 0)) @@ -2794,17 +2845,16 @@ get_subexp (mctx, bkref_node, bkref_str_idx) and SUB_LAST. */ static reg_errcode_t -get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str) - re_match_context_t *mctx; - const re_sub_match_top_t *sub_top; - re_sub_match_last_t *sub_last; - int bkref_node, bkref_str; +internal_function +get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, + re_sub_match_last_t *sub_last, int bkref_node, int bkref_str) { reg_errcode_t err; int to_idx; /* Can the subexpression arrive the back reference? */ err = check_arrival (mctx, &sub_last->path, sub_last->node, - sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP); + sub_last->str_idx, bkref_node, bkref_str, + OP_OPEN_SUBEXP); if (err != REG_NOERROR) return err; err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, @@ -2824,10 +2874,9 @@ get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str) E.g. RE: (a){2} */ static int -find_subexp_node (dfa, nodes, subexp_idx, type) - const re_dfa_t *dfa; - const re_node_set *nodes; - int subexp_idx, type; +internal_function +find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, + int subexp_idx, int type) { int cls_idx; for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) @@ -2847,14 +2896,12 @@ find_subexp_node (dfa, nodes, subexp_idx, type) Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ static reg_errcode_t -check_arrival (mctx, path, top_node, top_str, last_node, last_str, - type) - re_match_context_t *mctx; - state_array_t *path; - int top_node, top_str, last_node, last_str, type; +internal_function __attribute_warn_unused_result__ +check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node, + int top_str, int last_node, int last_str, int type) { - re_dfa_t *const dfa = mctx->dfa; - reg_errcode_t err; + const re_dfa_t *const dfa = mctx->dfa; + reg_errcode_t err = REG_NOERROR; int subexp_num, backup_cur_idx, str_idx, null_cnt; re_dfastate_t *cur_state = NULL; re_node_set *cur_nodes, next_nodes; @@ -2869,7 +2916,7 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, int old_alloc = path->alloc; path->alloc += last_str + mctx->max_mb_elem_len + 1; new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); - if (new_array == NULL) + if (BE (new_array == NULL, 0)) { path->alloc = old_alloc; return REG_ESPACE; @@ -2879,7 +2926,7 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); } - str_idx = path->next_idx == 0 ? top_str : path->next_idx; + str_idx = path->next_idx ?: top_str; /* Temporary modify MCTX. */ backup_state_log = mctx->state_log; @@ -2907,7 +2954,7 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, if (cur_state && cur_state->has_backref) { err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); - if (BE ( err != REG_NOERROR, 0)) + if (BE (err != REG_NOERROR, 0)) return err; } else @@ -2919,7 +2966,7 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, { err = expand_bkref_cache (mctx, &next_nodes, str_idx, subexp_num, type); - if (BE ( err != REG_NOERROR, 0)) + if (BE (err != REG_NOERROR, 0)) { re_node_set_free (&next_nodes); return err; @@ -2950,7 +2997,8 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, if (cur_state) { err = check_arrival_add_next_nodes (mctx, str_idx, - &cur_state->non_eps_nodes, &next_nodes); + &cur_state->non_eps_nodes, + &next_nodes); if (BE (err != REG_NOERROR, 0)) { re_node_set_free (&next_nodes); @@ -2968,7 +3016,7 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, } err = expand_bkref_cache (mctx, &next_nodes, str_idx, subexp_num, type); - if (BE ( err != REG_NOERROR, 0)) + if (BE (err != REG_NOERROR, 0)) { re_node_set_free (&next_nodes); return err; @@ -3009,15 +3057,14 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str, Can't we unify them? */ static reg_errcode_t -check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes) - re_match_context_t *mctx; - int str_idx; - re_node_set *cur_nodes, *next_nodes; +internal_function __attribute_warn_unused_result__ +check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx, + re_node_set *cur_nodes, re_node_set *next_nodes) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; int result; int cur_idx; - reg_errcode_t err; + reg_errcode_t err = REG_NOERROR; re_node_set union_set; re_node_set_init_empty (&union_set); for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) @@ -3089,10 +3136,9 @@ check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes) */ static reg_errcode_t -check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type) - re_dfa_t *dfa; - re_node_set *cur_nodes; - int ex_subexp, type; +internal_function +check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, + int ex_subexp, int type) { reg_errcode_t err; int idx, outside_node; @@ -3109,7 +3155,7 @@ check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type) for (idx = 0; idx < cur_nodes->nelem; ++idx) { int cur_node = cur_nodes->elems[idx]; - re_node_set *eclosure = dfa->eclosures + cur_node; + const re_node_set *eclosure = dfa->eclosures + cur_node; outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); if (outside_node == -1) { @@ -3143,10 +3189,9 @@ check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type) problematic append it to DST_NODES. */ static reg_errcode_t -check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type) - re_dfa_t *dfa; - int target, ex_subexp, type; - re_node_set *dst_nodes; +internal_function __attribute_warn_unused_result__ +check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, + int target, int ex_subexp, int type) { int cur_node; for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) @@ -3188,13 +3233,11 @@ check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type) in MCTX->BKREF_ENTS. */ static reg_errcode_t -expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num, - type) - re_match_context_t *mctx; - int cur_str, subexp_num, type; - re_node_set *cur_nodes; +internal_function __attribute_warn_unused_result__ +expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, + int cur_str, int subexp_num, int type) { - re_dfa_t *const dfa = mctx->dfa; + const re_dfa_t *const dfa = mctx->dfa; reg_errcode_t err; int cache_idx_start = search_cur_bkref_entry (mctx, cur_str); struct re_backref_cache_entry *ent; @@ -3279,39 +3322,42 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num, Return 1 if succeeded, otherwise return NULL. */ static int -build_trtable (dfa, state) - re_dfa_t *dfa; - re_dfastate_t *state; +internal_function +build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) { reg_errcode_t err; int i, j, ch, need_word_trtable = 0; - unsigned int elem, mask; - int dests_node_malloced = 0, dest_states_malloced = 0; + bitset_word_t elem, mask; + bool dests_node_malloced = false; + bool dest_states_malloced = false; int ndests; /* Number of the destination states from `state'. */ re_dfastate_t **trtable; re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; re_node_set follows, *dests_node; - bitset *dests_ch; - bitset acceptable; + bitset_t *dests_ch; + bitset_t acceptable; + + struct dests_alloc + { + re_node_set dests_node[SBC_MAX]; + bitset_t dests_ch[SBC_MAX]; + } *dests_alloc; /* We build DFA states which corresponds to the destination nodes from `state'. `dests_node[i]' represents the nodes which i-th destination state contains, and `dests_ch[i]' represents the characters which i-th destination state accepts. */ -#ifdef _LIBC - if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)) - dests_node = (re_node_set *) - alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); + if (__libc_use_alloca (sizeof (struct dests_alloc))) + dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); else -#endif { - dests_node = (re_node_set *) - malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); - if (BE (dests_node == NULL, 0)) + dests_alloc = re_malloc (struct dests_alloc, 1); + if (BE (dests_alloc == NULL, 0)) return 0; - dests_node_malloced = 1; + dests_node_malloced = true; } - dests_ch = (bitset *) (dests_node + SBC_MAX); + dests_node = dests_alloc->dests_node; + dests_ch = dests_alloc->dests_ch; /* Initialize transiton table. */ state->word_trtable = state->trtable = NULL; @@ -3322,12 +3368,14 @@ build_trtable (dfa, state) if (BE (ndests <= 0, 0)) { if (dests_node_malloced) - free (dests_node); + free (dests_alloc); /* Return 0 in case of an error, 1 otherwise. */ if (ndests == 0) { state->trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); + if (BE (state->trtable == NULL, 0)) + return 0; return 1; } return 0; @@ -3337,13 +3385,18 @@ build_trtable (dfa, state) if (BE (err != REG_NOERROR, 0)) goto out_free; -#ifdef _LIBC - if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX + /* Avoid arithmetic overflow in size calculation. */ + if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX) + / (3 * sizeof (re_dfastate_t *))) + < ndests), + 0)) + goto out_free; + + if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX + ndests * 3 * sizeof (re_dfastate_t *))) dest_states = (re_dfastate_t **) alloca (ndests * 3 * sizeof (re_dfastate_t *)); else -#endif { dest_states = (re_dfastate_t **) malloc (ndests * 3 * sizeof (re_dfastate_t *)); @@ -3356,10 +3409,10 @@ build_trtable (dfa, state) for (i = 0; i < ndests; ++i) re_node_set_free (dests_node + i); if (dests_node_malloced) - free (dests_node); + free (dests_alloc); return 0; } - dest_states_malloced = 1; + dest_states_malloced = true; } dest_states_word = dest_states + ndests; dest_states_nl = dest_states_word + ndests; @@ -3421,8 +3474,8 @@ build_trtable (dfa, state) goto out_free; /* For all characters ch...: */ - for (i = 0; i < BITSET_UINTS; ++i) - for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; + for (i = 0; i < BITSET_WORDS; ++i) + for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; elem; mask <<= 1, elem >>= 1, ++ch) if (BE (elem & 1, 0)) @@ -3452,8 +3505,8 @@ build_trtable (dfa, state) goto out_free; /* For all characters ch...: */ - for (i = 0; i < BITSET_UINTS; ++i) - for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; + for (i = 0; i < BITSET_WORDS; ++i) + for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; elem; mask <<= 1, elem >>= 1, ++ch) if (BE (elem & 1, 0)) @@ -3494,7 +3547,7 @@ build_trtable (dfa, state) re_node_set_free (dests_node + i); if (dests_node_malloced) - free (dests_node); + free (dests_alloc); return 1; } @@ -3505,17 +3558,15 @@ build_trtable (dfa, state) to DEST_CH[i]. This function return the number of destinations. */ static int -group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) - re_dfa_t *dfa; - const re_dfastate_t *state; - re_node_set *dests_node; - bitset *dests_ch; +internal_function +group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, + re_node_set *dests_node, bitset_t *dests_ch) { reg_errcode_t err; int result; int i, j, k; int ndests; /* Number of the destinations from `state'. */ - bitset accepts; /* Characters a node can accept. */ + bitset_t accepts; /* Characters a node can accept. */ const re_node_set *cur_nodes = &state->nodes; bitset_empty (accepts); ndests = 0; @@ -3549,13 +3600,13 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) } #ifdef RE_ENABLE_I18N else if (type == OP_UTF8_PERIOD) - { - memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2); + { + memset (accepts, '\xff', sizeof (bitset_t) / 2); if (!(dfa->syntax & RE_DOT_NEWLINE)) bitset_clear (accepts, '\n'); if (dfa->syntax & RE_DOT_NOT_NULL) bitset_clear (accepts, '\0'); - } + } #endif else continue; @@ -3566,7 +3617,7 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) { if (constraint & NEXT_NEWLINE_CONSTRAINT) { - int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); + bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); bitset_empty (accepts); if (accepts_newline) bitset_set (accepts, NEWLINE_CHAR); @@ -3581,7 +3632,7 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) if (constraint & NEXT_WORD_CONSTRAINT) { - unsigned int any_set = 0; + bitset_word_t any_set = 0; if (type == CHARACTER && !node->word_char) { bitset_empty (accepts); @@ -3589,18 +3640,18 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) } #ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); else #endif - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= dfa->word_char[j]); if (!any_set) continue; } if (constraint & NEXT_NOTWORD_CONSTRAINT) { - unsigned int any_set = 0; + bitset_word_t any_set = 0; if (type == CHARACTER && node->word_char) { bitset_empty (accepts); @@ -3608,11 +3659,11 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) } #ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); else #endif - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= ~dfa->word_char[j]); if (!any_set) continue; @@ -3623,10 +3674,10 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) state. Above, we make sure that accepts is not empty. */ for (j = 0; j < ndests; ++j) { - bitset intersec; /* Intersection sets, see below. */ - bitset remains; + bitset_t intersec; /* Intersection sets, see below. */ + bitset_t remains; /* Flags, see below. */ - int has_intersec, not_subset, not_consumed; + bitset_word_t has_intersec, not_subset, not_consumed; /* Optimization, skip if this state doesn't accept the character. */ if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) @@ -3634,7 +3685,7 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) /* Enumerate the intersection set of this state and `accepts'. */ has_intersec = 0; - for (k = 0; k < BITSET_UINTS; ++k) + for (k = 0; k < BITSET_WORDS; ++k) has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; /* And skip if the intersection set is empty. */ if (!has_intersec) @@ -3642,7 +3693,7 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) /* Then check if this state is a subset of `accepts'. */ not_subset = not_consumed = 0; - for (k = 0; k < BITSET_UINTS; ++k) + for (k = 0; k < BITSET_WORDS; ++k) { not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; @@ -3697,10 +3748,9 @@ group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) can only accept one byte. */ static int -check_node_accept_bytes (dfa, node_idx, input, str_idx) - re_dfa_t *dfa; - int node_idx, str_idx; - const re_string_t *input; +internal_function +check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, + const re_string_t *input, int str_idx) { const re_token_t *node = dfa->nodes + node_idx; int char_len, elem_len; @@ -3761,7 +3811,7 @@ check_node_accept_bytes (dfa, node_idx, input, str_idx) if (node->type == OP_PERIOD) { if (char_len <= 1) - return 0; + return 0; /* FIXME: I don't think this if is needed, as both '\n' and '\0' are char_len == 1. */ /* '.' accepts any one character except the following two cases. */ @@ -3816,7 +3866,6 @@ check_node_accept_bytes (dfa, node_idx, input, str_idx) const int32_t *table, *indirect; const unsigned char *weights, *extra; const char *collseqwc; - int32_t idx; /* This #include defines a local function! */ # include @@ -3874,15 +3923,20 @@ check_node_accept_bytes (dfa, node_idx, input, str_idx) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx = findidx (&cp); + int32_t idx = findidx (&cp, elem_len); if (idx > 0) for (i = 0; i < cset->nequiv_classes; ++i) { int32_t equiv_class_idx = cset->equiv_classes[i]; - size_t weight_len = weights[idx]; - if (weight_len == weights[equiv_class_idx]) + size_t weight_len = weights[idx & 0xffffff]; + if (weight_len == weights[equiv_class_idx & 0xffffff] + && (idx >> 24) == (equiv_class_idx >> 24)) { int cnt = 0; + + idx &= 0xffffff; + equiv_class_idx &= 0xffffff; + while (cnt <= weight_len && (weights[equiv_class_idx + 1 + cnt] == weights[idx + 1 + cnt])) @@ -3934,9 +3988,8 @@ check_node_accept_bytes (dfa, node_idx, input, str_idx) # ifdef _LIBC static unsigned int -find_collation_sequence_value (mbs, mbs_len) - const unsigned char *mbs; - size_t mbs_len; +internal_function +find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) { uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules == 0) @@ -3981,7 +4034,7 @@ find_collation_sequence_value (mbs, mbs_len) /* Skip the collation sequence value. */ idx += sizeof (uint32_t); /* Skip the wide char sequence of the collating element. */ - idx = idx + sizeof (uint32_t) * (extra[idx] + 1); + idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1); /* If we found the entry, return the sequence value. */ if (found) return *(uint32_t *) (extra + idx); @@ -3998,10 +4051,9 @@ find_collation_sequence_value (mbs, mbs_len) byte of the INPUT. */ static int -check_node_accept (mctx, node, idx) - const re_match_context_t *mctx; - const re_token_t *node; - int idx; +internal_function +check_node_accept (const re_match_context_t *mctx, const re_token_t *node, + int idx) { unsigned char ch; ch = re_string_byte_at (&mctx->input, idx); @@ -4009,18 +4061,18 @@ check_node_accept (mctx, node, idx) { case CHARACTER: if (node->opr.c != ch) - return 0; + return 0; break; case SIMPLE_BRACKET: if (!bitset_contain (node->opr.sbcset, ch)) - return 0; + return 0; break; #ifdef RE_ENABLE_I18N case OP_UTF8_PERIOD: if (ch >= 0x80) - return 0; + return 0; /* FALLTHROUGH */ #endif case OP_PERIOD: @@ -4049,14 +4101,18 @@ check_node_accept (mctx, node, idx) /* Extend the buffers, if the buffers have run out. */ static reg_errcode_t -extend_buffers (mctx) - re_match_context_t *mctx; +internal_function __attribute_warn_unused_result__ +extend_buffers (re_match_context_t *mctx) { reg_errcode_t ret; re_string_t *pstr = &mctx->input; + /* Avoid overflow. */ + if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0)) + return REG_ESPACE; + /* Double the lengthes of the buffers. */ - ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); + ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2)); if (BE (ret != REG_NOERROR, 0)) return ret; @@ -4108,9 +4164,8 @@ extend_buffers (mctx) /* Initialize MCTX. */ static reg_errcode_t -match_ctx_init (mctx, eflags, n) - re_match_context_t *mctx; - int eflags, n; +internal_function __attribute_warn_unused_result__ +match_ctx_init (re_match_context_t *mctx, int eflags, int n) { mctx->eflags = eflags; mctx->match_last = -1; @@ -4137,8 +4192,8 @@ match_ctx_init (mctx, eflags, n) of the input, or changes the input string. */ static void -match_ctx_clean (mctx) - re_match_context_t *mctx; +internal_function +match_ctx_clean (re_match_context_t *mctx) { int st_idx; for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) @@ -4167,8 +4222,8 @@ match_ctx_clean (mctx) /* Free all the memory associated with MCTX. */ static void -match_ctx_free (mctx) - re_match_context_t *mctx; +internal_function +match_ctx_free (re_match_context_t *mctx) { /* First, free all the memory associated with MCTX->SUB_TOPS. */ match_ctx_clean (mctx); @@ -4182,9 +4237,9 @@ match_ctx_free (mctx) */ static reg_errcode_t -match_ctx_add_entry (mctx, node, str_idx, from, to) - re_match_context_t *mctx; - int node, str_idx, from, to; +internal_function __attribute_warn_unused_result__ +match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from, + int to) { if (mctx->nbkref_ents >= mctx->abkref_ents) { @@ -4231,9 +4286,8 @@ match_ctx_add_entry (mctx, node, str_idx, from, to) found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ static int -search_cur_bkref_entry (mctx, str_idx) - re_match_context_t *mctx; - int str_idx; +internal_function +search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) { int left, right, mid, last; last = right = mctx->nbkref_ents; @@ -4255,9 +4309,8 @@ search_cur_bkref_entry (mctx, str_idx) at STR_IDX. */ static reg_errcode_t -match_ctx_add_subtop (mctx, node, str_idx) - re_match_context_t *mctx; - int node, str_idx; +internal_function __attribute_warn_unused_result__ +match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx) { #ifdef DEBUG assert (mctx->sub_tops != NULL); @@ -4286,9 +4339,8 @@ match_ctx_add_subtop (mctx, node, str_idx) at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ static re_sub_match_last_t * -match_ctx_add_sublast (subtop, node, str_idx) - re_sub_match_top_t *subtop; - int node, str_idx; +internal_function +match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx) { re_sub_match_last_t *new_entry; if (BE (subtop->nlasts == subtop->alasts, 0)) @@ -4314,10 +4366,9 @@ match_ctx_add_sublast (subtop, node, str_idx) } static void -sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx) - re_sift_context_t *sctx; - re_dfastate_t **sifted_sts, **limited_sts; - int last_node, last_str_idx; +internal_function +sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, + re_dfastate_t **limited_sts, int last_node, int last_str_idx) { sctx->sifted_states = sifted_sts; sctx->limited_states = limited_sts; -- 2.45.0