1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static void re_string_construct_common (const char *str, int len,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 unsigned int hash) internal_function;
31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32 const re_node_set *nodes,
33 unsigned int hash) internal_function;
34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
37 unsigned int hash) internal_function;
38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
39 unsigned int context) internal_function;
41 /* Functions for string operation. */
43 /* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
50 int len, init_len, icase;
51 RE_TRANSLATE_TYPE trans;
57 /* Ensure at least one character fits into the buffers. */
58 if (init_len < dfa->mb_cur_max)
59 init_len = dfa->mb_cur_max;
60 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61 re_string_construct_common (str, len, pstr, trans, icase, dfa);
63 ret = re_string_realloc_buffers (pstr, init_buf_len);
64 if (BE (ret != REG_NOERROR, 0))
67 pstr->word_char = dfa->word_char;
68 pstr->word_ops_used = dfa->word_ops_used;
69 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71 pstr->valid_raw_len = pstr->valid_len;
75 /* This function allocate the buffers, and initialize them. */
78 re_string_construct (pstr, str, len, trans, icase, dfa)
82 RE_TRANSLATE_TYPE trans;
86 memset (pstr, '\0', sizeof (re_string_t));
87 re_string_construct_common (str, len, pstr, trans, icase, dfa);
91 ret = re_string_realloc_buffers (pstr, len + 1);
92 if (BE (ret != REG_NOERROR, 0))
95 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
100 if (dfa->mb_cur_max > 1)
104 ret = build_wcs_upper_buffer (pstr);
105 if (BE (ret != REG_NOERROR, 0))
107 if (pstr->valid_raw_len >= len)
109 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
111 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
112 if (BE (ret != REG_NOERROR, 0))
117 #endif /* RE_ENABLE_I18N */
118 build_upper_buffer (pstr);
122 #ifdef RE_ENABLE_I18N
123 if (dfa->mb_cur_max > 1)
124 build_wcs_buffer (pstr);
126 #endif /* RE_ENABLE_I18N */
129 re_string_translate_buffer (pstr);
132 pstr->valid_len = pstr->bufs_len;
133 pstr->valid_raw_len = pstr->bufs_len;
141 /* Helper functions for re_string_allocate, and re_string_construct. */
144 re_string_realloc_buffers (pstr, new_buf_len)
148 #ifdef RE_ENABLE_I18N
149 if (pstr->mb_cur_max > 1)
151 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
152 if (BE (new_array == NULL, 0))
154 pstr->wcs = new_array;
155 if (pstr->offsets != NULL)
157 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
158 if (BE (new_array == NULL, 0))
160 pstr->offsets = new_array;
163 #endif /* RE_ENABLE_I18N */
164 if (pstr->mbs_allocated)
166 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
168 if (BE (new_array == NULL, 0))
170 pstr->mbs = new_array;
172 pstr->bufs_len = new_buf_len;
178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
182 RE_TRANSLATE_TYPE trans;
186 pstr->raw_mbs = (const unsigned char *) str;
189 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
190 pstr->icase = icase ? 1 : 0;
191 pstr->mbs_allocated = (trans != NULL || icase);
192 pstr->mb_cur_max = dfa->mb_cur_max;
193 pstr->is_utf8 = dfa->is_utf8;
194 pstr->map_notascii = dfa->map_notascii;
195 pstr->stop = pstr->len;
196 pstr->raw_stop = pstr->stop;
199 #ifdef RE_ENABLE_I18N
201 /* Build wide character buffer PSTR->WCS.
202 If the byte sequence of the string are:
203 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204 Then wide character buffer will be:
205 <wc1> , WEOF , <wc2> , WEOF , <wc3>
206 We use WEOF for padding, they indicate that the position isn't
207 a first byte of a multibyte character.
209 Note that this function assumes PSTR->VALID_LEN elements are already
210 built and starts from PSTR->VALID_LEN. */
213 build_wcs_buffer (pstr)
217 unsigned char buf[MB_CUR_MAX];
218 assert (MB_CUR_MAX >= pstr->mb_cur_max);
220 unsigned char buf[64];
223 int byte_idx, end_idx, remain_len;
226 /* Build the buffers from pstr->valid_len to either pstr->len or
228 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
229 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
234 remain_len = end_idx - byte_idx;
235 prev_st = pstr->cur_state;
236 /* Apply the translation if we need. */
237 if (BE (pstr->trans != NULL, 0))
241 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
243 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
244 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
246 p = (const char *) buf;
249 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
250 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
251 if (BE (mbclen == (size_t) -2, 0))
253 /* The buffer doesn't have enough space, finish to build. */
254 pstr->cur_state = prev_st;
257 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
259 /* We treat these cases as a singlebyte character. */
261 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
262 if (BE (pstr->trans != NULL, 0))
263 wc = pstr->trans[wc];
264 pstr->cur_state = prev_st;
267 /* Write wide character and padding. */
268 pstr->wcs[byte_idx++] = wc;
269 /* Write paddings. */
270 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271 pstr->wcs[byte_idx++] = WEOF;
273 pstr->valid_len = byte_idx;
274 pstr->valid_raw_len = byte_idx;
277 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
278 but for REG_ICASE. */
281 build_wcs_upper_buffer (pstr)
285 int src_idx, byte_idx, end_idx, remain_len;
288 char buf[MB_CUR_MAX];
289 assert (MB_CUR_MAX >= pstr->mb_cur_max);
294 byte_idx = pstr->valid_len;
295 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
297 /* The following optimization assumes that ASCII characters can be
298 mapped to wide characters with a simple cast. */
299 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301 while (byte_idx < end_idx)
305 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
306 && mbsinit (&pstr->cur_state))
308 /* In case of a singlebyte character. */
310 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
311 /* The next step uses the assumption that wchar_t is encoded
312 ASCII-safe: all ASCII values can be converted like this. */
313 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
318 remain_len = end_idx - byte_idx;
319 prev_st = pstr->cur_state;
320 mbclen = mbrtowc (&wc,
321 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
322 + byte_idx), remain_len, &pstr->cur_state);
323 if (BE (mbclen + 2 > 2, 1))
331 mbcdlen = wcrtomb (buf, wcu, &prev_st);
332 if (BE (mbclen == mbcdlen, 1))
333 memcpy (pstr->mbs + byte_idx, buf, mbclen);
341 memcpy (pstr->mbs + byte_idx,
342 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343 pstr->wcs[byte_idx++] = wcu;
344 /* Write paddings. */
345 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 pstr->wcs[byte_idx++] = WEOF;
348 else if (mbclen == (size_t) -1 || mbclen == 0)
350 /* It is an invalid character or '\0'. Just use the byte. */
351 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352 pstr->mbs[byte_idx] = ch;
353 /* And also cast it to wide char. */
354 pstr->wcs[byte_idx++] = (wchar_t) ch;
355 if (BE (mbclen == (size_t) -1, 0))
356 pstr->cur_state = prev_st;
360 /* The buffer doesn't have enough space, finish to build. */
361 pstr->cur_state = prev_st;
365 pstr->valid_len = byte_idx;
366 pstr->valid_raw_len = byte_idx;
370 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
375 remain_len = end_idx - byte_idx;
376 prev_st = pstr->cur_state;
377 if (BE (pstr->trans != NULL, 0))
381 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
383 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384 buf[i] = pstr->trans[ch];
386 p = (const char *) buf;
389 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391 if (BE (mbclen + 2 > 2, 1))
399 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
400 if (BE (mbclen == mbcdlen, 1))
401 memcpy (pstr->mbs + byte_idx, buf, mbclen);
402 else if (mbcdlen != (size_t) -1)
406 if (byte_idx + mbcdlen > pstr->bufs_len)
408 pstr->cur_state = prev_st;
412 if (pstr->offsets == NULL)
414 pstr->offsets = re_malloc (int, pstr->bufs_len);
416 if (pstr->offsets == NULL)
419 if (!pstr->offsets_needed)
421 for (i = 0; i < (size_t) byte_idx; ++i)
422 pstr->offsets[i] = i;
423 pstr->offsets_needed = 1;
426 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 pstr->wcs[byte_idx] = wcu;
428 pstr->offsets[byte_idx] = src_idx;
429 for (i = 1; i < mbcdlen; ++i)
431 pstr->offsets[byte_idx + i]
432 = src_idx + (i < mbclen ? i : mbclen - 1);
433 pstr->wcs[byte_idx + i] = WEOF;
435 pstr->len += mbcdlen - mbclen;
436 if (pstr->raw_stop > src_idx)
437 pstr->stop += mbcdlen - mbclen;
438 end_idx = (pstr->bufs_len > pstr->len)
439 ? pstr->len : pstr->bufs_len;
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
448 memcpy (pstr->mbs + byte_idx, p, mbclen);
450 if (BE (pstr->offsets_needed != 0, 0))
453 for (i = 0; i < mbclen; ++i)
454 pstr->offsets[byte_idx + i] = src_idx + i;
458 pstr->wcs[byte_idx++] = wcu;
459 /* Write paddings. */
460 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 pstr->wcs[byte_idx++] = WEOF;
463 else if (mbclen == (size_t) -1 || mbclen == 0)
465 /* It is an invalid character or '\0'. Just use the byte. */
466 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468 if (BE (pstr->trans != NULL, 0))
469 ch = pstr->trans [ch];
470 pstr->mbs[byte_idx] = ch;
472 if (BE (pstr->offsets_needed != 0, 0))
473 pstr->offsets[byte_idx] = src_idx;
476 /* And also cast it to wide char. */
477 pstr->wcs[byte_idx++] = (wchar_t) ch;
478 if (BE (mbclen == (size_t) -1, 0))
479 pstr->cur_state = prev_st;
483 /* The buffer doesn't have enough space, finish to build. */
484 pstr->cur_state = prev_st;
488 pstr->valid_len = byte_idx;
489 pstr->valid_raw_len = src_idx;
493 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
497 re_string_skip_chars (pstr, new_raw_idx, last_wc)
507 /* Skip the characters which are not necessary to check. */
508 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
509 rawbuf_idx < new_raw_idx;)
512 remain_len = pstr->len - rawbuf_idx;
513 prev_st = pstr->cur_state;
514 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
515 remain_len, &pstr->cur_state);
516 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
518 /* We treat these cases as a singlebyte character. */
520 pstr->cur_state = prev_st;
522 /* Then proceed the next character. */
523 rawbuf_idx += mbclen;
525 *last_wc = (wint_t) wc;
528 #endif /* RE_ENABLE_I18N */
530 /* Build the buffer PSTR->MBS, and apply the translation if we need.
531 This function is used in case of REG_ICASE. */
534 build_upper_buffer (pstr)
537 int char_idx, end_idx;
538 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
540 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
542 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
543 if (BE (pstr->trans != NULL, 0))
544 ch = pstr->trans[ch];
546 pstr->mbs[char_idx] = toupper (ch);
548 pstr->mbs[char_idx] = ch;
550 pstr->valid_len = char_idx;
551 pstr->valid_raw_len = char_idx;
554 /* Apply TRANS to the buffer in PSTR. */
557 re_string_translate_buffer (pstr)
560 int buf_idx, end_idx;
561 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
566 pstr->mbs[buf_idx] = pstr->trans[ch];
569 pstr->valid_len = buf_idx;
570 pstr->valid_raw_len = buf_idx;
573 /* This function re-construct the buffers.
574 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575 convert to upper case in case of REG_ICASE, apply translation. */
578 re_string_reconstruct (pstr, idx, eflags)
582 int offset = idx - pstr->raw_mbs_idx;
583 if (BE (offset < 0, 0))
586 #ifdef RE_ENABLE_I18N
587 if (pstr->mb_cur_max > 1)
588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590 pstr->len = pstr->raw_len;
591 pstr->stop = pstr->raw_stop;
593 pstr->raw_mbs_idx = 0;
594 pstr->valid_raw_len = 0;
595 pstr->offsets_needed = 0;
596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598 if (!pstr->mbs_allocated)
599 pstr->mbs = (unsigned char *) pstr->raw_mbs;
603 if (BE (offset != 0, 1))
605 /* Are the characters which are already checked remain? */
606 if (BE (offset < pstr->valid_raw_len, 1)
607 #ifdef RE_ENABLE_I18N
608 /* Handling this would enlarge the code too much.
609 Accept a slowdown in that case. */
610 && pstr->offsets_needed == 0
614 /* Yes, move them to the front of the buffer. */
615 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
616 #ifdef RE_ENABLE_I18N
617 if (pstr->mb_cur_max > 1)
618 memmove (pstr->wcs, pstr->wcs + offset,
619 (pstr->valid_len - offset) * sizeof (wint_t));
620 #endif /* RE_ENABLE_I18N */
621 if (BE (pstr->mbs_allocated, 0))
622 memmove (pstr->mbs, pstr->mbs + offset,
623 pstr->valid_len - offset);
624 pstr->valid_len -= offset;
625 pstr->valid_raw_len -= offset;
627 assert (pstr->valid_len > 0);
632 /* No, skip all characters until IDX. */
633 #ifdef RE_ENABLE_I18N
634 if (BE (pstr->offsets_needed, 0))
636 pstr->len = pstr->raw_len - idx + offset;
637 pstr->stop = pstr->raw_stop - idx + offset;
638 pstr->offsets_needed = 0;
642 pstr->valid_raw_len = 0;
643 #ifdef RE_ENABLE_I18N
644 if (pstr->mb_cur_max > 1)
651 const unsigned char *raw, *p, *q, *end;
653 /* Special case UTF-8. Multi-byte chars start with any
654 byte other than 0x80 - 0xbf. */
655 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
656 end = raw + (offset - pstr->mb_cur_max);
657 for (p = raw + offset - 1; p >= end; --p)
658 if ((*p & 0xc0) != 0x80)
662 int mlen = raw + pstr->len - p;
663 unsigned char buf[6];
666 if (BE (pstr->trans != NULL, 0))
668 int i = mlen < 6 ? mlen : 6;
670 buf[i] = pstr->trans[p[i]];
673 /* XXX Don't use mbrtowc, we know which conversion
674 to use (UTF-8 -> UCS4). */
675 memset (&cur_state, 0, sizeof (cur_state));
676 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
678 - (raw + offset - p));
681 memset (&pstr->cur_state, '\0',
683 pstr->valid_len = mlen;
691 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
692 if (BE (pstr->valid_len, 0))
694 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
695 pstr->wcs[wcs_idx] = WEOF;
696 if (pstr->mbs_allocated)
697 memset (pstr->mbs, 255, pstr->valid_len);
699 pstr->valid_raw_len = pstr->valid_len;
700 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
701 && IS_WIDE_WORD_CHAR (wc))
703 : ((IS_WIDE_NEWLINE (wc)
704 && pstr->newline_anchor)
705 ? CONTEXT_NEWLINE : 0));
708 #endif /* RE_ENABLE_I18N */
710 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
713 pstr->tip_context = (bitset_contain (pstr->word_char, c)
715 : ((IS_NEWLINE (c) && pstr->newline_anchor)
716 ? CONTEXT_NEWLINE : 0));
719 if (!BE (pstr->mbs_allocated, 0))
722 pstr->raw_mbs_idx = idx;
724 pstr->stop -= offset;
726 /* Then build the buffers. */
727 #ifdef RE_ENABLE_I18N
728 if (pstr->mb_cur_max > 1)
732 int ret = build_wcs_upper_buffer (pstr);
733 if (BE (ret != REG_NOERROR, 0))
737 build_wcs_buffer (pstr);
740 #endif /* RE_ENABLE_I18N */
741 if (BE (pstr->mbs_allocated, 0))
744 build_upper_buffer (pstr);
745 else if (pstr->trans != NULL)
746 re_string_translate_buffer (pstr);
749 pstr->valid_len = pstr->len;
756 re_string_peek_byte_case (pstr, idx)
757 const re_string_t *pstr;
762 /* Handle the common (easiest) cases first. */
763 if (BE (!pstr->mbs_allocated, 1))
764 return re_string_peek_byte (pstr, idx);
766 #ifdef RE_ENABLE_I18N
767 if (pstr->mb_cur_max > 1
768 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
769 return re_string_peek_byte (pstr, idx);
772 off = pstr->cur_idx + idx;
773 #ifdef RE_ENABLE_I18N
774 if (pstr->offsets_needed)
775 off = pstr->offsets[off];
778 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
780 #ifdef RE_ENABLE_I18N
781 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
782 this function returns CAPITAL LETTER I instead of first byte of
783 DOTLESS SMALL LETTER I. The latter would confuse the parser,
784 since peek_byte_case doesn't advance cur_idx in any way. */
785 if (pstr->offsets_needed && !isascii (ch))
786 return re_string_peek_byte (pstr, idx);
793 re_string_fetch_byte_case (pstr)
796 if (BE (!pstr->mbs_allocated, 1))
797 return re_string_fetch_byte (pstr);
799 #ifdef RE_ENABLE_I18N
800 if (pstr->offsets_needed)
804 /* For tr_TR.UTF-8 [[:islower:]] there is
805 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
806 in that case the whole multi-byte character and return
807 the original letter. On the other side, with
808 [[: DOTLESS SMALL LETTER I return [[:I, as doing
809 anything else would complicate things too much. */
811 if (!re_string_first_byte (pstr, pstr->cur_idx))
812 return re_string_fetch_byte (pstr);
814 off = pstr->offsets[pstr->cur_idx];
815 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
818 return re_string_fetch_byte (pstr);
820 re_string_skip_bytes (pstr,
821 re_string_char_size_at (pstr, pstr->cur_idx));
826 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
830 re_string_destruct (pstr)
833 #ifdef RE_ENABLE_I18N
835 re_free (pstr->offsets);
836 #endif /* RE_ENABLE_I18N */
837 if (pstr->mbs_allocated)
841 /* Return the context at IDX in INPUT. */
844 re_string_context_at (input, idx, eflags)
845 const re_string_t *input;
850 /* In this case, we use the value stored in input->tip_context,
851 since we can't know the character in input->mbs[-1] here. */
852 return input->tip_context;
853 if (BE (idx == input->len, 0))
854 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
855 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
856 #ifdef RE_ENABLE_I18N
857 if (input->mb_cur_max > 1)
861 while(input->wcs[wc_idx] == WEOF)
864 /* It must not happen. */
865 assert (wc_idx >= 0);
869 return input->tip_context;
871 wc = input->wcs[wc_idx];
872 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
874 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
875 ? CONTEXT_NEWLINE : 0);
880 c = re_string_byte_at (input, idx);
881 if (bitset_contain (input->word_char, c))
883 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
887 /* Functions for set operation. */
890 re_node_set_alloc (set, size)
896 set->elems = re_malloc (int, size);
897 if (BE (set->elems == NULL, 0))
903 re_node_set_init_1 (set, elem)
909 set->elems = re_malloc (int, 1);
910 if (BE (set->elems == NULL, 0))
912 set->alloc = set->nelem = 0;
915 set->elems[0] = elem;
920 re_node_set_init_2 (set, elem1, elem2)
925 set->elems = re_malloc (int, 2);
926 if (BE (set->elems == NULL, 0))
931 set->elems[0] = elem1;
938 set->elems[0] = elem1;
939 set->elems[1] = elem2;
943 set->elems[0] = elem2;
944 set->elems[1] = elem1;
951 re_node_set_init_copy (dest, src)
953 const re_node_set *src;
955 dest->nelem = src->nelem;
958 dest->alloc = dest->nelem;
959 dest->elems = re_malloc (int, dest->alloc);
960 if (BE (dest->elems == NULL, 0))
962 dest->alloc = dest->nelem = 0;
965 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
968 re_node_set_init_empty (dest);
972 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
973 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
974 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
977 re_node_set_add_intersect (dest, src1, src2)
979 const re_node_set *src1, *src2;
981 int i1, i2, is, id, delta, sbase;
982 if (src1->nelem == 0 || src2->nelem == 0)
985 /* We need dest->nelem + 2 * elems_in_intersection; this is a
986 conservative estimate. */
987 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
989 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
990 int *new_elems = re_realloc (dest->elems, int, new_alloc);
991 if (BE (new_elems == NULL, 0))
993 dest->elems = new_elems;
994 dest->alloc = new_alloc;
997 /* Find the items in the intersection of SRC1 and SRC2, and copy
998 into the top of DEST those that are not already in DEST itself. */
999 sbase = dest->nelem + src1->nelem + src2->nelem;
1000 i1 = src1->nelem - 1;
1001 i2 = src2->nelem - 1;
1002 id = dest->nelem - 1;
1005 if (src1->elems[i1] == src2->elems[i2])
1007 /* Try to find the item in DEST. Maybe we could binary search? */
1008 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1011 if (id < 0 || dest->elems[id] != src1->elems[i1])
1012 dest->elems[--sbase] = src1->elems[i1];
1014 if (--i1 < 0 || --i2 < 0)
1018 /* Lower the highest of the two items. */
1019 else if (src1->elems[i1] < src2->elems[i2])
1031 id = dest->nelem - 1;
1032 is = dest->nelem + src1->nelem + src2->nelem - 1;
1033 delta = is - sbase + 1;
1035 /* Now copy. When DELTA becomes zero, the remaining
1036 DEST elements are already in place; this is more or
1037 less the same loop that is in re_node_set_merge. */
1038 dest->nelem += delta;
1039 if (delta > 0 && id >= 0)
1042 if (dest->elems[is] > dest->elems[id])
1044 /* Copy from the top. */
1045 dest->elems[id + delta--] = dest->elems[is--];
1051 /* Slide from the bottom. */
1052 dest->elems[id + delta] = dest->elems[id];
1058 /* Copy remaining SRC elements. */
1059 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1064 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1065 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1067 static reg_errcode_t
1068 re_node_set_init_union (dest, src1, src2)
1070 const re_node_set *src1, *src2;
1073 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1075 dest->alloc = src1->nelem + src2->nelem;
1076 dest->elems = re_malloc (int, dest->alloc);
1077 if (BE (dest->elems == NULL, 0))
1082 if (src1 != NULL && src1->nelem > 0)
1083 return re_node_set_init_copy (dest, src1);
1084 else if (src2 != NULL && src2->nelem > 0)
1085 return re_node_set_init_copy (dest, src2);
1087 re_node_set_init_empty (dest);
1090 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1092 if (src1->elems[i1] > src2->elems[i2])
1094 dest->elems[id++] = src2->elems[i2++];
1097 if (src1->elems[i1] == src2->elems[i2])
1099 dest->elems[id++] = src1->elems[i1++];
1101 if (i1 < src1->nelem)
1103 memcpy (dest->elems + id, src1->elems + i1,
1104 (src1->nelem - i1) * sizeof (int));
1105 id += src1->nelem - i1;
1107 else if (i2 < src2->nelem)
1109 memcpy (dest->elems + id, src2->elems + i2,
1110 (src2->nelem - i2) * sizeof (int));
1111 id += src2->nelem - i2;
1117 /* Calculate the union set of the sets DEST and SRC. And store it to
1118 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1120 static reg_errcode_t
1121 re_node_set_merge (dest, src)
1123 const re_node_set *src;
1125 int is, id, sbase, delta;
1126 if (src == NULL || src->nelem == 0)
1128 if (dest->alloc < 2 * src->nelem + dest->nelem)
1130 int new_alloc = 2 * (src->nelem + dest->alloc);
1131 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1132 if (BE (new_buffer == NULL, 0))
1134 dest->elems = new_buffer;
1135 dest->alloc = new_alloc;
1138 if (BE (dest->nelem == 0, 0))
1140 dest->nelem = src->nelem;
1141 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1145 /* Copy into the top of DEST the items of SRC that are not
1146 found in DEST. Maybe we could binary search in DEST? */
1147 for (sbase = dest->nelem + 2 * src->nelem,
1148 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1150 if (dest->elems[id] == src->elems[is])
1152 else if (dest->elems[id] < src->elems[is])
1153 dest->elems[--sbase] = src->elems[is--];
1154 else /* if (dest->elems[id] > src->elems[is]) */
1160 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1162 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1165 id = dest->nelem - 1;
1166 is = dest->nelem + 2 * src->nelem - 1;
1167 delta = is - sbase + 1;
1171 /* Now copy. When DELTA becomes zero, the remaining
1172 DEST elements are already in place. */
1173 dest->nelem += delta;
1176 if (dest->elems[is] > dest->elems[id])
1178 /* Copy from the top. */
1179 dest->elems[id + delta--] = dest->elems[is--];
1185 /* Slide from the bottom. */
1186 dest->elems[id + delta] = dest->elems[id];
1189 /* Copy remaining SRC elements. */
1190 memcpy (dest->elems, dest->elems + sbase,
1191 delta * sizeof (int));
1200 /* Insert the new element ELEM to the re_node_set* SET.
1201 SET should not already have ELEM.
1202 return -1 if an error is occured, return 1 otherwise. */
1205 re_node_set_insert (set, elem)
1210 /* In case the set is empty. */
1211 if (set->alloc == 0)
1213 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1219 if (BE (set->nelem, 0) == 0)
1221 /* We already guaranteed above that set->alloc != 0. */
1222 set->elems[0] = elem;
1227 /* Realloc if we need. */
1228 if (set->alloc == set->nelem)
1231 set->alloc = set->alloc * 2;
1232 new_array = re_realloc (set->elems, int, set->alloc);
1233 if (BE (new_array == NULL, 0))
1235 set->elems = new_array;
1238 /* Move the elements which follows the new element. Test the
1239 first element separately to skip a check in the inner loop. */
1240 if (elem < set->elems[0])
1243 for (idx = set->nelem; idx > 0; idx--)
1244 set->elems[idx] = set->elems[idx - 1];
1248 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1249 set->elems[idx] = set->elems[idx - 1];
1252 /* Insert the new element. */
1253 set->elems[idx] = elem;
1258 /* Insert the new element ELEM to the re_node_set* SET.
1259 SET should not already have any element greater than or equal to ELEM.
1260 Return -1 if an error is occured, return 1 otherwise. */
1263 re_node_set_insert_last (set, elem)
1267 /* Realloc if we need. */
1268 if (set->alloc == set->nelem)
1271 set->alloc = (set->alloc + 1) * 2;
1272 new_array = re_realloc (set->elems, int, set->alloc);
1273 if (BE (new_array == NULL, 0))
1275 set->elems = new_array;
1278 /* Insert the new element. */
1279 set->elems[set->nelem++] = elem;
1283 /* Compare two node sets SET1 and SET2.
1284 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1287 re_node_set_compare (set1, set2)
1288 const re_node_set *set1, *set2;
1291 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1293 for (i = set1->nelem ; --i >= 0 ; )
1294 if (set1->elems[i] != set2->elems[i])
1299 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1302 re_node_set_contains (set, elem)
1303 const re_node_set *set;
1306 unsigned int idx, right, mid;
1307 if (set->nelem <= 0)
1310 /* Binary search the element. */
1312 right = set->nelem - 1;
1315 mid = (idx + right) / 2;
1316 if (set->elems[mid] < elem)
1321 return set->elems[idx] == elem ? idx + 1 : 0;
1325 re_node_set_remove_at (set, idx)
1329 if (idx < 0 || idx >= set->nelem)
1332 for (; idx < set->nelem; idx++)
1333 set->elems[idx] = set->elems[idx + 1];
1337 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1338 Or return -1, if an error will be occured. */
1341 re_dfa_add_node (dfa, token)
1345 int type = token.type;
1346 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1348 int new_nodes_alloc = dfa->nodes_alloc * 2;
1349 int *new_nexts, *new_indices;
1350 re_node_set *new_edests, *new_eclosures;
1352 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1354 if (BE (new_array == NULL, 0))
1356 dfa->nodes = new_array;
1357 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1358 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1359 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1360 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1361 if (BE (new_nexts == NULL || new_indices == NULL
1362 || new_edests == NULL || new_eclosures == NULL, 0))
1364 dfa->nexts = new_nexts;
1365 dfa->org_indices = new_indices;
1366 dfa->edests = new_edests;
1367 dfa->eclosures = new_eclosures;
1368 dfa->nodes_alloc = new_nodes_alloc;
1370 dfa->nodes[dfa->nodes_len] = token;
1371 dfa->nodes[dfa->nodes_len].constraint = 0;
1372 #ifdef RE_ENABLE_I18N
1373 dfa->nodes[dfa->nodes_len].accept_mb =
1374 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1376 dfa->nexts[dfa->nodes_len] = -1;
1377 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1378 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1379 return dfa->nodes_len++;
1382 static unsigned int inline
1383 calc_state_hash (nodes, context)
1384 const re_node_set *nodes;
1385 unsigned int context;
1387 unsigned int hash = nodes->nelem + context;
1389 for (i = 0 ; i < nodes->nelem ; i++)
1390 hash += nodes->elems[i];
1394 /* Search for the state whose node_set is equivalent to NODES.
1395 Return the pointer to the state, if we found it in the DFA.
1396 Otherwise create the new one and return it. In case of an error
1397 return NULL and set the error code in ERR.
1398 Note: - We assume NULL as the invalid state, then it is possible that
1399 return value is NULL and ERR is REG_NOERROR.
1400 - We never return non-NULL value in case of any errors, it is for
1403 static re_dfastate_t*
1404 re_acquire_state (err, dfa, nodes)
1407 const re_node_set *nodes;
1410 re_dfastate_t *new_state;
1411 struct re_state_table_entry *spot;
1413 if (BE (nodes->nelem == 0, 0))
1418 hash = calc_state_hash (nodes, 0);
1419 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1421 for (i = 0 ; i < spot->num ; i++)
1423 re_dfastate_t *state = spot->array[i];
1424 if (hash != state->hash)
1426 if (re_node_set_compare (&state->nodes, nodes))
1430 /* There are no appropriate state in the dfa, create the new one. */
1431 new_state = create_ci_newstate (dfa, nodes, hash);
1432 if (BE (new_state != NULL, 1))
1441 /* Search for the state whose node_set is equivalent to NODES and
1442 whose context is equivalent to CONTEXT.
1443 Return the pointer to the state, if we found it in the DFA.
1444 Otherwise create the new one and return it. In case of an error
1445 return NULL and set the error code in ERR.
1446 Note: - We assume NULL as the invalid state, then it is possible that
1447 return value is NULL and ERR is REG_NOERROR.
1448 - We never return non-NULL value in case of any errors, it is for
1451 static re_dfastate_t*
1452 re_acquire_state_context (err, dfa, nodes, context)
1455 const re_node_set *nodes;
1456 unsigned int context;
1459 re_dfastate_t *new_state;
1460 struct re_state_table_entry *spot;
1462 if (nodes->nelem == 0)
1467 hash = calc_state_hash (nodes, context);
1468 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1470 for (i = 0 ; i < spot->num ; i++)
1472 re_dfastate_t *state = spot->array[i];
1473 if (state->hash == hash
1474 && state->context == context
1475 && re_node_set_compare (state->entrance_nodes, nodes))
1478 /* There are no appropriate state in `dfa', create the new one. */
1479 new_state = create_cd_newstate (dfa, nodes, context, hash);
1480 if (BE (new_state != NULL, 1))
1489 /* Finish initialization of the new state NEWSTATE, and using its hash value
1490 HASH put in the appropriate bucket of DFA's state table. Return value
1491 indicates the error code if failed. */
1493 static reg_errcode_t
1494 register_state (dfa, newstate, hash)
1496 re_dfastate_t *newstate;
1499 struct re_state_table_entry *spot;
1503 newstate->hash = hash;
1504 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1505 if (BE (err != REG_NOERROR, 0))
1507 for (i = 0; i < newstate->nodes.nelem; i++)
1509 int elem = newstate->nodes.elems[i];
1510 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1511 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1514 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1515 if (BE (spot->alloc <= spot->num, 0))
1517 int new_alloc = 2 * spot->num + 2;
1518 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1520 if (BE (new_array == NULL, 0))
1522 spot->array = new_array;
1523 spot->alloc = new_alloc;
1525 spot->array[spot->num++] = newstate;
1529 /* Create the new state which is independ of contexts.
1530 Return the new state if succeeded, otherwise return NULL. */
1532 static re_dfastate_t *
1533 create_ci_newstate (dfa, nodes, hash)
1535 const re_node_set *nodes;
1540 re_dfastate_t *newstate;
1542 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1543 if (BE (newstate == NULL, 0))
1545 err = re_node_set_init_copy (&newstate->nodes, nodes);
1546 if (BE (err != REG_NOERROR, 0))
1552 newstate->entrance_nodes = &newstate->nodes;
1553 for (i = 0 ; i < nodes->nelem ; i++)
1555 re_token_t *node = dfa->nodes + nodes->elems[i];
1556 re_token_type_t type = node->type;
1557 if (type == CHARACTER && !node->constraint)
1559 #ifdef RE_ENABLE_I18N
1560 newstate->accept_mb |= node->accept_mb;
1561 #endif /* RE_ENABLE_I18N */
1563 /* If the state has the halt node, the state is a halt state. */
1564 if (type == END_OF_RE)
1566 else if (type == OP_BACK_REF)
1567 newstate->has_backref = 1;
1568 else if (type == ANCHOR || node->constraint)
1569 newstate->has_constraint = 1;
1571 err = register_state (dfa, newstate, hash);
1572 if (BE (err != REG_NOERROR, 0))
1574 free_state (newstate);
1580 /* Create the new state which is depend on the context CONTEXT.
1581 Return the new state if succeeded, otherwise return NULL. */
1583 static re_dfastate_t *
1584 create_cd_newstate (dfa, nodes, context, hash)
1586 const re_node_set *nodes;
1587 unsigned int context, hash;
1589 int i, nctx_nodes = 0;
1591 re_dfastate_t *newstate;
1593 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1594 if (BE (newstate == NULL, 0))
1596 err = re_node_set_init_copy (&newstate->nodes, nodes);
1597 if (BE (err != REG_NOERROR, 0))
1603 newstate->context = context;
1604 newstate->entrance_nodes = &newstate->nodes;
1606 for (i = 0 ; i < nodes->nelem ; i++)
1608 unsigned int constraint = 0;
1609 re_token_t *node = dfa->nodes + nodes->elems[i];
1610 re_token_type_t type = node->type;
1611 if (node->constraint)
1612 constraint = node->constraint;
1614 if (type == CHARACTER && !constraint)
1616 #ifdef RE_ENABLE_I18N
1617 newstate->accept_mb |= node->accept_mb;
1618 #endif /* RE_ENABLE_I18N */
1620 /* If the state has the halt node, the state is a halt state. */
1621 if (type == END_OF_RE)
1623 else if (type == OP_BACK_REF)
1624 newstate->has_backref = 1;
1625 else if (type == ANCHOR)
1626 constraint = node->opr.ctx_type;
1630 if (newstate->entrance_nodes == &newstate->nodes)
1632 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633 if (BE (newstate->entrance_nodes == NULL, 0))
1635 free_state (newstate);
1638 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1640 newstate->has_constraint = 1;
1643 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1645 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1650 err = register_state (dfa, newstate, hash);
1651 if (BE (err != REG_NOERROR, 0))
1653 free_state (newstate);
1661 re_dfastate_t *state;
1663 re_node_set_free (&state->non_eps_nodes);
1664 re_node_set_free (&state->inveclosure);
1665 if (state->entrance_nodes != &state->nodes)
1667 re_node_set_free (state->entrance_nodes);
1668 re_free (state->entrance_nodes);
1670 re_node_set_free (&state->nodes);
1671 re_free (state->word_trtable);
1672 re_free (state->trtable);