diff options
author | ksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-11-04 14:31:26 +0000 |
---|---|---|
committer | ksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-11-04 14:31:26 +0000 |
commit | afd0a23874626055f6d14f206e1d787ff80c110a (patch) | |
tree | 68652bc90625975ffce4dc7cebb5d85f2f43bc93 | |
parent | c2bfb31fd654333ae51ca50e01f0b911881b56e7 (diff) | |
download | ruby-afd0a23874626055f6d14f206e1d787ff80c110a.tar.gz |
imported Oni Guruma 3.4.0.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/KOSAKO@7203 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ascii.c | 79 | ||||
-rw-r--r-- | euc_jp.c | 161 | ||||
-rw-r--r-- | oniggnu.h | 45 | ||||
-rw-r--r-- | oniguruma.h | 296 | ||||
-rw-r--r-- | regcomp.c | 822 | ||||
-rw-r--r-- | regenc.c | 614 | ||||
-rw-r--r-- | regenc.h | 85 | ||||
-rw-r--r-- | regerror.c | 71 | ||||
-rw-r--r-- | regexec.c | 914 | ||||
-rw-r--r-- | reggnu.c | 38 | ||||
-rw-r--r-- | regint.h | 93 | ||||
-rw-r--r-- | regparse.c | 948 | ||||
-rw-r--r-- | regparse.h | 101 | ||||
-rw-r--r-- | sjis.c | 158 | ||||
-rw-r--r-- | utf8.c | 450 |
15 files changed, 3061 insertions, 1814 deletions
@@ -1,14 +1,36 @@ /********************************************************************** - ascii.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" static int -ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) +ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); @@ -17,38 +39,29 @@ ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingASCII = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "US-ASCII", /* name */ 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_to_lower, - onigenc_ascii_mbc_is_case_ambig, - ascii_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + onigenc_ascii_mbc_to_normalize, + onigenc_ascii_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + ascii_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; @@ -1,23 +1,69 @@ /********************************************************************** - euc_jp.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) +static int EncLen_EUCJP[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +static int +eucjp_mbc_enc_len(UChar* p) +{ + return EncLen_EUCJP[*p]; +} + static OnigCodePoint eucjp_mbc_to_code(UChar* p, UChar* end) { int c, i, len; OnigCodePoint n; - c = *p++; - len = enc_len(ONIG_ENCODING_EUC_JP, c); - n = c; + len = enc_len(ONIG_ENCODING_EUC_JP, p); + n = (OnigCodePoint )*p++; if (len == 1) return n; for (i = 1; i < len; i++) { @@ -31,11 +77,13 @@ eucjp_mbc_to_code(UChar* p, UChar* end) static int eucjp_code_to_mbclen(OnigCodePoint code) { - if ((code & 0xff0000) != 0) return 3; + if (ONIGENC_IS_CODE_ASCII(code)) return 1; + else if ((code & 0xff0000) != 0) return 3; else if ((code & 0xff00) != 0) return 2; - else return 1; + else return 0; } +#if 0 static int eucjp_code_to_mbc_first(OnigCodePoint code) { @@ -43,27 +91,16 @@ eucjp_code_to_mbc_first(OnigCodePoint code) if ((code & 0xff0000) != 0) { first = (code >> 16) & 0xff; - /* - if (enc_len(ONIG_ENCODING_EUC_JP, first) != 3) - return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; - */ } else if ((code & 0xff00) != 0) { first = (code >> 8) & 0xff; - /* - if (enc_len(ONIG_ENCODING_EUC_JP, first) != 2) - return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; - */ } else { - /* - if (enc_len(ONIG_ENCODING_EUC_JP, code) != 1) - return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; - */ return (int )code; } return first; } +#endif static int eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) @@ -75,44 +112,57 @@ eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(ONIG_ENCODING_EUC_JP, buf[0]) != (p - buf)) + if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf)) return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } static int -eucjp_mbc_to_lower(UChar* p, UChar* lower) +eucjp_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end, + UChar* lower) { int len; + UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; } else { - len = enc_len(ONIG_ENCODING_EUC_JP, *p); + len = enc_len(ONIG_ENCODING_EUC_JP, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -eucjp_code_is_ctype(OnigCodePoint code, unsigned int ctype) +eucjp_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end); +} + +static int +eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = eucjp_code_to_mbc_first(code); - return (enc_len(ONIG_ENCODING_EUC_JP, first) > 1 ? TRUE : FALSE); - } + else + return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -137,7 +187,7 @@ eucjp_left_adjust_char_head(UChar* start, UChar* s) p = s; while (!eucjp_islead(*p) && p > start) p--; - len = enc_len(ONIG_ENCODING_EUC_JP, *p); + len = enc_len(ONIG_ENCODING_EUC_JP, p); if (p + len > s) return p; p += len; return p + ((s - p) & ~1); @@ -154,38 +204,29 @@ eucjp_is_allowed_reverse_match(UChar* s, UChar* end) } OnigEncodingType OnigEncodingEUC_JP = { + eucjp_mbc_enc_len, + "EUC-JP", /* name */ + 3, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "EUC-JP", /* name */ - 3, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, eucjp_mbc_to_code, eucjp_code_to_mbclen, eucjp_code_to_mbc, - eucjp_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - eucjp_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + eucjp_mbc_to_normalize, + eucjp_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + eucjp_is_code_ctype, + onigenc_not_support_get_ctype_code_range, eucjp_left_adjust_char_head, - eucjp_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + eucjp_is_allowed_reverse_match }; @@ -1,12 +1,33 @@ +#ifndef ONIGGNU_H +#define ONIGGNU_H /********************************************************************** - oniggnu.h - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef ONIGGNU_H -#define ONIGGNU_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "oniguruma.h" @@ -27,6 +48,18 @@ #define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) #ifdef RUBY_PLATFORM + +#ifndef ONIG_RUBY_M17N + +ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; + +#undef ismbchar +#define ismbchar(c) (mbclen((c)) != 1) +#define mbclen(c) \ + ONIGENC_MBC_ENC_LEN(OnigEncDefaultCharEncoding, (UChar* )(&c)) + +#endif /* ifndef ONIG_RUBY_M17N */ + #define re_mbcinit ruby_re_mbcinit #define re_compile_pattern ruby_re_compile_pattern #define re_recompile_pattern ruby_re_recompile_pattern diff --git a/oniguruma.h b/oniguruma.h index 3fd9f4c395..c10f3b4d18 100644 --- a/oniguruma.h +++ b/oniguruma.h @@ -1,17 +1,38 @@ +#ifndef ONIGURUMA_H +#define ONIGURUMA_H /********************************************************************** - oniguruma.h - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef ONIGURUMA_H -#define ONIGURUMA_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #define ONIGURUMA -#define ONIGURUMA_VERSION_MAJOR 2 -#define ONIGURUMA_VERSION_MINOR 2 -#define ONIGURUMA_VERSION_TEENY 8 +#define ONIGURUMA_VERSION_MAJOR 3 +#define ONIGURUMA_VERSION_MINOR 4 +#define ONIGURUMA_VERSION_TEENY 0 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -56,12 +77,56 @@ typedef struct { OnigCodePoint to; } OnigCodePointRange; -#define ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE 16 + +/* ambiguous match flag */ +#define ONIGENC_AMBIGUOUS_MATCH_NONE 0 +#define ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE (1<<0) +#define ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE (1<<1) +/* #define ONIGENC_AMBIGUOUS_MATCH_ACCENT (1<<2) */ +/* #define ONIGENC_AMBIGUOUS_MATCH_HIRAGANA_KATAKANA (1<<3) */ +/* #define ONIGENC_AMBIGUOUS_MATCH_KATAKANA_WIDTH (1<<4) */ + +#define ONIGENC_AMBIGUOUS_MATCH_LIMIT (1<<1) +#define ONIGENC_AMBIGUOUS_MATCH_COMPOUND (1<<30) + +#define ONIGENC_AMBIGUOUS_MATCH_FULL \ + ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_COMPOUND ) +#define ONIGENC_AMBIGUOUS_MATCH_DEFAULT \ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_COMPOUND ) + +typedef unsigned int OnigAmbigType; + +#define ONIGENC_MAX_COMP_AMBIG_CODE_LEN 3 +#define ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM 4 + +typedef struct { + int len; + OnigCodePoint code[ONIGENC_MAX_COMP_AMBIG_CODE_LEN]; +} OnigCompAmbigCodeItem; + typedef struct { - int target_num; - int target_byte_len[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; - UChar* target_str[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; -} OnigEncFoldMatchInfo; + int n; + OnigCodePoint code; + OnigCompAmbigCodeItem items[ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM]; +} OnigCompAmbigCodes; + +typedef struct { + OnigCodePoint from; + OnigCodePoint to; +} OnigPairAmbigCodes; + +typedef struct { + OnigCodePoint esc; + OnigCodePoint anychar; + OnigCodePoint anytime; + OnigCodePoint zero_or_one_time; + OnigCodePoint one_or_more_time; + OnigCodePoint anychar_anytime; +} OnigMetaCharTableType; #if defined(RUBY_PLATFORM) && defined(M17N_H) @@ -72,23 +137,24 @@ typedef m17n_encoding* OnigEncoding; #else typedef struct { - const char len_table[256]; - const char* name; - int max_enc_len; - int is_fold_match; - int ctype_support_level; /* sb-only/full */ - int is_continuous_sb_mb; /* code point is continuous from sb to mb */ + int (*mbc_enc_len)(UChar* p); + const char* name; + int max_enc_len; + int min_enc_len; + OnigAmbigType support_ambig_flag; + OnigMetaCharTableType meta_char_table; + int (*is_mbc_newline)(UChar* p, UChar* end); OnigCodePoint (*mbc_to_code)(UChar* p, UChar* end); int (*code_to_mbclen)(OnigCodePoint code); int (*code_to_mbc)(OnigCodePoint code, UChar *buf); - int (*mbc_to_lower)(UChar* p, UChar* lower); - int (*mbc_is_case_ambig)(UChar* p); - int (*code_is_ctype)(OnigCodePoint code, unsigned int ctype); + int (*mbc_to_normalize)(OnigAmbigType flag, UChar** pp, UChar* end, UChar* to); + int (*is_mbc_ambiguous)(OnigAmbigType flag, UChar** pp, UChar* end); + int (*get_all_pair_ambig_codes)(OnigAmbigType flag, OnigPairAmbigCodes** acs); + int (*get_all_comp_ambig_codes)(OnigAmbigType flag, OnigCompAmbigCodes** acs); + int (*is_code_ctype)(OnigCodePoint code, unsigned int ctype); int (*get_ctype_code_range)(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]); - UChar* (*left_adjust_char_head)(UChar* start, UChar* s); - int (*is_allowed_reverse_match)(UChar* p, UChar* e); - int (*get_all_fold_match_code)(OnigCodePoint** codes); - int (*get_fold_match_info)(UChar* p, UChar* end, OnigEncFoldMatchInfo** info); + UChar* (*left_adjust_char_head)(UChar* start, UChar* p); + int (*is_allowed_reverse_match)(UChar* p, UChar* end); } OnigEncodingType; typedef OnigEncodingType* OnigEncoding; @@ -110,6 +176,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_14; ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_15; ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_16; ONIG_EXTERN OnigEncodingType OnigEncodingUTF8; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_BE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_LE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_BE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_LE; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_JP; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_TW; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_KR; @@ -136,6 +206,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15) #define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16) #define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8) +#define ONIG_ENCODING_UTF16_BE (&OnigEncodingUTF16_BE) +#define ONIG_ENCODING_UTF16_LE (&OnigEncodingUTF16_LE) +#define ONIG_ENCODING_UTF32_BE (&OnigEncodingUTF32_BE) +#define ONIG_ENCODING_UTF32_LE (&OnigEncodingUTF32_LE) #define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP) #define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW) #define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR) @@ -151,35 +225,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; /* work size */ -#define ONIGENC_CODE_TO_MBC_MAXLEN 7 -#define ONIGENC_MBC_TO_LOWER_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN +#define ONIGENC_CODE_TO_MBC_MAXLEN 7 +#define ONIGENC_MBC_NORMALIZE_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN /* character types */ -#define ONIGENC_CTYPE_ALPHA (1<< 0) -#define ONIGENC_CTYPE_BLANK (1<< 1) -#define ONIGENC_CTYPE_CNTRL (1<< 2) -#define ONIGENC_CTYPE_DIGIT (1<< 3) -#define ONIGENC_CTYPE_GRAPH (1<< 4) -#define ONIGENC_CTYPE_LOWER (1<< 5) -#define ONIGENC_CTYPE_PRINT (1<< 6) -#define ONIGENC_CTYPE_PUNCT (1<< 7) -#define ONIGENC_CTYPE_SPACE (1<< 8) -#define ONIGENC_CTYPE_UPPER (1<< 9) -#define ONIGENC_CTYPE_XDIGIT (1<<10) -#define ONIGENC_CTYPE_WORD (1<<11) -#define ONIGENC_CTYPE_ASCII (1<<12) +#define ONIGENC_CTYPE_NEWLINE (1<< 0) +#define ONIGENC_CTYPE_ALPHA (1<< 1) +#define ONIGENC_CTYPE_BLANK (1<< 2) +#define ONIGENC_CTYPE_CNTRL (1<< 3) +#define ONIGENC_CTYPE_DIGIT (1<< 4) +#define ONIGENC_CTYPE_GRAPH (1<< 5) +#define ONIGENC_CTYPE_LOWER (1<< 6) +#define ONIGENC_CTYPE_PRINT (1<< 7) +#define ONIGENC_CTYPE_PUNCT (1<< 8) +#define ONIGENC_CTYPE_SPACE (1<< 9) +#define ONIGENC_CTYPE_UPPER (1<<10) +#define ONIGENC_CTYPE_XDIGIT (1<<11) +#define ONIGENC_CTYPE_WORD (1<<12) +#define ONIGENC_CTYPE_ASCII (1<<13) #define ONIGENC_CTYPE_ALNUM (ONIGENC_CTYPE_ALPHA | ONIGENC_CTYPE_DIGIT) -/* ctype support level */ -#define ONIGENC_CTYPE_SUPPORT_LEVEL_SB 0 -#define ONIGENC_CTYPE_SUPPORT_LEVEL_FULL 1 - -#define enc_len(enc,byte) ONIGENC_MBC_LEN_BY_HEAD(enc,byte) +#define enc_len(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) #define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) #define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) -#define ONIGENC_IS_MBC_HEAD(enc,byte) (ONIGENC_MBC_LEN_BY_HEAD(enc,byte) != 1) +#define ONIGENC_IS_MBC_HEAD(enc,p) (ONIGENC_MBC_ENC_LEN(enc,p) != 1) #define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) #define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) #define ONIGENC_IS_CODE_SB_WORD(enc,code) \ @@ -192,31 +263,33 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #include <ctype.h> /* for isblank(), isgraph() */ -#define ONIGENC_MBC_TO_LOWER(enc,p,buf) onigenc_mbc_to_lower(enc,p,buf) -#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) onigenc_mbc_is_case_ambig(enc,p) +#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ + onigenc_mbc_to_normalize(enc,flag,pp,end,buf) +#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ + onigenc_is_mbc_ambiguous(enc,flag,pp,end) -#define ONIGENC_IS_FOLD_MATCH(enc) FALSE -#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) FALSE -#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ONIGENC_CTYPE_SUPPORT_LEVEL_SB +#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ onigenc_is_allowed_reverse_match(enc, s, end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ onigenc_get_left_adjust_char_head(enc, start, s) -#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) 0 -#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) ONIG_NO_SUPPORT_CONFIG +#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, ambig_flag, acs) 0 +#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, ambig_flag, acs) 0 #define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \ ONIG_NO_SUPPORT_CONFIG -#define ONIGENC_MBC_LEN_BY_HEAD(enc,b) m17n_mbclen(enc,(int )b) +#define ONIGENC_MBC_ENC_LEN(enc,p) m17n_mbclen(enc,(int )(*p)) #define ONIGENC_MBC_MAXLEN(enc) m17n_mbmaxlen(enc) #define ONIGENC_MBC_MAXLEN_DIST(enc) \ (ONIGENC_MBC_MAXLEN(enc) > 0 ? ONIGENC_MBC_MAXLEN(enc) \ : ONIG_INFINITE_DISTANCE) +#define ONIGENC_MBC_MINLEN(enc) 1 #define ONIGENC_MBC_TO_CODE(enc,p,e) m17n_codepoint((enc),(p),(e)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) m17n_codelen((enc),(code)) #define ONIGENC_CODE_TO_MBC(enc,code,buf) onigenc_code_to_mbc(enc, code, buf) -#if 0 -#define ONIGENC_STEP_BACK(enc,start,s,n) /* !! not supported !! */ +#if 0 /* !! not supported !! */ +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) +#define ONIGENC_STEP_BACK(enc,start,s,n) #endif #define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) \ @@ -251,9 +324,9 @@ int onigenc_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, int ctype)); ONIG_EXTERN int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); ONIG_EXTERN -int onigenc_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* buf)); +int onigenc_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, UChar** pp, UChar* end, UChar* buf)); ONIG_EXTERN -int onigenc_mbc_is_case_ambig P_((OnigEncoding enc, UChar* p)); +int onigenc_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, UChar** pp, UChar* end)); ONIG_EXTERN int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end)); @@ -261,32 +334,35 @@ int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end) #define ONIGENC_NAME(enc) ((enc)->name) -#define ONIGENC_MBC_TO_LOWER(enc,p,buf) (enc)->mbc_to_lower(p,buf) -#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) (enc)->mbc_is_case_ambig(p) - -#define ONIGENC_IS_FOLD_MATCH(enc) ((enc)->is_fold_match) -#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) ((enc)->is_continuous_sb_mb) -#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ((enc)->ctype_support_level) +#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ + (enc)->mbc_to_normalize(flag,pp,end,buf) +#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ + (enc)->is_mbc_ambiguous(flag,pp,end) +#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ((enc)->support_ambig_flag) #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ (enc)->is_allowed_reverse_match(s,end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ (enc)->left_adjust_char_head(start, s) -#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) \ - (enc)->get_all_fold_match_code(codes) -#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) \ - (enc)->get_fold_match_info(p,end,info) +#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc,ambig_flag,acs) \ + (enc)->get_all_pair_ambig_codes(ambig_flag,acs) +#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc,ambig_flag,acs) \ + (enc)->get_all_comp_ambig_codes(ambig_flag,acs) #define ONIGENC_STEP_BACK(enc,start,s,n) \ onigenc_step_back((enc),(start),(s),(n)) -#define ONIGENC_MBC_LEN_BY_HEAD(enc,byte) ((enc)->len_table[(int )(byte)]) +#define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p) #define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) -#define ONIGENC_MBC_TO_CODE(enc,p,e) (enc)->mbc_to_code((p),(e)) +#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end)) +#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code) #define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf) -#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->code_is_ctype(code,ctype) +#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype) +#define ONIGENC_IS_CODE_NEWLINE(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE) #define ONIGENC_IS_CODE_GRAPH(enc,code) \ ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH) #define ONIGENC_IS_CODE_PRINT(enc,code) \ @@ -340,6 +416,12 @@ ONIG_EXTERN UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); ONIG_EXTERN UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +ONIG_EXTERN +int onigenc_strlen P_((OnigEncoding enc, UChar* p, UChar* end)); +ONIG_EXTERN +int onigenc_strlen_null P_((OnigEncoding enc, UChar* p)); +ONIG_EXTERN +int onigenc_str_bytelen_null P_((OnigEncoding enc, UChar* p)); @@ -353,13 +435,6 @@ UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UC /* constants */ #define ONIG_MAX_ERROR_MESSAGE_LEN 90 -#if defined(RUBY_PLATFORM) && !defined(ONIG_RUBY_M17N) -ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -#undef ismbchar -#define ismbchar(c) (mbclen((c)) != 1) -#define mbclen(c) (OnigEncDefaultCharEncoding->len_table[(unsigned char )(c)]) -#endif - typedef unsigned int OnigOptionType; #define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE @@ -467,6 +542,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */ #define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1<<17) /* \p{^..}, \P{^..} */ #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1<<18) /* \p{IsXDigit} */ +#define ONIG_SYN_OP2_ESC_H_XDIGIT (1<<19) /* \h, \H */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */ @@ -479,6 +555,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */ #define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1<<7) /* see doc/RE */ #define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1<<8) /* (?<x>)(?<x>) */ +#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1<<9) /* a{n}?=(?:a{n})? */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<20) /* [^...] */ @@ -565,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223 #define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 #define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 +#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402 /* errors related to thread */ #define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 @@ -575,6 +653,15 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \ ((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i]) +typedef struct OnigCaptureTreeNodeStruct { + int group; /* group number */ + int beg; + int end; + int allocated; + int num_childs; + struct OnigCaptureTreeNodeStruct** childs; +} OnigCaptureTreeNode; + /* match result region type */ struct re_registers { int allocated; @@ -582,9 +669,16 @@ struct re_registers { int* beg; int* end; /* extended */ - struct re_registers** list; /* capture history. list[1]-list[31] */ + OnigCaptureTreeNode* history_root; /* capture history tree root */ }; +/* capture tree traverse */ +#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1 +#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2 +#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \ + ( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST ) + + #define ONIG_REGION_NOTPOS -1 typedef struct re_registers OnigRegion; @@ -635,6 +729,7 @@ typedef struct re_pattern_buffer { OnigEncoding enc; OnigOptionType options; OnigSyntaxType* syntax; + OnigAmbigType ambig_flag; void* name_table; /* optimization info (string search, char-map and anchors) */ @@ -646,7 +741,7 @@ typedef struct re_pattern_buffer { int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; - unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ + unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ int *int_map; /* BM skip for exact_len > 255 */ int *int_map_backward; /* BM skip for backward search */ OnigDistance dmin; /* min-distance of exact or map */ @@ -657,6 +752,15 @@ typedef struct re_pattern_buffer { } regex_t; +typedef struct { + int num_of_elements; + OnigEncoding pattern_enc; + OnigEncoding target_enc; + OnigSyntaxType* syntax; + OnigOptionType option; + OnigAmbigType ambig_flag; +} OnigCompileInfo; + /* Oniguruma Native API */ ONIG_EXTERN int onig_init P_((void)); @@ -669,10 +773,14 @@ void onig_set_verb_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN int onig_new P_((regex_t**, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN +int onig_new_deluxe P_((regex_t** reg, UChar* pattern, UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +ONIG_EXTERN void onig_free P_((regex_t*)); ONIG_EXTERN int onig_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN +int onig_recompile_deluxe P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +ONIG_EXTERN int onig_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN int onig_match P_((regex_t*, UChar* str, UChar* end, UChar* at, OnigRegion* region, OnigOptionType option)); @@ -696,16 +804,34 @@ int onig_foreach_name P_((regex_t* reg, int (*func)(UChar*,UChar*,int,int*,regex ONIG_EXTERN int onig_number_of_names P_((regex_t* reg)); ONIG_EXTERN +int onig_number_of_captures P_((regex_t* reg)); +ONIG_EXTERN +int onig_number_of_capture_histories P_((regex_t* reg)); +ONIG_EXTERN +OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region)); +ONIG_EXTERN +int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg)); +ONIG_EXTERN OnigEncoding onig_get_encoding P_((regex_t* reg)); ONIG_EXTERN OnigOptionType onig_get_options P_((regex_t* reg)); ONIG_EXTERN +OnigAmbigType onig_get_ambig_flag P_((regex_t* reg)); +ONIG_EXTERN OnigSyntaxType* onig_get_syntax P_((regex_t* reg)); ONIG_EXTERN int onig_set_default_syntax P_((OnigSyntaxType* syntax)); ONIG_EXTERN void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from)); ONIG_EXTERN +unsigned int onig_get_syntax_op P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +unsigned int onig_get_syntax_op2 P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +unsigned int onig_get_syntax_behavior P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +OnigOptionType onig_get_syntax_options P_((OnigSyntaxType* syntax)); +ONIG_EXTERN void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op)); ONIG_EXTERN void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2)); @@ -714,7 +840,9 @@ void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior) ONIG_EXTERN void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options)); ONIG_EXTERN -int onig_set_meta_char P_((unsigned int what, OnigCodePoint code)); +int onig_set_meta_char P_((OnigEncoding enc, unsigned int what, OnigCodePoint code)); +ONIG_EXTERN +void onig_copy_encoding P_((OnigEncoding to, OnigEncoding from)); ONIG_EXTERN unsigned int onig_get_match_stack_limit_size P_((void)); ONIG_EXTERN @@ -723,5 +851,7 @@ ONIG_EXTERN int onig_end P_((void)); ONIG_EXTERN const char* onig_version P_((void)); +ONIG_EXTERN +const char* onig_copyright P_((void)); #endif /* ONIGURUMA_H */ @@ -1,16 +1,42 @@ /********************************************************************** - regcomp.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif +/* + Caution: node should not be a string node. + (s and end member address break) +*/ static void swap_node(Node* a, Node* b) { @@ -120,33 +146,6 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) #endif /* USE_SUBEXP_CALL */ -#if 0 -static int -bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc) -{ - int i; - int len, maxlen = 0; - - if (negative) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - return maxlen; -} -#endif - static int add_opcode(regex_t* reg, int opcode) { @@ -293,15 +292,15 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) { int op; - switch (mb_len) { - case 1: - if (ignore_case) { - switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; - } + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; } - else { + } + else { + switch (mb_len) { + case 1: switch (str_len) { case 1: op = OP_EXACT1; break; case 2: op = OP_EXACT2; break; @@ -310,25 +309,25 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) case 5: op = OP_EXACT5; break; default: op = OP_EXACTN; break; } - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_EXACTMB3N; + break; - default: - op = OP_EXACTMBN; - break; + default: + op = OP_EXACTMBN; + break; + } } return op; } @@ -373,7 +372,7 @@ compile_call(CallNode* node, regex_t* reg) r = add_opcode(reg, OP_CALL); if (r) return r; r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), - node->target); + node->target); if (r) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); return r; @@ -394,15 +393,14 @@ compile_tree_n_times(Node* node, int n, regex_t* reg) static int add_compile_string_length(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int len; int op = select_str_opcode(mb_len, str_len, ignore_case); len = SIZE_OPCODE; - if (op == OP_EXACTMBN) - len += SIZE_LENGTH; + if (op == OP_EXACTMBN) len += SIZE_LENGTH; if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; @@ -412,7 +410,7 @@ add_compile_string_length(UChar* s, int mb_len, int str_len, static int add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int op = select_str_opcode(mb_len, str_len, ignore_case); add_opcode(reg, op); @@ -420,8 +418,12 @@ add_compile_string(UChar* s, int mb_len, int str_len, if (op == OP_EXACTMBN) add_length(reg, mb_len); - if (IS_NEED_STR_LEN_OP_EXACT(op)) - add_length(reg, str_len); + if (IS_NEED_STR_LEN_OP_EXACT(op)) { + if (op == OP_EXACTN_IC) + add_length(reg, mb_len * str_len); + else + add_length(reg, str_len); + } add_bytes(reg, s, mb_len * str_len); return 0; @@ -429,49 +431,37 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int -compile_length_string_node(StrNode* sn, regex_t* reg) +compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig, ic; + int rlen, r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; rlen = 0; for (; p < sn->end; ) { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && ambig == 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); } else { r = add_compile_string_length(prev, prev_len, slen, reg, ambig); rlen += r; - - if (ic != 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - prev = p; slen = 1; prev_len = len; } - p += len; } r = add_compile_string_length(prev, prev_len, slen, reg, ambig); @@ -489,49 +479,33 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg) } static int -compile_string_node(StrNode* sn, regex_t* reg) +compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig, ic; + int r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; - UChar *p, *prev; + UChar *p, *prev, *end; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + end = sn->end; + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) - ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; - for (; p < sn->end; ) { - len = enc_len(enc, *p); + for (; p < end; ) { + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && len == 1) { - if (ambig == 0) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } } else { r = add_compile_string(prev, prev_len, slen, reg, ambig); if (r) return r; - if (ic != 0 && len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; prev = p; slen = 1; @@ -584,8 +558,7 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg) len = SIZE_OPCODE + SIZE_BITSET; } else { - if (bitset_is_empty(cc->bs)) { - /* SIZE_BITSET is included in mbuf->used. */ + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { len = SIZE_OPCODE; } else { @@ -613,7 +586,7 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) r = add_bitset(reg, cc->bs); } else { - if (bitset_is_empty(cc->bs)) { + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); else add_opcode(reg, OP_CCLASS_MB); @@ -649,7 +622,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; p = (OnigRepeatRange* )xrealloc(reg->repeat_range, - sizeof(OnigRepeatRange) * n); + sizeof(OnigRepeatRange) * n); CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -665,7 +638,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) static int compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, - regex_t* reg) + regex_t* reg) { int r; int num_repeat = reg->num_repeat; @@ -684,7 +657,16 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; - r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + if ( +#ifdef USE_SUBEXP_CALL + reg->num_call > 0 || +#endif + IS_QUALIFIER_IN_REPEAT(qn)) { + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); + } + else { + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + } if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ return r; @@ -706,9 +688,9 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) if (NTYPE(qn->target) == N_ANYCHAR) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } @@ -741,7 +723,8 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) len = SIZE_OP_JUMP + tlen; } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { len = tlen * qn->lower; len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); } @@ -865,7 +848,8 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) r = compile_tree(qn->target, reg); } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { int n = qn->upper - qn->lower; r = compile_tree_n_times(qn->target, qn->lower, reg); @@ -925,18 +909,16 @@ compile_option_node(EffectNode* node, regex_t* reg) if (r) return r; r = add_opcode(reg, OP_FAIL); if (r) return r; + } - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); } - else { - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; - } return r; } @@ -983,7 +965,7 @@ compile_length_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; @@ -1073,7 +1055,7 @@ compile_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; @@ -1258,7 +1240,7 @@ compile_length_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_length_string_raw_node(&(NSTRING(node)), reg); else - r = compile_length_string_node(&(NSTRING(node)), reg); + r = compile_length_string_node(node, reg); break; case N_CCLASS: @@ -1356,7 +1338,7 @@ compile_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_string_raw_node(&(NSTRING(node)), reg); else - r = compile_string_node(&(NSTRING(node)), reg); + r = compile_string_node(node, reg); break; case N_CCLASS: @@ -1412,8 +1394,14 @@ compile_tree(Node* node, regex_t* reg) } else { int* p; - add_opcode(reg, (IS_IGNORECASE(reg->options) ? - OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI)); + + if (IS_IGNORECASE(reg->options)) { + add_opcode(reg, OP_BACKREF_MULTI_IC); + } + else { + add_opcode(reg, OP_BACKREF_MULTI); + } + if (r) return r; add_length(reg, br->back_num); if (r) return r; @@ -2044,7 +2032,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) StrNode* sn = &(NSTRING(node)); UChar *s = sn->s; while (s < sn->end) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); (*len)++; } } @@ -2135,7 +2123,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) { int found; - if (code >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) { if (IS_NULL(cc->mbuf)) { found = 0; } @@ -2300,7 +2288,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) CClassNode* cc = &(NCCLASS(y)); code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, - xs->s + enc_len(reg->enc, c)); + xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); } break; @@ -2311,18 +2299,9 @@ is_not_included(Node* x, Node* y, regex_t* reg) StrNode* ys = &(NSTRING(y)); len = NSTRING_LEN(x); if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); - if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { - UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int plen, qlen; - for (p = ys->s, q = xs->s; q < xs->end; ) { - plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow); - qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow); - if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0) - return 1; - p += enc_len(reg->enc, *p); - q += enc_len(reg->enc, *q); - } + if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { + /* tiny version */ + return 0; } else { for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { @@ -2379,8 +2358,12 @@ get_head_value_node(Node* node, int exact, regex_t* reg) if (exact != 0 && !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s)) +#if 0 + UChar* tmp = sn->s; + if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, + &tmp, sn->end)) n = node; +#endif } else { n = node; @@ -2937,7 +2920,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK); CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); - SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT); swap_node(node, en); NEFFECT(node).target = en; } @@ -2956,9 +2939,114 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) +static int +divide_ambig_string_node(Node* node, regex_t* reg) +{ + StrNode* sn = &NSTRING(node); + int ambig, prev_ambig; + UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp; + Node *snode; + Node *root = NULL_NODE; + Node **tailp = (Node** )0; + + start = prev_start = p = sn->s; + end = sn->end; + if (p >= end) return 0; + + prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end); + + while (p < end) { + prev = p; + if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, + reg->ambig_flag, &p, end))) { + + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < prev) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, prev); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(root).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + prev_ambig = ambig; + prev_start = prev; + } + } + + if (prev_start == start) { + if (prev_ambig != 0) { + NSTRING_SET_AMBIG(node); + tmp = start; + wp = start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node); + sn->end = wp; + } + } + else { + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(node).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + swap_node(node, root); + onig_node_str_clear(root); /* should be after swap! */ + onig_node_free(root); /* free original string node */ + } + + return 0; +} + +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -2996,33 +3084,11 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case N_CCLASS: - if (IS_IGNORECASE(reg->options)) { - int i; - UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - BitSetRef bs = NCCLASS(node).bs; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - c = (UChar )i; - ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf); - if (*lowbuf != c) { - if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf); - if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c); - } - } - } break; case N_STRING: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - StrNode* sn = &NSTRING(node); - UChar* p = sn->s; - - while (p < sn->end) { - if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) { - NSTRING_SET_CASE_AMBIG(node); - break; - } - p += enc_len(reg->enc, *p); - } + r = divide_ambig_string_node(node, reg); } break; @@ -3057,6 +3123,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) QualifierNode* qn = &(NQUALIFIER(node)); Node* target = qn->target; + if ((state & IN_REPEAT) != 0) { + qn->state |= NST_IN_REPEAT; + } + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { r = get_min_match_length(target, &d, env); if (r) break; @@ -3083,8 +3153,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } } + state |= IN_REPEAT; if (qn->lower != qn->upper) - state |= IN_REPEAT; + state |= IN_VAR_REPEAT; r = setup_tree(target, reg, state, env); if (r) break; @@ -3141,11 +3212,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case EFFECT_MEMORY: - if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { + if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) { BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ } - /* fall */ + r = setup_tree(en->target, reg, state, env); + break; + case EFFECT_STOP_BACKTRACK: { Node* target = en->target; @@ -3156,7 +3229,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) tqn->greedy != 0) { /* (?>a*), a*+ etc... */ int qtype = NTYPE(tqn->target); if (IS_NODE_TYPE_SIMPLE(qtype)) - SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT); } } } @@ -3228,26 +3301,17 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) /* set skip map for Boyer-Moor search */ static int -set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, +set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, UChar skip[], int** int_skip) { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - skip[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - skip[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + skip[s[i]] = len - 1 - i; } else { if (IS_NULL(*int_skip)) { @@ -3256,16 +3320,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, } for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*int_skip)[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - (*int_skip)[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + (*int_skip)[s[i]] = len - 1 - i; } return 0; } @@ -3278,11 +3334,12 @@ typedef struct { } MinMaxLen; typedef struct { - MinMaxLen mmd; - BitStatusType backrefed_status; - OnigEncoding enc; - OnigOptionType options; - ScanEnv* scan_env; + MinMaxLen mmd; + BitStatusType backrefed_status; + OnigEncoding enc; + OnigOptionType options; + OnigAmbigType ambig_flag; + ScanEnv* scan_env; } OptEnv; typedef struct { @@ -3319,31 +3376,31 @@ typedef struct { OptMapInfo map; /* boundary */ } NodeOptInfo; +static short int ByteValTable[] = { + 14, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 +}; static int map_position_value(int i) { - static int vals[] = { - 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, - }; - - if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; - - return 7; /* Take it easy. */ + if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0])) + return (int )ByteValTable[i]; + else + return 4; /* Take it easy. */ } static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static int dist_vals[] = { + static short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, @@ -3363,7 +3420,7 @@ distance_value(MinMaxLen* mm) d = mm->max - mm->min; if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) /* return dist_vals[d] * 16 / (mm->min + 12); */ - return dist_vals[d]; + return (int )dist_vals[d]; else return 1; } @@ -3419,12 +3476,14 @@ add_mml(MinMaxLen* to, MinMaxLen* from) to->max = distance_add(to->max, from->max); } +#if 0 static void add_len_mml(MinMaxLen* to, OnigDistance len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); } +#endif static void alt_merge_mml(MinMaxLen* to, MinMaxLen* from) @@ -3571,7 +3630,7 @@ concat_opt_exact_info_str(OptExactInfo* to, to->s[i++] = *p++; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (i + len > OPT_EXACT_MAXLEN) break; for (j = 0; j < len; j++) to->s[i++] = *p++; @@ -3598,7 +3657,7 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) for (i = 0; i < to->len && i < add->len; ) { if (to->s[i] != add->s[i]) break; - len = enc_len(env->enc, to->s[i]); + len = enc_len(env->enc, to->s + i); for (j = 1; j < len; j++) { if (to->s[i+j] != add->s[i+j]) break; @@ -3620,12 +3679,24 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) static void select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) { - int vlen1, vlen2; + int v1, v2; + + v1 = now->len; + v2 = alt->len; - vlen1 = now->len * (now->ignore_case ? 1 : 2); - vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + if (v1 <= 2 && v2 <= 2) { + /* ByteValTable[x] is big value --> low price */ + v2 = map_position_value(now->s[0]); + v1 = map_position_value(alt->s[0]); - if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + if (now->len > 1) v1 += 5; + if (alt->len > 1) v2 += 5; + } + + if (now->ignore_case == 0) v1 *= 2; + if (alt->ignore_case == 0) v2 *= 2; + + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_exact_info(now, alt); } @@ -3648,7 +3719,7 @@ copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) } static void -add_char_opt_map_info(OptMapInfo* map, int c) +add_char_opt_map_info(OptMapInfo* map, UChar c) { if (map->map[c] == 0) { map->map[c] = 1; @@ -3656,26 +3727,48 @@ add_char_opt_map_info(OptMapInfo* map, int c) } } -static void -add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc) +static int +add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, + OnigEncoding enc, OnigAmbigType ambig_flag) { - UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int i, j, n, len; + UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN]; + OnigCodePoint code, ccode; + OnigCompAmbigCodes* ccs; + OnigPairAmbigCodes* pccs; + OnigAmbigType amb; - add_char_opt_map_info(map, c); + add_char_opt_map_info(map, p[0]); + code = ONIGENC_MBC_TO_CODE(enc, p, end); - x = (UChar )c; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if (*low != x) { - add_char_opt_map_info(map, (int )(*low)); - } - else { - int i; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { - x = (UChar )i; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if ((int )(*low) == c) add_char_opt_map_info(map, i); + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs); + for (i = 0; i < n; i++) { + if (pccs[i].from == code) { + len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0]); + } + } + + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + n = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < n; i++) { + if (ccs[i].code == code) { + for (j = 0; j < ccs[i].n; j++) { + ccode = ccs[i].items[j].code[0]; + len = ONIGENC_CODE_TO_MBC(enc, ccode, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0]); + } + break; + } + } } } + return 0; } static void @@ -3881,143 +3974,110 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case N_STRING: { - UChar *p; - int len, plen; StrNode* sn = &(NSTRING(node)); int slen = sn->end - sn->s; int is_raw = NSTRING_IS_RAW(node); - if ((! IS_IGNORECASE(env->options)) || is_raw) { + if (! NSTRING_IS_AMBIG(node)) { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, NSTRING_IS_RAW(node), env->enc); if (slen > 0) { add_char_opt_map_info(&opt->map, *(sn->s)); } + set_mml(&opt->len, slen, slen); } else { - for (p = sn->s; p < sn->end; ) { - len = enc_len(env->enc, *p); - if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) { - break; - } - p += len; - } + int n, max; - plen = p - sn->s; - if (plen > slen / 5) { - concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); - concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); - opt->exm.ignore_case = 1; - if (opt->exm.len == sn->end - p) - opt->exm.reach_end = 1; - - copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); - add_len_mml(&(opt->exm.mmd), plen); - } - else { - concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, - is_raw, env->enc); - opt->exb.ignore_case = 1; - } + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; if (slen > 0) { - if (p == sn->s) - add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc); - else - add_char_opt_map_info(&opt->map, *(sn->s)); + r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end, + env->enc, env->ambig_flag); + if (r != 0) break; } + + if (NSTRING_IS_AMBIG_REDUCE(node)) { + n = onigenc_strlen(env->enc, sn->s, sn->end); + max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; + } + else { + max = slen; + } + set_mml(&opt->len, slen, max); } if (opt->exb.len == slen) opt->exb.reach_end = 1; - - set_mml(&opt->len, slen, slen); } break; case N_CCLASS: { - int i, z, len, found, mb_found; + int i, z; CClassNode* cc = &(NCCLASS(node)); /* no need to check ignore case. (setted in setup_tree()) */ - found = mb_found = 0; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && !cc->not) || (!z && cc->not)) { - found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - if (! ONIGENC_IS_SINGLEBYTE(env->enc)) { - if (! IS_NULL(cc->mbuf) || - (cc->not != 0 && found != 0)) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = ONIGENC_IS_MBC_HEAD(env->enc, i); - if (z) { - mb_found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - } - } + if (IS_NOT_NULL(cc->mbuf) || cc->not != 0) { + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); - if (mb_found) { - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + set_mml(&opt->len, min, max); } - else if (found) { - len = 1; - set_mml(&opt->len, 1, len); + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !cc->not) || (!z && cc->not)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + set_mml(&opt->len, 1, 1); } } break; case N_CTYPE: { - int c; - int len, min, max; + int i, min, max; - min = ONIGENC_MBC_MAXLEN_DIST(env->enc); - max = 0; + max = ONIGENC_MBC_MAXLEN_DIST(env->enc); -#define IS_WORD_HEAD_BYTE(enc,b) \ - (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \ - : ONIGENC_IS_MBC_HEAD(enc,b)) + if (max == 1) { + min = 1; - switch (NCTYPE(node).type) { - case CTYPE_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } - } - break; + switch (NCTYPE(node).type) { + case CTYPE_NOT_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + break; - case CTYPE_NOT_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } + case CTYPE_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + break; } - break; } - + else { + min = ONIGENC_MBC_MINLEN(env->enc); + } set_mml(&opt->len, min, max); } break; case N_ANYCHAR: { - OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, min, max); } break; @@ -4218,36 +4278,20 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) if (e->len == 0) return 0; - reg->exact = onig_strdup(e->s, e->s + e->len); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - - reg->exact_end = reg->exact + e->len; - if (e->ignore_case) { - UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int len, low_len, i, j, alloc_size; - - alloc_size = e->len; - i = j = 0; - while (i < e->len) { - low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf); - len = enc_len(reg->enc, e->s[i]); - if (low_len > alloc_size - i) { - reg->exact = xrealloc(reg->exact, alloc_size * 2); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - alloc_size *= 2; - } - - xmemcpy(&(reg->exact[j]), buf, low_len); - i += len; - j += low_len; - } - reg->exact_end = reg->exact + j; + reg->exact = (UChar* )xmalloc(e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + xmemcpy(reg->exact, e->s, e->len); + reg->exact_end = reg->exact + e->len; reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } else { int allow_reverse; + reg->exact = onig_strdup(e->s, e->s + e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + reg->exact_end = reg->exact + e->len; + if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) allow_reverse = 1; else @@ -4255,7 +4299,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, reg->map, &(reg->int_map)); if (r) return r; @@ -4315,6 +4359,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.enc = reg->enc; env.options = reg->options; + env.ambig_flag = reg->ambig_flag; env.scan_env = scan_env; clear_mml(&env.mmd); @@ -4469,17 +4514,26 @@ print_optimize_info(FILE* f, regex_t* reg) fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { - int i, n = 0; + int c, i, n = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) if (reg->map[i]) n++; fprintf(f, "map: n=%d\n", n); if (n > 0) { + c = 0; fputc('[', f); - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - if (reg->map[i] && enc_len(reg->enc, i) == 1 && - ONIGENC_IS_CODE_PRINT(reg->enc, i)) - fputc(i, f); + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + if (reg->map[i] != 0) { + if (c > 0) fputs(", ", f); + c++; + if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) + fputc(i, f); + else + fprintf(f, "%d", i); + } + } fprintf(f, "]\n"); } } @@ -4518,7 +4572,7 @@ onig_free(regex_t* reg) xfree(from);\ } while (0) -static void +extern void onig_transfer(regex_t* to, regex_t* from) { THREAD_ATOMIC_START; @@ -4532,7 +4586,7 @@ onig_transfer(regex_t* to, regex_t* from) }\ } while (0) -static void +extern void onig_chain_link_add(regex_t* to, regex_t* add) { THREAD_ATOMIC_START; @@ -4585,7 +4639,8 @@ onig_clone(regex_t** to, regex_t* from) from->state++; /* increment as search counter */ } - r = onig_alloc_init(®, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT); + r = onig_alloc_init(®, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + from->enc, ONIG_SYNTAX_DEFAULT); if (r != 0) { from->state--; return r; @@ -4816,8 +4871,8 @@ onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, static int onig_inited = 0; extern int -onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, - OnigSyntaxType* syntax) +onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, + OnigEncoding enc, OnigSyntaxType* syntax) { if (! onig_inited) onig_init(); @@ -4850,6 +4905,9 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, (*reg)->used = 0; (*reg)->name_table = (void* )NULL; + (*reg)->ambig_flag = ambig_flag; + (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc); + return 0; } @@ -4862,7 +4920,8 @@ onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; - r = onig_alloc_init(reg, option, enc, syntax); + r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + enc, syntax); if (r) return r; r = onig_compile(*reg, pattern, pattern_end, einfo); @@ -4959,7 +5018,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREF3, "backref3", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, @@ -4980,6 +5039,8 @@ OnigOpInfoType OnigOpInfo[] = { { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, @@ -5046,7 +5107,8 @@ p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) } extern void -onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, + OnigEncoding enc) { int i, n, arg_type; RelAddrType addr; @@ -5138,7 +5200,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) break; case OP_EXACT1_IC: - p_string(f, 1, bp++); + len = enc_len(enc, bp); + p_string(f, len, bp); + bp += len; break; case OP_EXACTN_IC: GET_LENGTH_INC(len, bp); @@ -5184,8 +5248,14 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) fprintf(f, ":%d:%d:%d", n, (int )code, len); break; - case OP_BACKREF_MULTI: + case OP_BACKREFN_IC: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_MULTI: fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { @@ -5253,7 +5323,7 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) else fputs(" ", f); } - onig_print_compiled_byte_code(f, bp, &bp); + onig_print_compiled_byte_code(f, bp, &bp, reg->enc); } fprintf(f, "\n"); @@ -5313,12 +5383,6 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "%0x", bbuf->p[i]); } } -#if 0 - fprintf(f, "\n"); - Indent(f, indent); - for (i = 0; i < SINGLE_BYTE_SIZE; i++) - fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); -#endif break; case N_CTYPE: @@ -1,10 +1,32 @@ /********************************************************************** - regenc.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT; @@ -33,7 +55,7 @@ onigenc_get_right_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) { UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); if (p < s) { - p += enc_len(enc, *p); + p += enc_len(enc, p); } return p; } @@ -46,7 +68,7 @@ onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, if (p < s) { if (prev) *prev = p; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { if (prev) *prev = (UChar* )NULL; /* Sorry */ @@ -75,11 +97,114 @@ onigenc_step_back(OnigEncoding enc, UChar* start, UChar* s, int n) return s; } +extern UChar* +onigenc_step(OnigEncoding enc, UChar* p, UChar* end, int n) +{ + while (n-- > 0) { + p += ONIGENC_MBC_ENC_LEN(enc, p); + } + return (p <= end ? p : (UChar* )NULL); +} + +extern int +onigenc_strlen(OnigEncoding enc, UChar* p, UChar* end) +{ + int n = 0; + + while (p < end) { + p += ONIGENC_MBC_ENC_LEN(enc, p); + n++; + } + return n; +} + +extern int +onigenc_strlen_null(OnigEncoding enc, UChar* p) +{ + int n = 0; + + while (1) { + if (*p == '\0') { + UChar* q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return n; + q = p + 1; + while (len > 1) { + if (*q != '\0') break; + q++; + len--; + } + if (len == 1) return n; + } + p += ONIGENC_MBC_ENC_LEN(enc, p); + n++; + } +} + +extern int +onigenc_str_bytelen_null(OnigEncoding enc, UChar* p) +{ + UChar* start = p; + + while (1) { + if (*p == '\0') { + UChar* q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return (int )(p - start); + q = p + 1; + while (len > 1) { + if (*q != '\0') break; + q++; + len--; + } + if (len == 1) return (int )(p - start); + } + p += ONIGENC_MBC_ENC_LEN(enc, p); + } +} #ifndef ONIG_RUBY_M17N #ifndef NOT_RUBY + #define USE_APPLICATION_TO_LOWER_CASE_TABLE + +unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 +}; #endif UChar* OnigEncAsciiToLowerCaseTable = (UChar* )0; @@ -121,23 +246,61 @@ static UChar BuiltInAsciiToLowerCaseTable[] = { }; #endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */ +#ifdef USE_UPPER_CASE_TABLE +UChar OnigEncAsciiToUpperCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137', + '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', +}; +#endif + unsigned short OnigEncAsciiCtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, @@ -156,6 +319,78 @@ unsigned short OnigEncAsciiCtypeTable[256] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; +UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', + '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' +}; + +#ifdef USE_UPPER_CASE_TABLE +UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137', + '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377', +}; +#endif + extern void onigenc_set_default_caseconv_table(UChar* table) { @@ -178,38 +413,230 @@ onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); } +OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { + { 0x41, 0x61 }, + { 0x42, 0x62 }, + { 0x43, 0x63 }, + { 0x44, 0x64 }, + { 0x45, 0x65 }, + { 0x46, 0x66 }, + { 0x47, 0x67 }, + { 0x48, 0x68 }, + { 0x49, 0x69 }, + { 0x4a, 0x6a }, + { 0x4b, 0x6b }, + { 0x4c, 0x6c }, + { 0x4d, 0x6d }, + { 0x4e, 0x6e }, + { 0x4f, 0x6f }, + { 0x50, 0x70 }, + { 0x51, 0x71 }, + { 0x52, 0x72 }, + { 0x53, 0x73 }, + { 0x54, 0x74 }, + { 0x55, 0x75 }, + { 0x56, 0x76 }, + { 0x57, 0x77 }, + { 0x58, 0x78 }, + { 0x59, 0x79 }, + { 0x5a, 0x7a }, + + { 0x61, 0x41 }, + { 0x62, 0x42 }, + { 0x63, 0x43 }, + { 0x64, 0x44 }, + { 0x65, 0x45 }, + { 0x66, 0x46 }, + { 0x67, 0x47 }, + { 0x68, 0x48 }, + { 0x69, 0x49 }, + { 0x6a, 0x4a }, + { 0x6b, 0x4b }, + { 0x6c, 0x4c }, + { 0x6d, 0x4d }, + { 0x6e, 0x4e }, + { 0x6f, 0x4f }, + { 0x70, 0x50 }, + { 0x71, 0x51 }, + { 0x72, 0x52 }, + { 0x73, 0x53 }, + { 0x74, 0x54 }, + { 0x75, 0x55 }, + { 0x76, 0x56 }, + { 0x77, 0x57 }, + { 0x78, 0x58 }, + { 0x79, 0x59 }, + { 0x7a, 0x5a } +}; + +extern int +onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes)); + } + else { + return 0; + } +} + extern int -onigenc_nothing_get_all_fold_match_code(OnigCodePoint** codes) +onigenc_nothing_get_all_comp_ambig_codes(OnigAmbigType flag, + OnigCompAmbigCodes** ccs) { return 0; } extern int -onigenc_nothing_get_fold_match_info(UChar* p, UChar* end, - OnigEncFoldMatchInfo** info) +onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes)); + } + else if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + +extern int +onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, + OnigCompAmbigCodes** ccs) { - return -1; + static OnigCompAmbigCodes folds[] = { + { 2, 0xdf, {{ 2, { 0x53, 0x53 } }, { 2, { 0x73, 0x73} } } } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = folds; + return sizeof(folds) / sizeof(OnigCompAmbigCodes); + } + else + return 0; } extern int -onigenc_nothing_get_ctype_code_range(int ctype, int* nsb, int* nmb, +onigenc_not_support_get_ctype_code_range(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]) { - return -1; + return ONIG_NO_SUPPORT_CONFIG; +} + +extern int +onigenc_is_mbc_newline_0x0a(UChar* p, UChar* end) +{ + if (p < end) { + if (*p == 0x0a) return 1; + } + return 0; } /* for single byte encodings */ extern int -onigenc_ascii_mbc_to_lower(UChar* p, UChar* lower) +onigenc_ascii_mbc_to_normalize(OnigAmbigType flag, UChar** p, UChar*end, + UChar* lower) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p); + } + else { + *lower = **p; + } + + (*p)++; return 1; /* return byte length of converted char to lower */ } extern int -onigenc_ascii_mbc_is_case_ambig(UChar* p) +onigenc_ascii_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) +{ + UChar* p = *pp; + + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + else { + return FALSE; + } +} + +extern int +onigenc_single_byte_mbc_enc_len(UChar* p) { - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + return 1; } extern OnigCodePoint @@ -244,20 +671,25 @@ onigenc_single_byte_left_adjust_char_head(UChar* start, UChar* s) } extern int -onigenc_single_byte_is_allowed_reverse_match(UChar* s, UChar* end) +onigenc_always_true_is_allowed_reverse_match(UChar* s, UChar* end) { return TRUE; } +extern int +onigenc_always_false_is_allowed_reverse_match(UChar* s, UChar* end) +{ + return FALSE; +} + extern OnigCodePoint onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) { int c, i, len; OnigCodePoint n; - c = *p++; - len = enc_len(enc, c); - n = c; + len = enc_len(enc, p); + n = (OnigCodePoint )(*p++); if (len == 1) return n; for (i = 1; i < len; i++) { @@ -269,33 +701,52 @@ onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) } extern int -onigenc_mbn_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* lower) +onigenc_mbn_mbc_to_normalize(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end, UChar* lower) { int len; + UChar *p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted to lower char */ } } extern int -onigenc_mbn_mbc_is_case_ambig(UChar* p) +onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end) { - if (ONIGENC_IS_MBC_ASCII(p)) - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + UChar* p = *pp; + if (ONIGENC_IS_MBC_ASCII(p)) { + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + else { + return FALSE; + } + } + + (*pp) += enc_len(enc, p); return FALSE; } @@ -360,7 +811,7 @@ onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(enc, buf[0]) != (p - buf)) + if (enc_len(enc, buf) != (p - buf)) return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; @@ -383,23 +834,21 @@ onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(enc, buf[0]) != (p - buf)) + if (enc_len(enc, buf) != (p - buf)) return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } extern int -onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, +onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = onigenc_mb2_code_to_mbc_first(code); - return (enc_len(enc, first) > 1 ? TRUE : FALSE); - } + else + return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -412,16 +861,14 @@ onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, } extern int -onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, +onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = onigenc_mb4_code_to_mbc_first(code); - return (enc_len(enc, first) > 1 ? TRUE : FALSE); - } + else + return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -433,6 +880,7 @@ onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, return FALSE; } +#if 0 extern int onigenc_get_all_fold_match_code_ss_0xdf(OnigCodePoint** codes) { @@ -440,33 +888,25 @@ onigenc_get_all_fold_match_code_ss_0xdf(OnigCodePoint** codes) *codes = list; return 1; } +#endif extern int -onigenc_get_fold_match_info_ss_0xdf(UChar* p, UChar* end, - OnigEncFoldMatchInfo** info) +onigenc_with_ascii_strncmp(OnigEncoding enc, UChar* p, UChar* end, + UChar* sascii /* ascii */, int n) { - /* German alphabet ess-tsett(U+00DF) */ - static OnigEncFoldMatchInfo ss = { - 3, - { 1, 2, 2 }, - { "\337", "ss", "SS" } /* 0337: 0xdf */ - }; + int x, c; - if (p >= end) return -1; + while (n-- > 0) { + if (p >= end) return (int )(*sascii); - if (*p == 0xdf) { - *info = &ss; - return 1; - } - else if (p + 1 < end) { - if ((*p == 'S' && *(p+1) == 'S') || - (*p == 's' && *(p+1) == 's')) { - *info = &ss; - return 2; - } - } + c = (int )ONIGENC_MBC_TO_CODE(enc, p, end); + x = *sascii - c; + if (x) return x; - return -1; /* is not a fold string. */ + sascii++; + p += enc_len(enc, p); + } + return 0; } #else /* ONIG_RUBY_M17N */ @@ -475,6 +915,10 @@ extern int onigenc_is_code_ctype(OnigEncoding enc, OnigCodePoint code, int ctype) { switch (ctype) { + case ONIGENC_CTYPE_NEWLINE: + if (code == 0x0a) return 1; + break; + case ONIGENC_CTYPE_ALPHA: return m17n_isalpha(enc, code); break; @@ -548,12 +992,22 @@ onigenc_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* buf) } extern int -onigenc_mbc_is_case_ambig(OnigEncoding enc, UChar* p) +onigenc_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end) { - unsigned int c = m17n_codepoint(enc, p, p + enc_len(enc, *p)); + int len; + unsigned int c; + UChar* p = *pp; + + len = enc_len(enc, *p); + (*pp) += len; + c = m17n_codepoint(enc, p, p + len); + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + if (m17n_isupper(enc, c) || m17n_islower(enc, c)) + return TRUE; + } - if (m17n_isupper(enc, c) || m17n_islower(enc, c)) - return TRUE; return FALSE; } @@ -1,12 +1,33 @@ +#ifndef REGENC_H +#define REGENC_H /********************************************************************** - regenc.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef REGENC_H -#define REGENC_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #ifndef RUBY_PLATFORM #include "config.h" @@ -31,8 +52,6 @@ #define ONIGENCERR_INVALID_WIDE_CHAR_VALUE -400 #define ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE -401 -#define ONIG_NEWLINE '\n' -#define ONIG_IS_NEWLINE(c) ((c) == ONIG_NEWLINE) #define ONIG_IS_NULL(p) (((void*)(p)) == (void*)0) #define ONIG_IS_NOT_NULL(p) (((void*)(p)) != (void*)0) #define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL @@ -48,44 +67,72 @@ #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII /* for encoding system implementation (internal) */ -ONIG_EXTERN int onigenc_nothing_get_all_fold_match_code P_((OnigCodePoint** codes)); -ONIG_EXTERN int onigenc_nothing_get_fold_match_info P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); -ONIG_EXTERN int onigenc_nothing_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); +ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); +ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((UChar* p, UChar* end)); /* methods for single byte encoding */ -ONIG_EXTERN int onigenc_ascii_mbc_to_lower P_((UChar* p, UChar* lower)); -ONIG_EXTERN int onigenc_ascii_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN int onigenc_ascii_mbc_to_normalize P_((OnigAmbigType flag, UChar** p, UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_ascii_is_mbc_ambiguous P_((OnigAmbigType flag, UChar** p, UChar* end)); +ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((UChar* p)); ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((UChar* p, UChar* end)); ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf)); ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((UChar* start, UChar* s)); -ONIG_EXTERN int onigenc_single_byte_is_allowed_reverse_match P_((UChar* s, UChar* end)); +ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((UChar* s, UChar* end)); +ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((UChar* s, UChar* end)); /* methods for multi byte encoding */ ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, UChar* p, UChar* end)); -ONIG_EXTERN int onigenc_mbn_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* lower)); -ONIG_EXTERN int onigenc_mbn_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN int onigenc_mbn_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, UChar** p, UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_mbn_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, UChar** p, UChar* end)); ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb2_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb4_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb4_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_get_all_fold_match_code_ss_0xdf P_((OnigCodePoint** codes)); -ONIG_EXTERN int onigenc_get_fold_match_info_ss_0xdf P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); + +/* in enc/unicode.c */ +ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); + + +#define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ + OnigEncISO_8859_1_ToLowerCaseTable[c] +#define ONIGENC_ISO_8859_1_TO_UPPER_CASE(c) \ + OnigEncISO_8859_1_ToUpperCaseTable[c] +#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ + ((OnigEnc_Unicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) + +ONIG_EXTERN UChar OnigEncISO_8859_1_ToLowerCaseTable[]; +ONIG_EXTERN UChar OnigEncISO_8859_1_ToUpperCaseTable[]; +ONIG_EXTERN unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; +ONIG_EXTERN OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; #endif /* is not ONIG_RUBY_M17N */ +ONIG_EXTERN int +onigenc_with_ascii_strncmp P_((OnigEncoding enc, UChar* p, UChar* end, UChar* sascii /* ascii */, int n)); +ONIG_EXTERN UChar* +onigenc_step P_((OnigEncoding enc, UChar* p, UChar* end, int n)); + ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; ONIG_EXTERN UChar* OnigEncAsciiToLowerCaseTable; +ONIG_EXTERN UChar OnigEncAsciiToUpperCaseTable[]; ONIG_EXTERN unsigned short OnigEncAsciiCtypeTable[]; #define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] +#define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c] #define ONIGENC_IS_ASCII_CODE_CTYPE(code,ctype) \ ((OnigEncAsciiCtypeTable[code] & ctype) != 0) #define ONIGENC_IS_ASCII_CODE_CASE_AMBIG(code) \ diff --git a/regerror.c b/regerror.c index c7a2a7b7ea..50ce8fd9fe 100644 --- a/regerror.c +++ b/regerror.c @@ -1,10 +1,32 @@ /********************************************************************** - regerror.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" #include <stdio.h> /* for vsnprintf() */ @@ -146,6 +168,8 @@ onig_error_code_to_format(int code) p = "group number is too big for capture history"; break; case ONIGERR_INVALID_CHAR_PROPERTY_NAME: p = "invalid character property name {%n}"; break; + case ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION: + p = "not supported encoding combination"; break; case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT: p = "over thread pass limit count"; break; @@ -219,7 +243,7 @@ onig_error_code_to_str(s, code, va_alist) default: q = onig_error_code_to_format(code); - len = strlen(q); + len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, q); xmemcpy(s, q, len); s[len] = '\0'; break; @@ -246,7 +270,8 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) #endif { int n, need, len; - UChar *p, *s; + UChar *p, *s, *bp; + char bs[6]; va_list args; va_init_list(args, fmt); @@ -257,29 +282,41 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) if (n + need < bufsize) { strcat(buf, ": /"); - s = buf + strlen(buf); + s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf); p = pat; while (p < (UChar* )pat_end) { - if (*p == MC_ESC) { + if (*p == MC_ESC(enc)) { *s++ = *p++; - len = enc_len(enc, *p); + len = enc_len(enc, p); while (len-- > 0) *s++ = *p++; } else if (*p == '/') { - *s++ = MC_ESC; + *s++ = (unsigned char )MC_ESC(enc); *s++ = *p++; } - else if (ONIGENC_IS_MBC_HEAD(enc, *p)) { - len = enc_len(enc, *p); - while (len-- > 0) *s++ = *p++; + else if (ONIGENC_IS_MBC_HEAD(enc, p)) { + len = enc_len(enc, p); + if (ONIGENC_MBC_MINLEN(enc) == 1) { + while (len-- > 0) *s++ = *p++; + } + else { /* for UTF16 */ + int blen; + + while (len-- > 0) { + sprintf(bs, "\\%03o", *p++ & 0377); + blen = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); + bp = bs; + while (blen-- > 0) *s++ = *bp++; + } + } } else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && !ONIGENC_IS_CODE_SPACE(enc, *p)) { - char b[5]; - sprintf(b, "\\%03o", *p & 0377); - len = strlen(b); - while (len-- > 0) *s++ = *p++; + sprintf(bs, "\\%03o", *p++ & 0377); + len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); + bp = bs; + while (len-- > 0) *s++ = *bp++; } else { *s++ = *p++; @@ -1,52 +1,151 @@ /********************************************************************** - regexec.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" +#ifdef USE_CAPTURE_HISTORY +static void history_tree_free(OnigCaptureTreeNode* node); + static void -region_list_clear(OnigRegion** list) +history_tree_clear(OnigCaptureTreeNode* node) { int i; - if (IS_NOT_NULL(list)) { - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (IS_NOT_NULL(list[i])) { - xfree(list[i]); - list[i] = (OnigRegion* )0; + if (IS_NOT_NULL(node)) { + for (i = 0; i < node->num_childs; i++) { + if (IS_NOT_NULL(node->childs[i])) { + history_tree_free(node->childs[i]); } } + for (i = 0; i < node->allocated; i++) { + node->childs[i] = (OnigCaptureTreeNode* )0; + } + node->num_childs = 0; + node->beg = ONIG_REGION_NOTPOS; + node->end = ONIG_REGION_NOTPOS; + node->group = -1; } } static void -region_list_free(OnigRegion* r) +history_tree_free(OnigCaptureTreeNode* node) { - if (IS_NOT_NULL(r->list)) { - region_list_clear(r->list); - xfree(r->list); - r->list = (OnigRegion** )0; + history_tree_clear(node); + xfree(node); +} + +static void +history_root_free(OnigRegion* r) +{ + if (IS_NOT_NULL(r->history_root)) { + history_tree_free(r->history_root); + r->history_root = (OnigCaptureTreeNode* )0; } } -static OnigRegion** -region_list_new() +static OnigCaptureTreeNode* +history_node_new() { - int i; - OnigRegion** list; + OnigCaptureTreeNode* node; + + node = (OnigCaptureTreeNode* )xmalloc(sizeof(OnigCaptureTreeNode)); + CHECK_NULL_RETURN(node); + node->childs = (OnigCaptureTreeNode** )0; + node->allocated = 0; + node->num_childs = 0; + node->group = -1; + node->beg = ONIG_REGION_NOTPOS; + node->end = ONIG_REGION_NOTPOS; + + return node; +} - list = (OnigRegion** )xmalloc(sizeof(OnigRegion*) - * (ONIG_MAX_CAPTURE_HISTORY_GROUP + 1)); - CHECK_NULL_RETURN(list); - for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - list[i] = (OnigRegion* )0; +static int +history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) +{ +#define HISTORY_TREE_INIT_ALLOC_SIZE 8 + + if (parent->num_childs >= parent->allocated) { + int n, i; + + if (IS_NULL(parent->childs)) { + n = HISTORY_TREE_INIT_ALLOC_SIZE; + parent->childs = + (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n); + } + else { + n = parent->allocated * 2; + parent->childs = + (OnigCaptureTreeNode** )xrealloc(parent->childs, + sizeof(OnigCaptureTreeNode*) * n); + } + CHECK_NULL_RETURN_VAL(parent->childs, ONIGERR_MEMORY); + for (i = parent->allocated; i < n; i++) { + parent->childs[i] = (OnigCaptureTreeNode* )0; + } + parent->allocated = n; + } + + parent->childs[parent->num_childs] = child; + parent->num_childs++; + return 0; +} + +static OnigCaptureTreeNode* +history_tree_clone(OnigCaptureTreeNode* node) +{ + int i; + OnigCaptureTreeNode *clone, *child; + + clone = history_node_new(); + CHECK_NULL_RETURN(clone); + + clone->beg = node->beg; + clone->end = node->end; + for (i = 0; i < node->num_childs; i++) { + child = history_tree_clone(node->childs[i]); + if (IS_NULL(child)) { + history_tree_free(clone); + return (OnigCaptureTreeNode* )0; + } + history_tree_add_child(clone, child); } - return list; + return clone; +} + +extern OnigCaptureTreeNode* +onig_get_capture_tree(OnigRegion* region) +{ + return region->history_root; } +#endif /* USE_CAPTURE_HISTORY */ extern void onig_region_clear(OnigRegion* region) @@ -56,7 +155,9 @@ onig_region_clear(OnigRegion* region) for (i = 0; i < region->num_regs; i++) { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } - region_list_clear(region->list); +#ifdef USE_CAPTURE_HISTORY + history_root_free(region); +#endif } extern int @@ -92,88 +193,20 @@ onig_region_resize(OnigRegion* region, int n) region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } - if (IS_NOT_NULL(region->list)) - region_list_clear(region->list); - - return 0; -} - -static int -region_ensure_size(OnigRegion* region, int n) -{ - int i, new_size; - - if (region->allocated >= n) - return 0; - - new_size = region->allocated; - if (new_size == 0) - new_size = ONIG_NREGION; - while (new_size < n) - new_size *= 2; - - if (region->allocated == 0) { - region->beg = (int* )xmalloc(new_size * sizeof(int)); - region->end = (int* )xmalloc(new_size * sizeof(int)); - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - - region->allocated = new_size; - } - else if (region->allocated < new_size) { - region->beg = (int* )xrealloc(region->beg, new_size * sizeof(int)); - region->end = (int* )xrealloc(region->end, new_size * sizeof(int)); - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - - region->allocated = new_size; - } - - for (i = region->num_regs; i < n; i++) { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } - return 0; -} - -static int -region_list_add_entry(OnigRegion* region, int group, int start, int end) -{ - int r, pos; - OnigRegion** list; - - if (group > ONIG_MAX_CAPTURE_HISTORY_GROUP) - return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; - - if (IS_NULL(region->list)) { - region->list = region_list_new(); - CHECK_NULL_RETURN_VAL(region->list, ONIGERR_MEMORY); - } - - list = region->list; - if (IS_NULL(list[group])) { - list[group] = onig_region_new(); - CHECK_NULL_RETURN_VAL(list[group], ONIGERR_MEMORY); - } - - r = region_ensure_size(list[group], list[group]->num_regs + 1); - if (r != 0) return r; - - pos = list[group]->num_regs; - list[group]->beg[pos] = start; - list[group]->end[pos] = end; - list[group]->num_regs++; - +#ifdef USE_CAPTURE_HISTORY + history_root_free(region); +#endif return 0; } static void onig_region_init(OnigRegion* region) { - region->num_regs = 0; - region->allocated = 0; - region->beg = (int* )0; - region->end = (int* )0; - region->list = (OnigRegion** )0; + region->num_regs = 0; + region->allocated = 0; + region->beg = (int* )0; + region->end = (int* )0; + region->history_root = (OnigCaptureTreeNode* )0; } extern OnigRegion* @@ -195,7 +228,9 @@ onig_region_free(OnigRegion* r, int free_self) if (r->end) xfree(r->end); r->allocated = 0; } - region_list_free(r); +#ifdef USE_CAPTURE_HISTORY + history_root_free(r); +#endif if (free_self) xfree(r); } } @@ -227,28 +262,13 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) } to->num_regs = from->num_regs; - if (IS_NOT_NULL(from->list)) { - if (IS_NULL(to->list)) { - to->list = region_list_new(); - } - - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (IS_NOT_NULL(from->list[i])) { - if (IS_NULL(to->list[i])) - to->list[i] = onig_region_new(); +#ifdef USE_CAPTURE_HISTORY + history_root_free(to); - onig_region_copy(to->list[i], from->list[i]); - } - else { - if (IS_NOT_NULL(to->list[i])) { - xfree(to->list[i]); - to->list[i] = (OnigRegion* )0; - } - } - } + if (IS_NOT_NULL(from->history_root)) { + to->history_root = history_tree_clone(from->history_root); } - else - region_list_free(to); +#endif } @@ -851,24 +871,25 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, }\ } while(0) -#define STRING_CMP_IC(s1,ps2,len) do {\ - if (string_cmp_ic(encode, s1, ps2, len) == 0) \ +#define STRING_CMP_IC(ambig_flag,s1,ps2,len) do {\ + if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \ goto fail; \ } while(0) -static int string_cmp_ic(OnigEncoding enc, +static int string_cmp_ic(OnigEncoding enc, int ambig_flag, UChar* s1, UChar** ps2, int mblen) { - UChar buf1[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar buf2[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar *p1, *p2, *end, *s2; + UChar buf1[ONIGENC_MBC_NORMALIZE_MAXLEN]; + UChar buf2[ONIGENC_MBC_NORMALIZE_MAXLEN]; + UChar *p1, *p2, *end, *s2, *end2; int len1, len2; - s2 = *ps2; - end = s1 + mblen; + s2 = *ps2; + end = s1 + mblen; + end2 = s2 + mblen; while (s1 < end) { - len1 = ONIGENC_MBC_TO_LOWER(enc, s1, buf1); - len2 = ONIGENC_MBC_TO_LOWER(enc, s2, buf2); + len1 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s1, end, buf1); + len2 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s2, end2, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -877,9 +898,6 @@ static int string_cmp_ic(OnigEncoding enc, p1++; p2++; } - - s1 += enc_len(enc, *s1); - s2 += enc_len(enc, *s2); } *ps2 = s2; @@ -895,8 +913,8 @@ static int string_cmp_ic(OnigEncoding enc, }\ } while(0) -#define STRING_CMP_VALUE_IC(s1,ps2,len,is_fail) do {\ - if (string_cmp_ic(encode, s1, ps2, len) == 0) \ +#define STRING_CMP_VALUE_IC(ambig_flag,s1,ps2,len,is_fail) do {\ + if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -911,6 +929,110 @@ static int string_cmp_ic(OnigEncoding enc, #define DATA_ENSURE_CHECK(n) (s + (n) <= end) +#ifdef USE_CAPTURE_HISTORY +static int +make_capture_history_tree(OnigCaptureTreeNode* node, StackType** kp, + StackType* stk_top, UChar* str, regex_t* reg) +{ + int n, r; + OnigCaptureTreeNode* child; + StackType* k = *kp; + + while (k < stk_top) { + if (k->type == STK_MEM_START) { + n = k->u.mem.num; + if (n <= ONIG_MAX_CAPTURE_HISTORY_GROUP && + BIT_STATUS_AT(reg->capture_history, n) != 0) { + child = history_node_new(); + CHECK_NULL_RETURN_VAL(child, ONIGERR_MEMORY); + child->group = n; + child->beg = (int )(k->u.mem.pstr - str); + r = history_tree_add_child(node, child); + if (r != 0) return r; + *kp = (k + 1); + r = make_capture_history_tree(child, kp, stk_top, str, reg); + if (r != 0) return r; + + k = *kp; + child->end = (int )(k->u.mem.pstr - str); + } + } + else if (k->type == STK_MEM_END) { + if (k->u.mem.num == node->group) { + node->end = (int )(k->u.mem.pstr - str); + *kp = k; + return 0; + } + } + k++; + } + + return 1; /* 1: root node ending. */ +} +#endif + +#ifdef RUBY_PLATFORM + +typedef struct { + int state; + regex_t* reg; + MatchArg* msa; + StackType* stk_base; +} TrapEnsureArg; + +static VALUE +trap_ensure(VALUE arg) +{ + TrapEnsureArg* ta = (TrapEnsureArg* )arg; + + if (ta->state == 0) { /* trap_exec() is not normal return */ + ta->reg->state--; + if (! IS_NULL(ta->msa->stack_p) && ta->stk_base != ta->msa->stack_p) + xfree(ta->stk_base); + + MATCH_ARG_FREE(*(ta->msa)); + } + + return Qnil; +} + +static VALUE +trap_exec(VALUE arg) +{ + TrapEnsureArg* ta; + + rb_trap_exec(); + + ta = (TrapEnsureArg* )arg; + ta->state = 1; /* normal return */ + return Qnil; +} + +extern void +onig_exec_trap(regex_t* reg, MatchArg* msa, StackType* stk_base) +{ + VALUE arg; + TrapEnsureArg ta; + + ta.state = 0; + ta.reg = reg; + ta.msa = msa; + ta.stk_base = stk_base; + arg = (VALUE )(&ta); + rb_ensure(trap_exec, arg, trap_ensure, arg); +} + +#define CHECK_INTERRUPT_IN_MATCH_AT do {\ + if (rb_trap_pending) {\ + if (! rb_prohibit_interrupt) {\ + onig_exec_trap(reg, msa, stk_base);\ + }\ + }\ +} while (0) +#else +#define CHECK_INTERRUPT_IN_MATCH_AT +#endif /* RUBY_PLATFORM */ + #ifdef ONIG_DEBUG_STATISTICS #define USE_TIMEOFDAY @@ -955,6 +1077,7 @@ static int MaxStackDepth = 0; } while (0) #ifdef RUBY_PLATFORM + /* * :nodoc: */ @@ -1047,18 +1170,18 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, RelAddrType addr; OnigOptionType option = reg->options; OnigEncoding encode = reg->enc; - int ignore_case; + OnigAmbigType ambig_flag = reg->ambig_flag; UChar *s, *q, *sbegin; UChar *p = reg->p; char *alloca_base; StackType *stk_alloc, *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ + StackIndex si; StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); - ignore_case = IS_IGNORECASE(option); pop_level = reg->stack_pop_level; num_mem = reg->num_mem; repeat_stk = (StackIndex* )alloca_base; @@ -1091,7 +1214,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, fprintf(stderr, "%4d> \"", (int )(s - str)); bp = buf; for (i = 0, q = s; i < 7 && q < end; i++) { - len = enc_len(encode, *q); + len = enc_len(encode, q); while (len-- > 0) *bp++ = *q++; } if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } @@ -1099,7 +1222,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, *bp = 0; fputs(buf, stderr); for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr); - onig_print_compiled_byte_code(stderr, p, NULL); + onig_print_compiled_byte_code(stderr, p, NULL, encode); fprintf(stderr, "\n"); } #endif @@ -1154,27 +1277,33 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } } +#ifdef USE_CAPTURE_HISTORY if (reg->capture_history != 0) { - UChar *pstart, *pend; - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (BIT_STATUS_AT(reg->capture_history, i) != 0) { - stkp = stk_base; - do { - STACK_GET_MEM_RANGE(stkp, i, pstart, pend); - if (stkp < stk) { - int r; - r = region_list_add_entry(region, i, - pstart - str, pend - str); - if (r) { - STACK_SAVE; - return r; - } - } - stkp++; - } while (stkp < stk); - } - } - } /* list of captures */ + int r; + OnigCaptureTreeNode* node; + + if (IS_NULL(region->history_root)) { + region->history_root = node = history_node_new(); + CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY); + } + else { + node = region->history_root; + history_tree_clear(node); + } + + node->group = 0; + node->beg = sstart - str; + node->end = s - str; + + stkp = stk_base; + r = make_capture_history_tree(region->history_root, &stkp, + stk, str, reg); + if (r < 0) { + best_len = r; /* error code */ + goto finish; + } + } +#endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_REGION_OPTION } /* else IS_POSIX_REGION() */ #endif @@ -1211,12 +1340,12 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_EXACT1_IC: STAT_OP_IN(OP_EXACT1_IC); { int len; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; - len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); - DATA_ENSURE(len); + DATA_ENSURE(1); + len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf); + DATA_ENSURE(0); q = lowbuf; - s += enc_len(encode, *s); while (len-- > 0) { if (*p != *q) goto fail; p++; q++; @@ -1295,16 +1424,16 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_EXACTN_IC: STAT_OP_IN(OP_EXACTN_IC); { int len; - UChar *q, *endp, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *q, *endp, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; GET_LENGTH_INC(tlen, p); endp = p + tlen; while (p < endp) { - len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); - DATA_ENSURE(len); sprev = s; - s += enc_len(encode, *s); + DATA_ENSURE(1); + len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf); + DATA_ENSURE(0); q = lowbuf; while (len-- > 0) { if (*p != *q) goto fail; @@ -1408,20 +1537,22 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(1); if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail; p += SIZE_BITSET; - s += enc_len(encode, *s); /* OP_CCLASS can match mb-code. \D, \S */ + s += enc_len(encode, s); /* OP_CCLASS can match mb-code. \D, \S */ STAT_OP_OUT; break; case OP_CCLASS_MB: STAT_OP_IN(OP_CCLASS_MB); - if (! ONIGENC_IS_MBC_HEAD(encode, *s)) goto fail; + if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: GET_LENGTH_INC(tlen, p); { OnigCodePoint code; UChar *ss; - int mb_len = enc_len(encode, *s); + int mb_len; + DATA_ENSURE(1); + mb_len = enc_len(encode, s); DATA_ENSURE(mb_len); ss = s; s += mb_len; @@ -1441,7 +1572,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_CCLASS_MIX: STAT_OP_IN(OP_CCLASS_MIX); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + if (ONIGENC_IS_MBC_HEAD(encode, s)) { p += SIZE_BITSET; goto cclass_mb; } @@ -1461,13 +1592,13 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(1); if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail; p += SIZE_BITSET; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; case OP_CCLASS_MB_NOT: STAT_OP_IN(OP_CCLASS_MB_NOT); - if (! ONIGENC_IS_MBC_HEAD(encode, *s)) { - DATA_ENSURE(1); + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_HEAD(encode, s)) { s++; GET_LENGTH_INC(tlen, p); p += tlen; @@ -1479,7 +1610,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, { OnigCodePoint code; UChar *ss; - int mb_len = enc_len(encode, *s); + int mb_len = enc_len(encode, s); if (s + mb_len > end) { DATA_ENSURE(1); @@ -1508,7 +1639,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_CCLASS_MIX_NOT: STAT_OP_IN(OP_CCLASS_MIX_NOT); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + if (ONIGENC_IS_MBC_HEAD(encode, s)) { p += SIZE_BITSET; goto cclass_mb_not; } @@ -1525,21 +1656,17 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR); - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - s += n; - } - else { - DATA_ENSURE(1); - if (ONIG_IS_NEWLINE(*s)) goto fail; - s++; - } + DATA_ENSURE(1); + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + s += n; STAT_OP_OUT; break; case OP_ANYCHAR_ML: STAT_OP_IN(OP_ANYCHAR_ML); - n = enc_len(encode, *s); + DATA_ENSURE(1); + n = enc_len(encode, s); DATA_ENSURE(n); s += n; STAT_OP_OUT; @@ -1548,17 +1675,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_ANYCHAR_STAR: STAT_OP_IN(OP_ANYCHAR_STAR); while (s < end) { STACK_PUSH_ALT(p, s, sprev); - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - if (ONIG_IS_NEWLINE(*s)) goto fail; - sprev = s; - s++; - } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; } STAT_OP_OUT; break; @@ -1566,7 +1687,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_ANYCHAR_ML_STAR: STAT_OP_IN(OP_ANYCHAR_ML_STAR); while (s < end) { STACK_PUSH_ALT(p, s, sprev); - n = enc_len(encode, *s); + n = enc_len(encode, s); if (n > 1) { DATA_ENSURE(n); sprev = s; @@ -1585,17 +1706,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (*p == *s) { STACK_PUSH_ALT(p + 1, s, sprev); } - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - if (ONIG_IS_NEWLINE(*s)) goto fail; - sprev = s; - s++; - } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; } p++; STAT_OP_OUT; @@ -1606,7 +1721,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (*p == *s) { STACK_PUSH_ALT(p + 1, s, sprev); } - n = enc_len(encode, *s); + n = enc_len(encode, s); if (n >1) { DATA_ENSURE(n); sprev = s; @@ -1626,7 +1741,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (! ONIGENC_IS_MBC_WORD(encode, s, end)) goto fail; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; @@ -1635,7 +1750,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (ONIGENC_IS_MBC_WORD(encode, s, end)) goto fail; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; @@ -1718,7 +1833,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STAT_OP_OUT; continue; } - else if (ONIG_IS_NEWLINE(*sprev) && !ON_STR_END(s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) { STAT_OP_OUT; continue; } @@ -1728,7 +1843,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_END_LINE: STAT_OP_IN(OP_END_LINE); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (IS_NOTEOL(msa->options)) goto fail; STAT_OP_OUT; @@ -1737,7 +1852,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } #endif } - else if (ONIG_IS_NEWLINE(*s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) { STAT_OP_OUT; continue; } @@ -1747,7 +1862,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SEMI_END_BUF: STAT_OP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (IS_NOTEOL(msa->options)) goto fail; /* Is it needed? */ STAT_OP_OUT; @@ -1756,7 +1871,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } #endif } - if (ONIG_IS_NEWLINE(*s) && ON_STR_END(s+1)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && + ON_STR_END(s + enc_len(encode, s))) { STAT_OP_OUT; continue; } @@ -1865,7 +1981,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(n); sprev = s; STRING_CMP(pstart, s, n); - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; STAT_OP_OUT; @@ -1896,8 +2012,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(pstart, &s, n); - while (sprev + (len = enc_len(encode, *sprev)) < s) + STRING_CMP_IC(ambig_flag, pstart, &s, n); + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; STAT_OP_OUT; @@ -1932,7 +2048,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STRING_CMP_VALUE(pstart, swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -1968,10 +2084,10 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(pstart, &swork, n, is_fail); + STRING_CMP_VALUE_IC(ambig_flag, pstart, &swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -1985,7 +2101,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH); GET_OPTION_INC(option, p); - ignore_case = IS_IGNORECASE(option); STACK_PUSH_ALT(p, s, sprev); p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL; STAT_OP_OUT; @@ -1994,7 +2109,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SET_OPTION: STAT_OP_IN(OP_SET_OPTION); GET_OPTION_INC(option, p); - ignore_case = IS_IGNORECASE(option); STAT_OP_OUT; continue; break; @@ -2026,6 +2140,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_REPEAT_INC: case OP_REPEAT_INC_NG: + case OP_REPEAT_INC_SG: + case OP_REPEAT_INC_NG_SG: p += SIZE_MEMNUM; break; default: @@ -2092,6 +2208,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, GET_RELADDR_INC(addr, p); p += addr; STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; @@ -2170,79 +2287,70 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_REPEAT_INC: STAT_OP_IN(OP_REPEAT_INC); - { - StackIndex si; + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + si = repeat_stk[mem]; + stkp = STACK_AT(si); - GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ -#ifdef USE_SUBEXP_CALL - if (reg->num_call > 0) { - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - } - else { - si = repeat_stk[mem]; - stkp = STACK_AT(si); - } -#else - si = repeat_stk[mem]; - stkp = STACK_AT(si); -#endif - stkp->u.repeat.count++; - if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - /* end of repeat. Nothing to do. */ - } - else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - STACK_PUSH_ALT(p, s, sprev); - p = stkp->u.repeat.pcode; - } - else { - p = stkp->u.repeat.pcode; - } - STACK_PUSH_REPEAT_INC(si); + repeat_inc: + stkp->u.repeat.count++; + if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + /* end of repeat. Nothing to do. */ + } + else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(p, s, sprev); + p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + } + else { + p = stkp->u.repeat.pcode; } + STACK_PUSH_REPEAT_INC(si); STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; - case OP_REPEAT_INC_NG: STAT_OP_IN(OP_REPEAT_INC_NG); - { - StackIndex si; - - GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ -#ifdef USE_SUBEXP_CALL - if (reg->num_call > 0) { - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - } - else { - si = repeat_stk[mem]; - stkp = STACK_AT(si); - } -#else - si = repeat_stk[mem]; - stkp = STACK_AT(si); -#endif - stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - UChar* pcode = stkp->u.repeat.pcode; + case OP_REPEAT_INC_SG: STAT_OP_IN(OP_REPEAT_INC_SG); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + goto repeat_inc; + break; - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); - } - else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); - } - } - else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); - } - } - STAT_OP_OUT; + case OP_REPEAT_INC_NG: STAT_OP_IN(OP_REPEAT_INC_NG); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + si = repeat_stk[mem]; + stkp = STACK_AT(si); + + repeat_inc_ng: + stkp->u.repeat.count++; + if (stkp->u.repeat.count < reg->repeat_range[mem].upper || + IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) { + if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + UChar* pcode = stkp->u.repeat.pcode; + + STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_ALT(pcode, s, sprev); + } + else { + p = stkp->u.repeat.pcode; + STACK_PUSH_REPEAT_INC(si); + } + } + else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + STACK_PUSH_REPEAT_INC(si); + } + STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; + case OP_REPEAT_INC_NG_SG: STAT_OP_IN(OP_REPEAT_INC_NG_SG); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + goto repeat_inc_ng; + break; + case OP_PUSH_POS: STAT_OP_IN(OP_PUSH_POS); STACK_PUSH_POS(s, sprev); STAT_OP_OUT; @@ -2400,73 +2508,39 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, if (t == target_end) return s; } - s += enc_len(enc, *s); + s += enc_len(enc, s); } return (UChar* )NULL; } -#if 0 static int -str_trans_match_after_head_byte(OnigEncoding enc, - int len, UChar* t, UChar* tend, UChar* p) +str_lower_case_match(OnigEncoding enc, int ambig_flag, + UChar* t, UChar* tend, UChar* p, UChar* end) { - while (--len > 0) { - if (*t != *p) break; - t++; p++; - } - - if (len == 0) { - int lowlen; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - - while (t < tend) { - len = enc_len(enc, *p); - lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); - q = lowbuf; - while (lowlen > 0) { - if (*t++ != *q++) break; - lowlen--; - } - if (lowlen > 0) break; - p += len; - } - if (t == tend) - return 1; - } - - return 0; -} -#endif - -static int -str_lower_case_match(OnigEncoding enc, UChar* t, UChar* tend, UChar* p) -{ - int len, lowlen; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int lowlen; + UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; while (t < tend) { - len = enc_len(enc, *p); - lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &p, end, lowbuf); q = lowbuf; while (lowlen > 0) { if (*t++ != *q++) return 0; lowlen--; } - p += len; } return 1; } static UChar* -slow_search_ic(OnigEncoding enc, +slow_search_ic(OnigEncoding enc, int ambig_flag, UChar* target, UChar* target_end, UChar* text, UChar* text_end, UChar* text_range) { - int len, lowlen; - UChar *t, *p, *s, *end; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int lowlen; + UChar *t, *p, *s, *end, *z; + UChar lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; end = text_end - (target_end - target) + 1; if (end > text_range) @@ -2475,22 +2549,21 @@ slow_search_ic(OnigEncoding enc, s = text; while (s < end) { - len = enc_len(enc, *s); - lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); + z = s; + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s, text_end, lowbuf); if (*target == *lowbuf) { p = lowbuf + 1; t = target + 1; while (--lowlen > 0) { if (*p != *t) break; - p++; *t++; + p++; t++; } if (lowlen == 0) { - if (str_lower_case_match(enc, t, target_end, s + len)) - return s; + if (str_lower_case_match(enc, ambig_flag, + t, target_end, s, text_end)) + return z; } } - - s += len; } return (UChar* )NULL; @@ -2527,14 +2600,14 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, } static UChar* -slow_search_backward_ic(OnigEncoding enc, +slow_search_backward_ic(OnigEncoding enc, int ambig_flag, UChar* target,UChar* target_end, UChar* text, UChar* adjust_text, UChar* text_end, UChar* text_start) { int len, lowlen; - UChar *t, *p, *s; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *t, *p, *s, *z; + UChar lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; s = text_end - (target_end - target); if (s > text_start) @@ -2543,22 +2616,24 @@ slow_search_backward_ic(OnigEncoding enc, s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); while (s >= text) { - len = enc_len(enc, *s); - lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); + len = enc_len(enc, s); + z = s; + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s, text_end, lowbuf); if (*target == *lowbuf) { p = lowbuf + 1; t = target + 1; while (--lowlen > 0) { if (*p != *t) break; - p++; *t++; + p++; t++; } if (lowlen == 0) { - if (str_lower_case_match(enc, t, target_end, s + len)) - return s; + if (str_lower_case_match(enc, ambig_flag, + t, target_end, s, text_end)) + return z; } } - s = onigenc_get_prev_char_head(enc, adjust_text, s); + s = onigenc_get_prev_char_head(enc, adjust_text, z); } return (UChar* )NULL; @@ -2572,6 +2647,11 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, UChar *tail; int skip; +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_notrev: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); +#endif + end = text_range + (target_end - target) - 1; if (end > text_end) end = text_end; @@ -2579,7 +2659,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, tail = target_end - 1; s = text; while ((s - text) < target_end - target) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } s--; /* set to text check tail position. */ @@ -2597,7 +2677,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, if (p >= text_end) return (UChar* )NULL; t = p; do { - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); } while ((p - t) < skip && p < text_end); s += (p - t); @@ -2617,7 +2697,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, if (p >= text_end) return (UChar* )NULL; t = p; do { - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); } while ((p - t) < skip && p < text_end); s += (p - t); @@ -2665,11 +2745,10 @@ bm_search(regex_t* reg, UChar* target, UChar* target_end, } static int -set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, - int ignore_case, int** skip) +set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, int** skip) + { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; if (IS_NULL(*skip)) { *skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); @@ -2680,16 +2759,9 @@ set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*skip)[i] = len; - if (ignore_case) { - for (i = len - 1; i > 0; i--) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*skip)[*lowbuf] = i; - } - } - else { - for (i = len - 1; i > 0; i--) - (*skip)[s[i]] = i; - } + for (i = len - 1; i > 0; i--) + (*skip)[s[i]] = i; + return 0; } @@ -2729,7 +2801,7 @@ map_search(OnigEncoding enc, UChar map[], UChar* text, UChar* text_range) while (s < text_range) { if (map[*s]) return s; - s += enc_len(enc, *s); + s += enc_len(enc, s); } return (UChar* )NULL; } @@ -2756,6 +2828,23 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, UChar *prev; MatchArg msa; + if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { + reg->state++; /* increment as search counter */ + if (IS_NOT_NULL(reg->chain)) { + onig_chain_reduce(reg); + reg->state++; + } + } + else { + int n = 0; + while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + reg->state++; /* increment as search counter */ + } + MATCH_ARG_INIT(msa, option, region, at); if (region @@ -2772,7 +2861,9 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, prev = onigenc_get_prev_char_head(reg->enc, str, at); r = match_at(reg, str, end, at, prev, &msa); } + MATCH_ARG_FREE(msa); + reg->state--; /* decrement as search counter */ return r; } @@ -2794,7 +2885,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, } else { UChar *q = p + reg->dmin; - while (p < q) p += enc_len(reg->enc, *p); + while (p < q) p += enc_len(reg->enc, p); } } @@ -2804,7 +2895,8 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_ic(reg->enc, reg->exact, reg->exact_end, p, end, range); + p = slow_search_ic(reg->enc, reg->ambig_flag, + reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -2824,7 +2916,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (p - reg->dmin < s) { retry_gate: pprev = p; - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); goto retry; } @@ -2836,7 +2928,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (!ONIG_IS_NEWLINE(*prev)) + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } break; @@ -2845,10 +2937,10 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (ON_STR_END(p)) { prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (prev && ONIG_IS_NEWLINE(*prev)) + if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } - else if (!ONIG_IS_NEWLINE(*p)) + else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) goto retry_gate; break; } @@ -2896,7 +2988,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, } static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, - int ignore_case, int** skip)); + int** skip)); #define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 @@ -2919,8 +3011,9 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_backward_ic(reg->enc, reg->exact, - reg->exact_end, range, adjrange, end, p); + p = slow_search_backward_ic(reg->enc, reg->ambig_flag, + reg->exact, reg->exact_end, + range, adjrange, end, p); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -2929,7 +3022,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) goto exact_method; - r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, &(reg->int_map_backward)); if (r) return r; } @@ -2950,7 +3043,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (!ONIG_IS_NEWLINE(*prev)) { + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; } @@ -2961,12 +3054,12 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (ON_STR_END(p)) { prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(prev)) goto fail; - if (ONIG_IS_NEWLINE(*prev)) { + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; } } - else if (!ONIG_IS_NEWLINE(*p)) { + else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) { p = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(p)) goto fail; goto retry; @@ -3106,8 +3199,10 @@ onig_search(regex_t* reg, UChar* str, UChar* end, } } else if (reg->anchor & ANCHOR_SEMI_END_BUF) { - if (ONIG_IS_NEWLINE(end[-1])) { - semi_end = end - 1; + UChar* pre_end = ONIGENC_STEP_BACK(reg->enc, start, end, 1); + + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) { + semi_end = pre_end; if (semi_end > str && start <= semi_end) { goto end_buf; } @@ -3177,13 +3272,14 @@ onig_search(regex_t* reg, UChar* str, UChar* end, while (s <= high) { MATCH_AND_RETURN_CHECK; prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { if (IS_NOT_NULL(prev)) { - while (!ONIG_IS_NEWLINE(*prev) && s < range) { + while (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end) && + s < range) { prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } } } @@ -3200,14 +3296,18 @@ onig_search(regex_t* reg, UChar* str, UChar* end, do { MATCH_AND_RETURN_CHECK; prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } while (s <= range); /* exec s == range, because empty match with /$/. */ } else { /* backward search */ if (reg->optimize != ONIG_OPTIMIZE_NONE) { UChar *low, *high, *adjrange, *sch_start; - adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + if (range < end) + adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + else + adjrange = end; + if (reg->dmax != ONIG_INFINITE_DISTANCE && (end - range) >= reg->threshold_len) { do { @@ -3306,8 +3406,44 @@ onig_get_options(regex_t* reg) return reg->options; } +extern OnigAmbigType +onig_get_ambig_flag(regex_t* reg) +{ + return reg->ambig_flag; +} + extern OnigSyntaxType* onig_get_syntax(regex_t* reg) { return reg->syntax; } + +extern int +onig_number_of_captures(regex_t* reg) +{ + return reg->num_mem; +} + +extern int +onig_number_of_capture_histories(regex_t* reg) +{ +#ifdef USE_CAPTURE_HISTORY + int i, n; + + n = 0; + for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (BIT_STATUS_AT(reg->capture_history, i) != 0) + n++; + } + return n; +#else + return 0; +#endif +} + +extern void +onig_copy_encoding(OnigEncoding to, OnigEncoding from) +{ + *to = *from; +} + @@ -1,10 +1,32 @@ /********************************************************************** - reggnu.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" #ifndef ONIGGNU_H /* name changes from oniggnu.h to regex.h in ruby. */ @@ -17,10 +39,6 @@ #endif #endif -#ifndef NULL -#define NULL ((void* )0) -#endif - extern void re_free_registers(OnigRegion* r) { @@ -111,7 +129,9 @@ re_free_pattern(regex_t* reg) extern int re_alloc_pattern(regex_t** reg) { - return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, OnigEncDefaultCharEncoding, + return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, + ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + OnigEncDefaultCharEncoding, OnigDefaultSyntax); } @@ -1,12 +1,33 @@ +#ifndef REGINT_H +#define REGINT_H /********************************************************************** - regint.h - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef REGINT_H -#define REGINT_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ /* for debug */ /* #define ONIG_DEBUG_PARSE_TREE */ @@ -19,7 +40,8 @@ /* #define ONIG_DEBUG_STATISTICS */ #if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \ - defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_STATISTICS) + defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ + defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG #endif @@ -36,7 +58,6 @@ /* spec. config */ #define USE_NAMED_GROUP #define USE_SUBEXP_CALL -#define USE_FOLD_MATCH /* ess-tsett etc... */ #define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR @@ -51,12 +72,14 @@ /* interface to external system */ #ifdef NOT_RUBY /* gived from Makefile */ #include "config.h" +#define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS #define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ #define USE_POSIX_REGION_OPTION /* needed for POSIX API support */ #define THREAD_ATOMIC_START /* depend on thread system */ #define THREAD_ATOMIC_END /* depend on thread system */ #define THREAD_PASS /* depend on thread system */ +#define CHECK_INTERRUPT /* depend on application */ #define xmalloc malloc #define xrealloc realloc #define xfree free @@ -67,6 +90,14 @@ #define THREAD_ATOMIC_START DEFER_INTS #define THREAD_ATOMIC_END ENABLE_INTS #define THREAD_PASS rb_thread_schedule() +#define CHECK_INTERRUPT do {\ + if (rb_trap_pending) {\ + if (! rb_prohibit_interrupt) {\ + rb_trap_exec();\ + }\ + }\ +} while (0) + #define DEFAULT_WARN_FUNCTION rb_warn #define DEFAULT_VERB_WARN_FUNCTION rb_warning @@ -108,7 +139,9 @@ #endif #include <ctype.h> +#ifndef __BORLANDC__ #include <sys/types.h> +#endif #ifdef ONIG_DEBUG # include <stdio.h> @@ -291,6 +324,8 @@ typedef unsigned int BitStatusType; /* ignore-case and multibyte status are included in compiled code. */ #define IS_DYNAMIC_OPTION(option) 0 +#define REPEAT_INFINITE -1 +#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) /* bitset */ #define BITS_PER_BYTE 8 @@ -500,6 +535,8 @@ enum OpCode { OP_REPEAT_NG, /* {n,m}? (non greedy) */ OP_REPEAT_INC, OP_REPEAT_INC_NG, /* non greedy */ + OP_REPEAT_INC_SG, /* search and get in stack */ + OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ OP_NULL_CHECK_START, /* null loop checker start */ OP_NULL_CHECK_END, /* null loop checker end */ OP_NULL_CHECK_END_MEMST, /* null loop checker end (with capture status) */ @@ -528,11 +565,11 @@ enum OpCode { #define ARG_MEMNUM 4 #define ARG_OPTION 5 -typedef short int RelAddrType; -typedef short int AbsAddrType; -typedef short int LengthType; -typedef short int MemNumType; -typedef int RepeatNumType; +typedef int RelAddrType; +typedef int AbsAddrType; +typedef int LengthType; +typedef int RepeatNumType; +typedef short int MemNumType; #define SIZE_OPCODE 1 #define SIZE_RELADDR sizeof(RelAddrType) @@ -573,6 +610,7 @@ typedef int RepeatNumType; option = *((OnigOptionType* )(p));\ (p) += SIZE_OPTION;\ } while(0) + #else #define GET_RELADDR_INC(addr,p) GET_SHORT_INC(addr,p) @@ -635,23 +673,12 @@ typedef int RepeatNumType; #define SIZE_OP_RETURN SIZE_OPCODE -typedef struct { - UChar esc; - UChar anychar; - UChar anytime; - UChar zero_or_one_time; - UChar one_or_more_time; - UChar anychar_anytime; -} OnigMetaCharTableType; - -extern OnigMetaCharTableType OnigMetaCharTable; - -#define MC_ESC OnigMetaCharTable.esc -#define MC_ANYCHAR OnigMetaCharTable.anychar -#define MC_ANYTIME OnigMetaCharTable.anytime -#define MC_ZERO_OR_ONE_TIME OnigMetaCharTable.zero_or_one_time -#define MC_ONE_OR_MORE_TIME OnigMetaCharTable.one_or_more_time -#define MC_ANYCHAR_ANYTIME OnigMetaCharTable.anychar_anytime +#define MC_ESC(enc) (enc)->meta_char_table.esc +#define MC_ANYCHAR(enc) (enc)->meta_char_table.anychar +#define MC_ANYTIME(enc) (enc)->meta_char_table.anytime +#define MC_ZERO_OR_ONE_TIME(enc) (enc)->meta_char_table.zero_or_one_time +#define MC_ONE_OR_MORE_TIME(enc) (enc)->meta_char_table.one_or_more_time +#define MC_ANYCHAR_ANYTIME(enc) (enc)->meta_char_table.anychar_anytime #define SYN_POSIX_COMMON_OP \ ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ @@ -689,7 +716,7 @@ typedef struct { extern OnigOpInfoType OnigOpInfo[]; -extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp)); +extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp, OnigEncoding enc)); #ifdef ONIG_DEBUG_STATISTICS extern void onig_statistics_init P_((void)); @@ -701,9 +728,11 @@ extern char* onig_error_code_to_format P_((int code)); extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...)); extern UChar* onig_strdup P_((UChar* s, UChar* end)); extern int onig_bbuf_init P_((BBuf* buf, int size)); -extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax)); +extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, OnigEncoding enc, OnigSyntaxType* syntax)); extern int onig_compile P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigErrorInfo* einfo)); extern void onig_chain_reduce P_((regex_t* reg)); +extern void onig_chain_link_add P_((regex_t* to, regex_t* add)); +extern void onig_transfer P_((regex_t* to, regex_t* from)); extern int onig_is_in_code_range P_((UChar* p, OnigCodePoint code)); #endif /* REGINT_H */ diff --git a/regparse.c b/regparse.c index 632e15c30a..e6fea8e68a 100644 --- a/regparse.c +++ b/regparse.c @@ -1,10 +1,32 @@ /********************************************************************** - regparse.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #define WARN_BUFSIZE 256 @@ -21,12 +43,14 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | - ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB ) + ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | + ONIG_SYN_OP2_ESC_H_XDIGIT ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | + ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -34,15 +58,6 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; -OnigMetaCharTableType OnigMetaCharTable = { - (OnigCodePoint )'\\' /* esc */ - , (OnigCodePoint )0 /* anychar '.' */ - , (OnigCodePoint )0 /* anytime '*' */ - , (OnigCodePoint )0 /* zero or one time '?' */ - , (OnigCodePoint )0 /* one or more time '+' */ - , (OnigCodePoint )0 /* anychar anytime */ -}; - extern void onig_null_warn(char* s) { } #ifdef DEFAULT_WARN_FUNCTION @@ -93,12 +108,15 @@ bbuf_clone(BBuf** rto, BBuf* from) #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) -#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \ - add_code_range_to_buf(pbuf, (OnigCodePoint )0x80, ~((OnigCodePoint )0)) +#define MBCODE_START_POS(enc) \ + (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) -#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\ - if (! ONIGENC_IS_SINGLEBYTE(code)) {\ - r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\ +#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ + add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) + +#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ + if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ + r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ if (r) return r;\ }\ } while (0) @@ -217,14 +235,23 @@ onig_strdup(UChar* s, UChar* end) } /* scan pattern methods */ -#define PEND_VALUE -1 - -#define PFETCH(c) do { (c) = *p++; } while (0) -#define PUNFETCH p-- -#define PINC p++ -#define PPEEK (p < end ? *p : PEND_VALUE) -#define PEND (p < end ? 0 : 1) +#define PEND_VALUE 0 + +#define PFETCH_READY UChar* pfetch_prev +#define PEND (p < end ? 0 : 1) +#define PUNFETCH p = pfetch_prev +#define PINC do { \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PFETCH(c) do { \ + c = ONIGENC_MBC_TO_CODE(enc, p, end); \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) +#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) static UChar* k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, @@ -388,12 +415,15 @@ typedef struct { regex_t* reg; void* arg; int ret; + OnigEncoding enc; } INamesArg; static int i_names(UChar* key, NameEntry* e, INamesArg* arg) { - int r = (*(arg->func))(e->name, e->name + strlen(e->name), e->back_num, + int r = (*(arg->func))(e->name, + e->name + onigenc_str_bytelen_null(arg->enc, e->name), + e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->reg, arg->arg); if (r != 0) { @@ -416,6 +446,7 @@ onig_foreach_name(regex_t* reg, narg.func = func; narg.reg = reg; narg.arg = arg; + narg.enc = reg->enc; /* should be pattern encoding. */ st_foreach(t, i_names, (HashDataType )&narg); } return narg.ret; @@ -973,6 +1004,12 @@ node_new_list(Node* left, Node* right) return node; } +extern Node* +onig_node_new_list(Node* left, Node* right) +{ + return node_new_list(left, right); +} + static Node* node_new_alt(Node* left, Node* right) { @@ -1058,6 +1095,7 @@ node_new_qualifier(int lower, int upper, int by_number) Node* node = node_new(); CHECK_NULL_RETURN(node); node->type = N_QUALIFIER; + NQUALIFIER(node).state = 0; NQUALIFIER(node).target = NULL; NQUALIFIER(node).lower = lower; NQUALIFIER(node).upper = upper; @@ -1171,6 +1209,20 @@ onig_node_conv_to_str_node(Node* node, int flag) NSTRING(node).end = NSTRING(node).buf; } +extern void +onig_node_str_clear(Node* node) +{ + if (NSTRING(node).capa != 0 && + IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { + xfree(NSTRING(node).s); + } + + NSTRING(node).capa = 0; + NSTRING(node).flag = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; +} + static Node* node_new_str(UChar* s, UChar* end) { @@ -1189,6 +1241,12 @@ node_new_str(UChar* s, UChar* end) return node; } +extern Node* +onig_node_new_str(UChar* s, UChar* end) +{ + return node_new_str(s, end); +} + static Node* node_new_str_raw(UChar* s, UChar* end) { @@ -1204,15 +1262,6 @@ node_new_empty() } static Node* -node_new_str_char(UChar c) -{ - UChar p[1]; - - p[0] = c; - return node_new_str(p, p + 1); -} - -static Node* node_new_str_raw_char(UChar c) { UChar p[1]; @@ -1243,7 +1292,7 @@ static int str_node_can_be_split(StrNode* sn, OnigEncoding enc) { if (sn->end > sn->s) { - return ((enc_len(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0); + return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0); } return 0; } @@ -1252,8 +1301,9 @@ extern int onig_scan_unsigned_number(UChar** src, UChar* end, OnigEncoding enc) { unsigned int num, val; - int c; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND) { @@ -1278,9 +1328,10 @@ static int scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1305,9 +1356,10 @@ static int scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1443,15 +1495,15 @@ add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) } static int -not_code_range_buf(BBuf* bbuf, BBuf** pbuf) +not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) { int r, i, n; - OnigCodePoint pre, from, to, *data; + OnigCodePoint pre, from, *data, to = 0; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf)) { set_all: - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } data = (OnigCodePoint* )(bbuf->p); @@ -1460,7 +1512,7 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) if (n <= 0) goto set_all; r = 0; - pre = 0x80; + pre = MBCODE_START_POS(enc); for (i = 0; i < n; i++) { from = data[i*2]; to = data[i*2+1]; @@ -1485,7 +1537,8 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) } while (0) static int -or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, + BBuf* bbuf2, int not2, BBuf** pbuf) { int r; OnigCodePoint i, n1, *data1; @@ -1494,7 +1547,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { if (not1 != 0 || not2 != 0) - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); return 0; } @@ -1504,14 +1557,14 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) if (IS_NULL(bbuf1)) { if (not1 != 0) { - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } else { if (not2 == 0) { return bbuf_clone(pbuf, bbuf2); } else { - return not_code_range_buf(bbuf2, pbuf); + return not_code_range_buf(enc, bbuf2, pbuf); } } } @@ -1527,7 +1580,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) r = bbuf_clone(pbuf, bbuf2); } else if (not1 == 0) { /* 1 OR (not 2) */ - r = not_code_range_buf(bbuf2, pbuf); + r = not_code_range_buf(enc, bbuf2, pbuf); } if (r != 0) return r; @@ -1638,6 +1691,29 @@ and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) } static int +clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) +{ + BBuf *tbuf; + int r; + + if (cc->not != 0) { + bitset_invert(cc->bs); + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + r = not_code_range_buf(enc, cc->mbuf, &tbuf); + if (r != 0) return r; + + bbuf_free(cc->mbuf); + cc->mbuf = tbuf; + } + + cc->not = 0; + } + + return 0; +} + +static int and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) { int r, not1, not2; @@ -1671,13 +1747,13 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) if (! ONIGENC_IS_SINGLEBYTE(enc)) { if (not1 != 0 && not2 != 0) { - r = or_code_range_buf(buf1, 0, buf2, 0, &pbuf); + r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); } else { r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -1732,10 +1808,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); } else { - r = or_code_range_buf(buf1, not1, buf2, not2, &pbuf); + r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -1835,6 +1911,7 @@ popular_qualifier_num(QualifierNode* qf) return -1; } + enum ReduceType { RQ_ASIS = 0, /* as is */ RQ_DEL = 1, /* delete parent */ @@ -1854,7 +1931,6 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; - extern void onig_reduce_nested_qualifier(Node* pnode, Node* cnode) { @@ -1907,8 +1983,9 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_BYTE = 1, - TK_RAW_BYTE = 2, + TK_RAW_BYTE = 1, + TK_CHAR, + TK_STRING, TK_CODE_POINT, TK_ANYCHAR, TK_CHAR_TYPE, @@ -1938,6 +2015,7 @@ typedef struct { int base; /* is number: 8, 16 (used in [....]) */ UChar* backp; union { + UChar* s; int c; OnigCodePoint code; int anchor; @@ -1969,8 +2047,11 @@ static int fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; - int c; + int r = 0; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); @@ -2024,12 +2105,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) PUNFETCH; up = low; /* {n} : exact n times */ + r = 2; /* fixed */ } if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC) goto invalid; + if (c != MC_ESC(enc)) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -2042,7 +2124,7 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) tok->u.repeat.lower = low; tok->u.repeat.upper = up; *src = p; - return 0; + return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: if (syn_allow) @@ -2055,8 +2137,11 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) static int fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) { - int c; + int v; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; @@ -2069,9 +2154,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (c != '-') return ONIGERR_META_CODE_SYNTAX; if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } c = ((c & 0xff) | 0x80); } @@ -2094,9 +2180,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } else if (c == '?') c = 0177; @@ -2128,11 +2215,13 @@ static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { - int r, len, is_num; - int c = 0; - OnigCodePoint code, first_code; + int r, is_num; + OnigCodePoint c = 0; + OnigCodePoint first_code; + OnigEncoding enc = env->enc; UChar *name_end; UChar *p = *src; + PFETCH_READY; name_end = end; r = 0; @@ -2141,62 +2230,50 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) return ONIGERR_EMPTY_GROUP_NAME; } else { - first_code = ONIGENC_MBC_TO_CODE(env->enc, p, end); PFETCH(c); + first_code = c; if (c == '>') return ONIGERR_EMPTY_GROUP_NAME; - if (ONIGENC_IS_CODE_DIGIT(env->enc, first_code)) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { if (ref == 1) is_num = 1; else { r = ONIGERR_INVALID_GROUP_NAME; } } - else if (! ONIGENC_IS_CODE_WORD(env->enc, first_code)) { + else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } - - len = enc_len(env->enc, c); - while (!PEND && len-- > 1) - PFETCH(c); } while (!PEND) { name_end = p; - code = ONIGENC_MBC_TO_CODE(env->enc, p, end); PFETCH(c); if (c == '>' || c == ')') break; - len = enc_len(env->enc, c); if (is_num == 1) { - if (len == 1) { - if (! ONIGENC_IS_CODE_DIGIT(env->enc, code)) { - if (!ONIGENC_IS_CODE_WORD(env->enc, code)) - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - else - r = ONIGERR_INVALID_GROUP_NAME; - } - } - else { - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) { + if (!ONIGENC_IS_CODE_WORD(enc, c)) + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + else + r = ONIGERR_INVALID_GROUP_NAME; } } else { - if (! ONIGENC_IS_CODE_WORD(env->enc, code)) { + if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } - - while (!PEND && len-- > 1) - PFETCH(c); } + if (c != '>') { r = ONIGERR_INVALID_GROUP_NAME; name_end = end; } else { - if (ONIGENC_IS_CODE_UPPER(env->enc, first_code)) + if (ONIGENC_IS_CODE_ASCII(first_code) && + ONIGENC_IS_CODE_UPPER(enc, first_code)) r = ONIGERR_INVALID_GROUP_NAME; } @@ -2215,25 +2292,22 @@ static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { int r, len; - int c = 0; - OnigCodePoint code; + OnigCodePoint c = 0; UChar *name_end; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; r = 0; while (!PEND) { name_end = p; - code = ONIGENC_MBC_TO_CODE(env->enc, p, end); - len = enc_len(env->enc, c); - PFETCH(c); - if (len > 1) + if (enc_len(enc, p) > 1) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + PFETCH(c); if (c == '>' || c == ')') break; - if (! ONIGENC_IS_CODE_DIGIT(env->enc, code)) + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - - p += (len - 1); } if (c != '>') { r = ONIGERR_INVALID_GROUP_NAME; @@ -2293,12 +2367,12 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) { if (IS_NOT_NULL(next)) @@ -2324,24 +2398,24 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { if (in_esc) { in_esc = 0; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) return 1; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); if (x == bad) return 0; - else if (x == MC_ESC) in_esc = 1; + else if (x == MC_ESC(enc)) in_esc = 1; p = q; } } @@ -2352,10 +2426,13 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int c, num; + int num; + OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; + OnigEncoding enc = env->enc; UChar* prev; UChar* p = *src; + PFETCH_READY; if (PEND) { tok->type = TK_EOT; @@ -2363,7 +2440,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } PFETCH(c); - tok->type = TK_BYTE; + tok->type = TK_CHAR; tok->base = 0; tok->u.c = c; if (c == ']') { @@ -2372,7 +2449,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) else if (c == '-') { tok->type = TK_CC_RANGE; } - else if (c == MC_ESC) { + else if (c == MC_ESC(enc)) { if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) goto end; @@ -2406,17 +2483,27 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_TYPE; tok->u.subtype = CTYPE_NOT_WHITE_SPACE; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; case 'p': case 'P': - if (PPEEK == '{' && + c2 = PPEEK; + if (c2 == '{' && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { - int c2; PFETCH(c2); if (c2 == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -2431,14 +2518,17 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + c2 = PPEEK; + if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) { PINC; tok->type = TK_CODE_POINT; tok->base = 16; @@ -2450,7 +2540,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2466,12 +2556,12 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; tok->base = 16; tok->u.c = num; } @@ -2482,7 +2572,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, env->enc); + num = scan_unsigned_octal_number(&p, end, 3, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2499,18 +2589,18 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return num; if (tok->u.c != num) { tok->u.c = num; - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; } break; } } else if (c == '[') { - if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') { + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; tok->backp = p; /* point at '[' is readed */ PINC; - if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', - env->enc)) { + if (str_exist_check_with_esc(send, 2, p, end, + (OnigCodePoint )']', enc)) { tok->type = TK_POSIX_BRACKET_OPEN; } else { @@ -2530,7 +2620,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } else if (c == '&') { if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && - !PEND && PPEEK == '&') { + !PEND && (PPEEK_IS('&'))) { PINC; tok->type = TK_CC_AND; } @@ -2544,10 +2634,13 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, c, num; + int r, num; + OnigCodePoint c; + OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; UChar* prev; UChar* p = *src; + PFETCH_READY; start: if (PEND) { @@ -2555,13 +2648,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return tok->type; } - tok->type = TK_BYTE; - tok->base = 0; + tok->type = TK_STRING; + tok->base = 0; + tok->backp = p; + PFETCH(c); - if (c == MC_ESC) { + if (c == MC_ESC(enc)) { if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + tok->backp = p; PFETCH(c); + tok->u.c = c; tok->escaped = 1; switch (c) { @@ -2587,37 +2684,42 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; greedy_check: - if (!PEND && PPEEK == '?' && + if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { PFETCH(c); tok->u.repeat.greedy = 0; tok->u.repeat.possessive = 0; } - else if (!PEND && PPEEK == '+' && - ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && - tok->type != TK_INTERVAL) || - (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && - tok->type == TK_INTERVAL))) { - PFETCH(c); - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 1; - } else { - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 0; + possessive_check: + if (!PEND && PPEEK_IS('+') && + ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && + tok->type != TK_INTERVAL) || + (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && + tok->type == TK_INTERVAL))) { + PFETCH(c); + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 1; + } + else { + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 0; + } } break; case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -2697,6 +2799,18 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.subtype = CTYPE_NOT_DIGIT; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; + case 'A': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; begin_buf: @@ -2737,14 +2851,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; tok->u.code = (OnigCodePoint )num; @@ -2755,7 +2871,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2771,12 +2887,12 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; tok->base = 16; tok->u.c = num; } @@ -2786,9 +2902,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, env->enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; - if (num > ONIG_MAX_BACKREF_NUM) return ONIGERR_TOO_BIG_BACKREF_NUMBER; + num = onig_scan_unsigned_number(&p, end, enc); + if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + goto skip_backref; + } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ @@ -2803,7 +2920,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.by_name = 0; break; } - else if (c == '8' || c == '9') { + + skip_backref: + if (c == '8' || c == '9') { /* normal char */ p = prev; PINC; break; @@ -2814,7 +2933,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc); + num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2900,16 +3019,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': - if (PPEEK == '{' && + if (PPEEK_IS('{') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { - int c2; - PFETCH(c2); - if (c2 == '^') { + PFETCH(c); + if (c == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); } else @@ -2924,9 +3042,12 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return num; /* set_raw: */ if (tok->u.c != num) { - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; tok->u.c = num; } + else { /* string */ + p = tok->backp + enc_len(enc, tok->backp); + } break; } } @@ -2937,15 +3058,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_VARIABLE_META_CHARS if ((c != ONIG_INEFFECTIVE_META_CHAR) && IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { - if (c == MC_ANYCHAR) + if (c == MC_ANYCHAR(enc)) goto any_char; - else if (c == MC_ANYTIME) + else if (c == MC_ANYTIME(enc)) goto anytime; - else if (c == MC_ZERO_OR_ONE_TIME) + else if (c == MC_ZERO_OR_ONE_TIME(enc)) goto zero_or_one_time; - else if (c == MC_ONE_OR_MORE_TIME) + else if (c == MC_ONE_OR_MORE_TIME(enc)) goto one_or_more_time; - else if (c == MC_ANYCHAR_ANYTIME) { + else if (c == MC_ANYCHAR_ANYTIME(enc)) { tok->type = TK_ANYCHAR_ANYTIME; goto out; } @@ -2988,14 +3109,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -3004,15 +3127,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '(': - if (PPEEK == '?' && + if (PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; - if (PPEEK == '#') { + if (PPEEK_IS('#')) { PFETCH(c); while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); - if (c == MC_ESC) { + if (c == MC_ESC(enc)) { if (!PEND) PFETCH(c); } else { @@ -3061,7 +3184,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_EXTEND(env->option)) { while (!PEND) { PFETCH(c); - if (ONIG_IS_NEWLINE(c)) + if (ONIGENC_IS_CODE_NEWLINE(enc, c)) break; } goto start; @@ -3075,6 +3198,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: + /* string */ break; } } @@ -3085,22 +3209,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int -add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, - OnigEncoding enc) +add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, + int nsb, int nmb, + OnigCodePointRange *sbr, OnigCodePointRange *mbr) { - int i, r, nsb, nmb; - OnigCodePointRange *sbr, *mbr; + int i, r; OnigCodePoint j; - r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); - if (r != 0) return r; - if (not == 0) { for (i = 0; i < nsb; i++) { for (j = sbr[i].from; j <= sbr[i].to; j++) { - BITSET_SET_BIT(cc->bs, j); + BITSET_SET_BIT(cc->bs, j); } } + for (i = 0; i < nmb; i++) { r = add_code_range_to_buf(&(cc->mbuf), mbr[i].from, mbr[i].to); if (r != 0) return r; @@ -3108,19 +3230,23 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, } else { OnigCodePoint prev = 0; - for (i = 0; i < nsb; i++) { - for (j = prev; j < sbr[i].from; j++) { - BITSET_SET_BIT(cc->bs, j); + + if (ONIGENC_MBC_MINLEN(enc) == 1) { + for (i = 0; i < nsb; i++) { + for (j = prev; j < sbr[i].from; j++) { + BITSET_SET_BIT(cc->bs, j); + } + prev = sbr[i].to + 1; } - prev = sbr[i].to + 1; - } - if (prev < 0x7f) { - for (j = prev; j < 0x7f; j++) { - BITSET_SET_BIT(cc->bs, j); + if (prev < 0x7f) { + for (j = prev; j < 0x7f; j++) { + BITSET_SET_BIT(cc->bs, j); + } } + + prev = 0x80; } - prev = 0x80; for (i = 0; i < nmb; i++) { if (prev < mbr[i].from) { r = add_code_range_to_buf(&(cc->mbuf), prev, mbr[i].from - 1); @@ -3134,17 +3260,23 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, } } - return r; + return 0; } static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; + int nsb, nmb; + OnigCodePointRange *sbr, *mbr; OnigEncoding enc = env->enc; - if (ONIGENC_CTYPE_SUPPORT_LEVEL(enc) != ONIGENC_CTYPE_SUPPORT_LEVEL_SB) { - r = add_ctype_to_cc_by_list(cc, ctype, not, env->enc); + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); + if (r == 0) { + return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, + nsb, nmb, sbr, mbr); + } + else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; } @@ -3202,7 +3334,8 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! ONIGENC_IS_CODE_SB_WORD(enc, c) && ! ONIGENC_IS_MBC_HEAD(enc, c)) + if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) && + ! ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); } } @@ -3246,6 +3379,14 @@ parse_ctype_to_enc_ctype(int pctype, int* not) ctype = ONIGENC_CTYPE_DIGIT; *not = 1; break; + case CTYPE_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 0; + break; + case CTYPE_NOT_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 1; + break; default: return ONIGERR_PARSER_BUG; break; @@ -3283,23 +3424,26 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) }; PosixBracketEntryType *pb; - int not, i, c, r; + int not, i, r; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; - if (PPEEK == '^') { + if (PPEEK_IS('^')) { PINC; not = 1; } else not = 0; - if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1) + if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2) goto not_posix_bracket; for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (onig_strncmp(p, pb->name, pb->len) == 0) { - p += pb->len; - if (end - p < 2 || *p != ':' || *(p+1) != ']') + if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { + p = onigenc_step(enc, p, end, pb->len); + if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); @@ -3318,9 +3462,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) PINC; if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; } - if (c == ':' && !PEND) { + if (c == ':' && ! PEND) { PINC; - if (!PEND) { + if (! PEND) { PFETCH(c); if (c == ']') return ONIGERR_INVALID_POSIX_BRACKET_TYPE; @@ -3331,7 +3475,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } static int -property_name_to_ctype(UChar* p, UChar* end) +property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) { static PosixBracketEntryType PBS[] = { { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, @@ -3353,9 +3497,10 @@ property_name_to_ctype(UChar* p, UChar* end) PosixBracketEntryType *pb; int len; - len = end - p; + len = onigenc_strlen(enc, p, end); for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (len == pb->len && onig_strncmp(p, pb->name, pb->len) == 0) + if (len == pb->len && + onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) return pb->ctype; } @@ -3366,8 +3511,10 @@ static int fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int ctype; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar *prev, *start, *p = *src; - int c; + PFETCH_READY; /* 'IsXXXX' => 'XXXX' */ if (!PEND && @@ -3391,7 +3538,7 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) prev = p; PFETCH(c); if (c == '}') { - ctype = property_name_to_ctype(start, prev); + ctype = property_name_to_ctype(start, prev, enc); if (ctype < 0) break; *src = p; @@ -3498,12 +3645,26 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } } else { +#if 0 if (intype == CCV_CODE_POINT && *type == CCV_SB && ONIGENC_IS_CONTINUOUS_SB_MB(env->enc)) { bitset_set_range(cc->bs, (int )*vs, 0x7f); r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )0x80, v); if (r < 0) return r; } +#else + if (intype == CCV_CODE_POINT && *type == CCV_SB) { + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); + if (r < 0) return r; + } +#endif else return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; } @@ -3527,22 +3688,24 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } static int -char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped, +code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, OnigEncoding enc) { int in_esc; + OnigCodePoint code; UChar* p = from; + PFETCH_READY; in_esc = 0; - while (p < to) { + while (! PEND) { if (ignore_escaped && in_esc) { in_esc = 0; } else { - if (*p == c) return 1; - if (*p == MC_ESC) in_esc = 1; + PFETCH(code); + if (code == c) return 1; + if (code == MC_ESC(enc)) in_esc = 1; } - p += enc_len(enc, *p); } return 0; } @@ -3565,7 +3728,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, prev_cc = (CClassNode* )NULL; *np = NULL_NODE; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_BYTE && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -3575,11 +3738,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (r < 0) return r; if (r == TK_CC_CLOSE) { - if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc)) + if (! code_exist_check((OnigCodePoint )']', + *src, env->pattern_end, 1, env->enc)) return ONIGERR_EMPTY_CHAR_CLASS; CC_ESC_WARN(env, "]"); - r = tok->type = TK_BYTE; /* allow []...] */ + r = tok->type = TK_CHAR; /* allow []...] */ } *np = node = node_new_cclass(); @@ -3592,58 +3756,69 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { - case TK_BYTE: - len = enc_len(env->enc, tok->u.c); + case TK_CHAR: + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); if (len > 1) { - PUNFETCH; - v = ONIGENC_MBC_TO_CODE(env->enc, p, end); - p += len; in_type = CCV_CODE_POINT; } else { sb_char: - v = (OnigCodePoint )tok->u.c; in_type = CCV_SB; } + v = (OnigCodePoint )tok->u.c; in_israw = 0; goto val_entry2; break; case TK_RAW_BYTE: - len = enc_len(env->enc, tok->u.c); - if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */ + /* tok->base != 0 : octal or hexadec. */ + if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - UChar* bufp = buf; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; + UChar* psave = p; int i, base = tok->base; - if (len > ONIGENC_CODE_TO_MBC_MAXLEN) { - bufp = (UChar* )xmalloc(len); - if (IS_NULL(bufp)) { - r = ONIGERR_MEMORY; - goto err; + buf[0] = tok->u.c; + for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + if (r != TK_RAW_BYTE || tok->base != base) { + fetched = 1; + break; } - bufe = bufp + len; + buf[i] = tok->u.c; } - bufp[0] = tok->u.c; - for (i = 1; i < len; i++) { - r = fetch_token_in_cc(tok, &p, end, env); - if (r < 0) goto raw_byte_err; - if (r != TK_RAW_BYTE || tok->base != base) break; - bufp[i] = tok->u.c; + + if (i < ONIGENC_MBC_MINLEN(env->enc)) { + r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; + goto err; } + + len = enc_len(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - raw_byte_err: - if (bufp != buf) xfree(bufp); goto err; } - v = ONIGENC_MBC_TO_CODE(env->enc, bufp, bufe); - if (bufp != buf) xfree(bufp); - in_type = CCV_CODE_POINT; + else if (i > len) { /* fetch back */ + p = psave; + for (i = 1; i < len; i++) { + r = fetch_token_in_cc(tok, &p, end, env); + } + fetched = 0; + } + + if (i == 1) { + v = (OnigCodePoint )buf[0]; + goto raw_single; + } + else { + v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CCV_CODE_POINT; + } } else { v = (OnigCodePoint )tok->u.c; + raw_single: in_type = CCV_SB; } in_israw = 1; @@ -3837,8 +4012,17 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); if (is_empty != 0) BITSET_IS_EMPTY(cc->bs, is_empty); - if (is_empty == 0) - BITSET_SET_BIT(cc->bs, ONIG_NEWLINE); + + if (is_empty == 0) { +#define NEWLINE_CODE 0x0a + + if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { + if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) + BITSET_SET_BIT(cc->bs, NEWLINE_CODE); + else + add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + } + } } *src = p; return 0; @@ -3857,17 +4041,20 @@ static int parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) { + int r, num; + int list_capture; Node *target; OnigOptionType option; - int r, c, num; - int list_capture; + OnigEncoding enc = env->enc; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; *np = NULL; if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; option = env->option; - if (PPEEK == '?' && + if (PPEEK_IS('?') && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -4015,7 +4202,7 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else if (c == ':') { OnigOptionType prev = env->option; - env->option = option; + env->option = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = parse_subexp(&target, tok, term, &p, end, env); @@ -4071,7 +4258,6 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return 0; } - static char* PopularQStr[] = { "?", "*", "+", "??", "*?", "+?" }; @@ -4136,7 +4322,7 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, - "nested repeat operator '%s and %s' was replaced with '%s'", + "nested repeat operator %s and %s was replaced with '%s'", PopularQStr[targetq_num], PopularQStr[nestq_num], ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); (*onig_verb_warn)(buf); @@ -4164,74 +4350,59 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) return 0; } -#ifdef USE_FOLD_MATCH -static int -make_alt_node_from_fold_info(OnigEncFoldMatchInfo* info, Node** node) -{ - int i; - UChar *s, *end; - Node *root, **ptail, *snode; - - ptail = &root; - for (i = 0; i < info->target_num; i++) { - s = info->target_str[i]; - end = s + info->target_byte_len[i]; - /* ex. - U+00DF match "ss" and "SS, but not match "Ss". - So, string nodes must be raw. - */ - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - } - *ptail = NULL_NODE; - *node = root; - return 0; -} - static int -make_fold_alt_node_from_cc(OnigEncoding enc, CClassNode* cc, Node** root) +make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc, + CClassNode* cc, Node** root) { - int i, j, flen, len, ncode, n; - UChar *s, *end, buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - OnigCodePoint* codes; - Node **ptail, *snode; - OnigEncFoldMatchInfo* info; + int r, i, j, k, clen, len, ncode, n; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node **ptail, *snode = NULL_NODE; + OnigCompAmbigCodes* ccs; + OnigCompAmbigCodeItem* ci; + OnigAmbigType amb; + n = 0; *root = NULL_NODE; ptail = root; - ncode = ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc, &codes); - n = 0; - for (i = 0; i < ncode; i++) { - if (onig_is_code_in_cc(enc, codes[i], cc)) { - len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); - flen = ONIGENC_GET_FOLD_MATCH_INFO(enc, buf, buf + len, &info); - if (flen > 0) { /* fold */ - for (j = 0; j < info->target_num; j++) { - s = info->target_str[j]; - end = s + info->target_byte_len[j]; - if (onig_strncmp(s, buf, enc_len(enc, *s)) == 0) - continue; /* ignore single char. */ - - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - n++; - } + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + ncode = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < ncode; i++) { + if (onig_is_code_in_cc(enc, ccs[i].code, cc)) { + for (j = 0; j < ccs[i].n; j++) { + ci = &(ccs[i].items[j]); + if (ci->len > 1) { /* compound only */ + if (cc->not) clear_not_flag_cclass(cc, enc); + + clen = ci->len; + for (k = 0; k < clen; k++) { + len = ONIGENC_CODE_TO_MBC(enc, ci->code[k], buf); + + if (k == 0) { + snode = node_new_str_raw(buf, buf + len); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + else { + r = onig_node_str_cat(snode, buf, buf + len); + if (r < 0) return r; + } + } + + *ptail = node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + ptail = &(NCONS(*ptail).right); + n++; + } + } } } } return n; } -#endif static int parse_exp(Node** np, OnigToken* tok, int term, @@ -4280,76 +4451,22 @@ parse_exp(Node** np, OnigToken* tok, int term, else goto tk_byte; break; - case TK_BYTE: + case TK_STRING: tk_byte: { - *np = node_new_str_char((UChar )tok->u.c); + *np = node_new_str(tok->backp, *src); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); while (1) { - len = enc_len(env->enc, tok->u.c); - if (len > 1) { - r = onig_node_str_cat(*np, *src, *src + len - 1); - if (r < 0) return r; - *src += (len - 1); - } - r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_BYTE) break; + if (r != TK_STRING) break; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = onig_node_str_cat(*np, tok->backp, *src); if (r < 0) return r; } - fold_entry: -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int flen, ret; - Node *root, **ptail, *work, *snode, *anode; - UChar *p, *pprev; - OnigEncFoldMatchInfo* fold_info; - StrNode* sn = &(NSTRING(*np)); - - ptail = &root; - pprev = sn->s; - for (p = sn->s; p < sn->end; ) { - flen = ONIGENC_GET_FOLD_MATCH_INFO(env->enc, p, sn->end, &fold_info); - if (flen > 0) { /* fold */ - ret = make_alt_node_from_fold_info(fold_info, &anode); - if (ret != 0) return ret; - work = node_new_list(anode, NULL); - CHECK_NULL_RETURN_VAL(work, ONIGERR_MEMORY); - - if (pprev < p) { - snode = node_new_str(pprev, p); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, work); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - else { - *ptail = work; - } - ptail = &(NCONS(work).right); - p += flen; - pprev = p; - } - else - p += enc_len(env->enc, *p); - } - *ptail = NULL_NODE; - if (IS_NOT_NULL(root)) { - if (pprev < sn->end) { - snode = node_new_str(pprev, sn->end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - onig_node_free(*np); - *np = root; - } - } -#endif + string_end: targetp = np; goto repeat; } @@ -4358,22 +4475,19 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_RAW_BYTE: tk_raw_byte: { - int expect_len; - *np = node_new_str_raw_char((UChar )tok->u.c); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - expect_len = enc_len(env->enc, tok->u.c); len = 1; while (1) { r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_RAW_BYTE) { #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - if (len >= expect_len) { + if (len >= enc_len(env->enc, NSTRING(*np).s)) { NSTRING_CLEAR_RAW(*np); } #endif - goto fold_entry; + goto string_end; } r = node_str_cat_char(*np, (UChar )tok->u.c); @@ -4402,7 +4516,7 @@ parse_exp(Node** np, OnigToken* tok, int term, OnigCodePoint end_op[2]; UChar *qstart, *qend, *nextp; - end_op[0] = (OnigCodePoint )MC_ESC; + end_op[0] = (OnigCodePoint )MC_ESC(env->enc); end_op[1] = (OnigCodePoint )'E'; qstart = *src; qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); @@ -4428,6 +4542,8 @@ parse_exp(Node** np, OnigToken* tok, int term, case CTYPE_NOT_WHITE_SPACE: case CTYPE_DIGIT: case CTYPE_NOT_DIGIT: + case CTYPE_XDIGIT: + case CTYPE_NOT_XDIGIT: { CClassNode* cc; int ctype, not; @@ -4455,27 +4571,65 @@ parse_exp(Node** np, OnigToken* tok, int term, break; case TK_CC_OPEN: - r = parse_char_class(np, tok, src, end, env); - if (r != 0) return r; + { + CClassNode* cc; -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int res; - Node *alt_root, *work; - CClassNode* cc = &(NCCLASS(*np)); - - res = make_fold_alt_node_from_cc(env->enc, cc, &alt_root); - if (res < 0) return res; - if (res > 0) { - work = node_new_alt(*np, alt_root); - if (IS_NULL(work)) { - onig_node_free(alt_root); - return ONIGERR_MEMORY; - } - *np = work; + r = parse_char_class(np, tok, src, end, env); + if (r != 0) return r; + + cc = &(NCCLASS(*np)); + + if (IS_IGNORECASE(env->option)) { + int i, n, in_cc; + OnigPairAmbigCodes* ccs; + BitSetRef bs = cc->bs; + OnigAmbigType amb; + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & env->ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs); + for (i = 0; i < n; i++) { + in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc); + + if ((in_cc != 0 && cc->not == 0) || (in_cc == 0 && cc->not != 0)) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ccs[i].from >= SINGLE_BYTE_SIZE) { + /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */ + add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to); + } + else { + if (BITSET_AT(bs, ccs[i].from)) { + /* /(?i:[^A-C])/.match("a") ==> fail. */ + BITSET_SET_BIT(bs, ccs[i].to); + } + if (BITSET_AT(bs, ccs[i].to)) { + BITSET_SET_BIT(bs, ccs[i].from); + } + } + } + } + } + } + + if (IS_IGNORECASE(env->option) && + (env->ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + int res; + Node *alt_root, *work; + + res = make_compound_alt_node_from_cc(env->ambig_flag, env->enc, + cc, &alt_root); + if (res < 0) return res; + if (res > 0) { + work = node_new_alt(*np, alt_root); + if (IS_NULL(work)) { + onig_node_free(alt_root); + return ONIGERR_MEMORY; + } + *np = work; + } } } -#endif break; case TK_ANYCHAR: @@ -4521,7 +4675,6 @@ parse_exp(Node** np, OnigToken* tok, int term, *np = node_new_empty(); } else { - *src = tok->backp; goto tk_byte; } break; @@ -4684,6 +4837,7 @@ onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, scan_env_clear(env); env->option = reg->options; + env->ambig_flag = reg->ambig_flag; env->enc = reg->enc; env->syntax = reg->syntax; env->pattern = pattern; diff --git a/regparse.h b/regparse.h index b2726becbd..5982ec8081 100644 --- a/regparse.h +++ b/regparse.h @@ -1,12 +1,33 @@ +#ifndef REGPARSE_H +#define REGPARSE_H /********************************************************************** - regparse.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef REGPARSE_H -#define REGPARSE_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "regint.h" @@ -43,7 +64,8 @@ #define CTYPE_NOT_WHITE_SPACE (1<<3) #define CTYPE_DIGIT (1<<4) #define CTYPE_NOT_DIGIT (1<<5) - +#define CTYPE_XDIGIT (1<<6) +#define CTYPE_NOT_XDIGIT (1<<7) #define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL) #define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF) @@ -52,23 +74,23 @@ #define EFFECT_OPTION (1<<1) #define EFFECT_STOP_BACKTRACK (1<<2) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) - #define NODE_STR_MARGIN 16 #define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 7 #define NSTR_RAW (1<<0) /* by backslashed number */ -#define NSTR_CASE_AMBIG (1<<1) - -#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) -#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW -#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW -#define NSTRING_SET_CASE_AMBIG(node) (node)->u.str.flag |= NSTR_CASE_AMBIG -#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) -#define NSTRING_IS_CASE_AMBIG(node) \ - (((node)->u.str.flag & NSTR_CASE_AMBIG) != 0) +#define NSTR_AMBIG (1<<1) +#define NSTR_AMBIG_REDUCE (1<<2) + +#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) +#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW +#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW +#define NSTRING_SET_AMBIG(node) (node)->u.str.flag |= NSTR_AMBIG +#define NSTRING_SET_AMBIG_REDUCE(node) (node)->u.str.flag |= NSTR_AMBIG_REDUCE +#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) +#define NSTRING_IS_AMBIG(node) (((node)->u.str.flag & NSTR_AMBIG) != 0) +#define NSTRING_IS_AMBIG_REDUCE(node) \ + (((node)->u.str.flag & NSTR_AMBIG_REDUCE) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static); @@ -96,6 +118,7 @@ typedef struct { } CClassNode; typedef struct { + int state; struct _Node* target; int lower; int upper; @@ -108,19 +131,19 @@ typedef struct { } QualifierNode; /* status bits */ -#define NST_MIN_FIXED (1<<0) -#define NST_MAX_FIXED (1<<1) -#define NST_CLEN_FIXED (1<<2) -#define NST_MARK1 (1<<3) -#define NST_MARK2 (1<<4) -#define NST_MEM_BACKREFED (1<<5) -#define NST_SIMPLE_REPEAT (1<<6) /* for stop backtrack optimization */ - -#define NST_RECURSION (1<<7) -#define NST_CALLED (1<<8) -#define NST_ADDR_FIXED (1<<9) -#define NST_NAMED_GROUP (1<<10) -#define NST_NAME_REF (1<<11) +#define NST_MIN_FIXED (1<<0) +#define NST_MAX_FIXED (1<<1) +#define NST_CLEN_FIXED (1<<2) +#define NST_MARK1 (1<<3) +#define NST_MARK2 (1<<4) +#define NST_MEM_BACKREFED (1<<5) +#define NST_STOP_BT_SIMPLE_REPEAT (1<<6) +#define NST_RECURSION (1<<7) +#define NST_CALLED (1<<8) +#define NST_ADDR_FIXED (1<<9) +#define NST_NAMED_GROUP (1<<10) +#define NST_NAME_REF (1<<11) +#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */ #define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) #define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) @@ -133,13 +156,15 @@ typedef struct { #define IS_EFFECT_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0) #define IS_EFFECT_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0) #define IS_EFFECT_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0) -#define IS_EFFECT_SIMPLE_REPEAT(en) (((en)->state & NST_SIMPLE_REPEAT) != 0) +#define IS_EFFECT_STOP_BT_SIMPLE_REPEAT(en) \ + (((en)->state & NST_STOP_BT_SIMPLE_REPEAT) != 0) #define IS_EFFECT_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0) #define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION #define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0) #define IS_CALL_NAME_REF(cn) (((cn)->state & NST_NAME_REF) != 0) #define IS_BACKREF_NAME_REF(bn) (((bn)->state & NST_NAME_REF) != 0) +#define IS_QUALIFIER_IN_REPEAT(qn) (((qn)->state & NST_IN_REPEAT) != 0) typedef struct { int state; @@ -224,9 +249,10 @@ typedef struct _Node { (senv)->mem_nodes_dynamic : (senv)->mem_nodes_static) typedef struct { - OnigOptionType option; - OnigEncoding enc; - OnigSyntaxType* syntax; + OnigOptionType option; + OnigAmbigType ambig_flag; + OnigEncoding enc; + OnigSyntaxType* syntax; BitStatusType capture_history; BitStatusType bt_mem_start; BitStatusType bt_mem_end; @@ -264,6 +290,9 @@ extern int onig_node_str_cat P_((Node* node, UChar* s, UChar* end)); extern void onig_node_free P_((Node* node)); extern Node* onig_node_new_effect P_((int type)); extern Node* onig_node_new_anchor P_((int type)); +extern Node* onig_node_new_str P_((UChar* s, UChar* end)); +extern Node* onig_node_new_list P_((Node* left, Node* right)); +extern void onig_node_str_clear P_((Node* node)); extern int onig_free_node_list(); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_make_tree P_((Node** root, UChar* pattern, UChar* end, regex_t* reg, ScanEnv* env)); @@ -1,12 +1,53 @@ /********************************************************************** - sjis.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +static int EncLen_SJIS[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 +}; + static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -26,17 +67,39 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }; -#define SJIS_ISMB_FIRST(byte) (OnigEncodingSJIS.len_table[byte] > 1) +#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] +static int +sjis_mbc_enc_len(UChar* p) +{ + return EncLen_SJIS[*p]; +} + +extern int +sjis_code_to_mbclen(OnigCodePoint code) +{ + if (code < 256) { + if (EncLen_SJIS[(int )code] == 1) + return 1; + else + return 0; + } + else if (code <= 0xffff) { + return 2; + } + else + return 0; +} + static OnigCodePoint sjis_mbc_to_code(UChar* p, UChar* end) { int c, i, len; OnigCodePoint n; + len = enc_len(ONIG_ENCODING_SJIS, p); c = *p++; - len = enc_len(ONIG_ENCODING_SJIS, c); n = c; if (len == 1) return n; @@ -57,43 +120,57 @@ sjis_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 0 - if (enc_len(ONIG_ENCODING_SJIS, buf[0]) != (p - buf)) + if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf)) return REGERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } static int -sjis_mbc_to_lower(UChar* p, UChar* lower) +sjis_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end, UChar* lower) { - int len; + UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; } else { - len = enc_len(ONIG_ENCODING_SJIS, *p); + int len = enc_len(ONIG_ENCODING_SJIS, p); + if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -sjis_code_is_ctype(OnigCodePoint code, unsigned int ctype) +sjis_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); + +} + +static int +sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); else { - int first = onigenc_mb2_code_to_mbc_first(code); - return (enc_len(ONIG_ENCODING_SJIS, first) > 1 ? TRUE : FALSE); + return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE); } ctype &= ~ONIGENC_CTYPE_WORD; @@ -123,7 +200,7 @@ sjis_left_adjust_char_head(UChar* start, UChar* s) } } } - len = enc_len(ONIG_ENCODING_SJIS, *p); + len = enc_len(ONIG_ENCODING_SJIS, p); if (p + len > s) return p; p += len; return p + ((s - p) & ~1); @@ -137,38 +214,29 @@ sjis_is_allowed_reverse_match(UChar* s, UChar* end) } OnigEncodingType OnigEncodingSJIS = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 - }, + sjis_mbc_enc_len, "Shift_JIS", /* name */ 2, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, sjis_mbc_to_code, - onigenc_mb2_code_to_mbclen, + sjis_code_to_mbclen, sjis_code_to_mbc, - sjis_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - sjis_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + sjis_mbc_to_normalize, + sjis_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + sjis_is_code_ctype, + onigenc_not_support_get_ctype_code_range, sjis_left_adjust_char_head, - sjis_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + sjis_is_allowed_reverse_match }; @@ -1,60 +1,78 @@ /********************************************************************** - utf8.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +#define USE_INVALID_CODE_SCHEME + +#ifdef USE_INVALID_CODE_SCHEME +/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ +#define INVALID_CODE_FE 0xfffffffe +#define INVALID_CODE_FF 0xffffffff +#define VALID_CODE_LIMIT 0x7fffffff +#endif + #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) -#define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ - ((EncUnicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) - -static unsigned short EncUnicode_ISO_8859_1_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x1050, 0x1050, 0x1050, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x18d0, - 0x1050, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, - 0x1050, 0x1050, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1050, - 0x1050, 0x1050, 0x1850, 0x1850, 0x1050, 0x1871, 0x1050, 0x10d0, - 0x1050, 0x1850, 0x1871, 0x10d0, 0x1850, 0x1850, 0x1850, 0x10d0, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1050, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1050, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871 +static int EncLen_UTF8[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 }; +static int +utf8_mbc_enc_len(UChar* p) +{ + return EncLen_UTF8[*p]; +} + static OnigCodePoint utf8_mbc_to_code(UChar* p, UChar* end) { int c, len; OnigCodePoint n; + len = enc_len(ONIG_ENCODING_UTF8, p); c = *p++; - len = enc_len(ONIG_ENCODING_UTF8, c); if (len > 1) { len--; n = c & ((1 << (6 - len)) - 1); @@ -64,8 +82,14 @@ utf8_mbc_to_code(UChar* p, UChar* end) } return n; } - else + else { +#ifdef USE_INVALID_CODE_SCHEME + if (c > 0xfd) { + return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); + } +#endif return (OnigCodePoint )c; + } } static int @@ -81,6 +105,10 @@ utf8_code_to_mbclen(OnigCodePoint code) else if ((code & 0xffe00000) == 0) return 4; else if ((code & 0xfc000000) == 0) return 5; else if ((code & 0x80000000) == 0) return 6; +#ifdef USE_INVALID_CODE_SCHEME + else if (code == INVALID_CODE_FE) return 1; + else if (code == INVALID_CODE_FF) return 1; +#endif else return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } @@ -147,6 +175,16 @@ utf8_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = UTF8_TRAILS(code, 12); *p++ = UTF8_TRAILS(code, 6); } +#ifdef USE_INVALID_CODE_SCHEME + else if (code == INVALID_CODE_FE) { + *p = 0xfe; + return 1; + } + else if (code == INVALID_CODE_FF) { + *p = 0xff; + return 1; + } +#endif else { return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } @@ -157,49 +195,129 @@ utf8_code_to_mbc(OnigCodePoint code, UChar *buf) } static int -utf8_mbc_to_lower(UChar* p, UChar* lower) +utf8_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end, UChar* lower) { - int len; + UChar* p = *pp; - /* !!! U+0080 - U+00ff is treated by fold match. !!! */ if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if (end > p + 1 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S')))) { + *lower++ = '\303'; + *lower = '\237'; + (*pp) += 2; + return 2; + } + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } else { - len = enc_len(ONIG_ENCODING_UTF8, *p); + int len; + + if (*p == 195) { /* 195 == '\303' */ + int c = *(p + 1); + if (c >= 128) { + if (c <= '\236' && /* upper */ + (flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { + if (c != '\227') { + *lower++ = *p; + *lower = (UChar )(c + 32); + (*pp) += 2; + return 2; + } + } +#if 0 + else if (c == '\237' && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + *lower++ = '\303'; + *lower = '\237'; + (*pp) += 2; + return 2; + } +#endif + } + } + + len = enc_len(ONIG_ENCODING_UTF8, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -utf8_mbc_is_case_ambig(UChar* p) +utf8_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) { - /* !!! U+0080 - U+00ff ( 0x80[0xc2,0x80] - 0xff[0xc3,0xbf] ) - is treated by fold match. !!! */ + UChar* p = *pp; - if (ONIGENC_IS_MBC_ASCII(p)) - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + if (ONIGENC_IS_MBC_ASCII(p)) { + if (end > p + 1 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S')))) { + (*pp) += 2; + return TRUE; + } + + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + } + else { + (*pp) += enc_len(ONIG_ENCODING_UTF8, p); + + if (*p == 195) { /* 195 == '\303' */ + int c = *(p + 1); + if (c >= 128) { + if ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { + if (c <= '\236') { /* upper */ + if (c == '\227') return FALSE; + return TRUE; + } + else if (c >= '\240' && c <= '\276') { /* lower */ + if (c == '\267') return FALSE; + return TRUE; + } + } + else if (c == '\237' && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + return TRUE; + } + } + } + } return FALSE; } static int -utf8_code_is_ctype(OnigCodePoint code, unsigned int ctype) +utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) { - return ENC_IS_ISO_8859_1_CTYPE(code, ctype); + return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype); } if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - return TRUE; +#ifdef USE_INVALID_CODE_SCHEME + if (code <= VALID_CODE_LIMIT) +#endif + return TRUE; } return FALSE; @@ -223,22 +341,17 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, } while (0) static OnigCodePointRange SBAlpha[] = { - { 0x41, 0x5a }, - { 0x61, 0x7a } + { 0x41, 0x5a }, { 0x61, 0x7a } }; static OnigCodePointRange MBAlpha[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, - { 0xf8, 0x220 } + { 0xaa, 0xaa }, { 0xb5, 0xb5 }, + { 0xba, 0xba }, { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, { 0xf8, 0x220 } }; static OnigCodePointRange SBBlank[] = { - { 0x09, 0x09 }, - { 0x20, 0x20 } + { 0x09, 0x09 }, { 0x20, 0x20 } }; static OnigCodePointRange MBBlank[] = { @@ -246,8 +359,7 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange SBCntrl[] = { - { 0x00, 0x1f }, - { 0x7f, 0x7f } + { 0x00, 0x1f }, { 0x7f, 0x7f } }; static OnigCodePointRange MBCntrl[] = { @@ -271,10 +383,8 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange MBLower[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xdf, 0xf6 }, + { 0xaa, 0xaa }, { 0xb5, 0xb5 }, + { 0xba, 0xba }, { 0xdf, 0xf6 }, { 0xf8, 0xff } }; @@ -287,29 +397,21 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange SBPunct[] = { - { 0x21, 0x23 }, - { 0x25, 0x2a }, - { 0x2c, 0x2f }, - { 0x3a, 0x3b }, - { 0x3f, 0x40 }, - { 0x5b, 0x5d }, - { 0x5f, 0x5f }, - { 0x7b, 0x7b }, + { 0x21, 0x23 }, { 0x25, 0x2a }, + { 0x2c, 0x2f }, { 0x3a, 0x3b }, + { 0x3f, 0x40 }, { 0x5b, 0x5d }, + { 0x5f, 0x5f }, { 0x7b, 0x7b }, { 0x7d, 0x7d } }; static OnigCodePointRange MBPunct[] = { - { 0xa1, 0xa1 }, - { 0xab, 0xab }, - { 0xad, 0xad }, - { 0xb7, 0xb7 }, - { 0xbb, 0xbb }, - { 0xbf, 0xbf } + { 0xa1, 0xa1 }, { 0xab, 0xab }, + { 0xad, 0xad }, { 0xb7, 0xb7 }, + { 0xbb, 0xbb }, { 0xbf, 0xbf } }; static OnigCodePointRange SBSpace[] = { - { 0x09, 0x0d }, - { 0x20, 0x20 } + { 0x09, 0x0d }, { 0x20, 0x20 } }; static OnigCodePointRange MBSpace[] = { @@ -321,30 +423,23 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange MBUpper[] = { - { 0xc0, 0xd6 }, - { 0xd8, 0xde } + { 0xc0, 0xd6 }, { 0xd8, 0xde } }; static OnigCodePointRange SBXDigit[] = { - { 0x30, 0x39 }, - { 0x41, 0x46 }, + { 0x30, 0x39 }, { 0x41, 0x46 }, { 0x61, 0x66 } }; static OnigCodePointRange SBWord[] = { - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x5f, 0x5f }, - { 0x61, 0x7a } + { 0x30, 0x39 }, { 0x41, 0x5a }, + { 0x5f, 0x5f }, { 0x61, 0x7a } }; static OnigCodePointRange MBWord[] = { - { 0xaa, 0xaa }, - { 0xb2, 0xb3 }, - { 0xb5, 0xb5 }, - { 0xb9, 0xba }, - { 0xbc, 0xbe }, - { 0xc0, 0xd6 }, + { 0xaa, 0xaa }, { 0xb2, 0xb3 }, + { 0xb5, 0xb5 }, { 0xb9, 0xba }, + { 0xbc, 0xbe }, { 0xc0, 0xd6 }, { 0xd8, 0xf6 }, #if 0 { 0xf8, 0x220 } @@ -358,18 +453,14 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange SBAlnum[] = { - { 0x30, 0x39 }, - { 0x41, 0x5a }, + { 0x30, 0x39 }, { 0x41, 0x5a }, { 0x61, 0x7a } }; static OnigCodePointRange MBAlnum[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, - { 0xf8, 0x220 } + { 0xaa, 0xaa }, { 0xb5, 0xb5 }, + { 0xba, 0xba }, { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, { 0xf8, 0x220 } }; switch (ctype) { @@ -424,92 +515,6 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, return 0; } -static int -utf8_get_all_fold_match_code(OnigCodePoint** codes) -{ - static OnigCodePoint list[] = { - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, - 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, - 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, - 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, - 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, - }; - - *codes = list; - return sizeof(list) / sizeof(OnigCodePoint); -} - -static int -utf8_get_fold_match_info(UChar* p, UChar* end, OnigEncFoldMatchInfo** info) -{ - - static OnigEncFoldMatchInfo xc[] = { - { 2, { 2, 2 }, { "\303\200", "\303\240" } }, /* CodePoint 0xc0 */ - { 2, { 2, 2 }, { "\303\201", "\303\241" } }, - { 2, { 2, 2 }, { "\303\202", "\303\242" } }, - { 2, { 2, 2 }, { "\303\203", "\303\243" } }, - { 2, { 2, 2 }, { "\303\204", "\303\244" } }, - { 2, { 2, 2 }, { "\303\205", "\303\245" } }, - { 2, { 2, 2 }, { "\303\206", "\303\246" } }, - { 2, { 2, 2 }, { "\303\207", "\303\247" } }, - { 2, { 2, 2 }, { "\303\210", "\303\250" } }, - { 2, { 2, 2 }, { "\303\211", "\303\251" } }, - { 2, { 2, 2 }, { "\303\212", "\303\252" } }, - { 2, { 2, 2 }, { "\303\213", "\303\253" } }, - { 2, { 2, 2 }, { "\303\214", "\303\254" } }, - { 2, { 2, 2 }, { "\303\215", "\303\255" } }, - { 2, { 2, 2 }, { "\303\216", "\303\256" } }, - { 2, { 2, 2 }, { "\303\217", "\303\257" } }, - { 2, { 2, 2 }, { "\303\220", "\303\260" } }, /* CodePoint 0xd0 */ - { 2, { 2, 2 }, { "\303\221", "\303\261" } }, - { 2, { 2, 2 }, { "\303\222", "\303\262" } }, - { 2, { 2, 2 }, { "\303\223", "\303\263" } }, - { 2, { 2, 2 }, { "\303\224", "\303\264" } }, - { 2, { 2, 2 }, { "\303\225", "\303\265" } }, - { 2, { 2, 2 }, { "\303\226", "\303\266" } }, - { 0, { 0 }, { "" } }, - { 2, { 2, 2 }, { "\303\230", "\303\270" } }, - { 2, { 2, 2 }, { "\303\231", "\303\271" } }, - { 2, { 2, 2 }, { "\303\232", "\303\272" } }, - { 2, { 2, 2 }, { "\303\233", "\303\273" } }, - { 2, { 2, 2 }, { "\303\234", "\303\274" } }, - { 2, { 2, 2 }, { "\303\235", "\303\275" } }, - { 2, { 2, 2 }, { "\303\236", "\303\276" } }, - { 3, { 2, 2, 2 }, { "\303\237", "ss", "SS" }} /* ess-tsett(U+00DF) */ - }; - - if (p + 1 >= end) return -1; - if (*p < 0x80) { - if ((*p == 'S' && *(p+1) == 'S') || - (*p == 's' && *(p+1) == 's')) { - *info = &(xc[0xdf - 0xc0]); - return 2; - } - } - else if (*p == 195) { /* 195 == '\303' */ - int c = *(p+1); - if (c >= 128) { - if (c <= 159) { /* upper */ - if (c == 151) return -1; /* 0xd7 */ - *info = &(xc[c - 128]); - return 2; - } - else { /* lower */ - if (c == 183) return -1; /* 0xf7 */ - *info = &(xc[c - 160]); - return 2; - } - } - } - - return -1; /* is not a fold string. */ -} - - static UChar* utf8_left_adjust_char_head(UChar* start, UChar* s) { @@ -522,45 +527,32 @@ utf8_left_adjust_char_head(UChar* start, UChar* s) return p; } -static int -utf8_is_allowed_reverse_match(UChar* s, UChar* end) -{ - return TRUE; -} - OnigEncodingType OnigEncodingUTF8 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 - }, + utf8_mbc_enc_len, "UTF-8", /* name */ 6, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_FULL, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, utf8_mbc_to_code, utf8_code_to_mbclen, utf8_code_to_mbc, - utf8_mbc_to_lower, - utf8_mbc_is_case_ambig, - utf8_code_is_ctype, + utf8_mbc_to_normalize, + utf8_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + utf8_is_code_ctype, utf8_get_ctype_code_range, utf8_left_adjust_char_head, - utf8_is_allowed_reverse_match, - utf8_get_all_fold_match_code, - utf8_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; |