diff options
author | Rui Hirokawa <hirokawa@php.net> | 2003-08-23 06:18:44 +0000 |
---|---|---|
committer | Rui Hirokawa <hirokawa@php.net> | 2003-08-23 06:18:44 +0000 |
commit | 5a80df26f3a66f08c376bf959e608ea80071204a (patch) | |
tree | bad99c2afb794cc1795c4be0701d1883b19ff85e /ext/mbstring/oniguruma | |
parent | d696f1267af69779aa361159de563d1886989a1a (diff) | |
download | php-git-5a80df26f3a66f08c376bf959e608ea80071204a.tar.gz |
@mbfilter is changed to bundled version of libmbfl to prevent LGPL licence problem. mbregex is changed to oniguruma.
Diffstat (limited to 'ext/mbstring/oniguruma')
32 files changed, 18918 insertions, 0 deletions
diff --git a/ext/mbstring/oniguruma/COPYING b/ext/mbstring/oniguruma/COPYING new file mode 100644 index 0000000000..7913cbf23f --- /dev/null +++ b/ext/mbstring/oniguruma/COPYING @@ -0,0 +1,32 @@ +OniGuruma LICENSE +----------------- + +When this software is partly used or it is distributed with Ruby, +this of Ruby follows the license of Ruby. +It follows the BSD license in the case of the one except for it. + +/*- + * Copyright (c) 2002 K.Kosako <kosako@sofnec.co.jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ diff --git a/ext/mbstring/oniguruma/HISTORY b/ext/mbstring/oniguruma/HISTORY new file mode 100644 index 0000000000..d9627fced7 --- /dev/null +++ b/ext/mbstring/oniguruma/HISTORY @@ -0,0 +1,517 @@ +History + +2003/07/04: Version 1.9.1 + +2003/07/04: [new] add REG_OPTION_CAPTURE_ONLY_NAMED_GROUP. (thanks .NET) +2003/07/04: [spec] check mbuf member in the case of + REG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC in parse_char_class(). +2003/07/04: [impl] typo REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPEED. + should be REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPED. +2003/07/04: [bug] conflict values on REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPEED and + REG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC. (thanks nobu) +2003/07/03: [spec] add REG_SYN_OP_ESC_CONTROL_CHAR flag. +2003/07/03: [spec] remove REG_SYN_OP_ESC_OCTAL3 and REG_SYN_OP_ESC_X_HEX2 + flag from RegSyntaxGnuRegex. +2003/07/03: [spec] remove REG_SYN_OP_NON_GREEDY flag from RegSyntaxGnuRegex. +2003/07/02: [dist] fix doc/RE. +2003/07/01: [impl] add config flag USE_VARIABLE_SYNTAX. + (turn off variable syntax on Ruby) +2003/07/01: [spec] add syntax behavior REG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND. +2003/06/30: [spec] allow different length top-level alternatives + in look-behind. ex. (?<=abc|abcd), (?<!a|bc) +2003/06/26: [spec] add option REG_OPTION_NEGATE_SINGLELINE. +2003/06/26: [spec] should default on REG_OPTION_SINGLELINE + for REG_SYNTAX_PERL and REG_SYNTAX_JAVA. +2003/06/26: [impl] add options member to RegStntaxType. +2003/06/26: [spec] don't change the meaning of '\Z' for REG_OPTION_SINGLELINE. +2003/06/25: [dist] don't use option REG_NEWLINE for sample/posix.c. +2003/06/25: [dist] modify testconv.rb. + should match and convert double quoted string data. + ex. x(/\ca/, "\001", 0, 1) +2003/06/25: [impl] add REG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL and + REG_SYN_OP2_ESC_M_BAR_META. +2003/06/25: [impl] add REG_SYN_OP_ESC_OCTAL3 and REG_SYN_OP_ESC_X_HEX2. +2003/06/24: [impl] add REG_SYN_OP2_ESC_V_VTAB. (\v is VTAB) +2003/06/24: [bug] should invert REG_OPTION_SINGLELINE flag + in REG_SYN_OP2_OPTION_PERL. +2003/06/24: [impl] add REG_SYN_OP2_OPTION_PERL and REG_SYN_OP2_OPTION_RUBY. + meaning of (?m) and (?s) are depend on syntax. + +2003/06/20: Version 1.9.0 + +2003/06/20: [spec] \Q...\E is not effective on REG_SYNTAX_RUBY. +2003/06/19: [inst] rename regex.h to oniguruma.h. +2003/06/18: [impl] change REG_EXTERN setting condition. (__CYGWIN__) +2003/06/18: [bug] return wrong result UTF-8 case in regex_mb2wc(). +2003/06/18: [impl] add REG_SYN_OP2_POSSESSIVE_INTERVAL. a{n,m}+ +2003/06/18: [new] add REG_SYNTAX_JAVA. +2003/06/18: [spec] add REG_SYN_OP_QUOTE. +2003/06/18: [spec] add op2 member to RegSyntaxType. + rename some REG_SYN_OP_XXX to REG_SYN_OP2. +2003/06/16: [new] Perl-like quotation operator \Q, \E. +2003/06/16: [spec] should not control ignore case mode by escaped char. + ex. /\J/i =~ "j", /[\J]/i =~ "j" (same as Perl) +2003/06/13: [bug] modify onigposix.h. +2003/06/13: [bug] should use -DIMPORT for link with DLL in win32/Makefile. +2003/06/13: [dist] add sample/names.c +2003/06/12: [bug] range should be from - 1 in not_wc_range_buf(). +2003/06/12: [spec] should warn for '-' befor '&&' operator in char-class. +2003/06/12: [new] add REG_SYNTAX_PERL. +2003/06/12: [spec] add syntax behavior REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPEED. +2003/06/12: [spec] invalid POSIX bracket should be error. ex. [[:upper :]] +2003/06/11: [new] char-class in char-class (as Java(TM)). +2003/06/11: [spec] change AND operator in char-class from &&[..] to &&. +2003/06/04: [spec] {n,m}+ should not be possessive operator. + ex. a{3}+ should be (?:a{3})+ +2003/06/03: [bug] should compare strings with min-length in is_not_included(). +2003/06/03: [impl] automatic possessivate optimization. a*b ==> (?>a*)b + (thanks Jeffrey E. F. Friedl) +2003/06/02: [impl] remove multibyte-BitSet for OP_CCLASS_MB/OP_CCLASS_MB_NOT. +2003/05/30: [new] char class intersection operator &&[...] like Java(TM). + (thanks akr) +2003/05/30: [bug] should use bbuf_free() for CClassNode in regex_node_free(). +2003/05/29: [bug] wrong usage of syntax REG_SYN_ALLOW_EMPTY_RANGE_IN_CC. + /[d-a]/ should be error. +2003/05/28: [impl] optimize stop-backtrack compiled code. + (/(?>a*)/, /(?>\w+)/ etc...) + add OP_POP opcode. +2003/05/28: [new] possessive repeat operator. (?+, *+, ++, {n,m}+) +2003/05/27: [spec] '-' at beginning of char-class should be warn only if + it is start of range. (ex. /[--a]/) +2003/05/27: [spec] should not warn for right bracket at beginning of pattern. + ex. /]aaa/ +2003/05/27: [spec] change CCEND_ESC_WARN() from VERB_WARNING() to WARNING(). +2003/05/27: [spec] /[]aaa/ should be empty char-class error. + /[]aaa]/ should be warn for 'without backslash'. + (add char_exist_check() in regparse.c) +2003/05/26: [bug] OP_REPEAT in recursive subexp call. + ex. /(?<n>(a|b\g<n>c){3,5})/.match("baaaaca") => "baaaaca" + was wrong result. (should be "aaaa") +2003/05/26: [impl] add num_call member to regex_t. +2003/05/26: [impl] add repeat_range member to regex_t. + (for delete upper,lower members from StackType.u.repeat) +2003/05/26: [bug] change print_names() to external regex_print_names(). +2003/05/26: [tune] change OP_NULL_CHECK_END process in match_at(). +2003/05/26: [spec] change CCEND_ESC_WARN() from WARNING() to VERB_WARNING(). +2003/05/26: [spec] remove POSIXLINE option. (?p:...) + (be made the same as Ruby.) +2003/05/22: [spec] use OP_NULL_CHECK_XXX only if repeat is infinite. + prev. /(?:()|()){0,10}\1\2/ =~ "" ==> FAIL + now /(?:()|()){0,10}\1\2/ =~ "" ==> MATCH + +2003/05/22: [impl] change target_empty setting condition in setup_tree(). +2003/05/19: [impl] avoid zero length repeat optimization. (thanks matz) + /()*/ ==> /()?/, /()+/ ==> /()/ etc... +2003/05/19: [impl] minor changes for gcc -Wall. (-DREG_DEBUG_STATISTICS case) +2003/05/19: [spec] rename regex_foreach_names() to regex_foreach_name(). +2003/05/16: [new] add --with-statistics option to configure. +2003/05/16: [bug] move RegOpInfo[] definition to regint.h. +2003/05/16: [new] add regex_version(). + +2003/05/14: Version 1.8.6 + +2003/05/14: [bug] use _vsnprintf() on Win32. +2003/05/14: [spec] define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE. + (/\n$/ =~ "\n", /\n\Z/ =~ "\n") [ruby-dev:20125] +2003/05/14: [impl] minor changes for gcc -Wall. +2003/05/14: [impl] add string.h check in AC_CHECK_HEADERS(). +2003/05/13: [impl] minor changes for gcc -Wall. +2003/05/13: [impl] add regex_snprintf_with_pattern(). +2003/05/13: [spec] add warning for char class meta character without escape + in Ruby mode ('[', '-', ']'). +2003/05/13: [impl] define WARNING() and VERB_WARNING() in regint.h. +2003/05/13: [bug] correct is_code_ascii() for /[[:ascii:]]/. +2003/05/12: [dist] add regular expression document (doc/RE). +2003/05/12: [spec] specification of $(END_LINE) was made the same as Ruby 1.8. + [ruby-dev:20130] (thanks matz) +2003/05/12: [memo] shifted to Subversion(version 0.21.0) from CVS. + +2003/03/19: Version 1.8.5 + +2003/03/19: [impl] change REG_EXTERN definition. (thanks nobu) +2003/03/19: [impl] abbreviation for long error_par in regex_error_code_to_str(). +2003/03/18: [dist] change re.c.XXX.patch for GNU regex API changes. +2003/03/18: [spec] change API regex_new(), regex_recompile() and + regex_error_code_to_str(). + change API re_compile_pattern() and re_recompile_pattern(). +2003/03/18: [spec] replace REGERR_END_PATTERN_AT_GROUP_{COMMENT|OPTION} to + REGERR_END_PATTERN_IN_GROUP. +2003/03/17: [impl] should free err_arg. +2003/03/17: [bug] mistake(high -> to) in add_wc_range_to_buf(). +2003/03/17: [spec] add err_arg argument to regex_new() and regex_recompile(). + for detail error message. (thanks akr) + +2003/03/12: Version 1.8.4 + +2003/03/12: [tune] use cached value of effect node in get_min_match_length(). +2003/03/12: [bug] escaped alphabet should be TK_RAW_BYTE + in fetch_token() and fetch_token_in_cc(). +2003/03/12: [spec] change named backref and subexp call format. + backref: \k<name>, call: \g<name> (thanks akr) +2003/03/11: [inst] add regparse.[ch] in win32/Makefile. +2003/03/11: [bug] if UNALIGNED_WORD_ACCESS isn't setted + then compile error in unset_addr_list_fix(). (thanks knu) +2003/03/10: [impl] divide regcomp.c to regcomp.c, regparse.c and regparse.h. +2003/03/10: [bug] should handle multi-byte code name in fetch_name(). +2003/03/10: [spec] remove REGERR_TABLE_FOR_IGNORE_CASE_IS_NOT_SETTED. +2003/03/10: [spec] support POSIX API option REG_NOSUB. + add comp_options member to POSIX API regex_t. + +2003/03/10: Version 1.8.3 + +2003/03/10: [bug] can not compile with Ruby 1.6.8. + (inconsistent st.h with 1.6 and 1.8) + use hash table on Ruby 1.8 only. +2003/03/10: [spec] forbid to use '\' in group name. +2003/03/08: [impl] remove check_backref_number(). +2003/03/08: [bug] called group in 0-repeat should not be eliminated from + compile code. ex. /(?*n)(?<n>){0}/ (thanks akr) + add is_refered member to QualifierNode. +2003/03/07: [impl] use hash table(st.[ch]) for implementation of name table. + (enable on Ruby in default) +2003/03/07: [new] add regex_foreach_names(). +2003/03/06: [impl] add member reg->stack_pop_level. +2003/03/06: [impl] add operator OP_MEMORY_START and member reg->backtrack_mem. +2003/03/06: [bug] if REG_OPTION_FIND_LONGEST or REG_OPTION_NOT_EMPTY, + should handle backtrack of MEM_END. + add OP_MEMORY_END_PUSH and OP_MEMORY_END_PUSH_REC. +2003/03/06: [impl] rename OP_MEMORY_END_PUSH to OP_MEMORY_END_MARK. +2003/03/06: [spec] change error messages. +2003/03/06: [tune] add tiny_pop check in STACK_POP. + +2003/03/05: Version 1.8.2 + +2003/03/05: [impl] use cache info in EFFECT_MEMORY case + in optimize_node_info(). +2003/03/05: [impl] add EFFECT_MEMORY node reference count check + in optimize_node_left(). +2003/03/05: [impl] add min-len, max-len, char-len cache in EffectNode. +2003/03/05: [spec] allow to call in look behind. ex. /(?<=(?*a))/ +2003/03/05: [bug] forgotten N_ANCHOR case in check_backref_number(), + subexp_inf_recursive_check_trav() etc... +2003/03/05: [impl] rename USE_ONIGURUMA_EXTENSION to USE_SBMB_CLASS. +2003/03/04: [impl] add CALL-node info in optimize_node_left(). +2003/03/04: [spec] prohibit left recursion of subexp call. ex. (?<n>|(?*n)a) + add subexp_inf_recursive_check_trav(). +2003/03/04: [spec] rename REG_SYN_STRICT_CHECK_BACKREF_NUMBER + to REG_SYN_STRICT_CHECK_BACKREF +2003/03/03: [bug] /(?<n>a(?*n)|)/ isn't infinite recursion. + fix N_LIST case in subexp_recursive_check(). (thanks akr) +2003/03/03: [bug] /(?<n>|(?*n))+/ segmentation fault. + should re-allocate in unset_addr_list_add(). (thanks akr) + +2003/03/01: Version 1.8.1 + +2003/03/01: [bug] change STACK_GET_MEM_START() and STACK_PUSH_MEM_END(). +2003/03/01: [new] add reg_name_to_group_numbers() to POSIX API. +2003/03/01: [impl] use OP_MEMORY_END_PUSH in callable subexp compiled code + only if subexp is recursive. +2003/03/01: [spec] rename regex_name_to_backrefs() to + regex_name_to_group_numbers(). +2003/02/28: [impl] use function stack_double() instead of macro. +2003/02/28: [new] subexp call. (?*name) (thanks akr) +2003/02/28: [spec] add match stack limit check. (MATCH_STACK_LIMIT_SIZE) +2003/02/28: [impl] check recursive subexp call. +2003/02/28: [impl] add opcode OP_MEMORY_END_PUSH for callable subexp. +2003/02/28: [impl] add opcode OP_CALL, OP_RETURN. + add stack type STK_CALL_FRAME, STK_RETURN, STK_MEM_END. +2003/02/26: [spec] add new syntax behavior REG_SYN_STRICT_CHECK_BACKREF_NUMBER. + if it is setted, then error /(\1)/, /\1(..)/ etc... +2003/02/26: [spec] if backref number is greater than max group number, + then return compile error. (REGERR_INVALID_BACKREF_NUMBER) +2003/02/26: [tune] bad implemented N_ALT case in get_min_match_length(). +2003/02/26: [dist] auto update testc.c and win32/testc.c in dist target. +2003/02/26: [impl] add -win option to testconv.rb. +2003/02/25: [spec] allow to assign same name to different group. + add OP_BACKREF_MULTI. +2003/02/24: [impl] reduce redundant repeat of empty target. + ex. /()*/ ==> /()?/, /()+/ ==> /()/, /(?:)+/ ==> // +2003/02/24: [impl] change condition in regex_is_allow_reverse_match(). +2003/02/24: [impl] convert i(/../, ...) functions in testconv.rb. +2003/02/24: [impl] change name table struct. + +2003/02/22: Version 1.8.0 + +2003/02/22: [new] named subexp, named back reference. (thanks akr) + define: (?<name>...), back-ref: \g<name> +2003/02/22: [impl] use str_node_can_be_split(). +2003/02/21: [dist] add sample/posix.c +2003/02/21: [spec] rename some error code symbols. +2003/02/21: [spec] max number of multibyte ranges(255) is small. + 255 --> 1000. (thanks MoonWolf) +2003/02/20: [new] supported Basic Regular Expression(BRE) in POSIX API. + (REG_EXTENDED option: Extended RE) +2003/02/20: [new] variable syntax. + +2003/02/12: Version 1.7.2 + +2003/02/12: [bug] mismatch /\?a/i.match('?A'). + check raw value in scan_make_node() and scan_backslash(). + (thanks Nobu) +2003/02/12: [impl] rename 'max_mem' to 'num_mem' in regex_t. +2003/02/12: [impl] rename 'code' to 'enc' in regex_t. +2003/02/12: [spec] remove transtable argument in regex_new and regex_recompile. + remove transtable member in regex_t. +2003/02/10: [inst] change backup file suffix name from '.orig' to '.ruby_orig'. + (win32/Makefile) +2003/02/10: [spec] number check in scan_char_class() ignore-case mode. + ex. /[\x58-\x64]/i +2003/02/10: [impl] don't use OP_MEMORY_END_PUSH (and STK_MEM_END). +2003/02/10: [impl] lift up head_exact value from child qualifier node to parent. +2003/02/10: [tune] change stack type values. +2003/02/10: [dist] add HISTORY. +2003/02/08: [tune] change stack type values. +2003/02/08: [tune] add STACK_BASE_CHECK(). +2003/02/08: [tune] add STACK_PUSH_ENSURED(). +2003/02/08: [dist] change contents of doc/API. +2003/02/07: [inst] change backup file suffix name from '.orig' to '.ruby_orig'. +2003/02/07: [spec] range in char-class should be same spec. with Ruby + in ignore-case mode. (ex. /[A-c]/i == /[a-c]/i) + (thanks MoonWolf) +2003/02/07: [spec] [!--] should be allowed. (thanks MoonWolf) +2003/02/07: [dist] refresh re.c.180.patch for re.c (2003-02-06). + +2003/02/07: Version 1.7.1 + +2003/02/07: [impl] check first byte of string in ignore-case mode. + (get_head_exact_node()) +2003/02/07: [impl] remove redundant statements in setup_tree(). +2003/02/06: [new] create Win32 DLL. +2003/02/06: [impl] use P_() macro for function prototype. +2003/02/06: [impl] add HAVE_PROTOTYPE, HAVE_STDARG_PROTOTYPES in + configure.in and config.h.in. +2003/02/06: [spec] /[0-9-a]/ is allowed as usual char '-' and 'a' in Ruby. + add USE_BETTER_COMPATIBILITY_FOR_ORIGINAL_REGEX in + regint.h. (thanks MoonWolf) +2003/02/06: [spec] rename REG_MBCTYPE_XXXX to REG_ENCODING_XXXX in onigposix.h. +2003/02/05: [spec] rename MBCTYPE_XXXX to REG_MBCTYPE_XXXX in onigposix.h. +2003/02/05: [spec] add POSIX API error REG_EONIG_THREAD to onigposix.h. +2003/02/05: [dist] add .cvsignore file. + +2003/02/04: Version 1.7 + +2003/02/04: [bug] typo miss in regex_region_copy(). +2003/02/04: [impl] change THREAD_PASS macro. (regint.h) +2003/02/04: [dist] add API document file doc/API. +2003/02/04: [tune] if sub_anchor has ANCHOR_BEGIN_LINE then + set REG_OPTIMIZE_EXACT_BM in set_optimize_exact_info(). +2003/02/04: [spec] reimplement regex_clone() and it is obsoleted. +2003/02/04: [bug] add REGERR_OVER_THREAD_PASS_LIMIT_COUNT + to regerror.c regposix.c. +2003/02/03: [bug] Hankaku-Kana may be second byte in Shift_JIS + regex_is_allow_reverse_match(). +2003/02/03: [impl] add optimization type REG_OPTIMIZE_EXACT_BM_NOT_REV. + remove exact_allow_reverse_match member in regex_t. +2003/02/03: [impl] add exact_allow_reverse_match member in regex_t. +2003/02/03: [impl] compile-search conflict in regex_search() is handled. +2003/02/01: [tune] decrease regex_region_clear() calling from regex_search(). +2003/02/01: [tune] remove region argument from match_at(). +2003/01/31: [tune] don't use strlen() in regexec() and regcomp(). +2003/01/31: [tune] decrease regex_reduce_chain() calling in regex_search(). +2003/01/31: [bug] STRING_CMP() in regexec.c was wrong in ignore-case. +2003/01/31: [impl] convert to lower-case char at string compile time. + change SBTRANSCMP() in regexec.c. +2003/01/31: [impl] rename TTRANS() to TOLOWER(). +2003/01/30: [bug] .c.o --> .c.obj in win32\Makefile. +2003/01/30: [impl] add -DNOT_RUBY to Makefile.in. + NOT_RUBY is refered in regint.h for escape double + including config.h. +2003/01/30: [impl] when string hasn't case ambiguity, don't compile + to ignore case opcode. +2003/01/29: [impl] add SJIS, UTF-8 test_sb() test. +2003/01/29: [dist] add INSTALL-RUBY file. +2003/01/28: [test] success in Cygwin, Ruby 1.8.0 (2003-01-27). +2003/01/24: [inst] add rback target to Makefile.in. +2003/01/24: [impl] change SBCMP() -> IS_NEWLINE() in match_at(). +2003/01/23: [impl] add encoding arg to scan_xxxx_number(). +2003/01/23: [impl] rename WCInt to WCINT. +2003/01/22: [bug] POSIX API regexec() was not thread safe. + remove region member from POSIX regex_t. + [new] add search time option REG_OPTION_POSIX_REGION. + (region argument is treated as regmatch_t[] type) + speed up regexec(). +2003/01/22: [memo] start CVS entry in my box. + +2003/01/21: Version 1.6 + +2003/01/21: [test] Mac OS X 10.1, Ruby 1.8.0 (2003-01-20) +2003/01/20: [impl] add UTF-8 check to test.rb. (thanks UENO Katsuhiro) +2003/01/18: [impl] change REGION_NOTPOS to REG_REGION_NOTPOS in regex.h. +2003/01/17: [dist] add sample/simple.c. +2003/01/17: [inst] add configure option --with-rubydir. +2003/01/17: [bug] bad implemeted POSIX API options. + default: /./ not match "\n", anchor not match "\n" + REG_NEWLINE: /./ not match "\n", anchor match "\n" +2003/01/16: [impl] rewrite POSIX API regexec() for speed up. +2003/01/16: [impl] add region member to POSIX regex_t struct. +2003/01/16: [inst] rename library file from 'libregex.a' to 'libonig.a'. +2003/01/15: [dist] add testc.c to distribution file. +2003/01/15: [test] success in 'make rtest/ctest/ptest' on Windows 2000. +2003/01/15: [bug] change '/' to \' in win32/Makefile. +2003/01/14: [test] success in Ruby make test on Windows 2000. + VC++6.0, Ruby 1.6.8 (2003-01-12) +2003/01/14: [inst] change Makefile.in and win32/Makefile. +2003/01/11: [inst] changes for Win32 platform. (regint.h, reggnu.c, regcomp.c) +2003/01/11: [dist] add win32 directory. (config.h, Makefile, testc.c) +2003/01/10: [inst] add onigposix.h to install target. (Makefile.in) +2003/01/10: [bug] lacked a comma in ESTRING[]. (regposerr.c) +2003/01/10: [bug] local variable name was wrong. buf -> tbuf (regerror()) +2003/01/10: [spec] remove REG_RUBY_M17N case from onigposix.h and regposix.c. + +2003/01/09: Version 1.5 + +2003/01/09: [inst] replace Ruby re.c.XXX.patch files. (166 -> 168, 172 -> 180) +2003/01/09: [new] implement POSIX API. (thanks knu) + (onigposix.h, regposix.c, regposerr.c) +2003/01/08: [spec] remove REGERR_END_PATTERN_AFTER_BACKSLASH in regex.h. +2003/01/08: [spec] region arg can be NULL in regex_search() and regex_match(). + +2003/01/08: Version 1.4 + +2003/01/08: [inst] add test program converter (test.rb -> testc.c). +2003/01/08: [bug] move GET_WCINT() from regcomp.c to regint.h. +2003/01/07: [inst] add new test script (test.rb). +2002/12/30: [bug] wrong merge in multibyte mode (alt_merge_opt_exact_info()). +2002/12/28: [inst] add rtest target to Makefile.in. +2002/12/28: [bug] /\xfe/.match("\xfe") mismatch in multibyte mode. + add "raw" flag arg to concat_opt_exact_info_str(). +2002/12/25: [bug] check condition was wrong in alt_merge_opt_map_info(). +2002/12/25: [impl] add threshold_len check in regex_search(). +2002/12/23: [bug] prec-read in alternative (/a|(?=z).f/.match("zf") => nil) +2002/12/23: [bug] \G in alternative (/a|\Gz/.match("bza") => "z"). + add start member in MatchArg. (regexec.c) +2002/12/21: [impl] **** rewrite all optimization process. **** +2002/12/16: [impl] remove node subtype EFFECT_EMPTY. +2002/12/12: [impl] reconstruct node types. (regcomp.c) +2002/12/11: [impl] add regerror.c +2002/12/10: [bug] [ruby-dev:19042] (thanks Nobu) + anchor(\G etc...) influenced outside of "|". (/a|\Gb/) +2002/11/30: [bug] [ruby-dev:18966] (thanks Nobu) + char-class(\S, [^\s] etc...) optimize map-info was wrong. +2002/11/29: [bug] infinite loop on NULL-pointer str search (regex_search()). + (thanks matz) +2002/11/29: [bug] change static -> extern (regex_chain_reduce()). +2002/11/29: [bug] change encoding to RegDefaultCharEncoding + in re_recompile_pattern(). (adapt to re.c) +2002/04/24: [spec] USE_ONIGURUMA_EXTENSION is disabled in default. +2002/04/24: [new] add searching time option: REG_OPTION_NOTBOL/NOTEOL. + add searching time option argument to regex_search() and + regex_match(). (prepare for POSIX API) +2002/04/20: [impl] divide regex.c file into regcomp.c, regexec.c, reggnu.c + and regint.h. +2002/04/09: [impl] move IS_MULTILINE() to outside of loop in OP_ANYCHAR_STAR. +2002/04/08: [impl] don't use OP_REPEAT operator for '??'. +2002/04/06: [impl] reduce redundant nested repeat operators(?,*,+,??,*?,+?). + ex. (?:a*)?, (?:a??)* etc.. +2002/04/06: [spec] should not warn for /(?:a?)+?/. +2002/04/04: [spec] should allow fixed length alternative and repeat pattern + in look-behind. ex. /(?<=(a|b){3})/ (thanks Guy Decoux) +2002/04/02: [spec] should warn for /(?:a+)?/ and /(?:a*)??/. (thanks akr) + +2002/04/01: Version 1.3 + +2002/04/01: [dist] add COPYING. +2002/03/30: [spec] warn redundant nested repeat operator + in Ruby verbose mode. ex. (?:a*)? +2002/03/30: [spec] nested repeat operator error check should be + same with GNU regex. (thanks Guy Decoux) +2002/03/30: [new] add \x{hexadecimal-wide-char}. (thanks matz) +2002/03/27: [bug] MBCTYPE_XXX symbol values should be same with GNU regex. +2002/03/27: [impl] add THREAD_ATOMIC to regex_clone(), regex_init(), regex_end(). +2002/03/25: [spec] if encoding is utf-8, allow combination of singlebyte and + multibyte code range in char class. + (cancelled 2002/04/01: for M17N compatibility) +2002/03/25: [dist] description of the license condition is added to README. +2002/03/23: [bug] should set all bits of reg->mem_stats, + if REG_OPTION_FIND_LONGEST or REG_OPTION_NOT_EMPTY. +2002/03/23: [new] add a new option REG_OPTION_NOT_EMPTY. +2002/03/20: [spec] allow incompleted left brace as an usual char. + ex. /{/, /({)/, /a{2,3/ etc... +2002/03/20: [impl] serialize integer in bytecode. + (switch by UNALIGNED_WORD_ACCESS in regex.c) +2002/03/20: [impl] change re_mbcinit() for REG_RUBY_M17N. +2002/03/19: [impl] word alignment of char class multi-byte code ranges. +2002/03/19: [impl] replace OP_EXACTMB4N with OP_EXACTMB3N. +2002/03/19: [bug] OP_CCLASS_MB_NOT process in matchAt() is wrong. +2002/03/19: [new] add re_mbctab[] for Ruby extension library compatibility. +2002/03/19: [spec] allow nested repeat operator, if operator is {n,m} type. +2002/03/19: [new] add REG_IS_PATTERN_ERROR(ecode) in regex.h +2002/03/18: [spec] /[a-b-c]/ should be error. +2002/03/18: [bug] /[\w-a]/ should be error. (thanks Guy Decoux) +2002/03/18: [bug] /[\]/ should be error. (thanks Guy Decoux) +2002/03/18: [bug] /()*/ etc.. should not be error. (thanks Guy Decoux) +2002/03/18: [spec] /a{1}*/ should not be error. (thanks Guy Decoux) +2002/03/18: [bug] ab{2}{3} was interpreded to (?:a(?:b{2})){3} + (thanks Guy Decoux) +2002/03/18: [bug] abort /(?i)*a/ etc... (thanks Guy Decoux) +2002/03/18: [bug] abort /a|*/,/a|{1}/ etc... (thanks Guy Decoux) + +2002/03/13: Version 1.2 + +2002/03/13: [test] success in rubicon/builtin/AllBuiltinTests.rb. + (thanks rubicon) +2002/03/13: [bug] OP_EXACTMBN process in matchAt() is wrong. +2002/03/13: [bug] start argument of BackwardSearchRange() is wrong. +2002/03/12: [spec] change function name style from CamelCase + to underline_separation. (includes API) +2002/03/12: [bug] if pattern has nested null-check, cause infinite loop. + correct STACK_NULL_CHECK() macro. (thanks Guy Decoux) +2002/03/11: [bug] it is wrong that four numbers to continue as + an octal value in scanBackSlash(). ex. /\0111/ + (thanks matz) +2002/03/11: [new] \k (single-byte word char), \K (multi-byte char). +2002/03/09: [inst] add two targets to Makefile.in (166 and 172). +2002/03/09: [spec] decrease REG_MAX_BACKREF_NUM, REG_MAX_REPEAT_NUM + values. +2002/03/08: [spec] allow use of "\A"(begin-buf) in look-behind. +2002/03/08: [impl] add a new opcode OP_PUSH_IF_PEEK_NEXT. +2002/03/08: [impl] add a new opcode OP_ANYCHAR_STAR_PEEK_NEXT. +2002/03/07: [spec] prohibit use of capture group "(...)" + in negative look-behind. +2002/03/07: [inst] add configure.in, config.h.in, Makefile.in. +2002/03/07: [impl] call Init_REGEX_STAT() in RegexInit(). +2002/03/07: [spec] less length string match with negative look-behind. + ex. /(?<!XXX)a/.match("Xa"). (thanks Nobu) +2002/03/06: [impl] expand repeated string, if expanded length <= 100. + ex. /(?:abc){10}/ +2002/03/06: [new] add a symbol REG_TRANSTABLE_USE_DEFAULT in regex.h. +2002/03/06: [impl] rename RegDefaultCharCode to RegDefaultCharEncoding. +2002/03/06: [bug] if pattern has NULL(\000) char, infinite loop happens + in ScanMakeNode(). (beware of strchr(). thanks Nobu) +2002/03/06: [bug] range argument of ForwardSearchRange() is wrong. + ex. /\A.a/, /\G.a/ mismatched with "aa". (thanks Nobu) +2002/03/05: [new] add RegexMatch() API. rename regexMatch() to matchAt(). +2002/03/05: [impl] change function definition style. +2002/03/05: [impl] abolish use of macro symbol which name begin with underline. +2002/03/04: [bug] make up a break-statement in compileTree(). + (compile error on Mac OS X 10.1.3) + +2002/03/04: Version 1.1 + +2002/03/04: [impl] replace STK_BOTTOM with STK_ALT. +2002/03/02: [impl] add new opcode OP_FINISH and new stack type + STK_BOTTOM for (little bit) speed up STACK_POP. +2002/03/02: [impl] add new opcode OP_EXACT1_IC, OP_EXACTN_IC + for compile time ignore case check. + remove opcode OP_EXACT1_RAW, OP_EXACTN_RAW. +2002/03/02: [impl] add OpTime info to statistical data. +2002/02/28: [bug] sub_anchor($) in ForwardSearch() and BackwardSearch(). + ex. /$\x0az/.match("\nz") +2002/02/28: [new] look-behind (?<=pattern), (?<!pattern). +2002/02/27: [bug] use StackIndex instead of StackType* for realloc problem. +2002/02/27: [impl] use m17n_codepoint() as mb2wc() in REG_RUBY_M17N. +2002/02/27: [spec] undefined POSIX bracket /[[:xyz:]]/ should be syntax error. +2002/02/26: [bug] ex. /$*/, /[a-]/, /((?i)a)b/ (thanks matz) + +2002/02/25: Version 1.0 (first release) + +-- +[bug: bug fix] +[new: new feature] +[spec: specification change] +[impl: implementation change] +[tune: tune for speed up] +[inst: changes for installation] +[dist: distribution change] +[test: test] +[memo: memo] diff --git a/ext/mbstring/oniguruma/INSTALL-RUBY b/ext/mbstring/oniguruma/INSTALL-RUBY new file mode 100644 index 0000000000..ea214b6127 --- /dev/null +++ b/ext/mbstring/oniguruma/INSTALL-RUBY @@ -0,0 +1,48 @@ +INSTALL-RUBY 2003/06/12 + +The way of installing into Ruby is shown. +First, prepare for the source of Ruby. +(http://www.ruby-lang.org/) + +A. Unix or Cygwin platform +B. Win32 platform (VC++) + + +A. Unix or Cygwin platform + + (in oniguruma directory) + 1. ./configure --with-rubydir=<ruby-source-dir> + 2. make 16 # for Ruby 1.6.8 + or + make 18 # for Ruby 1.8.0 + + Or you can specify ruby source directory. + (ex. make 16 RUBYDIR=../ruby) + + (in ruby source directory) + 3. ./configure (** If it doesn't go yet. **) + 4. make clean + 5. make + + + * test (ASCII and EUC-JP) + + (in oniguruma directory) + 6. make rtest + Or you can specify ruby program directory. + (ex. make rtest RUBYDIR=/usr/local/bin) + + +B. Win32 platform (VC++) + + * Requirement: Visual C++, patch.exe + + (in oniguruma directory) + 1. copy win32\Makefile Makefile + 2. nmake 16 RUBYDIR=<ruby-source-dir> # for Ruby 1.6.8 + or + nmake 18 RUBYDIR=<ruby-source-dir> # for Ruby 1.8.0 + + 3. Follow <ruby-source-dir>\win32\README.win32 description... + +// END diff --git a/ext/mbstring/oniguruma/Makefile.in b/ext/mbstring/oniguruma/Makefile.in new file mode 100644 index 0000000000..fd79cfb24f --- /dev/null +++ b/ext/mbstring/oniguruma/Makefile.in @@ -0,0 +1,188 @@ +# Oni Guruma Makefile + +product_name = oniguruma +dist_tag = `date '+%Y%m%d'` + +SHELL = /bin/sh +AUTOCONF = autoconf + +CPPFLAGS = +CFLAGS = @CFLAGS@ @STATISTICS@ +LDFLAGS = +LOADLIBES = +AR = ar +ARFLAGS = rc +RANLIB = @RANLIB@ +INSTALL = install -c +CP = cp -p +CC = @CC@ +DEFS = @DEFS@ -DNOT_RUBY +RUBYDIR = @RUBYDIR@ +WIN32 = win32 +DOC = doc + +srcdir = @srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ +libdir = $(exec_prefix)/lib +includedir = $(prefix)/include + +subdirs = + +libname = libonig.a + +onigintheaders = regint.h regparse.h +onigheaders = oniguruma.h $(onigintheaders) +posixheaders = onigposix.h +headers = $(posixheaders) $(onigheaders) + +onigobjs = regerror.o regparse.o regcomp.o regexec.o reggnu.o +posixobjs = regposix.o regposerr.o +libobjs = $(onigobjs) $(posixobjs) + +onigsources = regerror.c regparse.c regcomp.c regexec.c reggnu.c +posixsources = regposix.c regposerr.c +libsources = $(posixsources) $(onigsources) +rubysources = regex.c $(onigsources) + +patchfiles = re.c.168.patch re.c.180.patch +distfiles = README COPYING INSTALL-RUBY HISTORY \ + .cvsignore Makefile.in configure.in config.h.in configure \ + $(headers) $(libsources) regex.c $(patchfiles) \ + test.rb testconv.rb $(testc).c +win32distfiles = $(WIN32)/Makefile $(WIN32)/config.h $(WIN32)/testc.c +docfiles = $(DOC)/API $(DOC)/RE + +samplefiles = sample/*.c + +testc = testc +testp = testp + +makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)' + +.SUFFIXES: +.SUFFIXES: .o .c .h .ps .dvi .info .texinfo + +.c.o: + $(CC) $(CFLAGS) $(CPPFLAGS) $(DEFS) -I. -I$(srcdir) -c $< + +# targets +default: all + +all: $(libname) + +$(libname): $(libobjs) + rm -f $(libname) + $(AR) $(ARFLAGS) $(libname) $(libobjs) + $(RANLIB) $(libname) + +regparse.o: regparse.c $(onigheaders) config.h +regcomp.o: regcomp.c $(onigheaders) config.h +regexec.o: regexec.c regint.h oniguruma.h config.h +reggnu.o: reggnu.c regint.h oniguruma.h config.h +regerror.o: regerror.c regint.h oniguruma.h config.h +regposix.o: regposix.c $(posixheaders) oniguruma.h config.h +regposerr.o: regposerr.c $(posixheaders) config.h + +install: all + test -d $(libdir) || mkdir $(libdir) + test -d $(includedir) || mkdir $(includedir) + $(INSTALL) $(libname) $(libdir)/$(libname) + $(RANLIB) $(libdir)/$(libname) + $(INSTALL) $(srcdir)/oniguruma.h $(includedir)/oniguruma.h + $(INSTALL) $(srcdir)/onigposix.h $(includedir)/onigposix.h + +uninstall: + -rm -f $(libdir)/$(libname) + -rm -f $(includedir)/oniguruma.h + +# Ruby test +rtest: + $(RUBYDIR)/ruby -w -Ke test.rb + +# C library test +ctest: $(testc) + ./$(testc) + +# POSIX C library test +ptest: $(testp) + ./$(testp) + +$(testc): $(testc).c $(libname) + $(CC) $(CFLAGS) -o $@ $(testc).c $(libname) + +$(testp): $(testc).c $(libname) + $(CC) -DPOSIX_TEST $(CFLAGS) -o $@ $(testc).c $(libname) + +$(testc).c: test.rb testconv.rb + ruby -Ke testconv.rb < test.rb > $@ + +$(WIN32)/$(testc).c: test.rb testconv.rb + ruby -Ke testconv.rb -win < test.rb | nkf -cs > $@ + +clean: + rm -f *.o $(libname) $(testc) $(testp) $(testc) *~ win32/*~ + +distclean: clean + rm -f Makefile config.status + + +16: cpruby + patch -d $(RUBYDIR) -p0 < re.c.168.patch + +18: cpruby + patch -d $(RUBYDIR) -p0 < re.c.180.patch + +# backup file suffix +SORIG = ruby_orig + +cpruby: + $(CP) $(RUBYDIR)/regex.c $(RUBYDIR)/regex.c.$(SORIG) + $(CP) $(RUBYDIR)/regex.h $(RUBYDIR)/regex.h.$(SORIG) + $(CP) $(RUBYDIR)/re.c $(RUBYDIR)/re.c.$(SORIG) + $(CP) $(rubysources) $(onigintheaders) $(RUBYDIR) + $(CP) oniguruma.h $(RUBYDIR)/regex.h + +rback: + $(CP) $(RUBYDIR)/regex.c.$(SORIG) $(RUBYDIR)/regex.c + $(CP) $(RUBYDIR)/regex.h.$(SORIG) $(RUBYDIR)/regex.h + $(CP) $(RUBYDIR)/re.c.$(SORIG) $(RUBYDIR)/re.c + +samples: + $(CC) $(CFLAGS) -I. -o sample/simple sample/simple.c $(libname) + $(CC) $(CFLAGS) -I. -o sample/posix sample/posix.c $(libname) + $(CC) $(CFLAGS) -I. -o sample/names sample/names.c $(libname) + +configure: configure.in + $(AUTOCONF) + +config.status: configure + $(SHELL) ./config.status --recheck + +Makefile: Makefile.in config.status + $(SHELL) ./config.status + +# Prevent GNU make 3 from overflowing arg limit on system V. +.NOEXPORT: + +manifest: + for file in $(distfiles); do echo $$file; done + + +distdir = $(product_name) + +dist_auto: $(testc).c $(WIN32)/$(testc).c + +dist: configure dist_auto + rm -rf $(distdir) + mkdir $(distdir) + mkdir $(distdir)/$(DOC) + mkdir $(distdir)/$(WIN32) + mkdir $(distdir)/sample + ln $(distfiles) $(distdir) + ln $(docfiles) $(distdir)/$(DOC) + ln $(win32distfiles) $(distdir)/$(WIN32) + ln $(samplefiles) $(distdir)/sample + tar chf - $(distdir) | gzip > onigd$(dist_tag).tar.gz + rm -rf $(distdir) diff --git a/ext/mbstring/oniguruma/config.h b/ext/mbstring/oniguruma/config.h new file mode 100644 index 0000000000..9e37608fdc --- /dev/null +++ b/ext/mbstring/oniguruma/config.h @@ -0,0 +1,67 @@ +/* ext/mbstring/oniguruma/config.h. Generated by configure. */ +/* config.h.in. Generated automatically from configure.in by autoheader. */ + +/* Define if using alloca.c. */ +/* #undef C_ALLOCA */ + +/* Define to empty if the keyword does not work. */ +/* #undef const */ + +/* Define to one of _getb67, GETB67, getb67 for Cray-2 and Cray-YMP systems. + This function is required for alloca.c support on those systems. */ +/* #undef CRAY_STACKSEG_END */ + +/* Define if you have alloca, as a function or macro. */ +#define HAVE_ALLOCA 1 + +/* Define if you have <alloca.h> and it should be used (not on Ultrix). */ +#define HAVE_ALLOCA_H 1 + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at run-time. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown + */ +/* #undef STACK_DIRECTION */ + +/* Define if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define if you can safely include both <sys/time.h> and <time.h>. */ +/* #undef TIME_WITH_SYS_TIME */ + +/* The number of bytes in a int. */ +#define SIZEOF_INT 4 + +/* The number of bytes in a long. */ +#define SIZEOF_LONG 4 + +/* The number of bytes in a short. */ +/* #undef SIZEOF_SHORT */ + +/* Define if you have the <stdlib.h> header file. */ +#define HAVE_STDLIB_H 1 + +/* Define if you have the <string.h> header file. */ +#define HAVE_STRING_H 1 + +/* Define if you have the <strings.h> header file. */ +/* #undef HAVE_STRINGS_H */ + +/* Define if you have the <sys/time.h> header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define if you have the <sys/times.h> header file. */ +/* #undef HAVE_SYS_TIMES_H */ + +/* Define if you have the <unistd.h> header file. */ +#define HAVE_UNISTD_H 1 + +/* Define if you have the function argument prototype */ +/* #undef HAVE_PROTOTYPES */ + +/* Define if you have the variable length prototypes and stdarg.h */ +#define HAVE_STDARG_PROTOTYPES 1 + diff --git a/ext/mbstring/oniguruma/config.h.in b/ext/mbstring/oniguruma/config.h.in new file mode 100644 index 0000000000..1a59a45dc0 --- /dev/null +++ b/ext/mbstring/oniguruma/config.h.in @@ -0,0 +1,66 @@ +/* config.h.in. Generated automatically from configure.in by autoheader. */ + +/* Define if using alloca.c. */ +#undef C_ALLOCA + +/* Define to empty if the keyword does not work. */ +#undef const + +/* Define to one of _getb67, GETB67, getb67 for Cray-2 and Cray-YMP systems. + This function is required for alloca.c support on those systems. */ +#undef CRAY_STACKSEG_END + +/* Define if you have alloca, as a function or macro. */ +#undef HAVE_ALLOCA + +/* Define if you have <alloca.h> and it should be used (not on Ultrix). */ +#undef HAVE_ALLOCA_H + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at run-time. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown + */ +#undef STACK_DIRECTION + +/* Define if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* Define if you can safely include both <sys/time.h> and <time.h>. */ +#undef TIME_WITH_SYS_TIME + +/* The number of bytes in a int. */ +#undef SIZEOF_INT + +/* The number of bytes in a long. */ +#undef SIZEOF_LONG + +/* The number of bytes in a short. */ +#undef SIZEOF_SHORT + +/* Define if you have the <stdlib.h> header file. */ +#undef HAVE_STDLIB_H + +/* Define if you have the <string.h> header file. */ +#undef HAVE_STRING_H + +/* Define if you have the <strings.h> header file. */ +#undef HAVE_STRINGS_H + +/* Define if you have the <sys/time.h> header file. */ +#undef HAVE_SYS_TIME_H + +/* Define if you have the <sys/times.h> header file. */ +#undef HAVE_SYS_TIMES_H + +/* Define if you have the <unistd.h> header file. */ +#undef HAVE_UNISTD_H + +/* Define if you have the function argument prototype */ +#undef HAVE_PROTOTYPES + +/* Define if you have the variable length prototypes and stdarg.h */ +#undef HAVE_STDARG_PROTOTYPES + diff --git a/ext/mbstring/oniguruma/configure.in b/ext/mbstring/oniguruma/configure.in new file mode 100644 index 0000000000..84af3fbdb8 --- /dev/null +++ b/ext/mbstring/oniguruma/configure.in @@ -0,0 +1,70 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(regex.c) + +AC_CONFIG_HEADER(config.h) + +dnl default value for RUBYDIR +RUBYDIR=".." +AC_ARG_WITH(rubydir, + [ --with-rubydir=RUBYDIR specify value for RUBYDIR (default ..)], + [ RUBYDIR=$withval ]) +AC_SUBST(RUBYDIR) + +dnl default value for STATISTICS +STATISTICS="" +AC_ARG_WITH(statistics, + [ --with-statistics take matching time statistical data], + [ STATISTICS=-DREG_DEBUG_STATISTICS ]) +AC_SUBST(STATISTICS) + +dnl Checks for programs. +AC_PROG_CC +AC_PROG_RANLIB +dnl AC_PROG_INSTALL + +dnl Checks for libraries. + +dnl Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS(stdlib.h string.h strings.h sys/time.h unistd.h sys/times.h) + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_CHECK_SIZEOF(int, 4) +AC_CHECK_SIZEOF(short, 2) +AC_CHECK_SIZEOF(long, 4) +AC_C_CONST +AC_HEADER_TIME + +dnl Checks for library functions. +AC_FUNC_ALLOCA +AC_FUNC_MEMCMP + +AC_CACHE_CHECK(for prototypes, cv_have_prototypes, + [AC_TRY_COMPILE([int foo(int x) { return 0; }], [return foo(10);], + cv_have_prototypes=yes, + cv_have_prototypes=no)]) +if test "$cv_have_prototypes" = yes; then + AC_DEFINE(HAVE_PROTOTYPES) +fi + +AC_CACHE_CHECK(for variable length prototypes and stdarg.h, cv_stdarg, + [AC_TRY_COMPILE([ +#include <stdarg.h> +int foo(int x, ...) { + va_list va; + va_start(va, x); + va_arg(va, int); + va_arg(va, char *); + va_arg(va, double); + return 0; +} +], [return foo(10, "", 3.14);], + cv_stdarg=yes, + cv_stdarg=no)]) +if test "$cv_stdarg" = yes; then + AC_DEFINE(HAVE_STDARG_PROTOTYPES) +fi + +AC_SUBST() + +AC_OUTPUT(Makefile) diff --git a/ext/mbstring/oniguruma/doc/API b/ext/mbstring/oniguruma/doc/API new file mode 100644 index 0000000000..96f53ae9b8 --- /dev/null +++ b/ext/mbstring/oniguruma/doc/API @@ -0,0 +1,279 @@ +Oniguruma API 2003/07/04 + +declared in regex.h. + + +# int regex_init(void) + + Initialize library. + + You don't have to call it explicitly, because it is called in regex_new(). + + +# int regex_error_code_to_str(UChar* err_buf, int err_code, ...) + + Return error message string length. + + arguments + 1 err_buf: error message buffer. + (required size: REG_MAX_ERROR_MESSAGE_LEN) + 2 err_code: error code returned from other API functions. + 3 err_info (optional): error info returned from regex_new() + and regex_recompile(). + + +# int regex_new(regex_t** reg, UChar* pattern, UChar* pattern_end, + RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, + RegErrorInfo* err_info) + + Create new regex object(regex_t). + + normal return: REG_NORMAL + + arguments + 1 reg: return regex object's address. + 2 pattern: regex pattern string. + 3 pattern_end: terminate address of pattern. (pattern + pattern length) + 4 option: compile time options. + + REG_OPTION_NONE no option + REG_OPTION_SINGLELINE '^' -> '\A', '$' -> '\z', '\Z' -> '\z' + REG_OPTION_MULTILINE '.' match with newline + REG_OPTION_IGNORECASE ignore case (case-insensitive) + REG_OPTION_EXTEND extended pattern form + REG_OPTION_FIND_LONGEST find longest match + REG_OPTION_FIND_NOT_EMPTY ignore empty match + REG_OPTION_NEGATE_SINGLELINE + clear REG_OPTION_SINGLELINE which is default on + in REG_SYNTAX_POSIX_XXX, REG_SYNTAX_PERL and REG_SYNTAX_JAVA. + REG_OPTION_CAPTURE_ONLY_NAMED_GROUP named group only captured. + + 5 code: character encoding. + + REGCODE_ASCII ASCII + REGCODE_UTF8 UTF-8 + REGCODE_EUCJP EUC-JP + REGCODE_SJIS Shift_JIS + REGCODE_DEFAULT ASCII + + 6 syntax: pointer to pattern syntax definition. + + REG_SYNTAX_POSIX_BASIC POSIX Basic RE + REG_SYNTAX_POSIX_EXTENDED POSIX Extended RE + REG_SYNTAX_EMACS Emacs + REG_SYNTAX_GREP grep + REG_SYNTAX_GNU_REGEX GNU regex + REG_SYNTAX_JAVA Java (Sun java.util.regex) + REG_SYNTAX_PERL Perl + REG_SYNTAX_RUBY Ruby + REG_SYNTAX_DEFAULT default (== Ruby) + regex_set_default_syntax() + + or any RegSyntaxType data pointer defined by user. + + 7 err_info: address for return optional error info. + use this value as 3rd argument of regex_error_code_to_str(). + + +# void regex_free(regex_t* reg) + + Free memory used by regex object. + + arguments + 1 reg: regex object. + + +# int regex_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, + RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, + RegErrorInfo* err_info) + + Recompile regex object. + + normal return: REG_NORMAL + + arguments + 1 reg: regex object. + + Another arguments are same with regex_new(). + + +# int regex_search(regex_t* reg, UChar* str, UChar* end, UChar* start, + UChar* range, RegRegion* region, RegOptionType option) + + Search string and return search result and matching region. + + normal return: match position offset (i.e. p - str >= 0) + not found: REG_MISMATCH (< 0) + + arguments + 1 reg: regex object + 2 str: target string + 3 end: terminate address of target string + 4 start: search start address of target string + 5 range: search terminate address of target string + 6 region: address for return group match range info (NULL is allowed) + 7 option: search time option + + REG_OPTION_NOTBOL string head(str) isn't considered as begin of line + REG_OPTION_NOTEOL string end (end) isn't considered as end of line + REG_OPTION_POSIX_REGION region argument is regmatch_t[] of POSIX API. + + +# int regex_match(regex_t* reg, UChar* str, UChar* end, UChar* at, + RegRegion* region, RegOptionType option) + + Match string and return result and matching region. + + normal return: match length (i.e. p - at >= 0) + not match: REG_MISMATCH (< 0) + + arguments + 1 reg: regex object + 2 str: target string + 3 end: terminate address of target string + 4 at: match address of target string + 5 region: address for return group match range info (NULL is allowed) + 6 option: search time option + + REG_OPTION_NOTBOL string head(str) isn't considered as begin of line + REG_OPTION_NOTEOL string end (end) isn't considered as end of line + REG_OPTION_POSIX_REGION region argument is regmatch_t[] of POSIX API. + + +# RegRegion* regex_region_new(void) + + Create a region. + + +# void regex_region_free(RegRegion* region, int free_self) + + Free memory used by region. + + arguments + 1 region: target region + 2 free_self: [1: free all, 0: free memory used in region but not self] + + +# void regex_region_copy(RegRegion* to, RegRegion* from) + + Copy contents of region. + + arguments + 1 to: target region + 2 from: source region + + +# void regex_region_clear(RegRegion* region) + + Clear contents of region. + + arguments + 1 region: target region + + +# int regex_region_resize(RegRegion* region, int n) + + Resize group range area of region. + + normal return: REG_NORMAL + + arguments + 1 region: target region + 2 n: new size + + +# int regex_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, + int** num_list) + + Return group number list of name. + Named subexp is defined by (?<name>....). + + normal return: number of groups for the name. + (ex. /(?<x>..)...(?<x>..)/ ==> 2) + name not found: -1 + + arguments + 1 reg: regex object. + 2 name: subexp-name. + 3 name_end: terminate address of subexp-name. + 4 num_list: return list of group number. + + +# int regex_foreach_names(regex_t* reg, int (*func)(UChar*,int,int*,void*), + void* arg) + + Iterate function call for all names. + + normal return: 0 + error: func's return value. + + arguments + 1 reg: regex object. + 2 func: called function. + func(name, <number of groups>, <group number's list>, arg); + if func return non 0 value, iteration is stopped. + 3 arg: argument for func. + + +# UChar* regex_get_prev_char_head(RegCharEncoding code, UChar* start, UChar* s) + + Return previous character head address. + + arguments + 1 code: character encoding + 2 start: string address + 3 s: target address of string + + +# UChar* regex_get_left_adjust_char_head(RegCharEncoding code, + UChar* start, UChar* s) + + Return left-adjusted head address of a character. + + arguments + 1 code: character encoding + 2 start: string address + 3 s: target address of string + + +# UChar* regex_get_right_adjust_char_head(RegCharEncoding code, + UChar* start, UChar* s) + + Return right-adjusted head address of a character. + + arguments + 1 code: character encoding + 2 start: string address + 3 s: target address of string + + +# int regex_set_default_syntax(RegSyntaxType* syntax) + + Set default syntax. + + arguments + 1 syntax: pointer to pattern syntax definition. + + +# void regex_set_default_trans_table(UChar* table) + + Set default case transformation table. + + arguments + 1 table: case transformation table + + (* this function will be obsoleted in future version) + + +# int regex_end(void) + + The use of this library is finished. + + normal return: REG_NORMAL + + +# const char* regex_version(void) + + Return version string. (ex. "1.8.6") + +// END diff --git a/ext/mbstring/oniguruma/doc/RE b/ext/mbstring/oniguruma/doc/RE new file mode 100644 index 0000000000..3527b4556f --- /dev/null +++ b/ext/mbstring/oniguruma/doc/RE @@ -0,0 +1,224 @@ +Oniguruma Regular Expressions 2003/07/04 + +syntax: REG_SYNTAX_RUBY (default) + + +1. Syntax elements + + \ escape + | alternation + (...) group + [...] character class + + +2. Characters + + \t horizontal tab (0x09) + \v vertical tab (0x0B) + \n newline (0x0A) + \r return (0x0D) + \b back space (0x08) (* in character class only) + \f form feed (0x0C) + \a bell (0x07) + \e escape (0x1B) + \nnn octal char + \xHH hexadecimal char + \x{7HHHHHHH} wide hexadecimal char + \cx control char + \C-x control char + \M-x meta (x|0x80) + \M-\C-x meta control char + + +3. Character types + + . any character (except newline) + \w word character (alphanumeric, "_" and multibyte char) + \W non-word char + \s whitespace char (\t, \n, \v, \f, \r, \x20) + \S non-whitespace char + \d digit char + \D non-digit char + + +4. Quantifier + + greedy + + ? 1 or 0 times + * 0 or more times + + 1 or more times + {n,m} at least n but not more than m times + {n,} at least n times + {n} n times + + reluctant + + ?? 1 or 0 times + *? 0 or more times + +? 1 or more times + {n,m}? at least n but not more than m times + {n,}? at least n times + + possessive (greedy and does not backtrack after repeated) + + ?+ 1 or 0 times + *+ 0 or more times + ++ 1 or more times + + +5. Anchors + + ^ beginning of the line + $ end of the line + \b word boundary + \B not word boundary + \A beginning of string + \Z end of string, or before newline at the end + \z end of string + \G previous end-of-match position + + +6. POSIX character class ([:xxxxx:], negate [:^xxxxx:]) + + alnum alphabet or digit char + alpha alphabet + ascii code value: [0 - 127] + blank \t, \x20 + cntrl + digit 0-9 + graph + lower + print + punct + space \t, \n, \v, \f, \r, \x20 + upper + xdigit 0-9, a-f, A-F + + +7. Operators in character class + + [...] group (character class in character class) + && intersection + (lowest precedence operator in character class) + + ex. [a-w&&[^c-g]z] ==> ([a-w] and ([^c-g] or z)) ==> [abh-w] + + +8. Extended expressions + + (?#...) comment + (?imx-imx) option on/off + i: ignore case + m: multi-line (dot(.) match newline) + x: extended form + (?imx-imx:subexp) option on/off for subexp + (?:subexp) not captured + (?=subexp) look-ahead + (?!subexp) negative look-ahead + (?<=subexp) look-behind + (?<!subexp) negative look-behind + + Subexp of look-behind must be fixed character length. + But different character length is allowed in top level + alternatives only. + ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed. + + (?>subexp) don't backtrack + (?<name>subexp) define named group + (name can not include '>', ')', '\' and NUL character) + + +9. Back reference + + \n back reference by group number (n >= 1) + \k<name> back reference by group name + + +10. Subexp call ("Tanaka Akira special") + + \g<name> call by group name + \g<n> call by group number (only if 'n' is not defined as name) + + +----------------------------- +11. Original extensions + + + named group (?<name>...) + + named backref \k<name> + + subexp call \g<name>, \g<group-num> + + +12. Lacked features compare with perl 5.8.0 + + + [:word:] + + \N{name} + + \l,\u,\L,\U, \P, \X, \C + + (?{code}) + + (??{code}) + + (?(condition)yes-pat|no-pat) + + + \Q...\E (* This is effective on REG_SYNTAX_PERL and REG_SYNTAX_JAVA) + + +13. Syntax depend options + + + REG_SYNTAX_RUBY (default) + (?m): dot(.) match newline + + + REG_SYNTAX_PERL, REG_SYNTAX_JAVA + (?s): dot(.) match newline + (?m): ^ match after newline, $ match before newline + + +14. Differences with Japanized GNU regex(version 0.12) of Ruby + + + add look behind + (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern) + (in negative-look-behind, capture group isn't allowed, + shy group(?:) is allowed.) + + add possessive quantifier. ?+, *+, ++ + + add operations in character class. [], && + + add named group and subexp call. + + octal or hexadecimal number sequence can be treated as + a multibyte code char in char-class, if multibyte encoding is specified. + (ex. [\xa1\xa2], [\xa1\xa7-\xa4\xa1]) + + effect range of isolated option is to next ')'. + ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b). + + isolated option is not transparent to previous pattern. + ex. a(?i)* is a syntax error pattern. + + allowed incompleted left brace as an usual char. + ex. /{/, /({)/, /a{2,3/ etc... + + negative POSIX bracket [:^xxxx:] is supported. + + POSIX bracket [:ascii:] is added. + + repeat of look-ahead is not allowd. + ex. /(?=a)*/, /(?!b){5}/ + + +14. Problems + + + Invalid first byte in UTF-8 is allowed. + (which is the same as GNU regex of Ruby) + + /./u =~ "\xa3" + + Of course, although it is possible to validate, + it will become later than now. + + + Zero-length match in infinite repeat stops the repeat, + and captured group status isn't checked as stop condition. + + /()*\1/ =~ "" #=> match + /(?:()|())*\1\2/ =~ "" #=> fail + + /(?:\1a|())*/ =~ "a" #=> match with "" + + + Ignore case option is not effect to an octal or hexadecimal + numbered char, but it becomes effective if it appears in the char class. + This doesn't have consistency, though they are the specifications + which are the same as GNU regex of Ruby. + + /\x61/i.match("A") # => nil + /[\x61]/i.match("A") # => match + +// END diff --git a/ext/mbstring/oniguruma/onigposix.h b/ext/mbstring/oniguruma/onigposix.h new file mode 100644 index 0000000000..ea93c6f9f7 --- /dev/null +++ b/ext/mbstring/oniguruma/onigposix.h @@ -0,0 +1,135 @@ +/********************************************************************** + + onigposix.h - Oniguruma (regular expression library) + + Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef ONIGPOSIX_H +#define ONIGPOSIX_H + +/* options */ +#define REG_ICASE (1<<0) +#define REG_NEWLINE (1<<1) +#define REG_NOTBOL (1<<2) +#define REG_NOTEOL (1<<3) +#define REG_EXTENDED (1<<4) /* if not setted, Basic Regular Expression */ +#define REG_NOSUB (1<<5) + +/* POSIX error codes */ +#define REG_NOMATCH 1 +#define REG_BADPAT 2 +#define REG_ECOLLATE 3 +#define REG_ECTYPE 4 +#define REG_EESCAPE 5 +#define REG_ESUBREG 6 +#define REG_EBRACK 7 +#define REG_EPAREN 8 +#define REG_EBRACE 9 +#define REG_BADBR 10 +#define REG_ERANGE 11 +#define REG_ESPACE 12 +#define REG_BADRPT 13 + +/* extended error codes */ +#define REG_EONIG_INTERNAL 14 +#define REG_EONIG_BADWC 15 +#define REG_EONIG_BADARG 16 +#define REG_EONIG_THREAD 17 + +/* character encodings (for reg_set_encoding()) */ +/* These value must be same with MBCTYPE_XXXX in oniguruma.h.*/ +#define REG_ENCODING_ASCII 0 +#define REG_ENCODING_EUC_JP 1 +#define REG_ENCODING_SJIS 2 +#define REG_ENCODING_UTF8 3 + +#include <stdlib.h> + + +typedef int regoff_t; + +typedef struct { + regoff_t rm_so; + regoff_t rm_eo; +} regmatch_t; + +/* POSIX regex_t */ +typedef struct { + void* onig; /* Oniguruma regex_t* */ + size_t re_nsub; + int comp_options; +} regex_t; + + +#ifndef P_ +#ifdef __STDC__ +# define P_(args) args +#else +# define P_(args) () +#endif +#endif + +#ifndef REG_EXTERN +#if defined(_WIN32) && !defined(__CYGWIN__) +#if defined(EXPORT) || defined(RUBY_EXPORT) +#define REG_EXTERN extern __declspec(dllexport) +#elif defined(IMPORT) +#define REG_EXTERN extern __declspec(dllimport) +#endif +#endif +#endif + +#ifndef REG_EXTERN +#define REG_EXTERN extern +#endif + +#ifndef ONIGURUMA_H +typedef unsigned int RegOptionType; + +/* syntax */ +typedef struct { + unsigned int op; + unsigned int op2; + unsigned int behavior; + RegOptionType options; /* default option */ +} RegSyntaxType; + +REG_EXTERN RegSyntaxType RegSyntaxPosixBasic; +REG_EXTERN RegSyntaxType RegSyntaxPosixExtended; +REG_EXTERN RegSyntaxType RegSyntaxEmacs; +REG_EXTERN RegSyntaxType RegSyntaxGrep; +REG_EXTERN RegSyntaxType RegSyntaxGnuRegex; +REG_EXTERN RegSyntaxType RegSyntaxJava; +REG_EXTERN RegSyntaxType RegSyntaxPerl; +REG_EXTERN RegSyntaxType RegSyntaxRuby; + +/* predefined syntaxes (see regcomp.c) */ +#define REG_SYNTAX_POSIX_BASIC (&RegSyntaxPosixBasic) +#define REG_SYNTAX_POSIX_EXTENDED (&RegSyntaxPosixExtended) +#define REG_SYNTAX_EMACS (&RegSyntaxEmacs) +#define REG_SYNTAX_GREP (&RegSyntaxGrep) +#define REG_SYNTAX_GNU_REGEX (&RegSyntaxGnuRegex) +#define REG_SYNTAX_JAVA (&RegSyntaxJava) +#define REG_SYNTAX_PERL (&RegSyntaxPerl) +#define REG_SYNTAX_RUBY (&RegSyntaxRuby) +/* default syntax */ +#define REG_SYNTAX_DEFAULT RegDefaultSyntax + +REG_EXTERN RegSyntaxType* RegDefaultSyntax; + +REG_EXTERN int regex_set_default_syntax P_((RegSyntaxType* syntax)); +#endif /* ONIGURUMA_H */ + + +REG_EXTERN int regcomp P_((regex_t* reg, const char* pat, int options)); +REG_EXTERN int regexec P_((regex_t* reg, const char* str, size_t nmatch, regmatch_t* matches, int options)); +REG_EXTERN void regfree P_((regex_t* reg)); +REG_EXTERN size_t regerror P_((int code, const regex_t* reg, char* buf, size_t size)); + +/* extended API */ +REG_EXTERN void reg_set_encoding P_((int enc)); +REG_EXTERN int reg_name_to_group_numbers P_((regex_t* reg, unsigned char* name, unsigned char* name_end, int** nums)); +REG_EXTERN int reg_foreach_name P_((regex_t* reg, int (*func)(unsigned char*,int,int*,void*), void* arg)); + +#endif /* ONIGPOSIX_H */ diff --git a/ext/mbstring/oniguruma/oniguruma.h b/ext/mbstring/oniguruma/oniguruma.h new file mode 100644 index 0000000000..e5236a80a7 --- /dev/null +++ b/ext/mbstring/oniguruma/oniguruma.h @@ -0,0 +1,447 @@ +/********************************************************************** + + oniguruma.h - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef ONIGURUMA_H +#define ONIGURUMA_H + +#include "php_compat.h" + +#define ONIGURUMA +#define ONIGURUMA_VERSION_MAJOR 1 +#define ONIGURUMA_VERSION_MINOR 9 +#define ONIGURUMA_VERSION_TEENY 1 + +/* config parameters */ +#ifndef RE_NREGS +#define RE_NREGS 10 +#endif +#define REG_NREGION RE_NREGS +#define REG_MAX_BACKREF_NUM 1000 +#define REG_MAX_REPEAT_NUM 100000 +#define REG_MAX_MULTI_BYTE_RANGES_NUM 1000 +/* constants */ +#define REG_MAX_ERROR_MESSAGE_LEN 90 + +#ifndef P_ +#ifdef __STDC__ +# define P_(args) args +#else +# define P_(args) () +#endif +#endif + +#ifndef PV_ +#ifdef HAVE_STDARG_PROTOTYPES +# define PV_(args) args +#else +# define PV_(args) () +#endif +#endif + +#ifndef REG_EXTERN +#if defined(_WIN32) && !defined(__CYGWIN__) +#if defined(EXPORT) || defined(RUBY_EXPORT) +#define REG_EXTERN extern __declspec(dllexport) +#elif defined(IMPORT) +#define REG_EXTERN extern __declspec(dllimport) +#endif +#endif +#endif + +#ifndef REG_EXTERN +#define REG_EXTERN extern +#endif + +#define REG_CHAR_TABLE_SIZE 256 + +#define REGCODE_UNDEF ((RegCharEncoding )0) + +#if defined(RUBY_PLATFORM) && defined(M17N_H) +#define REG_RUBY_M17N +typedef m17n_encoding* RegCharEncoding; +#define REGCODE_DEFAULT REGCODE_UNDEF +#else +typedef const char* RegCharEncoding; +#define MBCTYPE_ASCII 0 +#define MBCTYPE_EUC 1 +#define MBCTYPE_SJIS 2 +#define MBCTYPE_UTF8 3 + +#define REGCODE_ASCII REG_MBLEN_TABLE[MBCTYPE_ASCII] +#define REGCODE_UTF8 REG_MBLEN_TABLE[MBCTYPE_UTF8] +#define REGCODE_EUCJP REG_MBLEN_TABLE[MBCTYPE_EUC] +#define REGCODE_SJIS REG_MBLEN_TABLE[MBCTYPE_SJIS] +#define REGCODE_DEFAULT REGCODE_ASCII + +REG_EXTERN const char REG_MBLEN_TABLE[][REG_CHAR_TABLE_SIZE]; +#endif /* else RUBY && M17N */ + +REG_EXTERN RegCharEncoding RegDefaultCharEncoding; + +#if defined(RUBY_PLATFORM) && !defined(M17N_H) +#undef ismbchar +#define ismbchar(c) (mbclen((c)) != 1) +#define mbclen(c) RegDefaultCharEncoding[(unsigned char )(c)] +#endif + +typedef unsigned int RegOptionType; +typedef unsigned char* RegTransTableType; +typedef unsigned int RegDistance; +typedef unsigned char UChar; + +#define REG_OPTION_DEFAULT REG_OPTION_NONE + +/* GNU regex options */ +#define RE_OPTION_IGNORECASE (1L) +#define RE_OPTION_EXTENDED (RE_OPTION_IGNORECASE << 1) +#define RE_OPTION_MULTILINE (RE_OPTION_EXTENDED << 1) +#define RE_OPTION_SINGLELINE (RE_OPTION_MULTILINE << 1) +#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) +#define RE_OPTION_LONGEST (RE_OPTION_SINGLELINE << 1) + +/* options */ +#define REG_OPTION_NONE 0 +#define REG_OPTION_SINGLELINE RE_OPTION_SINGLELINE +#define REG_OPTION_MULTILINE RE_OPTION_MULTILINE +#define REG_OPTION_IGNORECASE RE_OPTION_IGNORECASE +#define REG_OPTION_EXTEND RE_OPTION_EXTENDED +#define REG_OPTION_FIND_LONGEST RE_OPTION_LONGEST +#define REG_OPTION_FIND_NOT_EMPTY (REG_OPTION_FIND_LONGEST << 1) +#define REG_OPTION_NEGATE_SINGLELINE (REG_OPTION_FIND_NOT_EMPTY << 1) +#define REG_OPTION_CAPTURE_ONLY_NAMED_GROUP (REG_OPTION_NEGATE_SINGLELINE << 1) +/* options (search time) */ +#define REG_OPTION_NOTBOL (REG_OPTION_CAPTURE_ONLY_NAMED_GROUP << 1) +#define REG_OPTION_NOTEOL (REG_OPTION_NOTBOL << 1) +#define REG_OPTION_POSIX_REGION (REG_OPTION_NOTEOL << 1) + +#define REG_OPTION_ON(options,regopt) ((options) |= (regopt)) +#define REG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) +#define IS_REG_OPTION_ON(options,option) ((options) & (option)) + +/* syntax */ +typedef struct { + unsigned int op; + unsigned int op2; + unsigned int behavior; + RegOptionType options; /* default option */ +} RegSyntaxType; + +REG_EXTERN RegSyntaxType RegSyntaxPosixBasic; +REG_EXTERN RegSyntaxType RegSyntaxPosixExtended; +REG_EXTERN RegSyntaxType RegSyntaxEmacs; +REG_EXTERN RegSyntaxType RegSyntaxGrep; +REG_EXTERN RegSyntaxType RegSyntaxGnuRegex; +REG_EXTERN RegSyntaxType RegSyntaxJava; +REG_EXTERN RegSyntaxType RegSyntaxPerl; +REG_EXTERN RegSyntaxType RegSyntaxRuby; + +/* predefined syntaxes (see regcomp.c) */ +#define REG_SYNTAX_POSIX_BASIC (&RegSyntaxPosixBasic) +#define REG_SYNTAX_POSIX_EXTENDED (&RegSyntaxPosixExtended) +#define REG_SYNTAX_EMACS (&RegSyntaxEmacs) +#define REG_SYNTAX_GREP (&RegSyntaxGrep) +#define REG_SYNTAX_GNU_REGEX (&RegSyntaxGnuRegex) +#define REG_SYNTAX_JAVA (&RegSyntaxJava) +#define REG_SYNTAX_PERL (&RegSyntaxPerl) +#define REG_SYNTAX_RUBY (&RegSyntaxRuby) + +/* default syntax */ +#define REG_SYNTAX_DEFAULT RegDefaultSyntax + +REG_EXTERN RegSyntaxType* RegDefaultSyntax; + +/* syntax (operators) */ +#define REG_SYN_OP_ANYCHAR 1 /* . */ +#define REG_SYN_OP_0INF (1<<1) /* * */ +#define REG_SYN_OP_ESC_0INF (1<<2) +#define REG_SYN_OP_1INF (1<<3) /* + */ +#define REG_SYN_OP_ESC_1INF (1<<4) +#define REG_SYN_OP_01 (1<<5) /* ? */ +#define REG_SYN_OP_ESC_01 (1<<6) +#define REG_SYN_OP_INTERVAL (1<<7) /* {lower,upper} */ +#define REG_SYN_OP_ESC_INTERVAL (1<<8) +#define REG_SYN_OP_ALT (1<<9) /* | */ +#define REG_SYN_OP_ESC_ALT (1<<10) +#define REG_SYN_OP_SUBEXP (1<<11) /* (...) */ +#define REG_SYN_OP_ESC_SUBEXP (1<<12) +#define REG_SYN_OP_ESC_BUF_ANCHOR (1<<13) /* \A, \Z, \z */ +#define REG_SYN_OP_ESC_GNU_BUF_ANCHOR (1<<14) /* \`, \' */ +#define REG_SYN_OP_BACK_REF (1<<15) /* \num */ +#define REG_SYN_OP_CC (1<<16) /* [...] */ +#define REG_SYN_OP_ESC_WORD (1<<17) /* \w, \W */ +#define REG_SYN_OP_ESC_WORD_BEGIN_END (1<<18) /* \<. \> */ +#define REG_SYN_OP_ESC_WORD_BOUND (1<<19) /* \b, \B */ +#define REG_SYN_OP_ESC_WHITE_SPACE (1<<20) /* \s, \S */ +#define REG_SYN_OP_ESC_DIGIT (1<<21) /* \d, \D */ +#define REG_SYN_OP_LINE_ANCHOR (1<<22) /* ^, $ */ +#define REG_SYN_OP_POSIX_BRACKET (1<<23) /* [:xxxx:] */ +#define REG_SYN_OP_NON_GREEDY (1<<24) /* ??,*?,+?,{n,m}? */ +#define REG_SYN_OP_ESC_CONTROL_CHAR (1<<25) /* \n,\r,\t,\a ... */ +#define REG_SYN_OP_ESC_C_CONTROL (1<<26) /* \cx */ +#define REG_SYN_OP_ESC_OCTAL3 (1<<27) /* \OOO */ +#define REG_SYN_OP_ESC_X_HEX2 (1<<28) /* \xHH */ +#define REG_SYN_OP_ESC_X_BRACE_HEX8 (1<<29) /* \x{7HHHHHHH} */ +#define REG_SYN_OP_SUBEXP_EFFECT (1<<30) /* (?...) */ +#define REG_SYN_OP_QUOTE (1<<31) /* \Q...\E */ + +#define REG_SYN_OP2_OPTION_PERL (1<<0) /* (?imsx), (?-imsx) */ +#define REG_SYN_OP2_OPTION_RUBY (1<<1) /* (?imx), (?-imx) */ +#define REG_SYN_OP2_POSSESSIVE_REPEAT (1<<2) /* ?+,*+,++ */ +#define REG_SYN_OP2_POSSESSIVE_INTERVAL (1<<3) /* {n,m}+ */ +#define REG_SYN_OP2_CCLASS_SET (1<<4) /* [...&&..[..].] */ +#define REG_SYN_OP2_NAMED_SUBEXP (1<<5) /*(?<name>.),\k<name>*/ +#define REG_SYN_OP2_SUBEXP_CALL (1<<6) /* \g<name> */ +#define REG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1<<7) /* \C-x */ +#define REG_SYN_OP2_ESC_M_BAR_META (1<<8) /* \M-x */ +#define REG_SYN_OP2_ESC_V_VTAB (1<<9) /* \v as VTAB */ +#define REG_SYN_OP2_ESC_U_HEX4 (1<<10) /* \uHHHH */ + +/* syntax (behavior) */ +#define REG_SYN_CONTEXT_INDEP_ANCHORS (1<<0) /* not implemented */ +#define REG_SYN_CONTEXT_INDEP_OPS (1<<1) /* ?, *, +, {n,m} */ +#define REG_SYN_CONTEXT_INVALID_OPS (1<<2) /* error or ignore */ +#define REG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1<<3) /* ...)... */ +#define REG_SYN_ALLOW_INVALID_INTERVAL (1<<4) /* {??? */ +#define REG_SYN_STRICT_CHECK_BACKREF (1<<5) /* /(\1)/,/\1()/ etc.*/ +#define REG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */ + +/* syntax in char class [...] */ +#define REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPED (1<<10) /* [,-,] */ +#define REG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<11) +#define REG_SYN_ESCAPE_IN_CC (1<<12) /* [...\w..] etc.. */ +#define REG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1<<13) +#define REG_SYN_ALLOW_RANGE_OP_IN_CC (1<<14) /* [0-9-a] */ + + +/* error codes */ +#define REG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -300) +/* normal return */ +#define REG_NORMAL 0 +#define REG_MISMATCH -1 +#define REG_NO_SUPPORT_CONFIG -2 +/* internal error */ +#define REGERR_MEMORY -5 +#define REGERR_MATCH_STACK_LIMIT_OVER -6 +#define REGERR_TYPE_BUG -10 +#define REGERR_PARSER_BUG -11 +#define REGERR_STACK_BUG -12 +#define REGERR_UNDEFINED_BYTECODE -13 +#define REGERR_UNEXPECTED_BYTECODE -14 +#define REGERR_DEFAULT_ENCODING_IS_NOT_SETTED -21 +#define REGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22 +/* syntax error */ +#define REGERR_END_PATTERN_AT_LEFT_BRACE -100 +#define REGERR_END_PATTERN_AT_LEFT_BRACKET -101 +#define REGERR_EMPTY_CHAR_CLASS -102 +#define REGERR_PREMATURE_END_OF_CHAR_CLASS -103 +#define REGERR_END_PATTERN_AT_BACKSLASH -104 +#define REGERR_END_PATTERN_AT_META -105 +#define REGERR_END_PATTERN_AT_CONTROL -106 +#define REGERR_META_CODE_SYNTAX -108 +#define REGERR_CONTROL_CODE_SYNTAX -109 +#define REGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE -110 +#define REGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE -111 +#define REGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS -112 +#define REGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED -113 +#define REGERR_TARGET_OF_REPEAT_OPERATOR_INVALID -114 +#define REGERR_NESTED_REPEAT_OPERATOR -115 +#define REGERR_UNMATCHED_CLOSE_PARENTHESIS -116 +#define REGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS -117 +#define REGERR_END_PATTERN_IN_GROUP -118 +#define REGERR_UNDEFINED_GROUP_OPTION -119 +#define REGERR_INVALID_POSIX_BRACKET_TYPE -121 +#define REGERR_INVALID_LOOK_BEHIND_PATTERN -122 +#define REGERR_INVALID_REPEAT_RANGE_PATTERN -123 +/* values error (syntax error) */ +#define REGERR_TOO_BIG_NUMBER -200 +#define REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201 +#define REGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE -202 +#define REGERR_EMPTY_RANGE_IN_CHAR_CLASS -203 +#define REGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE -204 +#define REGERR_TOO_MANY_MULTI_BYTE_RANGES -205 +#define REGERR_TOO_SHORT_MULTI_BYTE_STRING -206 +#define REGERR_TOO_BIG_BACKREF_NUMBER -207 +#define REGERR_INVALID_BACKREF -208 +#define REGERR_TOO_BIG_WIDE_CHAR_VALUE -209 +#define REGERR_TOO_LONG_WIDE_CHAR_VALUE -210 +#define REGERR_INVALID_WIDE_CHAR_VALUE -211 +#define REGERR_INVALID_SUBEXP_NAME -212 +#define REGERR_UNDEFINED_NAME_REFERENCE -213 +#define REGERR_UNDEFINED_GROUP_REFERENCE -214 +#define REGERR_MULTIPLEX_DEFINITION_NAME_CALL -215 +#define REGERR_NEVER_ENDING_RECURSION -216 +/* errors related to thread */ +#define REGERR_OVER_THREAD_PASS_LIMIT_COUNT -301 + + +/* match result region type */ +struct re_registers { + int allocated; + int num_regs; + int* beg; + int* end; +}; + +#define REG_REGION_NOTPOS -1 + +typedef struct re_registers RegRegion; + +typedef struct { + UChar* par; + UChar* par_end; +} RegErrorInfo; + +typedef struct { + int lower; + int upper; +} RegRepeatRange; + +/* regex_t state */ +#define REG_STATE_NORMAL 0 +#define REG_STATE_SEARCHING 1 +#define REG_STATE_COMPILING -1 +#define REG_STATE_MODIFY -2 + +#define REG_STATE(regex) \ + ((regex)->state > 0 ? REG_STATE_SEARCHING : (regex)->state) + +typedef struct re_pattern_buffer { + /* common members in BBuf(bytes-buffer) type */ + unsigned char* p; /* compiled pattern */ + unsigned int used; /* used space for p */ + unsigned int alloc; /* allocated space for p */ + + int state; /* normal, searching, compiling */ + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_null_check; /* OP_NULL_CHECK_START/END id counter */ + int num_call; /* number of subexp call */ + unsigned int backtrack_mem; + int stack_pop_level; + int repeat_range_alloc; + RegRepeatRange* repeat_range; + + RegCharEncoding enc; + RegOptionType options; + RegSyntaxType* syntax; + void* name_table; + + /* optimize info (string search and char-map and anchor) */ + int optimize; /* optimize flag */ + int threshold_len; /* search str-length for apply optimize */ + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + RegDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */ + RegDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + int sub_anchor; /* start-anchor for exact or map */ + unsigned char *exact; + unsigned char *exact_end; + unsigned char map[REG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ + int *int_map; /* BM skip for exact_len > 255 */ + int *int_map_backward; /* BM skip for backward search */ + RegDistance dmin; /* min-distance of exact or map */ + RegDistance dmax; /* max-distance of exact or map */ + + /* regex_t link chain */ + struct re_pattern_buffer* chain; /* escape compile-conflict on multi-thread */ +} regex_t; + +#ifdef RUBY_PLATFORM +#define re_mbcinit ruby_re_mbcinit +#define re_compile_pattern ruby_re_compile_pattern +#define re_recompile_pattern ruby_re_recompile_pattern +#define re_free_pattern ruby_re_free_pattern +#define re_adjust_startpos ruby_re_adjust_startpos +#define re_search ruby_re_search +#define re_match ruby_re_match +#define re_set_casetable ruby_re_set_casetable +#define re_copy_registers ruby_re_copy_registers +#define re_free_registers ruby_re_free_registers +#define register_info_type ruby_register_info_type +#define re_error_code_to_str ruby_error_code_to_str + +#define ruby_error_code_to_str regex_error_code_to_str +#define ruby_re_copy_registers regex_region_copy +#else +#define re_error_code_to_str regex_error_code_to_str +#define re_copy_registers regex_region_copy +#endif + +/* Oniguruma Native API */ +REG_EXTERN +int regex_init P_((void)); +REG_EXTERN +int regex_error_code_to_str PV_((UChar* s, int err_code, ...)); +REG_EXTERN +int regex_new P_((regex_t**, UChar* pattern, UChar* pattern_end, RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, RegErrorInfo* einfo)); +REG_EXTERN +void regex_free P_((regex_t*)); +REG_EXTERN +int regex_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, RegErrorInfo* einfo)); +REG_EXTERN +int regex_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, RegRegion* region, RegOptionType option)); +REG_EXTERN +int regex_match P_((regex_t*, UChar* str, UChar* end, UChar* at, RegRegion* region, RegOptionType option)); +REG_EXTERN +RegRegion* regex_region_new P_((void)); +REG_EXTERN +void regex_region_free P_((RegRegion* region, int free_self)); +REG_EXTERN +void regex_region_copy P_((RegRegion* to, RegRegion* from)); +REG_EXTERN +void regex_region_clear P_((RegRegion* region)); +REG_EXTERN +int regex_region_resize P_((RegRegion* region, int n)); +REG_EXTERN +int regex_name_to_group_numbers P_((regex_t* reg, UChar* name, UChar* name_end, + int** nums)); +REG_EXTERN +int regex_foreach_name P_((regex_t* reg, int (*func)(UChar*,int,int*,void*), void* arg)); +REG_EXTERN +UChar* regex_get_prev_char_head P_((RegCharEncoding code, UChar* start, UChar* s)); +REG_EXTERN +UChar* regex_get_left_adjust_char_head P_((RegCharEncoding code, UChar* start, UChar* s)); +REG_EXTERN +UChar* regex_get_right_adjust_char_head P_((RegCharEncoding code, UChar* start, UChar* s)); +REG_EXTERN +void regex_set_default_trans_table P_((UChar* table)); +REG_EXTERN +int regex_set_default_syntax P_((RegSyntaxType* syntax)); +REG_EXTERN +int regex_end P_((void)); +REG_EXTERN +const char* regex_version P_((void)); + + +/* GNU regex API */ +#ifdef REG_RUBY_M17N +REG_EXTERN +void re_mbcinit P_((RegCharEncoding)); +#else +REG_EXTERN +void re_mbcinit P_((int)); +#endif + +REG_EXTERN +int re_compile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf)); +REG_EXTERN +int re_recompile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf)); +REG_EXTERN +void re_free_pattern P_((struct re_pattern_buffer*)); +REG_EXTERN +int re_adjust_startpos P_((struct re_pattern_buffer*, const char*, int, int, int)); +REG_EXTERN +int re_search P_((struct re_pattern_buffer*, const char*, int, int, int, struct re_registers*)); +REG_EXTERN +int re_match P_((struct re_pattern_buffer*, const char *, int, int, struct re_registers*)); +REG_EXTERN +void re_set_casetable P_((const char*)); +REG_EXTERN +void re_free_registers P_((struct re_registers*)); +REG_EXTERN +int re_alloc_pattern P_((struct re_pattern_buffer**)); /* added */ + +#endif /* ONIGURUMA_H */ diff --git a/ext/mbstring/oniguruma/php_compat.h b/ext/mbstring/oniguruma/php_compat.h new file mode 100644 index 0000000000..c261cc2bb0 --- /dev/null +++ b/ext/mbstring/oniguruma/php_compat.h @@ -0,0 +1,46 @@ +#ifndef _PHP_MBREGEX_COMPAT_H +#define _PHP_MBREGEX_COMPAT_H + +#define RegCharEncoding php_mb_reg_char_encoding +#define RegRegion php_mb_reg_region +#define RegDefaultCharEncoding php_mb_reg_default_char_encoding +#define REG_MBLEN_TABLE PHP_MBSTR_REG_MBLEN_TABLE +#define RegSyntaxType php_mb_reg_syntax_type +#define RegOptionType php_mb_reg_option_type +#define re_registers php_mb_re_registers +#define RegErrorInfo php_mb_reg_error_info +#define re_pattern_buffer php_mb_re_pattern_buffer +#define regex_t php_mb_regex_t +#define regex_init php_mb_regex_init +#define regex_new php_mb_regex_new +#define regex_free php_mb_regex_free +#define regex_recompile php_mb_regex_recompile +#define regex_search php_mb_regex_search +#define regex_match php_mb_regex_match +#define regex_region_new php_mb_regex_region_new +#define regex_region_free php_mb_regex_region_free +#define regex_region_copy php_mb_regex_region_copy +#define regex_region_clear php_mb_regex_region_clear +#define regex_region_resize php_mb_regex_region_resize +#define regex_name_to_group_numbers php_mb_regex_name_to_group_numbers +#define regex_foreach_names php_mb_regex_foreach_names +#define regex_get_prev_char_head php_mb_regex_get_prev_char_head +#define regex_get_left_adjust_char_head php_mb_get_left_adjust_char_head +#define regex_get_right_adjust_char_head php_mb_get_right_adjust_char_head +#define regex_set_default_trans_table php_mb_get_default_trans_table +#define regex_set_default_syntax php_mb_regex_set_default_syntax +#define regex_end php_mb_regex_end +#define re_mbcinit php_mb_re_mbcinit +#define re_compile_pattern php_mb_re_compile_pattern +#define re_recompile_pattern php_mb_re_recompile_pattern +#define re_free_pattern php_mb_re_free_pattern +#define re_adjust_startpos php_mb_re_adjust_startpos +#define re_search php_mb_re_search +#define re_match php_mb_re_match +#define re_set_casetable php_mb_re_set_casetable +#define php_mbregex_region_copy php_mb_re_copy_registers +#define re_free_registers php_mb_re_free_registers +#define register_info_type php_mb_register_info_type +#define regex_error_code_to_str php_mb_regex_error_code_to_str + +#endif /* _PHP_MBREGEX_COMPAT_H */ diff --git a/ext/mbstring/oniguruma/re.c.168.patch b/ext/mbstring/oniguruma/re.c.168.patch new file mode 100644 index 0000000000..fd1c1bf5d8 --- /dev/null +++ b/ext/mbstring/oniguruma/re.c.168.patch @@ -0,0 +1,56 @@ +--- re.c.ruby_orig Tue Feb 4 15:52:29 2003 ++++ re.c Tue Mar 18 19:37:49 2003 +@@ -380,7 +380,8 @@ make_regexp(s, len, flag) + int len, flag; + { + Regexp *rp; +- char *err; ++ char err[REG_MAX_ERROR_MESSAGE_LEN]; ++ int r; + + /* Handle escaped characters first. */ + +@@ -389,16 +390,17 @@ make_regexp(s, len, flag) + from that. + */ + +- rp = ALLOC(Regexp); +- MEMZERO((char *)rp, Regexp, 1); +- rp->buffer = ALLOC_N(char, 16); +- rp->allocated = 16; +- rp->fastmap = ALLOC_N(char, 256); ++ r = re_alloc_pattern(&rp); ++ if (r) { ++ re_error_code_to_str(err, r); ++ rb_reg_raise(s, len, err, 0); ++ } ++ + if (flag) { + rp->options = flag; + } +- err = re_compile_pattern(s, len, rp); +- if (err != NULL) { ++ r = re_compile_pattern(s, len, rp, err); ++ if (r != 0) { + rb_reg_raise(s, len, err, 0); + } + +@@ -532,14 +534,14 @@ rb_reg_prepare_re(re) + } + + if (need_recompile) { +- char *err; ++ char err[REG_MAX_ERROR_MESSAGE_LEN]; ++ int r; + + if (FL_TEST(re, KCODE_FIXED)) + kcode_set_option(re); + rb_reg_check(re); +- RREGEXP(re)->ptr->fastmap_accurate = 0; +- err = re_compile_pattern(RREGEXP(re)->str, RREGEXP(re)->len, RREGEXP(re)->ptr); +- if (err != NULL) { ++ r = re_recompile_pattern(RREGEXP(re)->str, RREGEXP(re)->len, RREGEXP(re)->ptr, err); ++ if (r != 0) { + rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, re); + } + } diff --git a/ext/mbstring/oniguruma/re.c.180.patch b/ext/mbstring/oniguruma/re.c.180.patch new file mode 100644 index 0000000000..08ef2397fb --- /dev/null +++ b/ext/mbstring/oniguruma/re.c.180.patch @@ -0,0 +1,66 @@ +--- re.c.ruby_orig Fri Feb 7 15:35:26 2003 ++++ re.c Tue Mar 18 18:51:21 2003 +@@ -444,7 +444,7 @@ rb_reg_to_s(re) + kcode_set_option(re); + rp = ALLOC(Regexp); + MEMZERO((char *)rp, Regexp, 1); +- err = re_compile_pattern(++ptr, len -= 2, rp) != 0; ++ err = (re_compile_pattern(++ptr, len -= 2, rp, NULL) != 0); + kcode_reset_option(); + re_free_pattern(rp); + } +@@ -538,7 +538,8 @@ make_regexp(s, len, flags) + int flags; + { + Regexp *rp; +- char *err; ++ char err[REG_MAX_ERROR_MESSAGE_LEN]; ++ int r; + + /* Handle escaped characters first. */ + +@@ -547,17 +548,18 @@ make_regexp(s, len, flags) + from that. + */ + +- rp = ALLOC(Regexp); +- MEMZERO((char *)rp, Regexp, 1); +- rp->buffer = ALLOC_N(char, 16); +- rp->allocated = 16; +- rp->fastmap = ALLOC_N(char, 256); ++ r = re_alloc_pattern(&rp); ++ if (r) { ++ re_error_code_to_str((UChar* )err, r); ++ rb_reg_raise(s, len, err, 0); ++ } ++ + if (flags) { + rp->options = flags; + } +- err = re_compile_pattern(s, len, rp); ++ r = re_compile_pattern(s, len, rp, err); + +- if (err != NULL) { ++ if (r != 0) { + rb_reg_raise(s, len, err, 0); + } + return rp; +@@ -692,14 +694,14 @@ rb_reg_prepare_re(re) + } + + if (need_recompile) { +- char *err; ++ char err[REG_MAX_ERROR_MESSAGE_LEN]; ++ int r; + + if (FL_TEST(re, KCODE_FIXED)) + kcode_set_option(re); + rb_reg_check(re); +- RREGEXP(re)->ptr->fastmap_accurate = 0; +- err = re_compile_pattern(RREGEXP(re)->str, RREGEXP(re)->len, RREGEXP(re)->ptr); +- if (err != NULL) { ++ r = re_recompile_pattern(RREGEXP(re)->str, RREGEXP(re)->len, RREGEXP(re)->ptr, err); ++ if (r != 0) { + rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, re); + } + } diff --git a/ext/mbstring/oniguruma/regcomp.c b/ext/mbstring/oniguruma/regcomp.c new file mode 100644 index 0000000000..fd8e56a7a7 --- /dev/null +++ b/ext/mbstring/oniguruma/regcomp.c @@ -0,0 +1,5282 @@ +/********************************************************************** + + regcomp.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regparse.h" + +#ifndef UNALIGNED_WORD_ACCESS +static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; +#endif + +static void +swap_node(Node* a, Node* b) +{ + Node c; + c = *a; *a = *b; *b = c; +} + +static RegDistance +distance_add(RegDistance d1, RegDistance d2) +{ + if (d1 == INFINITE_DISTANCE || d2 == INFINITE_DISTANCE) + return INFINITE_DISTANCE; + else { + if (d1 <= INFINITE_DISTANCE - d2) return d1 + d2; + else return INFINITE_DISTANCE; + } +} + +static RegDistance +distance_multiply(RegDistance d, int m) +{ + if (m == 0) return 0; + + if (d < INFINITE_DISTANCE / m) + return d * m; + else + return INFINITE_DISTANCE; +} + +#if 0 +static RegDistance +distance_distance(RegDistance d1, RegDistance d2) +{ + if (d1 == INFINITE_DISTANCE || d2 == INFINITE_DISTANCE) + return INFINITE_DISTANCE; + + if (d1 > d2) return d1 - d2; + else return d2 - d1; +} +#endif + +RegCharEncoding RegDefaultCharEncoding = REGCODE_DEFAULT; +static UChar AmbiguityTable[REG_CHAR_TABLE_SIZE]; + +#define IS_AMBIGUITY_CHAR(enc, c) (AmbiguityTable[(c)] >= 2) + +#ifdef DEFAULT_TRANSTABLE_EXIST + +static UChar DTT[] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', + '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', +}; +#endif + +static int +bitset_is_empty(BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + if (bs[i] != 0) return 0; + } + return 1; +} + +#ifdef REG_DEBUG +static int +bitset_on_num(BitSetRef bs) +{ + int i, n; + + n = 0; + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(bs, i)) n++; + } + return n; +} +#endif + +extern int +regex_bbuf_init(BBuf* buf, int size) +{ + buf->p = (UChar* )xmalloc(size); + if (IS_NULL(buf->p)) return(REGERR_MEMORY); + + buf->alloc = size; + buf->used = 0; + return 0; +} + + +#ifdef USE_SUBEXP_CALL + +static int +unset_addr_list_init(UnsetAddrList* uslist, int size) +{ + UnsetAddr* p; + + p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size); + CHECK_NULL_RETURN_VAL(p, REGERR_MEMORY); + uslist->num = 0; + uslist->alloc = size; + uslist->us = p; + return 0; +} + +static void +unset_addr_list_end(UnsetAddrList* uslist) +{ + if (IS_NOT_NULL(uslist->us)) + xfree(uslist->us); +} + +static int +unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) +{ + UnsetAddr* p; + int size; + + if (uslist->num >= uslist->alloc) { + size = uslist->alloc * 2; + p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size); + CHECK_NULL_RETURN_VAL(p, REGERR_MEMORY); + uslist->alloc = size; + uslist->us = p; + } + + uslist->us[uslist->num].offset = offset; + uslist->us[uslist->num].target = node; + uslist->num++; + return 0; +} +#endif /* USE_SUBEXP_CALL */ + + +#ifdef REG_RUBY_M17N + +extern int +regex_is_allow_reverse_match(RegCharEncoding enc, UChar* s, UChar* end) +{ + return IS_INDEPENDENT_TRAIL(enc); +} + +#else /* REG_RUBY_M17N */ + +const char REG_MBLEN_TABLE[][REG_CHAR_TABLE_SIZE] = { + { /* ascii */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + }, + { /* euc-jp */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 + }, + { /* sjis */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + }, + { /* utf8 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 + } +}; + +extern int +regex_mb_max_length(RegCharEncoding code) +{ + /* can't use switch statement, code isn't int type. */ + if (code == REGCODE_ASCII) return 1; + else if (code == REGCODE_EUCJP) return 3; + else if (code == REGCODE_SJIS) return 2; + else return 6; /* REGCODE_UTF8 */ +} + +extern int +regex_is_allow_reverse_match(RegCharEncoding enc, UChar* s, UChar* end) +{ + UChar c; + + if (IS_INDEPENDENT_TRAIL(enc)) return 1; + + c = *s; + if (enc == REGCODE_EUCJP) { + if (c <= 0x7e || c == 0x8e || c == 0x8f) return 1; + } + else if (enc == REGCODE_SJIS) { + if (c <= 0x3f || c == 0x7f) return 1; + } + return 0; +} + +#endif /* not REG_RUBY_M17N */ + +static int +bitset_mbmaxlen(BitSetRef bs, int negative, RegCharEncoding enc) +{ + int i; + int len, maxlen = 0; + + if (negative) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! BITSET_AT(bs, i)) { + len = mblen(enc, i); + if (len > maxlen) maxlen = len; + } + } + } + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(bs, i)) { + len = mblen(enc, i); + if (len > maxlen) maxlen = len; + } + } + } + return maxlen; +} + + +static int +add_opcode(regex_t* reg, int opcode) +{ + BBUF_ADD1(reg, opcode); + return 0; +} + +static int +add_rel_addr(regex_t* reg, int addr) +{ + RelAddrType ra = (RelAddrType )addr; + +#ifdef UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &ra, SIZE_RELADDR); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_RELADDR(ra, buf); + BBUF_ADD(reg, buf, SIZE_RELADDR); +#endif + return 0; +} + +static int +add_abs_addr(regex_t* reg, int addr) +{ + AbsAddrType ra = (AbsAddrType )addr; + +#ifdef UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &ra, SIZE_ABSADDR); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_ABSADDR(ra, buf); + BBUF_ADD(reg, buf, SIZE_ABSADDR); +#endif + return 0; +} + +static int +add_length(regex_t* reg, int len) +{ + LengthType l = (LengthType )len; + +#ifdef UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &l, SIZE_LENGTH); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_LENGTH(l, buf); + BBUF_ADD(reg, buf, SIZE_LENGTH); +#endif + return 0; +} + +static int +add_mem_num(regex_t* reg, int num) +{ + MemNumType n = (MemNumType )num; + +#ifdef UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &n, SIZE_MEMNUM); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_MEMNUM(n, buf); + BBUF_ADD(reg, buf, SIZE_MEMNUM); +#endif + return 0; +} + +#if 0 +static int +add_repeat_num(regex_t* reg, int num) +{ + RepeatNumType n = (RepeatNumType )num; + +#ifdef UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &n, SIZE_REPEATNUM); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_REPEATNUM(n, buf); + BBUF_ADD(reg, buf, SIZE_REPEATNUM); +#endif + return 0; +} +#endif + +static int +add_option(regex_t* reg, RegOptionType option) +{ +#ifdef UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &option, SIZE_OPTION); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_OPTION(option, buf); + BBUF_ADD(reg, buf, SIZE_OPTION); +#endif + return 0; +} + +static int +add_opcode_rel_addr(regex_t* reg, int opcode, int addr) +{ + int r; + + r = add_opcode(reg, opcode); + if (r) return r; + r = add_rel_addr(reg, addr); + return r; +} + +static int +add_bytes(regex_t* reg, UChar* bytes, int len) +{ + BBUF_ADD(reg, bytes, len); + return 0; +} + +static int +add_bitset(regex_t* reg, BitSetRef bs) +{ + BBUF_ADD(reg, bs, SIZE_BITSET); + return 0; +} + +static int +add_opcode_option(regex_t* reg, int opcode, RegOptionType option) +{ + int r; + + r = add_opcode(reg, opcode); + if (r) return r; + r = add_option(reg, option); + return r; +} + +static int compile_length_tree(Node* node, regex_t* reg); +static int compile_tree(Node* node, regex_t* reg); + + +#define IS_NEED_STR_LEN_OP_EXACT(op) \ + ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ + (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) + +static int +select_str_opcode(int mb_len, int str_len, int ignore_case) +{ + int op; + + switch (mb_len) { + case 1: + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; + } + } + else { + switch (str_len) { + case 1: op = OP_EXACT1; break; + case 2: op = OP_EXACT2; break; + case 3: op = OP_EXACT3; break; + case 4: op = OP_EXACT4; break; + case 5: op = OP_EXACT5; break; + default: op = OP_EXACTN; break; + } + } + break; + + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; + + case 3: + op = OP_EXACTMB3N; + break; + + default: + op = OP_EXACTMBN; + break; + } + return op; +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int empty_check) +{ + int r; + int saved_num_null_check = reg->num_null_check; + + if (empty_check) { + r = add_opcode(reg, OP_NULL_CHECK_START); + if (r) return r; + r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */ + if (r) return r; + reg->num_null_check++; + } + + r = compile_tree(node, reg); + if (r) return r; + + if (empty_check) { + r = add_opcode(reg, OP_NULL_CHECK_END); + if (r) return r; + r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */ + } + return r; +} + +#ifdef USE_SUBEXP_CALL +static int +compile_call(CallNode* node, regex_t* reg) +{ + int r; + + r = add_opcode(reg, OP_CALL); + if (r) return r; + r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), + node->target); + if (r) return r; + r = add_abs_addr(reg, 0 /*dummy addr.*/); + return r; +} +#endif + +static int +compile_tree_n_times(Node* node, int n, regex_t* reg) +{ + int i, r; + + for (i = 0; i < n; i++) { + r = compile_tree(node, reg); + if (r) return r; + } + return 0; +} + +static int +add_compile_string_length(UChar* s, int mb_len, int str_len, + regex_t* reg, int ignore_case) +{ + int len; + int op = select_str_opcode(mb_len, str_len, ignore_case); + + len = SIZE_OPCODE; + if (op == OP_EXACTMBN) + len += SIZE_LENGTH; + + if (IS_NEED_STR_LEN_OP_EXACT(op)) + len += SIZE_LENGTH; + + len += mb_len * str_len; + return len; +} + +static int +add_compile_string(UChar* s, int mb_len, int str_len, + regex_t* reg, int ignore_case) +{ + int op = select_str_opcode(mb_len, str_len, ignore_case); + add_opcode(reg, op); + + if (op == OP_EXACTMBN) + add_length(reg, mb_len); + + if (IS_NEED_STR_LEN_OP_EXACT(op)) + add_length(reg, str_len); + + add_bytes(reg, s, mb_len * str_len); + return 0; +} + + +static int +compile_length_string_node(StrNode* sn, regex_t* reg) +{ + int rlen, r, len, prev_len, slen, ambig, ic; + RegCharEncoding code = reg->enc; + UChar *p, *prev; + + if (sn->end <= sn->s) + return 0; + + ic = IS_IGNORECASE(reg->options); + + p = prev = sn->s; + prev_len = mblen(code, *p); + if (ic != 0 && prev_len == 1) + ambig = IS_AMBIGUITY_CHAR(reg->enc, *p); + else + ambig = 0; + + p += prev_len; + slen = 1; + rlen = 0; + + for (; p < sn->end; ) { + len = mblen(code, *p); + if (len == prev_len) { + slen++; + if (ic != 0 && ambig == 0 && len == 1) + ambig = IS_AMBIGUITY_CHAR(reg->enc, *p); + } + else { + r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + rlen += r; + + if (ic != 0 && len == 1) + ambig = IS_AMBIGUITY_CHAR(reg->enc, *p); + else + ambig = 0; + + prev = p; + slen = 1; + prev_len = len; + } + + p += len; + } + r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + rlen += r; + return rlen; +} + +static int +compile_length_string_raw_node(StrNode* sn, regex_t* reg) +{ + if (sn->end <= sn->s) + return 0; + + return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0); +} + +static int +compile_string_node(StrNode* sn, regex_t* reg) +{ + int r, len, prev_len, slen, ambig, ic; + RegCharEncoding code = reg->enc; + UChar *p, *prev; + + if (sn->end <= sn->s) + return 0; + + ic = IS_IGNORECASE(reg->options); + + p = prev = sn->s; + prev_len = mblen(code, *p); + if (ic != 0 && prev_len == 1) { + ambig = IS_AMBIGUITY_CHAR(reg->enc, *p); + if (ambig != 0) *p = TOLOWER(reg->enc, *p); + } + else + ambig = 0; + + p += prev_len; + slen = 1; + + for (; p < sn->end; ) { + len = mblen(code, *p); + if (len == prev_len) { + slen++; + if (ic != 0 && len == 1) { + if (ambig == 0) + ambig = IS_AMBIGUITY_CHAR(reg->enc, *p); + if (ambig != 0) *p = TOLOWER(reg->enc, *p); + } + } + else { + r = add_compile_string(prev, prev_len, slen, reg, ambig); + if (r) return r; + if (ic != 0 && len == 1) { + ambig = IS_AMBIGUITY_CHAR(reg->enc, *p); + if (ambig != 0) *p = TOLOWER(reg->enc, *p); + } + else + ambig = 0; + + prev = p; + slen = 1; + prev_len = len; + } + + p += len; + } + return add_compile_string(prev, prev_len, slen, reg, ambig); +} + +static int +compile_string_raw_node(StrNode* sn, regex_t* reg) +{ + if (sn->end <= sn->s) + return 0; + + return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0); +} + +static int +add_multi_byte_cclass_offset(BBuf* mbuf, regex_t* reg, int offset) +{ +#ifdef UNALIGNED_WORD_ACCESS + add_length(reg, mbuf->used - offset); + return add_bytes(reg, mbuf->p + offset, mbuf->used - offset); +#else + int r, pad_size; + UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH; + + GET_ALIGNMENT_PAD_SIZE(p, pad_size); + add_length(reg, mbuf->used - offset + (WORD_ALIGNMENT_SIZE - 1)); + if (pad_size != 0) add_bytes(reg, PadBuf, pad_size); + + r = add_bytes(reg, mbuf->p + offset, mbuf->used - offset); + + /* padding for return value from compile_length_cclass_node() to be fix. */ + pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size; + if (pad_size != 0) add_bytes(reg, PadBuf, pad_size); + return r; +#endif +} + +static int +compile_length_cclass_node(CClassNode* cc, regex_t* reg) +{ + int len; + + if (IS_NULL(cc->mbuf)) { + len = SIZE_OPCODE + SIZE_BITSET; + } + else { + if (bitset_is_empty(cc->bs)) { + /* SIZE_BITSET is included in mbuf->used. */ + len = SIZE_OPCODE - SIZE_BITSET; + } + else { + len = SIZE_OPCODE; + } +#ifdef UNALIGNED_WORD_ACCESS + len += SIZE_LENGTH + cc->mbuf->used; +#else + len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1); +#endif + } + + return len; +} + +static int +compile_cclass_node(CClassNode* cc, regex_t* reg) +{ + int r; + + if (IS_NULL(cc->mbuf)) { + if (cc->not) add_opcode(reg, OP_CCLASS_NOT); + else add_opcode(reg, OP_CCLASS); + + r = add_bitset(reg, cc->bs); + } + else { + if (bitset_is_empty(cc->bs)) { + if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); + else add_opcode(reg, OP_CCLASS_MB); + + r = add_multi_byte_cclass_offset(cc->mbuf, reg, SIZE_BITSET); + } + else { + if (cc->not) add_opcode(reg, OP_CCLASS_MIX_NOT); + else add_opcode(reg, OP_CCLASS_MIX); + + r = add_bitset(reg, cc->bs); + if (r) return r; + r = add_multi_byte_cclass_offset(cc->mbuf, reg, SIZE_BITSET); + } + } + + return r; +} + +static int +entry_repeat_range(regex_t* reg, int id, int lower, int upper) +{ +#define REPEAT_RANGE_ALLOC 4 + + RegRepeatRange* p; + + if (reg->repeat_range_alloc == 0) { + p = (RegRepeatRange* )xmalloc(sizeof(RegRepeatRange) * REPEAT_RANGE_ALLOC); + CHECK_NULL_RETURN_VAL(p, REGERR_MEMORY); + reg->repeat_range = p; + reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; + } + else if (reg->repeat_range_alloc <= id) { + int n; + n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; + p = (RegRepeatRange* )xrealloc(reg->repeat_range, + sizeof(RegRepeatRange) * n); + CHECK_NULL_RETURN_VAL(p, REGERR_MEMORY); + reg->repeat_range = p; + reg->repeat_range_alloc = n; + } + else { + p = reg->repeat_range; + } + + p[id].lower = lower; + p[id].upper = upper; + return 0; +} + +static int +compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_check, + regex_t* reg) +{ + int r; + int num_repeat = reg->num_repeat; + + r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG); + if (r) return r; + r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ + reg->num_repeat++; + if (r) return r; + r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC); + if (r) return r; + + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + if (r) return r; + + r = compile_tree_empty_check(qn->target, reg, empty_check); + if (r) return r; + + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + if (r) return r; + r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ + return r; +} + +#define QUALIFIER_EXPAND_LIMIT_SIZE 50 + +static int +compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int len, mod_tlen; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_check = (infinite && qn->target_may_empty); + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + /* anychar repeat */ + if (NTYPE(qn->target) == N_ANYCHAR) { + if (qn->greedy && infinite) { + if (IS_NOT_NULL(qn->next_head_exact)) + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + else + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + } + } + + if (empty_check) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && + (qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) { + if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) { + len = SIZE_OP_JUMP; + } + else { + len = tlen * qn->lower; + } + + if (qn->greedy) { + if (IS_NOT_NULL(qn->head_exact)) + len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + else if (IS_NOT_NULL(qn->next_head_exact)) + len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + else + len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + } + else + len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; + } + else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */ + len = SIZE_OP_JUMP + tlen; + } + else if (!infinite && qn->greedy && + (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + len = tlen * qn->lower; + len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; + } + else { + len = SIZE_OP_REPEAT_INC + + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; + } + + return len; +} + +static int +is_anychar_star_qualifier(QualifierNode* qn) +{ + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + NTYPE(qn->target) == N_ANYCHAR) + return 1; + else + return 0; +} + +static int +compile_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int i, r, mod_tlen; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_check = (infinite && qn->target_may_empty); + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + if (is_anychar_star_qualifier(qn)) { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + if (IS_NOT_NULL(qn->next_head_exact)) { + r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); + if (r) return r; + return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + } + else + return add_opcode(reg, OP_ANYCHAR_STAR); + } + + if (empty_check) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && + (qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) { + if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) { + if (qn->greedy) { + if (IS_NOT_NULL(qn->head_exact)) + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1); + else if (IS_NOT_NULL(qn->next_head_exact)) + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT); + else + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH); + } + else { + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP); + } + if (r) return r; + } + else { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + } + + if (qn->greedy) { + if (IS_NOT_NULL(qn->head_exact)) { + r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1, + mod_tlen + SIZE_OP_JUMP); + if (r) return r; + add_bytes(reg, NSTRING(qn->head_exact).s, 1); + r = compile_tree_empty_check(qn->target, reg, empty_check); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + SIZE_OP_JUMP + SIZE_OP_PUSH_OR_JUMP_EXACT1)); + } + else if (IS_NOT_NULL(qn->next_head_exact)) { + r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT, + mod_tlen + SIZE_OP_JUMP); + if (r) return r; + add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + r = compile_tree_empty_check(qn->target, reg, empty_check); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + SIZE_OP_JUMP + SIZE_OP_PUSH_IF_PEEK_NEXT)); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_check); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + SIZE_OP_JUMP + SIZE_OP_PUSH)); + } + } + else { + r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_check); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + SIZE_OP_PUSH)); + } + } + else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */ + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else if (!infinite && qn->greedy && + (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + int n = qn->upper - qn->lower; + + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + + for (i = 0; i < n; i++) { + r = add_opcode_rel_addr(reg, OP_PUSH, + (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH); + if (r) return r; + r = compile_tree(qn->target, reg); + if (r) return r; + } + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else { + r = compile_range_repeat_node(qn, mod_tlen, empty_check, reg); + } + return r; +} + +static int +compile_length_option_node(EffectNode* node, regex_t* reg) +{ + int tlen; + RegOptionType prev = reg->options; + + reg->options = node->option; + tlen = compile_length_tree(node->target, reg); + reg->options = prev; + + if (tlen < 0) return tlen; + + return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL + + tlen + SIZE_OP_SET_OPTION; +} + +static int +compile_option_node(EffectNode* node, regex_t* reg) +{ + int r; + RegOptionType prev = reg->options; + + r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option); + if (r) return r; + r = add_opcode_option(reg, OP_SET_OPTION, prev); + if (r) return r; + r = add_opcode(reg, OP_FAIL); + if (r) return r; + + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + if (r) return r; + + r = add_opcode_option(reg, OP_SET_OPTION, prev); + return r; +} + +static int +compile_length_effect_node(EffectNode* node, regex_t* reg) +{ + int len; + int tlen; + + if (node->type == EFFECT_OPTION) + return compile_length_option_node(node, reg); + + if (node->target) { + tlen = compile_length_tree(node->target, reg); + if (tlen < 0) return tlen; + } + else + tlen = 0; + + switch (node->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CALLED(node)) { + len = SIZE_OP_MEMORY_START_PUSH + tlen + + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + if (IS_FIND_CONDITION(reg->options)) + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + else + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + } + else +#endif + { + if (BIT_STATUS_AT(reg->backtrack_mem, node->regnum)) + len = SIZE_OP_MEMORY_START_PUSH; + else + len = SIZE_OP_MEMORY_START; + + len += tlen + (IS_FIND_CONDITION(reg->options) + ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + } + break; + + case EFFECT_STOP_BACKTRACK: + if (IS_EFFECT_SIMPLE_REPEAT(node)) { + QualifierNode* qn = &NQUALIFIER(node->target); + tlen = compile_length_tree(qn->target, reg); + if (tlen < 0) return tlen; + + len = tlen * qn->lower + + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP; + } + else { + len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT; + } + break; + + default: + return REGERR_TYPE_BUG; + break; + } + + return len; +} + +static int get_char_length_tree(Node* node, regex_t* reg, int* len); + +static int +compile_effect_node(EffectNode* node, regex_t* reg) +{ + int r, len; + + if (node->type == EFFECT_OPTION) + return compile_option_node(node, reg); + + switch (node->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CALLED(node)) { + r = add_opcode(reg, OP_CALL); + if (r) return r; + node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; + node->state |= NST_ADDR_FIXED; + r = add_abs_addr(reg, (int )node->call_addr); + if (r) return r; + len = compile_length_tree(node->target, reg); + len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); + if (IS_FIND_CONDITION(reg->options)) + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + else + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + + r = add_opcode_rel_addr(reg, OP_JUMP, len); + if (r) return r; + } +#endif + if (BIT_STATUS_AT(reg->backtrack_mem, node->regnum)) + r = add_opcode(reg, OP_MEMORY_START_PUSH); + else + r = add_opcode(reg, OP_MEMORY_START); + if (r) return r; + r = add_mem_num(reg, node->regnum); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CALLED(node)) { + if (IS_FIND_CONDITION(reg->options)) + r = add_opcode(reg, (IS_EFFECT_RECURSION(node) + ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + else + r = add_opcode(reg, (IS_EFFECT_RECURSION(node) + ? OP_MEMORY_END_REC : OP_MEMORY_END)); + + if (r) return r; + r = add_mem_num(reg, node->regnum); + if (r) return r; + r = add_opcode(reg, OP_RETURN); + } + else +#endif + { + if (IS_FIND_CONDITION(reg->options)) + r = add_opcode(reg, OP_MEMORY_END_PUSH); + else + r = add_opcode(reg, OP_MEMORY_END); + if (r) return r; + r = add_mem_num(reg, node->regnum); + } + break; + + case EFFECT_STOP_BACKTRACK: + if (IS_EFFECT_SIMPLE_REPEAT(node)) { + QualifierNode* qn = &NQUALIFIER(node->target); + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + + len = compile_length_tree(qn->target, reg); + if (len < 0) return len; + + r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree(qn->target, reg); + if (r) return r; + r = add_opcode(reg, OP_POP); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(SIZE_OP_PUSH + len + SIZE_OP_POP + SIZE_OP_JUMP)); + } + else { + r = add_opcode(reg, OP_PUSH_STOP_BT); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_POP_STOP_BT); + } + break; + + default: + return REGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +compile_length_anchor_node(AnchorNode* node, regex_t* reg) +{ + int len; + int tlen = 0; + + if (node->target) { + tlen = compile_length_tree(node->target, reg); + if (tlen < 0) return tlen; + } + + switch (node->type) { + case ANCHOR_PREC_READ: + len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS; + break; + case ANCHOR_PREC_READ_NOT: + len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS; + break; + case ANCHOR_LOOK_BEHIND: + len = SIZE_OP_LOOK_BEHIND + tlen; + break; + case ANCHOR_LOOK_BEHIND_NOT: + len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT; + break; + + default: + len = SIZE_OPCODE; + break; + } + + return len; +} + +static int +compile_anchor_node(AnchorNode* node, regex_t* reg) +{ + int r, len; + + switch (node->type) { + case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break; + case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break; + case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break; + case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break; + case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; + case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; + + case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break; + case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break; +#ifdef USE_WORD_BEGIN_END + case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break; + case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break; +#endif + + case ANCHOR_PREC_READ: + r = add_opcode(reg, OP_PUSH_POS); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_POP_POS); + break; + + case ANCHOR_PREC_READ_NOT: + len = compile_length_tree(node->target, reg); + if (len < 0) return len; + r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_FAIL_POS); + break; + + case ANCHOR_LOOK_BEHIND: + { + int n; + r = add_opcode(reg, OP_LOOK_BEHIND); + if (r) return r; + if (node->char_len < 0) { + r = get_char_length_tree(node->target, reg, &n); + if (r) return REGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else + n = node->char_len; + r = add_length(reg, n); + if (r) return r; + r = compile_tree(node->target, reg); + } + break; + + case ANCHOR_LOOK_BEHIND_NOT: + { + int n; + len = compile_length_tree(node->target, reg); + r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT, + len + SIZE_OP_FAIL_LOOK_BEHIND_NOT); + if (r) return r; + if (node->char_len < 0) { + r = get_char_length_tree(node->target, reg, &n); + if (r) return REGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else + n = node->char_len; + r = add_length(reg, n); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT); + } + break; + + default: + return REGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +compile_length_tree(Node* node, regex_t* reg) +{ + int len, type, r; + + type = NTYPE(node); + switch (type) { + case N_LIST: + len = 0; + do { + r = compile_length_tree(NCONS(node).left, reg); + if (r < 0) return r; + len += r; + } while (IS_NOT_NULL(node = NCONS(node).right)); + r = len; + break; + + case N_ALT: + { + int n; + + n = r = 0; + do { + r += compile_length_tree(NCONS(node).left, reg); + n++; + } while (IS_NOT_NULL(node = NCONS(node).right)); + r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); + } + break; + + case N_STRING: + if (NSTRING_IS_RAW(node)) + r = compile_length_string_raw_node(&(NSTRING(node)), reg); + else + r = compile_length_string_node(&(NSTRING(node)), reg); + break; + + case N_CCLASS: + r = compile_length_cclass_node(&(NCCLASS(node)), reg); + break; + + case N_CTYPE: + case N_ANYCHAR: + r = SIZE_OPCODE; + break; + + case N_BACKREF: + { + BackrefNode* br = &(NBACKREF(node)); + + if (br->back_num == 1) { + r = (br->back_static[0] <= 3 + ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM)); + } + else { + r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + r = SIZE_OP_CALL; + break; +#endif + + case N_QUALIFIER: + r = compile_length_qualifier_node(&(NQUALIFIER(node)), reg); + break; + + case N_EFFECT: + r = compile_length_effect_node(&NEFFECT(node), reg); + break; + + case N_ANCHOR: + r = compile_length_anchor_node(&(NANCHOR(node)), reg); + break; + + default: + return REGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +compile_tree(Node* node, regex_t* reg) +{ + int n, type, len, pos, r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + do { + r = compile_tree(NCONS(node).left, reg); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + { + Node* x = node; + len = 0; + do { + len += compile_length_tree(NCONS(x).left, reg); + if (NCONS(x).right != NULL) { + len += SIZE_OP_PUSH + SIZE_OP_JUMP; + } + } while (IS_NOT_NULL(x = NCONS(x).right)); + pos = reg->used + len; /* goal position */ + + do { + len = compile_length_tree(NCONS(node).left, reg); + if (IS_NOT_NULL(NCONS(node).right)) { + r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP); + if (r) break; + } + r = compile_tree(NCONS(node).left, reg); + if (r) break; + if (IS_NOT_NULL(NCONS(node).right)) { + len = pos - (reg->used + SIZE_OP_JUMP); + r = add_opcode_rel_addr(reg, OP_JUMP, len); + if (r) break; + } + } while (IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_STRING: + if (NSTRING_IS_RAW(node)) + r = compile_string_raw_node(&(NSTRING(node)), reg); + else + r = compile_string_node(&(NSTRING(node)), reg); + break; + + case N_CCLASS: + r = compile_cclass_node(&(NCCLASS(node)), reg); + break; + + case N_CTYPE: + { + int op; + + switch (NCTYPE(node).type) { + case CTYPE_WORD: op = OP_WORD; break; + case CTYPE_NOT_WORD: op = OP_NOT_WORD; break; +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: op = OP_WORD_SB; break; + case CTYPE_WORD_MB: op = OP_WORD_MB; break; +#endif + default: + return REGERR_TYPE_BUG; + break; + } + r = add_opcode(reg, op); + } + break; + + case N_ANYCHAR: + r = add_opcode(reg, OP_ANYCHAR); + break; + + case N_BACKREF: + { + int i; + BackrefNode* br = &(NBACKREF(node)); + + if (br->back_num == 1) { + n = br->back_static[0]; + switch (n) { + case 1: r = add_opcode(reg, OP_BACKREF1); break; + case 2: r = add_opcode(reg, OP_BACKREF2); break; + case 3: r = add_opcode(reg, OP_BACKREF3); break; + default: + r = add_opcode(reg, OP_BACKREFN); + if (r) return r; + r = add_mem_num(reg, n); + break; + } + } + else { + int* p; + add_opcode(reg, OP_BACKREF_MULTI); + if (r) return r; + add_length(reg, br->back_num); + if (r) return r; + p = BACKREFS_P(br); + for (i = br->back_num - 1; i >= 0; i--) { + r = add_mem_num(reg, p[i]); + if (r) return r; + } + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + r = compile_call(&(NCALL(node)), reg); + break; +#endif + + case N_QUALIFIER: + r = compile_qualifier_node(&(NQUALIFIER(node)), reg); + break; + + case N_EFFECT: + r = compile_effect_node(&NEFFECT(node), reg); + break; + + case N_ANCHOR: + r = compile_anchor_node(&(NANCHOR(node)), reg); + break; + + default: +#ifdef REG_DEBUG + fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node)); +#endif + break; + } + + return r; +} + +#ifdef USE_SUBEXP_CALL +static int +unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) +{ + int i, offset; + EffectNode* en; + AbsAddrType addr; +#ifndef UNALIGNED_WORD_ACCESS + UChar buf[SERIALIZE_BUFSIZE]; +#endif + + for (i = 0; i < uslist->num; i++) { + en = &(NEFFECT(uslist->us[i].target)); + if (! IS_EFFECT_ADDR_FIXED(en)) return REGERR_PARSER_BUG; + addr = en->call_addr; + offset = uslist->us[i].offset; + +#ifdef UNALIGNED_WORD_ACCESS + BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR); +#else + SERIALIZE_ABSADDR(addr, buf); + BBUF_WRITE(reg, offset, buf, SIZE_ABSADDR); +#endif + } + return 0; +} +#endif + +static int +get_min_match_length(Node* node, RegDistance *min, ScanEnv* env) +{ + RegDistance tmin; + int r = 0; + + *min = 0; + switch (NTYPE(node)) { + case N_BACKREF: + { + int i; + int* backs; + Node** nodes = SCANENV_MEM_NODES(env); + BackrefNode* br = &(NBACKREF(node)); + if (br->state & NST_RECURSION) break; + + backs = BACKREFS_P(br); + if (backs[0] > env->num_mem) return REGERR_INVALID_BACKREF; + r = get_min_match_length(nodes[backs[0]], min, env); + if (r != 0) break; + for (i = 1; i < br->back_num; i++) { + if (backs[i] > env->num_mem) return REGERR_INVALID_BACKREF; + r = get_min_match_length(nodes[backs[i]], &tmin, env); + if (r != 0) break; + if (*min > tmin) *min = tmin; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&NCALL(node))) { + EffectNode* en = &(NEFFECT(NCALL(node).target)); + if (IS_EFFECT_MIN_FIXED(en)) + *min = en->min_len; + } + else + r = get_min_match_length(NCALL(node).target, min, env); + break; +#endif + + case N_LIST: + do { + r = get_min_match_length(NCONS(node).left, &tmin, env); + if (r == 0) *min += tmin; + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + { + Node *x, *y; + y = node; + do { + x = NCONS(y).left; + r = get_min_match_length(x, &tmin, env); + if (r != 0) break; + if (y == node) *min = tmin; + else if (*min > tmin) *min = tmin; + } while (r == 0 && IS_NOT_NULL(y = NCONS(y).right)); + } + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + *min = sn->end - sn->s; + } + break; + + case N_CTYPE: + switch (NCTYPE(node).type) { + case CTYPE_WORD: *min = 1; break; + case CTYPE_NOT_WORD: *min = 1; break; +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: *min = 1; break; + case CTYPE_WORD_MB: *min = 2; break; +#endif + default: + break; + } + break; + + case N_CCLASS: + case N_ANYCHAR: + *min = 1; + break; + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + + if (qn->lower > 0) { + r = get_min_match_length(qn->target, min, env); + if (r == 0) + *min = distance_multiply(*min, qn->lower); + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_MIN_FIXED(en)) + *min = en->min_len; + else { + r = get_min_match_length(en->target, min, env); + if (r == 0) { + en->min_len = *min; + SET_EFFECT_STATUS(node, NST_MIN_FIXED); + } + } + break; +#endif + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = get_min_match_length(en->target, min, env); + break; + } + } + break; + + case N_ANCHOR: + default: + break; + } + + return r; +} + +static int +get_max_match_length(Node* node, RegDistance *max, ScanEnv* env) +{ + RegDistance tmax; + int r = 0; + + *max = 0; + switch (NTYPE(node)) { + case N_LIST: + do { + r = get_max_match_length(NCONS(node).left, &tmax, env); + if (r == 0) + *max = distance_add(*max, tmax); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + do { + r = get_max_match_length(NCONS(node).left, &tmax, env); + if (r == 0 && *max < tmax) *max = tmax; + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + *max = sn->end - sn->s; + } + break; + + case N_CTYPE: + switch (NCTYPE(node).type) { + case CTYPE_WORD: + case CTYPE_NOT_WORD: +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_MB: +#endif + *max = mbmaxlen_dist(env->enc); + break; + +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: + *max = 1; + break; +#endif + + default: + break; + } + break; + + case N_CCLASS: + case N_ANYCHAR: + *max = mbmaxlen_dist(env->enc); + break; + + case N_BACKREF: + { + int i; + int* backs; + Node** nodes = SCANENV_MEM_NODES(env); + BackrefNode* br = &(NBACKREF(node)); + if (br->state & NST_RECURSION) { + *max = INFINITE_DISTANCE; + break; + } + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + if (backs[i] > env->num_mem) return REGERR_INVALID_BACKREF; + r = get_max_match_length(nodes[backs[i]], &tmax, env); + if (r != 0) break; + if (*max < tmax) *max = tmax; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (! IS_CALL_RECURSION(&(NCALL(node)))) + r = get_max_match_length(NCALL(node).target, max, env); + else + *max = INFINITE_DISTANCE; + break; +#endif + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + + if (qn->upper != 0) { + r = get_max_match_length(qn->target, max, env); + if (r == 0 && *max != 0) { + if (! IS_REPEAT_INFINITE(qn->upper)) + *max = distance_multiply(*max, qn->upper); + else + *max = INFINITE_DISTANCE; + } + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_MAX_FIXED(en)) + *max = en->max_len; + else { + r = get_max_match_length(en->target, max, env); + if (r == 0) { + en->max_len = *max; + SET_EFFECT_STATUS(node, NST_MAX_FIXED); + } + } + break; +#endif + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = get_max_match_length(en->target, max, env); + break; + } + } + break; + + case N_ANCHOR: + default: + break; + } + + return r; +} + +#define GET_CHAR_LEN_VARLEN -1 +#define GET_CHAR_LEN_TOP_ALT_VARLEN -2 + +/* fixed size pattern node only */ +static int +get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) +{ + int tlen; + int r = 0; + + level++; + *len = 0; + switch (NTYPE(node)) { + case N_LIST: + do { + r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level); + if (r == 0) + *len = distance_add(*len, tlen); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + { + int tlen2; + int varlen = 0; + + r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level); + while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)) { + r = get_char_length_tree1(NCONS(node).left, reg, &tlen2, level); + if (r == 0) { + if (tlen != tlen2) + varlen = 1; + } + } + if (r == 0) { + if (varlen != 0) { + if (level == 1) + r = GET_CHAR_LEN_TOP_ALT_VARLEN; + else + r = GET_CHAR_LEN_VARLEN; + } + else + *len = tlen; + } + } + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + UChar *s = sn->s; + while (s < sn->end) { + s += mblen(reg->enc, *s); + (*len)++; + } + } + break; + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->lower == qn->upper) { + r = get_char_length_tree1(qn->target, reg, &tlen, level); + if (r == 0) + *len = distance_multiply(tlen, qn->lower); + } + else + r = GET_CHAR_LEN_VARLEN; + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (! IS_CALL_RECURSION(&(NCALL(node)))) + r = get_char_length_tree1(NCALL(node).target, reg, len, level); + else + r = GET_CHAR_LEN_VARLEN; + break; +#endif + + case N_CTYPE: + switch (NCTYPE(node).type) { + case CTYPE_WORD: + case CTYPE_NOT_WORD: +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: + case CTYPE_WORD_MB: +#endif + *len = 1; + break; + } + break; + + case N_CCLASS: + case N_ANYCHAR: + *len = 1; + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CLEN_FIXED(en)) + *len = en->char_len; + else { + r = get_char_length_tree1(en->target, reg, len, level); + if (r == 0) { + en->char_len = *len; + SET_EFFECT_STATUS(node, NST_CLEN_FIXED); + } + } + break; +#endif + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = get_char_length_tree1(en->target, reg, len, level); + break; + default: + break; + } + } + break; + + case N_ANCHOR: + break; + + default: + r = GET_CHAR_LEN_VARLEN; + break; + } + + return r; +} + +static int +get_char_length_tree(Node* node, regex_t* reg, int* len) +{ + return get_char_length_tree1(node, reg, len, 0); +} + +/* x is not included y ==> 1 : 0 */ +static int +is_not_included(Node* x, Node* y, regex_t* reg) +{ + int i, len; + WCINT wc; + UChar *p, c; + int ytype; + + retry: + ytype = NTYPE(y); + switch (NTYPE(x)) { + case N_CTYPE: + { + switch (ytype) { + case N_CTYPE: + switch (NCTYPE(x).type) { + case CTYPE_WORD: + if (NCTYPE(y).type == CTYPE_NOT_WORD) + return 1; + else + return 0; + break; + case CTYPE_NOT_WORD: + if (NCTYPE(y).type == CTYPE_WORD) + return 1; + else + return 0; + break; +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: + case CTYPE_WORD_MB: + break; +#endif + default: + break; + } + break; + + case N_CCLASS: + swap: + { + Node* tmp; + tmp = x; x = y; y = tmp; + goto retry; + } + break; + + case N_STRING: + goto swap; + break; + + default: + break; + } + } + break; + + case N_CCLASS: + { + CClassNode* xc = &(NCCLASS(x)); + switch (ytype) { + case N_CTYPE: + switch (NCTYPE(y).type) { + case CTYPE_WORD: + if (IS_NULL(xc->mbuf) && xc->not == 0) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(xc->bs, i)) { + if (IS_SB_WORD(reg->enc, i)) return 0; + } + } + return 1; + } + return 0; + break; + case CTYPE_NOT_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! IS_SB_WORD(reg->enc, i)) { + if (xc->not == 0) { + if (BITSET_AT(xc->bs, i)) + return 0; + } + else { + if (! BITSET_AT(xc->bs, i)) + return 0; + } + } + } + return 1; + break; + +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: + case CTYPE_WORD_MB: + break; +#endif + default: + break; + } + break; + + case N_CCLASS: + { + int v; + CClassNode* yc = &(NCCLASS(y)); + + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + v = BITSET_AT(xc->bs, i); + if ((v != 0 && xc->not == 0) || (v == 0 && xc->not)) { + v = BITSET_AT(yc->bs, i); + if ((v != 0 && yc->not == 0) || (v == 0 && yc->not)) + return 0; + } + } + if ((IS_NULL(xc->mbuf) && xc->not == 0) || + (IS_NULL(yc->mbuf) && yc->not == 0)) + return 1; + return 0; + } + break; + + case N_STRING: + goto swap; + break; + + default: + break; + } + } + break; + + case N_STRING: + { + StrNode* xs = &(NSTRING(x)); + if (NSTRING_LEN(x) == 0) + break; + + c = *(xs->s); + switch (ytype) { + case N_CTYPE: + switch (NCTYPE(y).type) { + case CTYPE_WORD: + return (IS_WORD_STR(reg->enc, xs->s, xs->end) ? 0 : 1); + break; + case CTYPE_NOT_WORD: + return (IS_WORD_STR(reg->enc, xs->s, xs->end) ? 1 : 0); + break; +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: + return (ismb(reg->enc, c) ? 1 : 0); + break; + case CTYPE_WORD_MB: + return (ismb(reg->enc, c) ? 0 : 1); + break; +#endif + default: + break; + } + break; + + case N_CCLASS: + { + CClassNode* cc = &(NCCLASS(y)); + if (ismb(reg->enc, c)) { + if (IS_NULL(cc->mbuf)) + return (cc->not == 0 ? 1 : 0); + else { + len = mblen(reg->enc, c); + wc = MB2WC(xs->s, xs->s + len, reg->enc); + p = cc->mbuf->p + SIZE_BITSET; + if (regex_is_in_wc_range(p, wc)) + return (cc->not == 0 ? 0 : 1); + else + return (cc->not == 0 ? 1 : 0); + } + } + else { + if (BITSET_AT(cc->bs, c) == 0) + return (cc->not == 0 ? 1 : 0); + else + return (cc->not == 0 ? 0 : 1); + } + } + break; + + case N_STRING: + { + UChar *q; + StrNode* ys = &(NSTRING(y)); + len = NSTRING_LEN(x); + if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); + if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { + for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { + if (TOLOWER(reg->enc, *p) != TOLOWER(reg->enc, *q)) + return 1; + } + } + else { + for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { + if (*p != *q) return 1; + } + } + } + break; + + default: + break; + } + } + break; + + default: + break; + } + + return 0; +} + +static Node* +get_head_value_node(Node* node, int exact, regex_t* reg) +{ + Node* n = NULL_NODE; + + switch (NTYPE(node)) { + case N_BACKREF: + case N_ALT: + case N_ANYCHAR: +#ifdef USE_SUBEXP_CALL + case N_CALL: +#endif + break; + + case N_CTYPE: + case N_CCLASS: + if (exact == 0) { + n = node; + } + break; + + case N_LIST: + n = get_head_value_node(NCONS(node).left, exact, reg); + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + + if (sn->end <= sn->s) + break; + + if (exact != 0 && + !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { + if (! IS_AMBIGUITY_CHAR(reg->enc, *(sn->s))) + n = node; + } + else { + n = node; + } + } + break; + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->lower > 0) { + if (IS_NOT_NULL(qn->head_exact)) + n = qn->head_exact; + else + n = get_head_value_node(qn->target, exact, reg); + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_OPTION: + { + RegOptionType options = reg->options; + + reg->options = NEFFECT(node).option; + n = get_head_value_node(NEFFECT(node).target, exact, reg); + reg->options = options; + } + break; + + case EFFECT_MEMORY: + case EFFECT_STOP_BACKTRACK: + n = get_head_value_node(en->target, exact, reg); + break; + } + } + break; + + case N_ANCHOR: + if (NANCHOR(node).type == ANCHOR_PREC_READ) + n = get_head_value_node(NANCHOR(node).target, exact, reg); + break; + + default: + break; + } + + return n; +} + +static int +check_type_tree(Node* node, int type_mask, int effect_mask, int anchor_mask) +{ + int type, r = 0; + + type = NTYPE(node); + if ((type & type_mask) == 0) + return 1; + + switch (type) { + case N_LIST: + case N_ALT: + do { + r = check_type_tree(NCONS(node).left, type_mask, effect_mask, anchor_mask); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = check_type_tree(NQUALIFIER(node).target, type_mask, effect_mask, + anchor_mask); + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + if ((en->type & effect_mask) == 0) + return 1; + + r = check_type_tree(en->target, type_mask, effect_mask, anchor_mask); + } + break; + + case N_ANCHOR: + type = NANCHOR(node).type; + if ((type & anchor_mask) == 0) + return 1; + + if (NANCHOR(node).target) + r = check_type_tree(NANCHOR(node).target, + type_mask, effect_mask, anchor_mask); + break; + + default: + break; + } + return r; +} + +#ifdef USE_SUBEXP_CALL + +#define RECURSION_EXIST 1 +#define RECURSION_INFINITE 2 + +static int +subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + Node *x; + RegDistance min; + int ret; + + x = node; + do { + ret = subexp_inf_recursive_check(NCONS(x).left, env, head); + if (ret < 0 || ret == RECURSION_INFINITE) return ret; + r |= ret; + if (head) { + ret = get_min_match_length(NCONS(x).left, &min, env); + if (ret != 0) return ret; + if (min != 0) head = 0; + } + } while (IS_NOT_NULL(x = NCONS(x).right)); + } + break; + + case N_ALT: + { + int ret; + r = RECURSION_EXIST; + do { + ret = subexp_inf_recursive_check(NCONS(node).left, env, head); + if (ret < 0 || ret == RECURSION_INFINITE) return ret; + r &= ret; + } while (IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_QUALIFIER: + r = subexp_inf_recursive_check(NQUALIFIER(node).target, env, head); + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_inf_recursive_check(an->target, env, head); + break; + } + } + break; + + case N_CALL: + r = subexp_inf_recursive_check(NCALL(node).target, env, head); + break; + + case N_EFFECT: + if (IS_EFFECT_MARK2(&(NEFFECT(node)))) + return 0; + else if (IS_EFFECT_MARK1(&(NEFFECT(node)))) + return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE); + else { + SET_EFFECT_STATUS(node, NST_MARK2); + r = subexp_inf_recursive_check(NEFFECT(node).target, env, head); + CLEAR_EFFECT_STATUS(node, NST_MARK2); + } + break; + + default: + break; + } + + return r; +} + +static int +subexp_inf_recursive_check_trav(Node* node, ScanEnv* env) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + do { + r = subexp_inf_recursive_check_trav(NCONS(node).left, env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = subexp_inf_recursive_check_trav(NQUALIFIER(node).target, env); + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_inf_recursive_check_trav(an->target, env); + break; + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + if (IS_EFFECT_RECURSION(en)) { + SET_EFFECT_STATUS(node, NST_MARK1); + r = subexp_inf_recursive_check(en->target, env, 1); + if (r > 0) return REGERR_NEVER_ENDING_RECURSION; + CLEAR_EFFECT_STATUS(node, NST_MARK1); + } + r = subexp_inf_recursive_check_trav(en->target, env); + } + + break; + + default: + break; + } + + return r; +} + +static int +subexp_recursive_check(Node* node) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + do { + r |= subexp_recursive_check(NCONS(node).left); + } while (IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = subexp_recursive_check(NQUALIFIER(node).target); + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_recursive_check(an->target); + break; + } + } + break; + + case N_CALL: + r = subexp_recursive_check(NCALL(node).target); + if (r != 0) SET_CALL_RECURSION(node); + break; + + case N_EFFECT: + if (IS_EFFECT_MARK2(&(NEFFECT(node)))) + return 0; + else if (IS_EFFECT_MARK1(&(NEFFECT(node)))) + return 1; /* recursion */ + else { + SET_EFFECT_STATUS(node, NST_MARK2); + r = subexp_recursive_check(NEFFECT(node).target); + CLEAR_EFFECT_STATUS(node, NST_MARK2); + } + break; + + default: + break; + } + + return r; +} + + +static int +subexp_recursive_check_trav(Node* node, ScanEnv* env) +{ +#define FOUND_CALLED_NODE 1 + + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + { + int ret; + do { + ret = subexp_recursive_check_trav(NCONS(node).left, env); + if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE; + else if (ret < 0) return ret; + } while (IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_QUALIFIER: + r = subexp_recursive_check_trav(NQUALIFIER(node).target, env); + if (NQUALIFIER(node).upper == 0) { + if (r == FOUND_CALLED_NODE) + NQUALIFIER(node).is_refered = 1; + } + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_recursive_check_trav(an->target, env); + break; + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + if (! IS_EFFECT_RECURSION(en)) { + if (IS_EFFECT_CALLED(en)) { + SET_EFFECT_STATUS(node, NST_MARK1); + r = subexp_recursive_check(en->target); + if (r != 0) SET_EFFECT_STATUS(node, NST_RECURSION); + CLEAR_EFFECT_STATUS(node, NST_MARK1); + } + } + r = subexp_recursive_check_trav(en->target, env); + if (IS_EFFECT_CALLED(en)) + r |= FOUND_CALLED_NODE; + } + break; + + default: + break; + } + + return r; +} + +static int +setup_subexp_call(Node* node, ScanEnv* env) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + do { + r = setup_subexp_call(NCONS(node).left, env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + do { + r = setup_subexp_call(NCONS(node).left, env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = setup_subexp_call(NQUALIFIER(node).target, env); + break; + case N_EFFECT: + r = setup_subexp_call(NEFFECT(node).target, env); + break; + + case N_CALL: + { + int n, num, *refs; + UChar *p; + CallNode* cn = &(NCALL(node)); + Node** nodes = SCANENV_MEM_NODES(env); + +#ifdef USE_NAMED_SUBEXP + n = regex_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs); +#else + n = REGERR_UNDEFINED_GROUP_REFERENCE; +#endif + if (n <= 0) { + /* name not found, check group number. (?*ddd) */ + p = cn->name; + num = regex_scan_unsigned_number(&p, cn->name_end, env->enc); + if (num <= 0 || p != cn->name_end) { + regex_scan_env_set_error_string(env, + REGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); + return REGERR_UNDEFINED_NAME_REFERENCE; + } + if (num > env->num_mem) return REGERR_UNDEFINED_GROUP_REFERENCE; + cn->ref_num = num; + goto set_call_attr; + } + else if (n > 1) { + regex_scan_env_set_error_string(env, + REGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end); + return REGERR_MULTIPLEX_DEFINITION_NAME_CALL; + } + else { + cn->ref_num = refs[0]; + set_call_attr: + cn->target = nodes[cn->ref_num]; + if (IS_NULL(cn->target)) return REGERR_INVALID_SUBEXP_NAME; + SET_EFFECT_STATUS(cn->target, NST_CALLED); + BIT_STATUS_ON_AT(env->backtrack_mem, cn->ref_num); + cn->unset_addr_list = env->unset_addr_list; + } + } + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = setup_subexp_call(an->target, env); + break; + } + } + break; + + default: + break; + } + + return r; +} +#endif + +/* divide different length alternatives in look-behind. + (?<=A|B) ==> (?<=A)|(?<=B) + (?<!A|B) ==> (?<!A)(?<!B) +*/ +static int +divide_look_behind_alternatives(Node* node) +{ + Node tmp_node; + Node *head, *np, *insert_node; + AnchorNode* an = &(NANCHOR(node)); + int anc_type = an->type; + + head = an->target; + np = NCONS(head).left; + tmp_node = *node; *node = *head; *head = tmp_node; + NCONS(node).left = head; + NANCHOR(head).target = np; + + np = node; + while ((np = NCONS(np).right) != NULL_NODE) { + insert_node = regex_node_new_anchor(anc_type); + CHECK_NULL_RETURN_VAL(insert_node, REGERR_MEMORY); + NANCHOR(insert_node).target = NCONS(np).left; + NCONS(np).left = insert_node; + } + + if (anc_type == ANCHOR_LOOK_BEHIND_NOT) { + np = node; + do { + np->type = N_LIST; /* alt -> list */ + } while ((np = NCONS(np).right) != NULL_NODE); + } + return 0; +} + +static int +setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) +{ + int r, len; + AnchorNode* an = &(NANCHOR(node)); + + r = get_char_length_tree(an->target, reg, &len); + if (r == 0) + an->char_len = len; + else if (r == GET_CHAR_LEN_VARLEN) + r = REGERR_INVALID_LOOK_BEHIND_PATTERN; + else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) { + if (IS_SYNTAX_BV(env->syntax, REG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) + r = divide_look_behind_alternatives(node); + else + r = REGERR_INVALID_LOOK_BEHIND_PATTERN; + } + + return r; +} + +static int +next_setup(Node* node, Node* next_node, regex_t* reg) +{ + int type; + + retry: + type = NTYPE(node); + if (type == N_QUALIFIER) { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { +#ifdef USE_QUALIFIER_PEEK_NEXT + qn->next_head_exact = get_head_value_node(next_node, 1, reg); +#endif + /* automatic posseivation a*b ==> (?>a*)b */ + if (qn->lower <= 1) { + int ttype = NTYPE(qn->target); + if (IS_NODE_TYPE_SIMPLE(ttype)) { + Node *x, *y; + x = get_head_value_node(qn->target, 0, reg); + if (IS_NOT_NULL(x)) { + y = get_head_value_node(next_node, 0, reg); + if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { + Node* en = regex_node_new_effect(EFFECT_STOP_BACKTRACK); + CHECK_NULL_RETURN_VAL(en, REGERR_MEMORY); + SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); + swap_node(node, en); + NEFFECT(node).target = en; + } + } + } + } + } + } + else if (type == N_EFFECT) { + EffectNode* en = &(NEFFECT(node)); + if (en->type == EFFECT_MEMORY) { + node = en->target; + goto retry; + } + } + return 0; +} + +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) + +/* setup_tree does the following work. + 1. check empty loop. (set qn->target_may_empty) + 2. expand ignore-case in char class. + 3. set memory status bit flags. (reg->mem_stats) + 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. + 5. find invalid patterns in look-behind. + 6. expand repeated string. + */ +static int +setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + Node* prev = NULL_NODE; + do { + r = setup_tree(NCONS(node).left, reg, state, env); + if (IS_NOT_NULL(prev) && r == 0) { + r = next_setup(prev, NCONS(node).left, reg); + } + prev = NCONS(node).left; + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_ALT: + do { + r = setup_tree(NCONS(node).left, reg, (state | IN_ALT), env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_CCLASS: + if (IS_IGNORECASE(reg->options)) { + int c, t; + BitSetRef bs = NCCLASS(node).bs; + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + t = TOLOWER(reg->enc, c); + if (t != c) { + if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, t); + if (BITSET_AT(bs, t)) BITSET_SET_BIT(bs, c); + } + } + } + break; + + case N_STRING: + if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { + StrNode* sn = &NSTRING(node); + UChar* p = sn->s; + + while (p < sn->end) { + if (IS_AMBIGUITY_CHAR(reg->enc, *p)) { + NSTRING_SET_CASE_AMBIG(node); + break; + } + p++; + } + } + break; + + case N_CTYPE: + case N_ANYCHAR: + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + break; +#endif + + case N_BACKREF: + { + int i; + int* p; + Node** nodes = SCANENV_MEM_NODES(env); + BackrefNode* br = &(NBACKREF(node)); + p = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + if (p[i] > env->num_mem) return REGERR_INVALID_BACKREF; + BIT_STATUS_ON_AT(env->backrefed_mem, p[i]); + BIT_STATUS_ON_AT(env->backtrack_mem, p[i]); + SET_EFFECT_STATUS(nodes[p[i]], NST_MEM_BACKREFED); + } + } + break; + + case N_QUALIFIER: + { + RegDistance d; + QualifierNode* qn = &(NQUALIFIER(node)); + Node* target = qn->target; + + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + r = get_min_match_length(target, &d, env); + if (r) break; + if (d == 0) { + qn->target_may_empty = 1; +#if 0 + r = get_max_match_length(target, &d, env); + if (r == 0 && d == 0) { + /* ()* ==> ()?, ()+ ==> () */ + qn->upper = 1; + if (qn->lower > 1) qn->lower = 1; + if (NTYPE(target) == N_STRING) { + qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */ + } + } +#endif + } + } + + if (qn->lower != qn->upper) + state |= IN_REPEAT; + r = setup_tree(target, reg, state, env); + if (r) break; + + /* expand string */ +#define EXPAND_STRING_MAX_LENGTH 100 + if (NTYPE(target) == N_STRING) { + if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { + int len = NSTRING_LEN(target); + StrNode* sn = &(NSTRING(target)); + + if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { + int i, n = qn->lower; + regex_node_conv_to_str_node(node, NSTRING(target).flag); + for (i = 0; i < n; i++) { + r = regex_node_str_cat(node, sn->s, sn->end); + if (r) break; + } + regex_node_free(target); + break; /* break case N_QUALIFIER: */ + } + } + } + +#ifdef USE_OP_PUSH_OR_JUMP_EXACT + if (qn->greedy && !qn->target_may_empty) { + if (NTYPE(target) == N_QUALIFIER) { + QualifierNode* tqn = &(NQUALIFIER(target)); + if (IS_NOT_NULL(tqn->head_exact)) { + qn->head_exact = tqn->head_exact; + tqn->head_exact = NULL; + } + } + else { + qn->head_exact = get_head_value_node(qn->target, 1, reg); + } + } +#endif + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + switch (en->type) { + case EFFECT_OPTION: + { + RegOptionType options = reg->options; + reg->options = NEFFECT(node).option; + r = setup_tree(NEFFECT(node).target, reg, state, env); + reg->options = options; + } + break; + + case EFFECT_MEMORY: + if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { + BIT_STATUS_ON_AT(env->backtrack_mem, en->regnum); + /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ + } + /* fall */ + case EFFECT_STOP_BACKTRACK: + { + Node* target = en->target; + r = setup_tree(target, reg, state, env); + if (NTYPE(target) == N_QUALIFIER) { + QualifierNode* tqn = &(NQUALIFIER(target)); + if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + tqn->greedy != 0) { /* (?>a*), a*+ etc... */ + int qtype = NTYPE(tqn->target); + if (IS_NODE_TYPE_SIMPLE(qtype)) + SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); + } + } + } + break; + } + } + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + + switch (an->type) { + case ANCHOR_PREC_READ: + r = setup_tree(an->target, reg, state, env); + break; + case ANCHOR_PREC_READ_NOT: + r = setup_tree(an->target, reg, (state | IN_NOT), env); + break; + +/* allowed node types in look-behind */ +#define ALLOWED_TYPE_IN_LB \ + ( N_LIST | N_ALT | N_STRING | N_CCLASS | N_CTYPE | \ + N_ANYCHAR | N_ANCHOR | N_EFFECT | N_QUALIFIER | N_CALL ) + +#define ALLOWED_EFFECT_IN_LB ( EFFECT_MEMORY ) +#define ALLOWED_EFFECT_IN_LB_NOT 0 + +#define ALLOWED_ANCHOR_IN_LB \ +( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) +#define ALLOWED_ANCHOR_IN_LB_NOT \ +( ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) + /* can't allow all anchors, because \G in look-behind through Search(). + ex. /(?<=\G)zz/.match("azz") => success. */ + + case ANCHOR_LOOK_BEHIND: + { + r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, + ALLOWED_EFFECT_IN_LB, ALLOWED_ANCHOR_IN_LB); + if (r < 0) return r; + if (r > 0) return REGERR_INVALID_LOOK_BEHIND_PATTERN; + r = setup_look_behind(node, reg, env); + if (r != 0) return r; + r = setup_tree(an->target, reg, state, env); + } + break; + + case ANCHOR_LOOK_BEHIND_NOT: + { + r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, + ALLOWED_EFFECT_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); + if (r < 0) return r; + if (r > 0) return REGERR_INVALID_LOOK_BEHIND_PATTERN; + r = setup_look_behind(node, reg, env); + if (r != 0) return r; + r = setup_tree(an->target, reg, (state | IN_NOT), env); + } + break; + } + } + break; + + default: + break; + } + + return r; +} + +/* set skip map for Boyer-Moor search */ +static int +set_bm_skip(UChar* s, UChar* end, RegCharEncoding enc, int ignore_case, + UChar skip[], int** int_skip) +{ + int i, len; + + len = end - s; + if (len < REG_CHAR_TABLE_SIZE) { + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) skip[i] = len; + + if (ignore_case) { + for (i = 0; i < len - 1; i++) + skip[TOLOWER(enc, s[i])] = len - 1 - i; + } + else { + for (i = 0; i < len - 1; i++) + skip[s[i]] = len - 1 - i; + } + } + else { + if (IS_NULL(*int_skip)) { + *int_skip = (int* )xmalloc(sizeof(int) * REG_CHAR_TABLE_SIZE); + if (IS_NULL(*int_skip)) return REGERR_MEMORY; + } + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; + + if (ignore_case) { + for (i = 0; i < len - 1; i++) + (*int_skip)[TOLOWER(enc, s[i])] = len - 1 - i; + } + else { + for (i = 0; i < len - 1; i++) + (*int_skip)[s[i]] = len - 1 - i; + } + } + return 0; +} + +#define OPT_EXACT_MAXLEN 24 + +typedef struct { + RegDistance min; /* min byte length */ + RegDistance max; /* max byte length */ +} MinMaxLen; + +typedef struct { + MinMaxLen mmd; + BitStatusType backrefed_status; + RegCharEncoding enc; + RegOptionType options; + RegTransTableType transtable; + ScanEnv* scan_env; +} OptEnv; + +typedef struct { + int left_anchor; + int right_anchor; +} OptAncInfo; + +typedef struct { + MinMaxLen mmd; /* info position */ + OptAncInfo anc; + + int reach_end; + int ignore_case; + int len; + UChar s[OPT_EXACT_MAXLEN]; +} OptExactInfo; + +typedef struct { + MinMaxLen mmd; /* info position */ + OptAncInfo anc; + + int value; /* weighted value */ + UChar map[REG_CHAR_TABLE_SIZE]; +} OptMapInfo; + +typedef struct { + MinMaxLen len; + + OptAncInfo anc; + OptExactInfo exb; /* boundary */ + OptExactInfo exm; /* middle */ + OptExactInfo expr; /* prec read (?=...) */ + + OptMapInfo map; /* boundary */ +} NodeOptInfo; + + +static int +map_position_value(int i) +{ + static int vals[] = { + 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, + 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, + 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, + }; + + if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; + + return 7; /* Take it easy. */ +} + +static int +distance_value(MinMaxLen* mm) +{ + /* 1000 / (min-max-dist + 1) */ + static int dist_vals[] = { + 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, + 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, + 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, + 32, 31, 30, 29, 29, 28, 27, 26, 26, 25, + 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, + 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, + 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 10, 10, 10 + }; + + int d; + + if (mm->max == INFINITE_DISTANCE) return 0; + + d = mm->max - mm->min; + if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) + /* return dist_vals[d] * 16 / (mm->min + 12); */ + return dist_vals[d]; + else + return 1; +} + +static int +comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2) +{ + if (v2 <= 0) return -1; + if (v1 <= 0) return 1; + + v1 *= distance_value(d1); + v2 *= distance_value(d2); + + if (v2 > v1) return 1; + if (v2 < v1) return -1; + + if (d2->min < d1->min) return 1; + if (d2->min > d1->min) return -1; + return 0; +} + +static int +is_equal_mml(MinMaxLen* a, MinMaxLen* b) +{ + return (a->min == b->min && a->max == b->max) ? 1 : 0; +} + + +static void +set_mml(MinMaxLen* mml, RegDistance min, RegDistance max) +{ + mml->min = min; + mml->max = max; +} + +static void +clear_mml(MinMaxLen* mml) +{ + mml->min = mml->max = 0; +} + +static void +copy_mml(MinMaxLen* to, MinMaxLen* from) +{ + to->min = from->min; + to->max = from->max; +} + +static void +add_mml(MinMaxLen* to, MinMaxLen* from) +{ + to->min = distance_add(to->min, from->min); + to->max = distance_add(to->max, from->max); +} + +static void +add_len_mml(MinMaxLen* to, RegDistance len) +{ + to->min = distance_add(to->min, len); + to->max = distance_add(to->max, len); +} + +static void +alt_merge_mml(MinMaxLen* to, MinMaxLen* from) +{ + if (to->min > from->min) to->min = from->min; + if (to->max < from->max) to->max = from->max; +} + +static void +copy_opt_env(OptEnv* to, OptEnv* from) +{ + *to = *from; +} + +static void +clear_opt_anc_info(OptAncInfo* anc) +{ + anc->left_anchor = 0; + anc->right_anchor = 0; +} + +static void +copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from) +{ + *to = *from; +} + +static void +concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, + RegDistance left_len, RegDistance right_len) +{ + clear_opt_anc_info(to); + + to->left_anchor = left->left_anchor; + if (left_len == 0) { + to->left_anchor |= right->left_anchor; + } + + to->right_anchor = right->right_anchor; + if (right_len == 0) { + to->right_anchor |= left->right_anchor; + } +} + +static int +is_left_anchor(int anc) +{ + if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF || + anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ || + anc == ANCHOR_PREC_READ_NOT) + return 0; + + return 1; +} + +static int +is_set_opt_anc_info(OptAncInfo* to, int anc) +{ + if ((to->left_anchor & anc) != 0) return 1; + + return ((to->right_anchor & anc) != 0 ? 1 : 0); +} + +static void +add_opt_anc_info(OptAncInfo* to, int anc) +{ + if (is_left_anchor(anc)) + to->left_anchor |= anc; + else + to->right_anchor |= anc; +} + +static void +remove_opt_anc_info(OptAncInfo* to, int anc) +{ + if (is_left_anchor(anc)) + to->left_anchor &= ~anc; + else + to->right_anchor &= ~anc; +} + +static void +alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add) +{ + to->left_anchor &= add->left_anchor; + to->right_anchor &= add->right_anchor; +} + +static int +is_full_opt_exact_info(OptExactInfo* ex) +{ + return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0); +} + +static void +clear_opt_exact_info(OptExactInfo* ex) +{ + clear_mml(&ex->mmd); + clear_opt_anc_info(&ex->anc); + ex->reach_end = 0; + ex->ignore_case = 0; + ex->len = 0; + ex->s[0] = '\0'; +} + +static void +copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from) +{ + *to = *from; +} + +static void +concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) +{ + int i, n; + OptAncInfo tanc; + + if (! to->ignore_case && add->ignore_case) { + if (to->len >= add->len) return ; /* avoid */ + + to->ignore_case = 1; + } + + for (i = to->len, n = 0; n < add->len && i < OPT_EXACT_MAXLEN; i++, n++) + to->s[i] = add->s[n]; + + to->len = i; + to->reach_end = (n == add->len ? add->reach_end : 0); + + concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1); + if (! to->reach_end) tanc.right_anchor = 0; + copy_opt_anc_info(&to->anc, &tanc); +} + +static void +concat_opt_exact_info_str(OptExactInfo* to, + UChar* s, UChar* end, int raw, RegCharEncoding code) +{ + int i, j, len; + UChar *p; + + for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { + if (raw) { + to->s[i++] = *p++; + } + else { + len = mblen(code, *p); + for (j = 0; j < len; j++) + to->s[i++] = *p++; + } + } + + to->len = i; +} + +static void +alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) +{ + int i, j, len; + + if (add->len == 0 || to->len == 0) { + clear_opt_exact_info(to); + return ; + } + + if (! is_equal_mml(&to->mmd, &add->mmd)) { + clear_opt_exact_info(to); + return ; + } + + for (i = 0; i < to->len && i < add->len; ) { + if (to->s[i] != add->s[i]) break; + len = mblen(env->enc, to->s[i]); + + for (j = 1; j < len; j++) { + if (to->s[i+j] != add->s[i+j]) break; + } + if (j < len) break; + i += len; + } + + if (! add->reach_end || i < add->len || i < to->len) { + to->reach_end = 0; + } + to->len = i; + to->ignore_case |= add->ignore_case; + + alt_merge_opt_anc_info(&to->anc, &add->anc); + if (! to->reach_end) to->anc.right_anchor = 0; +} + +static void +select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) +{ + int vlen1, vlen2; + + vlen1 = now->len * (now->ignore_case ? 1 : 2); + vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + + if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + copy_opt_exact_info(now, alt); +} + +static void +clear_opt_map_info(OptMapInfo* map) +{ + int i; + + clear_mml(&map->mmd); + clear_opt_anc_info(&map->anc); + map->value = 0; + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) + map->map[i] = 0; +} + +static void +copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) +{ + *to = *from; +} + +static void +add_char_opt_map_info(OptMapInfo* map, int c) +{ + if (map->map[c] == 0) { + map->map[c] = 1; + map->value += map_position_value(c); + } +} + +static void +add_char_amb_opt_map_info(OptMapInfo* map, int c, RegCharEncoding enc) +{ + int i, t; + + add_char_opt_map_info(map, c); + t = TOLOWER(enc, c); + if (t != c) { + add_char_opt_map_info(map, t); + } + else { + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) { + t = TOLOWER(enc, i); + if (t == c) add_char_opt_map_info(map, i); + } + } +} + +static void +select_opt_map_info(OptMapInfo* now, OptMapInfo* alt) +{ + static int z = 1<<15; /* 32768: something big value */ + + int v1, v2; + + if (alt->value == 0) return ; + if (now->value == 0) { + copy_opt_map_info(now, alt); + return ; + } + + v1 = z / now->value; + v2 = z / alt->value; + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) + copy_opt_map_info(now, alt); +} + +static int +comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m) +{ +#define COMP_EM_BASE 20 + int ve, vm; + + if (m->value <= 0) return -1; + + ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2); + vm = COMP_EM_BASE * 5 * 2 / m->value; + return comp_distance_value(&e->mmd, &m->mmd, ve, vm); +} + +static void +alt_merge_opt_map_info(OptMapInfo* to, OptMapInfo* add) +{ + int i, val; + + /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ + if (to->value == 0) return ; + if (add->value == 0 || to->mmd.max < add->mmd.min) { + clear_opt_map_info(to); + return ; + } + + alt_merge_mml(&to->mmd, &add->mmd); + + val = 0; + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) { + if (add->map[i]) + to->map[i] = 1; + + if (to->map[i]) + val += map_position_value(i); + } + to->value = val; + + alt_merge_opt_anc_info(&to->anc, &add->anc); +} + +static void +set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd) +{ + copy_mml(&(opt->exb.mmd), mmd); + copy_mml(&(opt->expr.mmd), mmd); + copy_mml(&(opt->map.mmd), mmd); +} + +static void +clear_node_opt_info(NodeOptInfo* opt) +{ + clear_mml(&opt->len); + clear_opt_anc_info(&opt->anc); + clear_opt_exact_info(&opt->exb); + clear_opt_exact_info(&opt->exm); + clear_opt_exact_info(&opt->expr); + clear_opt_map_info(&opt->map); +} + +static void +copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from) +{ + *to = *from; +} + +static void +concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) +{ + int exb_reach, exm_reach; + OptAncInfo tanc; + + concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max); + copy_opt_anc_info(&to->anc, &tanc); + + if (add->exb.len > 0 && to->len.max == 0) { + concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc, + to->len.max, add->len.max); + copy_opt_anc_info(&add->exb.anc, &tanc); + } + + if (add->map.value > 0 && to->len.max == 0) { + concat_opt_anc_info(&tanc, &to->anc, &add->map.anc, + to->len.max, add->len.max); + copy_opt_anc_info(&add->map.anc, &tanc); + } + + exb_reach = to->exb.reach_end; + exm_reach = to->exm.reach_end; + + if (add->len.max != 0) + to->exb.reach_end = to->exm.reach_end = 0; + + if (add->exb.len > 0) { + if (exb_reach) { + concat_opt_exact_info(&to->exb, &add->exb); + clear_opt_exact_info(&add->exb); + } + else if (exm_reach) { + concat_opt_exact_info(&to->exm, &add->exb); + clear_opt_exact_info(&add->exb); + } + } + select_opt_exact_info(&to->exm, &add->exb); + select_opt_exact_info(&to->exm, &add->exm); + + if (to->expr.len > 0) { + if (add->len.max > 0) { + if (to->expr.len > add->len.max) + to->expr.len = add->len.max; + + if (to->expr.mmd.max == 0) + select_opt_exact_info(&to->exb, &to->expr); + else + select_opt_exact_info(&to->exm, &to->expr); + } + } + else if (add->expr.len > 0) { + copy_opt_exact_info(&to->expr, &add->expr); + } + + select_opt_map_info(&to->map, &add->map); + + add_mml(&to->len, &add->len); +} + +static void +alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env) +{ + alt_merge_opt_anc_info (&to->anc, &add->anc); + alt_merge_opt_exact_info(&to->exb, &add->exb, env); + alt_merge_opt_exact_info(&to->exm, &add->exm, env); + alt_merge_opt_exact_info(&to->expr, &add->expr, env); + alt_merge_opt_map_info (&to->map, &add->map); + + alt_merge_mml(&to->len, &add->len); +} + + +#define MAX_NODE_OPT_INFO_REF_COUNT 5 + +static int +optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) +{ + int type; + int r = 0; + + clear_node_opt_info(opt); + set_bound_node_opt_info(opt, &env->mmd); + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + OptEnv nenv; + NodeOptInfo nopt; + Node* nd = node; + + copy_opt_env(&nenv, env); + do { + r = optimize_node_left(NCONS(nd).left, &nopt, &nenv); + if (r == 0) { + add_mml(&nenv.mmd, &nopt.len); + concat_left_node_opt_info(opt, &nopt); + } + } while (r == 0 && IS_NOT_NULL(nd = NCONS(nd).right)); + } + break; + + case N_ALT: + { + NodeOptInfo nopt; + Node* nd = node; + + do { + r = optimize_node_left(NCONS(nd).left, &nopt, env); + if (r == 0) { + if (nd == node) copy_node_opt_info(opt, &nopt); + else alt_merge_node_opt_info(opt, &nopt, env); + } + } while ((r == 0) && IS_NOT_NULL(nd = NCONS(nd).right)); + } + break; + + case N_STRING: + { + UChar *p; + int len, plen; + StrNode* sn = &(NSTRING(node)); + int slen = sn->end - sn->s; + int is_raw = NSTRING_IS_RAW(node); + + if ((! IS_IGNORECASE(env->options)) || is_raw) { + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + NSTRING_IS_RAW(node), env->enc); + if (slen > 0) { + add_char_opt_map_info(&opt->map, *(sn->s)); + } + } + else { + for (p = sn->s; p < sn->end; ) { + len = mblen(env->enc, *p); + if (len == 1 && IS_AMBIGUITY_CHAR(env->enc, *p)) { + break; + } + p += len; + } + + plen = p - sn->s; + if (plen > slen / 5) { + concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); + concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); + opt->exm.ignore_case = 1; + if (opt->exm.len == sn->end - p) + opt->exm.reach_end = 1; + + copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); + add_len_mml(&(opt->exm.mmd), plen); + } + else { + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; + } + + if (slen > 0) { + if (p == sn->s) + add_char_amb_opt_map_info(&opt->map, *(sn->s), env->transtable); + else + add_char_opt_map_info(&opt->map, *(sn->s)); + } + } + + if (opt->exb.len == slen) + opt->exb.reach_end = 1; + + set_mml(&opt->len, slen, slen); + } + break; + + case N_CCLASS: + { + int i, z, len, found; + CClassNode* cc = &(NCCLASS(node)); + + /* no need to check ignore case. (setted in setup_tree()) */ + found = 0; + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !cc->not) || (!z && cc->not)) { + found = 1; + add_char_opt_map_info(&opt->map, i); + } + } + + if (IS_NOT_NULL(cc->mbuf)) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT((BitSetRef )(cc->mbuf->p), i)) { + found = 1; + add_char_opt_map_info(&opt->map, i); + } + } + } + + if (found) { + if (IS_NULL(cc->mbuf)) + len = bitset_mbmaxlen(cc->bs, cc->not, env->enc); + else + len = mbmaxlen_dist(env->enc); + + set_mml(&opt->len, 1, len); + } + } + break; + + case N_CTYPE: + { + int c; + int len, min, max; + + min = mbmaxlen_dist(env->enc); + max = 0; + + switch (NCTYPE(node).type) { + case CTYPE_WORD: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_WORD_HEAD(env->enc, c)) { + add_char_opt_map_info(&opt->map, c); + len = mblen(env->enc, c); + if (len < min) min = len; + if (len > max) max = len; + } + } + break; + + case CTYPE_NOT_WORD: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_WORD_HEAD(env->enc, c)) { + add_char_opt_map_info(&opt->map, c); + len = mblen(env->enc, c); + if (len < min) min = len; + if (len > max) max = len; + } + } + break; + +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_SB_WORD(env->enc, c)) { + add_char_opt_map_info(&opt->map, c); + } + } + min = max = 1; + break; + + case CTYPE_WORD_MB: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_MB_WORD(env->enc, c)) { + add_char_opt_map_info(&opt->map, c); + len = mblen(env->enc, c); + if (len < min) min = len; + if (len > max) max = len; + } + } + break; +#endif + } + + set_mml(&opt->len, min, max); + } + break; + + case N_ANYCHAR: + { + RegDistance len = mbmaxlen_dist(env->enc); + set_mml(&opt->len, 1, len); + } + break; + + case N_ANCHOR: + switch (NANCHOR(node).type) { + case ANCHOR_BEGIN_BUF: + case ANCHOR_BEGIN_POSITION: + case ANCHOR_BEGIN_LINE: + case ANCHOR_END_BUF: + case ANCHOR_SEMI_END_BUF: + case ANCHOR_END_LINE: + add_opt_anc_info(&opt->anc, NANCHOR(node).type); + break; + + case ANCHOR_PREC_READ: + { + NodeOptInfo nopt; + + r = optimize_node_left(NANCHOR(node).target, &nopt, env); + if (r == 0) { + if (nopt.exb.len > 0) + copy_opt_exact_info(&opt->expr, &nopt.exb); + else if (nopt.exm.len > 0) + copy_opt_exact_info(&opt->expr, &nopt.exm); + + opt->expr.reach_end = 0; + + if (nopt.map.value > 0) + copy_opt_map_info(&opt->map, &nopt.map); + } + } + break; + + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */ + case ANCHOR_LOOK_BEHIND_NOT: + break; + } + break; + + case N_BACKREF: + { + int i; + int* backs; + RegDistance min, max, tmin, tmax; + Node** nodes = SCANENV_MEM_NODES(env->scan_env); + BackrefNode* br = &(NBACKREF(node)); + + if (br->state & NST_RECURSION) { + set_mml(&opt->len, 0, INFINITE_DISTANCE); + break; + } + backs = BACKREFS_P(br); + r = get_min_match_length(nodes[backs[0]], &min, env->scan_env); + if (r != 0) break; + r = get_max_match_length(nodes[backs[0]], &max, env->scan_env); + if (r != 0) break; + for (i = 1; i < br->back_num; i++) { + r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env); + if (r != 0) break; + r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env); + if (r != 0) break; + if (min > tmin) min = tmin; + if (max < tmax) max = tmax; + } + if (r == 0) set_mml(&opt->len, min, max); + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&(NCALL(node)))) + set_mml(&opt->len, 0, INFINITE_DISTANCE); + else { + r = optimize_node_left(NCALL(node).target, opt, env); + } + break; +#endif + + case N_QUALIFIER: + { + int i; + RegDistance min, max; + NodeOptInfo nopt; + QualifierNode* qn = &(NQUALIFIER(node)); + + r = optimize_node_left(qn->target, &nopt, env); + if (r) break; + + if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { + if (env->mmd.max == 0 && + NTYPE(qn->target) == N_ANYCHAR && qn->greedy) { + if (IS_POSIXLINE(env->options)) + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_PL); + else + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); + } + } + else { + if (qn->lower > 0) { + copy_node_opt_info(opt, &nopt); + if (nopt.exb.len > 0) { + if (nopt.exb.reach_end) { + for (i = 2; i < qn->lower && + ! is_full_opt_exact_info(&opt->exb); i++) { + concat_opt_exact_info(&opt->exb, &nopt.exb); + } + if (i < qn->lower) { + opt->exb.reach_end = 0; + } + } + } + + if (qn->lower != qn->upper) { + opt->exb.reach_end = 0; + opt->exm.reach_end = 0; + } + if (qn->lower > 1) + opt->exm.reach_end = 0; + } + } + + min = distance_multiply(nopt.len.min, qn->lower); + if (IS_REPEAT_INFINITE(qn->upper)) + max = (nopt.len.max > 0 ? INFINITE_DISTANCE : 0); + else + max = distance_multiply(nopt.len.max, qn->upper); + + set_mml(&opt->len, min, max); + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + switch (en->type) { + case EFFECT_OPTION: + { + RegOptionType save = env->options; + + env->options = en->option; + r = optimize_node_left(en->target, opt, env); + env->options = save; + } + break; + + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + en->opt_count++; + if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) { + RegDistance min, max; + + min = 0; + max = INFINITE_DISTANCE; + if (IS_EFFECT_MIN_FIXED(en)) min = en->min_len; + if (IS_EFFECT_MAX_FIXED(en)) max = en->max_len; + set_mml(&opt->len, min, max); + } + else +#endif + { + r = optimize_node_left(en->target, opt, env); + + if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) { + if (BIT_STATUS_AT(env->backrefed_status, en->regnum)) + remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK); + } + } + break; + + case EFFECT_STOP_BACKTRACK: + r = optimize_node_left(en->target, opt, env); + break; + } + } + break; + + default: +#ifdef REG_DEBUG + fprintf(stderr, "optimize_node_left: undefined node type %d\n", + NTYPE(node)); +#endif + r = REGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +set_optimize_exact_info(regex_t* reg, OptExactInfo* e) +{ + int r; + + if (e->len == 0) return 0; + + reg->exact = regex_strdup(e->s, e->s + e->len); + if (IS_NULL(reg->exact)) return REGERR_MEMORY; + + reg->exact_end = reg->exact + e->len; + + if (e->ignore_case) { + UChar *p; + int len; + for (p = reg->exact; p < reg->exact_end; ) { + len = mblen(reg->enc, *p); + if (len == 1) { + *p = TOLOWER(reg->enc, *p); + } + p += len; + } + reg->optimize = REG_OPTIMIZE_EXACT_IC; + } + else { + int allow_reverse; + + if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) + allow_reverse = 1; + else + allow_reverse = + regex_is_allow_reverse_match(reg->enc, reg->exact, reg->exact_end); + + if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { + r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, + reg->map, &(reg->int_map)); + if (r) return r; + + reg->optimize = (allow_reverse != 0 + ? REG_OPTIMIZE_EXACT_BM : REG_OPTIMIZE_EXACT_BM_NOT_REV); + } + else { + reg->optimize = REG_OPTIMIZE_EXACT; + } + } + + reg->dmin = e->mmd.min; + reg->dmax = e->mmd.max; + + if (reg->dmin != INFINITE_DISTANCE) { + reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact); + } + + return 0; +} + +static void +set_optimize_map_info(regex_t* reg, OptMapInfo* m) +{ + int i; + + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) + reg->map[i] = m->map[i]; + + reg->optimize = REG_OPTIMIZE_MAP; + reg->dmin = m->mmd.min; + reg->dmax = m->mmd.max; + + if (reg->dmin != INFINITE_DISTANCE) { + reg->threshold_len = reg->dmin + 1; + } +} + +static void +set_sub_anchor(regex_t* reg, OptAncInfo* anc) +{ + reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE; + reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE; +} + +#ifdef REG_DEBUG +static void print_optimize_info(FILE* f, regex_t* reg); +#endif + +static int +set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) +{ + + int r; + NodeOptInfo opt; + OptEnv env; + + env.enc = reg->enc; + env.options = reg->options; + env.scan_env = scan_env; + clear_mml(&env.mmd); + + r = optimize_node_left(node, &opt, &env); + if (r) return r; + + reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF | + ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL); + + reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF); + + if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) { + reg->anchor_dmin = opt.len.min; + reg->anchor_dmax = opt.len.max; + } + + if (opt.exb.len > 0 || opt.exm.len > 0) { + select_opt_exact_info(&opt.exb, &opt.exm); + if (opt.map.value > 0 && + comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) { + goto set_map; + } + else { + r = set_optimize_exact_info(reg, &opt.exb); + set_sub_anchor(reg, &opt.exb.anc); + } + } + else if (opt.map.value > 0) { + set_map: + set_optimize_map_info(reg, &opt.map); + set_sub_anchor(reg, &opt.map.anc); + } + else { + reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE; + if (opt.len.max == 0) + reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE; + } + +#if defined(REG_DEBUG_COMPILE) || defined(REG_DEBUG_MATCH) + print_optimize_info(stderr, reg); +#endif + return r; +} + +static void +clear_optimize_info(regex_t* reg) +{ + reg->optimize = REG_OPTIMIZE_NONE; + reg->anchor = 0; + reg->anchor_dmin = 0; + reg->anchor_dmax = 0; + reg->sub_anchor = 0; + reg->exact_end = (UChar* )NULL; + reg->threshold_len = 0; + if (IS_NOT_NULL(reg->exact)) { + xfree(reg->exact); + reg->exact = (UChar* )NULL; + } +} + +#ifdef REG_DEBUG + +static void +print_distance_range(FILE* f, RegDistance a, RegDistance b) +{ + if (a == INFINITE_DISTANCE) + fputs("inf", f); + else + fprintf(f, "(%u)", a); + + fputs("-", f); + + if (b == INFINITE_DISTANCE) + fputs("inf", f); + else + fprintf(f, "(%u)", b); +} + +static void +print_anchor(FILE* f, int anchor) +{ + int q = 0; + + fprintf(f, "["); + + if (anchor & ANCHOR_BEGIN_BUF) { + fprintf(f, "begin-buf"); + q = 1; + } + if (anchor & ANCHOR_BEGIN_LINE) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "begin-line"); + } + if (anchor & ANCHOR_BEGIN_POSITION) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "begin-pos"); + } + if (anchor & ANCHOR_END_BUF) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "end-buf"); + } + if (anchor & ANCHOR_SEMI_END_BUF) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "semi-end-buf"); + } + if (anchor & ANCHOR_END_LINE) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "end-line"); + } + if (anchor & ANCHOR_ANYCHAR_STAR) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "anychar-star"); + } + if (anchor & ANCHOR_ANYCHAR_STAR_PL) { + if (q) fprintf(f, ", "); + fprintf(f, "anychar-star-pl"); + } + + fprintf(f, "]"); +} + +static void +print_optimize_info(FILE* f, regex_t* reg) +{ + static char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", + "EXACT_IC", "MAP" }; + + fprintf(f, "optimize: %s\n", on[reg->optimize]); + fprintf(f, " anchor: "); print_anchor(f, reg->anchor); + if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0) + print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); + fprintf(f, "\n"); + + if (reg->optimize) { + fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor); + fprintf(f, "\n"); + } + fprintf(f, "\n"); + + if (reg->exact) { + UChar *p; + fprintf(f, "exact: ["); + for (p = reg->exact; p < reg->exact_end; p++) { + fputc(*p, f); + } + fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); + } + else if (reg->optimize & REG_OPTIMIZE_MAP) { + int i, n = 0; + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) + if (reg->map[i]) n++; + + fprintf(f, "map: n=%d\n", n); + if (n > 0) { + fputc('[', f); + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) + if (reg->map[i] && mblen(reg->enc, i) == 1 && + IS_CODE_PRINT(reg->enc, i)) + fputc(i, f); + fprintf(f, "]\n"); + } + } +} +#endif /* REG_DEBUG */ + + +static void +regex_free_body(regex_t* reg) +{ + if (IS_NOT_NULL(reg->p)) xfree(reg->p); + if (IS_NOT_NULL(reg->exact)) xfree(reg->exact); + if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map); + if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward); + if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range); + if (IS_NOT_NULL(reg->chain)) regex_free(reg->chain); + +#ifdef USE_NAMED_SUBEXP + regex_names_free(reg); +#endif +} + +extern void +regex_free(regex_t* reg) +{ + if (IS_NOT_NULL(reg)) { + regex_free_body(reg); + xfree(reg); + } +} + +#define REGEX_TRANSFER(to,from) do {\ + (to)->state = REG_STATE_MODIFY;\ + regex_free_body(to);\ + xmemcpy(to, from, sizeof(regex_t));\ + xfree(from);\ +} while (0) + +static void +regex_transfer(regex_t* to, regex_t* from) +{ + THREAD_ATOMIC_START; + REGEX_TRANSFER(to, from); + THREAD_ATOMIC_END; +} + +#define REGEX_CHAIN_HEAD(reg) do {\ + while (IS_NOT_NULL((reg)->chain)) {\ + (reg) = (reg)->chain;\ + }\ +} while (0) + +static void +regex_chain_link_add(regex_t* to, regex_t* add) +{ + THREAD_ATOMIC_START; + REGEX_CHAIN_HEAD(to); + to->chain = add; + THREAD_ATOMIC_END; +} + +extern void +regex_chain_reduce(regex_t* reg) +{ + regex_t *head, *prev; + + THREAD_ATOMIC_START; + prev = reg; + head = prev->chain; + if (IS_NOT_NULL(head)) { + reg->state = REG_STATE_MODIFY; + while (IS_NOT_NULL(head->chain)) { + prev = head; + head = head->chain; + } + prev->chain = (regex_t* )NULL; + REGEX_TRANSFER(reg, head); + } + THREAD_ATOMIC_END; +} + +#if 0 +extern int +regex_clone(regex_t** to, regex_t* from) +{ + int r, size; + regex_t* reg; + + if (REG_STATE(from) == REG_STATE_NORMAL) { + from->state++; /* increment as search counter */ + if (IS_NOT_NULL(from->chain)) { + regex_chain_reduce(from); + from->state++; + } + } + else { + int n = 0; + while (REG_STATE(from) < REG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return REGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + from->state++; /* increment as search counter */ + } + + r = regex_alloc_init(®, REG_OPTION_NONE, RegDefaultCharEncoding, + REG_TRANSTABLE_USE_DEFAULT); + if (r != 0) { + from->state--; + return r; + } + + xmemcpy(reg, from, sizeof(regex_t)); + reg->state = REG_STATE_NORMAL; + reg->chain = (regex_t* )NULL; + + if (from->p) { + reg->p = (UChar* )xmalloc(reg->alloc); + if (IS_NULL(reg->p)) goto mem_error; + xmemcpy(reg->p, from->p, reg->alloc); + } + + if (from->exact) { + reg->exact = (UChar* )xmalloc(from->exact_end - from->exact); + if (IS_NULL(reg->exact)) goto mem_error; + reg->exact_end = reg->exact + (from->exact_end - from->exact); + xmemcpy(reg->exact, from->exact, reg->exact_end - reg->exact); + } + + if (from->int_map) { + size = sizeof(int) * REG_CHAR_TABLE_SIZE; + reg->int_map = (int* )xmalloc(size); + if (IS_NULL(reg->int_map)) goto mem_error; + xmemcpy(reg->int_map, from->int_map, size); + } + + if (from->int_map_backward) { + size = sizeof(int) * REG_CHAR_TABLE_SIZE; + reg->int_map_backward = (int* )xmalloc(size); + if (IS_NULL(reg->int_map_backward)) goto mem_error; + xmemcpy(reg->int_map_backward, from->int_map_backward, size); + } + +#ifdef USE_NAMED_SUBEXP + reg->name_table = names_clone(from); /* names_clone is not implemented */ +#endif + + from->state--; + *to = reg; + return 0; + + mem_error: + from->state--; + return REGERR_MEMORY; +} +#endif + +#ifdef REG_DEBUG +static void print_tree P_((FILE* f, Node* node)); +static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg)); +#endif + +extern int +regex_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, + RegErrorInfo* einfo) +{ +#define COMPILE_INIT_SIZE 20 + + int r, init_size; + Node* root; + ScanEnv scan_env; +#ifdef USE_SUBEXP_CALL + UnsetAddrList uslist; +#endif + + reg->state = REG_STATE_COMPILING; + + if (reg->alloc == 0) { + init_size = (pattern_end - pattern) * 2; + if (init_size <= 0) init_size = COMPILE_INIT_SIZE; + r = BBUF_INIT(reg, init_size); + if (r) goto end; + } + else + reg->used = 0; + + reg->num_mem = 0; + reg->num_repeat = 0; + reg->num_null_check = 0; + reg->repeat_range_alloc = 0; + reg->repeat_range = (RegRepeatRange* )NULL; + + r = regex_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env); + if (r) goto err; + +#ifdef REG_DEBUG_PARSE_TREE + print_tree(stderr, root); +#endif + +#ifdef USE_SUBEXP_CALL + if (scan_env.num_call > 0) { + r = unset_addr_list_init(&uslist, scan_env.num_call); + if (r) goto err; + scan_env.unset_addr_list = &uslist; + r = setup_subexp_call(root, &scan_env); + if (r) goto err_unset; + r = subexp_recursive_check_trav(root, &scan_env); + if (r < 0) goto err_unset; + r = subexp_inf_recursive_check_trav(root, &scan_env); + if (r) goto err_unset; + + reg->num_call = scan_env.num_call; + } + else + reg->num_call = 0; +#endif + + r = setup_tree(root, reg, 0, &scan_env); + if (r) goto err_unset; + + reg->backtrack_mem = scan_env.backtrack_mem; + + clear_optimize_info(reg); +#ifndef REG_DONT_OPTIMIZE + r = set_optimize_info_from_tree(root, reg, &scan_env); + if (r) goto err_unset; +#endif + + if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) { + xfree(scan_env.mem_nodes_dynamic); + scan_env.mem_nodes_dynamic = (Node** )NULL; + } + + r = compile_tree(root, reg); + if (r == 0) { + r = add_opcode(reg, OP_END); +#ifdef USE_SUBEXP_CALL + if (scan_env.num_call > 0) { + r = unset_addr_list_fix(&uslist, reg); + unset_addr_list_end(&uslist); + if (r) goto err; + } +#endif + + if ((reg->num_repeat != 0) || IS_FIND_CONDITION(reg->options)) + reg->stack_pop_level = STACK_POP_LEVEL_ALL; + else { + if (reg->backtrack_mem != 0) + reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; + else + reg->stack_pop_level = STACK_POP_LEVEL_FREE; + } + } +#ifdef USE_SUBEXP_CALL + else if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + regex_node_free(root); + +#ifdef REG_DEBUG_COMPILE +#ifdef USE_NAMED_SUBEXP + regex_print_names(stderr, reg); +#endif + print_compiled_byte_code_list(stderr, reg); +#endif + + end: + reg->state = REG_STATE_NORMAL; + return r; + + err_unset: +#ifdef USE_SUBEXP_CALL + if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + err: + if (IS_NOT_NULL(scan_env.error)) { + if (IS_NOT_NULL(einfo)) { + einfo->par = scan_env.error; + einfo->par_end = scan_env.error_end; + } + } + + if (IS_NOT_NULL(root)) regex_node_free(root); + if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) + xfree(scan_env.mem_nodes_dynamic); + return r; +} + +extern int +regex_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, + RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, + RegErrorInfo* einfo) +{ + int r; + regex_t *new_reg; + + r = regex_new(&new_reg, pattern, pattern_end, option, code, syntax, einfo); + if (r) return r; + if (REG_STATE(reg) == REG_STATE_NORMAL) { + regex_transfer(reg, new_reg); + } + else { + regex_chain_link_add(reg, new_reg); + } + return 0; +} + +static int regex_inited = 0; + +extern int +regex_alloc_init(regex_t** reg, RegOptionType option, RegCharEncoding enc, + RegSyntaxType* syntax) +{ + if (! regex_inited) + regex_init(); + + *reg = (regex_t* )xmalloc(sizeof(regex_t)); + if (IS_NULL(*reg)) return REGERR_MEMORY; + + if ((option & REG_OPTION_NEGATE_SINGLELINE) != 0) { + option |= syntax->options; + option &= ~REG_OPTION_SINGLELINE; + } + else + option |= syntax->options; + + (*reg)->state = REG_STATE_NORMAL; + (*reg)->enc = enc; + (*reg)->options = option; + (*reg)->syntax = syntax; + (*reg)->optimize = 0; + (*reg)->exact = (UChar* )NULL; + (*reg)->int_map = (int* )NULL; + (*reg)->int_map_backward = (int* )NULL; + (*reg)->chain = (regex_t* )NULL; + + (*reg)->p = (UChar* )NULL; + (*reg)->alloc = 0; + (*reg)->used = 0; + (*reg)->name_table = (void* )NULL; + + return 0; +} + +extern int +regex_new(regex_t** reg, UChar* pattern, UChar* pattern_end, + RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, + RegErrorInfo* einfo) +{ + int r; + + if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; + + r = regex_alloc_init(reg, option, code, syntax); + if (r) return r; + + r = regex_compile(*reg, pattern, pattern_end, einfo); + if (r) { + regex_free(*reg); + *reg = NULL; + } + return r; +} + +extern void +regex_set_default_trans_table(UChar* table) +{ + int i; + + if (table && table != DefaultTransTable) { + DefaultTransTable = table; + + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) + AmbiguityTable[i] = 0; + + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) { + AmbiguityTable[table[i]]++; + if (table[i] != i) + AmbiguityTable[i] += 2; + } + } +} + +extern int +regex_init() +{ + regex_inited = 1; + + THREAD_ATOMIC_START; +#ifdef DEFAULT_TRANSTABLE_EXIST + if (! DefaultTransTable) /* check re_set_casetable() called already. */ + regex_set_default_trans_table(DTT); +#endif + +#ifdef REG_DEBUG_STATISTICS + regex_statistics_init(); +#endif + THREAD_ATOMIC_END; + + return 0; +} + +extern int +regex_end() +{ +#ifdef REG_DEBUG_STATISTICS + regex_print_statistics(stderr); +#endif + +#ifdef USE_RECYCLE_NODE + regex_free_node_list(); +#endif + + regex_inited = 0; + return 0; +} + + +#ifdef REG_DEBUG + +RegOpInfoType RegOpInfo[] = { + { OP_FINISH, "finish", ARG_NON }, + { OP_END, "end", ARG_NON }, + { OP_EXACT1, "exact1", ARG_SPECIAL }, + { OP_EXACT2, "exact2", ARG_SPECIAL }, + { OP_EXACT3, "exact3", ARG_SPECIAL }, + { OP_EXACT4, "exact4", ARG_SPECIAL }, + { OP_EXACT5, "exact5", ARG_SPECIAL }, + { OP_EXACTN, "exactn", ARG_SPECIAL }, + { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL }, + { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL }, + { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL }, + { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL }, + { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL }, + { OP_EXACTMBN, "exactmbn", ARG_SPECIAL }, + { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL }, + { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL }, + { OP_CCLASS, "cclass", ARG_SPECIAL }, + { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL }, + { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL }, + { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL }, + { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL }, + { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL }, + { OP_ANYCHAR, "anychar", ARG_NON }, + { OP_ANYCHAR_STAR, "anychar*", ARG_NON }, + { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, + { OP_WORD, "word", ARG_NON }, + { OP_NOT_WORD, "not-word", ARG_NON }, + { OP_WORD_SB, "word-sb", ARG_NON }, + { OP_WORD_MB, "word-mb", ARG_NON }, + { OP_WORD_BOUND, "word-bound", ARG_NON }, + { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, + { OP_WORD_BEGIN, "word-begin", ARG_NON }, + { OP_WORD_END, "word-end", ARG_NON }, + { OP_BEGIN_BUF, "begin-buf", ARG_NON }, + { OP_END_BUF, "end-buf", ARG_NON }, + { OP_BEGIN_LINE, "begin-line", ARG_NON }, + { OP_END_LINE, "end-line", ARG_NON }, + { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, + { OP_BEGIN_POSITION, "begin-position", ARG_NON }, + { OP_BACKREF1, "backref1", ARG_NON }, + { OP_BACKREF2, "backref2", ARG_NON }, + { OP_BACKREF3, "backref3", ARG_NON }, + { OP_BACKREFN, "backrefn", ARG_MEMNUM }, + { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, + { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, + { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, + { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, + { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, + { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, + { OP_SET_OPTION, "set-option", ARG_OPTION }, + { OP_FAIL, "fail", ARG_NON }, + { OP_JUMP, "jump", ARG_RELADDR }, + { OP_PUSH, "push", ARG_RELADDR }, + { OP_POP, "pop", ARG_NON }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, + { OP_REPEAT, "repeat", ARG_SPECIAL }, + { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, + { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, + { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, + { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, + { OP_PUSH_POS, "push-pos", ARG_NON }, + { OP_POP_POS, "pop-pos", ARG_NON }, + { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, + { OP_FAIL_POS, "fail-pos", ARG_NON }, + { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, + { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, + { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, + { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, + { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, + { OP_CALL, "call", ARG_ABSADDR }, + { OP_RETURN, "return", ARG_NON }, + { -1, "", ARG_NON } +}; + +static char* +op2name(int opcode) +{ + int i; + + for (i = 0; RegOpInfo[i].opcode >= 0; i++) { + if (opcode == RegOpInfo[i].opcode) + return RegOpInfo[i].name; + } + return ""; +} + +static int +op2arg_type(int opcode) +{ + int i; + + for (i = 0; RegOpInfo[i].opcode >= 0; i++) { + if (opcode == RegOpInfo[i].opcode) + return RegOpInfo[i].arg_type; + } + return ARG_SPECIAL; +} + +static void +Indent(FILE* f, int indent) +{ + int i; + for (i = 0; i < indent; i++) putc(' ', f); +} + +static void +p_string(FILE* f, int len, UChar* s) +{ + fputs(":", f); + while (len-- > 0) { fputc(*s++, f); } +} + +static void +p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) +{ + int x = len * mb_len; + + fprintf(f, ":%d:", len); + while (x-- > 0) { fputc(*s++, f); } +} + +extern void +regex_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +{ + int i, n, arg_type; + RelAddrType addr; + LengthType len; + MemNumType mem; + WCINT wc; + UChar *q; + + fprintf(f, "[%s", op2name(*bp)); + arg_type = op2arg_type(*bp); + if (arg_type != ARG_SPECIAL) { + bp++; + switch (arg_type) { + case ARG_NON: + break; + case ARG_RELADDR: + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":(%d)", addr); + break; + case ARG_ABSADDR: + GET_ABSADDR_INC(addr, bp); + fprintf(f, ":(%d)", addr); + break; + case ARG_LENGTH: + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d", len); + break; + case ARG_MEMNUM: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case ARG_OPTION: + { + RegOptionType option = *((RegOptionType* )bp); + bp += SIZE_OPTION; + fprintf(f, ":%d", option); + } + break; + } + } + else { + switch (*bp++) { + case OP_EXACT1: + case OP_ANYCHAR_STAR_PEEK_NEXT: + p_string(f, 1, bp++); break; + case OP_EXACT2: + p_string(f, 2, bp); bp += 2; break; + case OP_EXACT3: + p_string(f, 3, bp); bp += 3; break; + case OP_EXACT4: + p_string(f, 4, bp); bp += 4; break; + case OP_EXACT5: + p_string(f, 5, bp); bp += 5; break; + case OP_EXACTN: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 1, bp); + bp += len; + break; + + case OP_EXACTMB2N1: + p_string(f, 2, bp); bp += 2; break; + case OP_EXACTMB2N2: + p_string(f, 4, bp); bp += 4; break; + case OP_EXACTMB2N3: + p_string(f, 6, bp); bp += 6; break; + case OP_EXACTMB2N: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 2, bp); + bp += len * 2; + break; + case OP_EXACTMB3N: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 3, bp); + bp += len * 3; + break; + case OP_EXACTMBN: + { + int mb_len; + + GET_LENGTH_INC(mb_len, bp); + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d:%d:", mb_len, len); + n = len * mb_len; + while (n-- > 0) { fputc(*bp++, f); } + } + break; + + case OP_EXACT1_IC: + p_string(f, 1, bp++); + break; + case OP_EXACTN_IC: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 1, bp); + bp += len; + break; + + case OP_CCLASS: + n = bitset_on_num((BitSetRef )bp); + bp += SIZE_BITSET; + fprintf(f, ":%d", n); + break; + + case OP_CCLASS_NOT: + n = bitset_on_num((BitSetRef )bp); + bp += SIZE_BITSET; + fprintf(f, ":%d", n); + break; + + case OP_CCLASS_MB: + case OP_CCLASS_MB_NOT: + GET_LENGTH_INC(len, bp); + q = bp; +#ifndef UNALIGNED_WORD_ACCESS + ALIGNMENT_RIGHT(q); +#endif + GET_WCINT(wc, q); + bp += len; + fprintf(f, ":%d:%d", (int )wc, len); + break; + + case OP_CCLASS_MIX: + case OP_CCLASS_MIX_NOT: + n = bitset_on_num((BitSetRef )bp); + bp += SIZE_BITSET; + GET_LENGTH_INC(len, bp); + q = bp; +#ifndef UNALIGNED_WORD_ACCESS + ALIGNMENT_RIGHT(q); +#endif + GET_WCINT(wc, q); + bp += len; + fprintf(f, ":%d:%d:%d", n, (int )wc, len); + break; + + case OP_BACKREF_MULTI: + fputs(" ", f); + GET_LENGTH_INC(len, bp); + for (i = 0; i < len; i++) { + GET_MEMNUM_INC(mem, bp); + if (i > 0) fputs(", ", f); + fprintf(f, "%d", mem); + } + break; + + case OP_REPEAT: + case OP_REPEAT_NG: + { + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":%d:%d", mem, addr); + } + break; + + case OP_PUSH_OR_JUMP_EXACT1: + case OP_PUSH_IF_PEEK_NEXT: + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":(%d)", addr); + p_string(f, 1, bp); + bp += 1; + break; + + case OP_LOOK_BEHIND: + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d", len); + break; + + case OP_PUSH_LOOK_BEHIND_NOT: + GET_RELADDR_INC(addr, bp); + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d:(%d)", len, addr); + break; + + default: + fprintf(stderr, "regex_print_compiled_byte_code: undefined code %d\n", + *--bp); + } + } + fputs("]", f); + if (nextp) *nextp = bp; +} + +static void +print_compiled_byte_code_list(FILE* f, regex_t* reg) +{ + int ncode; + UChar* bp = reg->p; + UChar* end = reg->p + reg->used; + + fprintf(f, "code length: %d\n", reg->used); + + ncode = 0; + while (bp < end) { + ncode++; + if (bp > reg->p) { + if (ncode % 5 == 0) + fprintf(f, "\n"); + else + fputs(" ", f); + } + regex_print_compiled_byte_code(f, bp, &bp); + } + + fprintf(f, "\n"); +} + +static void +print_indent_tree(FILE* f, Node* node, int indent) +{ + int i, type; + int add = 3; + UChar* p; + + Indent(f, indent); + if (IS_NULL(node)) { + fprintf(f, "ERROR: null node!!!\n"); + exit (0); + } + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + if (NTYPE(node) == N_LIST) + fprintf(f, "<list:%x>\n", (int )node); + else + fprintf(f, "<alt:%x>\n", (int )node); + + print_indent_tree(f, NCONS(node).left, indent + add); + while (IS_NOT_NULL(node = NCONS(node).right)) { + if (NTYPE(node) != type) { + fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node)); + exit(0); + } + print_indent_tree(f, NCONS(node).left, indent + add); + } + break; + + case N_STRING: + fprintf(f, "<string%s:%x>", + (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node); + for (p = NSTRING(node).s; p < NSTRING(node).end; p++) fputc(*p, f); + break; + + case N_CCLASS: + fprintf(f, "<cclass:%x>", (int )node); + if (NCCLASS(node).not) fputs(" not", f); + if (NCCLASS(node).mbuf) { + BBuf* bbuf = NCCLASS(node).mbuf; + for (i = 0; i < bbuf->used; i++) { + if (i > 0) fprintf(f, ","); + fprintf(f, "%0x", bbuf->p[i]); + } + } +#if 0 + fprintf(f, "\n"); + Indent(f, indent); + for (i = 0; i < SINGLE_BYTE_SIZE; i++) + fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); +#endif + break; + + case N_CTYPE: + fprintf(f, "<ctype:%x> ", (int )node); + switch (NCTYPE(node).type) { + case CTYPE_WORD: fputs("word", f); break; + case CTYPE_NOT_WORD: fputs("not word", f); break; +#ifdef USE_SBMB_CLASS + case CTYPE_WORD_SB: fputs("word-sb", f); break; + case CTYPE_WORD_MB: fputs("word-mb", f); break; +#endif + default: + fprintf(f, "ERROR: undefined ctype.\n"); + exit(0); + } + break; + + case N_ANYCHAR: + fprintf(f, "<anychar:%x>", (int )node); + break; + + case N_ANCHOR: + fprintf(f, "<anchor:%x> ", (int )node); + switch (NANCHOR(node).type) { + case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; + case ANCHOR_END_BUF: fputs("end buf", f); break; + case ANCHOR_BEGIN_LINE: fputs("begin line", f); break; + case ANCHOR_END_LINE: fputs("end line", f); break; + case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; + case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; + + case ANCHOR_WORD_BOUND: fputs("word bound", f); break; + case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; +#ifdef USE_WORD_BEGIN_END + case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; + case ANCHOR_WORD_END: fputs("word end", f); break; +#endif + case ANCHOR_PREC_READ: fputs("prec read", f); break; + case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break; + case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break; + case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break; + + default: + fprintf(f, "ERROR: undefined anchor type.\n"); + break; + } + break; + + case N_BACKREF: + { + int* p; + BackrefNode* br = &(NBACKREF(node)); + p = BACKREFS_P(br); + fprintf(f, "<backref:%x>", (int )node); + for (i = 0; i < br->back_num; i++) { + if (i > 0) fputs(", ", f); + fprintf(f, "%d", p[i]); + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + { + CallNode* cn = &(NCALL(node)); + fprintf(f, "<call:%x>", (int )node); + p_string(f, cn->name_end - cn->name, cn->name); + } + break; +#endif + + case N_QUALIFIER: + fprintf(f, "<qualifier:%x>{%d,%d}%s\n", (int )node, + NQUALIFIER(node).lower, NQUALIFIER(node).upper, + (NQUALIFIER(node).greedy ? "" : "?")); + print_indent_tree(f, NQUALIFIER(node).target, indent + add); + break; + + case N_EFFECT: + fprintf(f, "<effect:%x> ", (int )node); + switch (NEFFECT(node).type) { + case EFFECT_OPTION: + fprintf(f, "option:%d\n", NEFFECT(node).option); + print_indent_tree(f, NEFFECT(node).target, indent + add); + break; + case EFFECT_MEMORY: + fprintf(f, "memory:%d", NEFFECT(node).regnum); + break; + case EFFECT_STOP_BACKTRACK: + fprintf(f, "stop-bt"); + break; + + default: + break; + } + fprintf(f, "\n"); + print_indent_tree(f, NEFFECT(node).target, indent + add); + break; + + default: + fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node)); + break; + } + + if (type != N_LIST && type != N_ALT && type != N_QUALIFIER && + type != N_EFFECT) + fprintf(f, "\n"); + fflush(f); +} + +static void +print_tree(FILE* f, Node* node) +{ + print_indent_tree(f, node, 0); +} +#endif /* REG_DEBUG */ diff --git a/ext/mbstring/oniguruma/regerror.c b/ext/mbstring/oniguruma/regerror.c new file mode 100644 index 0000000000..a1e86c34f1 --- /dev/null +++ b/ext/mbstring/oniguruma/regerror.c @@ -0,0 +1,267 @@ +/********************************************************************** + + regerror.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regint.h" +#include <stdio.h> /* for vsnprintf() */ + +#ifdef HAVE_STDARG_PROTOTYPES +#include <stdarg.h> +#define va_init_list(a,b) va_start(a,b) +#else +#include <varargs.h> +#define va_init_list(a,b) va_start(a) +#endif + +extern char* +regex_error_code_to_format(int code) +{ + char *p; + + if (code >= 0) return (char* )0; + + switch (code) { + case REG_MISMATCH: + p = "mismatch"; break; + case REG_NO_SUPPORT_CONFIG: + p = "no support in this configuration"; break; + case REGERR_MEMORY: + p = "fail to memory allocation"; break; + case REGERR_MATCH_STACK_LIMIT_OVER: + p = "match-stack limit over"; break; + case REGERR_TYPE_BUG: + p = "undefined type (bug)"; break; + case REGERR_PARSER_BUG: + p = "internal parser error (bug)"; break; + case REGERR_STACK_BUG: + p = "stack error (bug)"; break; + case REGERR_UNDEFINED_BYTECODE: + p = "undefined bytecode (bug)"; break; + case REGERR_UNEXPECTED_BYTECODE: + p = "unexpected bytecode (bug)"; break; + case REGERR_DEFAULT_ENCODING_IS_NOT_SETTED: + p = "default multibyte-encoding is not setted"; break; + case REGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR: + p = "can't convert to wide-char on specified multibyte-encoding"; break; + case REGERR_END_PATTERN_AT_LEFT_BRACE: + p = "end pattern at left brace"; break; + case REGERR_END_PATTERN_AT_LEFT_BRACKET: + p = "end pattern at left bracket"; break; + case REGERR_EMPTY_CHAR_CLASS: + p = "empty char-class"; break; + case REGERR_PREMATURE_END_OF_CHAR_CLASS: + p = "premature end of char-class"; break; + case REGERR_END_PATTERN_AT_BACKSLASH: + p = "end pattern at backslash"; break; + case REGERR_END_PATTERN_AT_META: + p = "end pattern at meta"; break; + case REGERR_END_PATTERN_AT_CONTROL: + p = "end pattern at control"; break; + case REGERR_META_CODE_SYNTAX: + p = "illegal meta-code syntax"; break; + case REGERR_CONTROL_CODE_SYNTAX: + p = "illegal control-code syntax"; break; + case REGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE: + p = "char-class value at end of range"; break; + case REGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE: + p = "char-class value at start of range"; break; + case REGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS: + p = "unmatched range specifier in char-class"; break; + case REGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED: + p = "target of repeat operator is not specified"; break; + case REGERR_TARGET_OF_REPEAT_OPERATOR_INVALID: + p = "target of repeat operator is invalid"; break; + case REGERR_NESTED_REPEAT_OPERATOR: + p = "nested repeat operator"; break; + case REGERR_UNMATCHED_CLOSE_PARENTHESIS: + p = "unmatched close parenthesis"; break; + case REGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS: + p = "end pattern with unmatched parenthesis"; break; + case REGERR_END_PATTERN_IN_GROUP: + p = "end pattern in group"; break; + case REGERR_UNDEFINED_GROUP_OPTION: + p = "undefined group option"; break; + case REGERR_INVALID_POSIX_BRACKET_TYPE: + p = "invalid POSIX bracket type"; break; + case REGERR_INVALID_LOOK_BEHIND_PATTERN: + p = "invalid pattern in look-behind"; break; + case REGERR_INVALID_REPEAT_RANGE_PATTERN: + p = "invalid repeat range {lower,upper}"; break; + case REGERR_TOO_BIG_NUMBER: + p = "too big number"; break; + case REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE: + p = "too big number for repeat range"; break; + case REGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE: + p = "upper is smaller than lower in repeat range"; break; + case REGERR_EMPTY_RANGE_IN_CHAR_CLASS: + p = "empty range in char class"; break; + case REGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE: + p = "mismatch multibyte code length in char-class range"; break; + case REGERR_TOO_MANY_MULTI_BYTE_RANGES: + p = "too many multibyte code ranges are specified"; break; + case REGERR_TOO_SHORT_MULTI_BYTE_STRING: + p = "too short multibyte code string"; break; + case REGERR_TOO_BIG_BACKREF_NUMBER: + p = "too big backref number"; break; + case REGERR_INVALID_BACKREF: +#ifdef USE_NAMED_SUBEXP + p = "invalid backref number/name"; break; +#else + p = "invalid backref number"; break; +#endif + case REGERR_TOO_BIG_WIDE_CHAR_VALUE: + p = "too big wide-char value"; break; + case REGERR_TOO_LONG_WIDE_CHAR_VALUE: + p = "too long wide-char value"; break; + case REGERR_INVALID_WIDE_CHAR_VALUE: + p = "invalid wide-char value"; break; + case REGERR_INVALID_SUBEXP_NAME: + p = "invalid subexp name"; break; + case REGERR_UNDEFINED_NAME_REFERENCE: + p = "undefined name <%n> reference"; break; + case REGERR_UNDEFINED_GROUP_REFERENCE: + p = "undefined group reference"; break; + case REGERR_MULTIPLEX_DEFINITION_NAME_CALL: + p = "multiplex definition name <%n> call"; break; + case REGERR_NEVER_ENDING_RECURSION: + p = "never ending recursion"; break; + case REGERR_OVER_THREAD_PASS_LIMIT_COUNT: + p = "over thread pass limit count"; break; + + default: + p = "undefined error code"; break; + } + + return p; +} + + +/* for REG_MAX_ERROR_MESSAGE_LEN */ +#define MAX_ERROR_PAR_LEN 30 + +extern int +#ifdef HAVE_STDARG_PROTOTYPES +regex_error_code_to_str(UChar* s, int code, ...) +#else +regex_error_code_to_str(UChar* s, code, va_alist) + int code; + va_dcl +#endif +{ + UChar *p, *q; + RegErrorInfo* einfo; + int len; + va_list vargs; + + va_init_list(vargs, code); + + switch (code) { + case REGERR_UNDEFINED_NAME_REFERENCE: + case REGERR_MULTIPLEX_DEFINITION_NAME_CALL: + einfo = va_arg(vargs, RegErrorInfo*); + len = einfo->par_end - einfo->par; + q = regex_error_code_to_format(code); + p = s; + while (*q != '\0') { + if (*q == '%') { + q++; + if (*q == 'n') { /* '%n': name */ + if (len > MAX_ERROR_PAR_LEN) { + xmemcpy(p, einfo->par, MAX_ERROR_PAR_LEN - 3); + p += (MAX_ERROR_PAR_LEN - 3); + xmemcpy(p, "...", 3); + p += 3; + } + else { + xmemcpy(p, einfo->par, len); + p += len; + } + q++; + } + else + goto normal_char; + } + else { + normal_char: + *p++ = *q++; + } + } + *p = '\0'; + len = p - s; + break; + + default: + q = regex_error_code_to_format(code); + len = strlen(q); + xmemcpy(s, q, len); + s[len] = '\0'; + break; + } + + va_end(vargs); + return len; +} + + +void +#ifdef HAVE_STDARG_PROTOTYPES +regex_snprintf_with_pattern(char buf[], int bufsize, RegCharEncoding enc, + char* pat, char* pat_end, char *fmt, ...) +#else +regex_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) + char buf[]; + int bufsize; + RegCharEncoding enc; + char* pat; + char* pat_end; + const char *fmt; + va_dcl +#endif +{ + int n, need, len; + char *p, *s; + va_list args; + + va_init_list(args, fmt); + n = vsnprintf(buf, bufsize, fmt, args); + va_end(args); + + need = (pat_end - pat) * 4 + 4; + + if (n + need < bufsize) { + strcat(buf, ": /"); + s = buf + strlen(buf); + + p = pat; + while (p < pat_end) { + if (*p == '\\') { + *s++ = *p++; + len = mblen(enc, *p); + while (len-- > 0) *s++ = *p++; + } + else if (*p == '/') { + *s++ = '\\'; + *s++ = *p++; + } + else if (ismb(enc, *p)) { + len = mblen(enc, *p); + while (len-- > 0) *s++ = *p++; + } + else if (!IS_PRINT(*p) && !IS_SPACE(*p)) { + char b[5]; + sprintf(b, "\\%03o", *p & 0377); + len = strlen(b); + while (len-- > 0) *s++ = *p++; + } + else { + *s++ = *p++; + } + } + + *s++ = '/'; + *s = '\0'; + } +} diff --git a/ext/mbstring/oniguruma/regex.c b/ext/mbstring/oniguruma/regex.c new file mode 100644 index 0000000000..0c4a43be9e --- /dev/null +++ b/ext/mbstring/oniguruma/regex.c @@ -0,0 +1,16 @@ +/********************************************************************** + + regex.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +/* + * Source wrapper for Ruby. + */ + +#include "regparse.c" +#include "regcomp.c" +#include "regexec.c" +#include "reggnu.c" +#include "regerror.c" diff --git a/ext/mbstring/oniguruma/regexec.c b/ext/mbstring/oniguruma/regexec.c new file mode 100644 index 0000000000..b7319ac4fb --- /dev/null +++ b/ext/mbstring/oniguruma/regexec.c @@ -0,0 +1,2935 @@ +/********************************************************************** + + regexec.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regint.h" + +static UChar* +get_right_adjust_char_head_with_prev(RegCharEncoding code, + UChar* start, UChar* s, UChar** prev); +static UChar* +step_backward_char(RegCharEncoding code, UChar* start, UChar* s, int n); + + +extern void +regex_region_clear(RegRegion* region) +{ + int i; + + for (i = 0; i < region->num_regs; i++) { + region->beg[i] = region->end[i] = REG_REGION_NOTPOS; + } +} + +extern int +regex_region_resize(RegRegion* region, int n) +{ + int i; + + region->num_regs = n; + + if (n < REG_NREGION) + n = REG_NREGION; + + if (region->allocated == 0) { + region->beg = (int* )xmalloc(n * sizeof(int)); + region->end = (int* )xmalloc(n * sizeof(int)); + + if (region->beg == 0 || region->end == 0) + return REGERR_MEMORY; + + region->allocated = n; + } + else if (region->allocated < n) { + region->beg = (int* )xrealloc(region->beg, n * sizeof(int)); + region->end = (int* )xrealloc(region->end, n * sizeof(int)); + + if (region->beg == 0 || region->end == 0) + return REGERR_MEMORY; + + region->allocated = n; + } + + for (i = 0; i < region->num_regs; i++) { + region->beg[i] = region->end[i] = REG_REGION_NOTPOS; + } + return 0; +} + +static void +regex_region_init(RegRegion* region) +{ + region->num_regs = 0; + region->allocated = 0; + region->beg = (int* )0; + region->end = (int* )0; +} + +extern RegRegion* +regex_region_new() +{ + RegRegion* r; + + r = (RegRegion* )xmalloc(sizeof(RegRegion)); + regex_region_init(r); + return r; +} + +extern void +regex_region_free(RegRegion* r, int free_self) +{ + if (r) { + if (r->allocated > 0) { + if (r->beg) xfree(r->beg); + if (r->end) xfree(r->end); + r->allocated = 0; + } + if (free_self) xfree(r); + } +} + +extern void +regex_region_copy(RegRegion* to, RegRegion* from) +{ +#define RREGC_SIZE (sizeof(int) * from->num_regs) + int i; + + if (to == from) return; + + if (to->allocated == 0) { + if (from->num_regs > 0) { + to->beg = (int* )xmalloc(RREGC_SIZE); + to->end = (int* )xmalloc(RREGC_SIZE); + to->allocated = from->num_regs; + } + } + else if (to->allocated < from->num_regs) { + to->beg = (int* )xrealloc(to->beg, RREGC_SIZE); + to->end = (int* )xrealloc(to->end, RREGC_SIZE); + to->allocated = from->num_regs; + } + + for (i = 0; i < from->num_regs; i++) { + to->beg[i] = from->beg[i]; + to->end[i] = from->end[i]; + } + to->num_regs = from->num_regs; +} + + +/** stack **/ +#define INVALID_STACK_INDEX -1 +typedef int StackIndex; + +typedef struct _StackType { + unsigned int type; + union { + struct { + UChar *pcode; /* byte code position */ + UChar *pstr; /* string position */ + UChar *pstr_prev; /* previous char position of pstr */ + } state; + struct { + int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ + UChar *pcode; /* byte code position (head of repeated target) */ + int num; /* repeat id */ + } repeat; + struct { + StackIndex si; /* index of stack */ + } repeat_inc; + struct { + int num; /* memory num */ + UChar *pstr; /* start/end position */ + /* Following information is setted, if this stack type is MEM-START */ + StackIndex start; /* prev. info (for backtrack "(...)*" ) */ + StackIndex end; /* prev. info (for backtrack "(...)*" ) */ + } mem; + struct { + int num; /* null check id */ + UChar *pstr; /* start position */ + } null_check; +#ifdef USE_SUBEXP_CALL + struct { + UChar *ret_addr; /* byte code position */ + int num; /* null check id */ + UChar *pstr; /* string position */ + } call_frame; +#endif + } u; +} StackType; + +/* stack type */ +/* used by normal-POP */ +#define STK_ALT 0x0001 +#define STK_LOOK_BEHIND_NOT 0x0003 +#define STK_POS_NOT 0x0005 +/* avoided by normal-POP, but value should be small */ +#define STK_NULL_CHECK_START 0x0100 +/* handled by normal-POP */ +#define STK_MEM_START 0x0200 +#define STK_MEM_END 0x0300 +#define STK_REPEAT_INC 0x0400 +/* avoided by normal-POP */ +#define STK_POS 0x0500 /* used when POP-POS */ +#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */ +#define STK_REPEAT 0x0700 +#define STK_CALL_FRAME 0x0800 +#define STK_RETURN 0x0900 +#define STK_MEM_END_MARK 0x0a00 +#define STK_VOID 0x0b00 /* for fill a blank */ + +/* stack type check mask */ +#define STK_MASK_POP_USED 0x00ff +#define IS_TO_VOID_TARGET(stk) \ + (((stk)->type & STK_MASK_POP_USED) || (stk)->type == STK_NULL_CHECK_START) + +typedef struct { + void* stack_p; + int stack_n; + RegOptionType options; + RegRegion* region; + UChar* start; /* search start position (for \G: BEGIN_POSITION) */ +} MatchArg; + +#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ + (msa).stack_p = (void* )0;\ + (msa).options = (arg_option);\ + (msa).region = (arg_region);\ + (msa).start = (arg_start);\ +} while (0) + +#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) + + +#define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\ + if (msa->stack_p) {\ + alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num));\ + stk_alloc = (StackType* )(msa->stack_p);\ + stk_base = stk_alloc;\ + stk = stk_base;\ + stk_end = stk_base + msa->stack_n;\ + }\ + else {\ + alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num)\ + + sizeof(StackType) * (stack_num));\ + stk_alloc = (StackType* )(alloc_addr + sizeof(char*) * (ptr_num));\ + stk_base = stk_alloc;\ + stk = stk_base;\ + stk_end = stk_base + (stack_num);\ + }\ +} while(0) + +#define STACK_SAVE do{\ + if (stk_base != stk_alloc) {\ + msa->stack_p = stk_base;\ + msa->stack_n = stk_end - stk_base;\ + };\ +} while(0) + +static int +stack_double(StackType** arg_stk_base, StackType** arg_stk_end, + StackType** arg_stk, StackType* stk_alloc, MatchArg* msa) +{ + int n; + StackType *x, *stk_base, *stk_end, *stk; + + stk_base = *arg_stk_base; + stk_end = *arg_stk_end; + stk = *arg_stk; + + n = stk_end - stk_base; + if (stk_base == stk_alloc && IS_NULL(msa->stack_p)) { + x = (StackType* )xmalloc(sizeof(StackType) * n * 2); + if (IS_NULL(x)) { + STACK_SAVE; + return REGERR_MEMORY; + } + xmemcpy(x, stk_base, n * sizeof(StackType)); + n *= 2; + } + else { + n *= 2; + if (n > MATCH_STACK_LIMIT_SIZE) return REGERR_MATCH_STACK_LIMIT_OVER; + x = (StackType* )xrealloc(stk_base, sizeof(StackType) * n); + if (IS_NULL(x)) { + STACK_SAVE; + return REGERR_MEMORY; + } + } + *arg_stk = x + (stk - stk_base); + *arg_stk_base = x; + *arg_stk_end = x + n; + return 0; +} + +#define STACK_ENSURE(n) do {\ + if (stk_end - stk < (n)) {\ + int r = stack_double(&stk_base, &stk_end, &stk, stk_alloc, msa);\ + if (r != 0) { STACK_SAVE; return r; } \ + }\ +} while(0) + +#define STACK_AT(index) (stk_base + (index)) +#define GET_STACK_INDEX(stk) ((stk) - stk_base) + +#define STACK_PUSH(stack_type,pat,s,sprev) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ENSURED(stack_type,pat) do {\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_TYPE(stack_type) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) +#define STACK_PUSH_POS(s,sprev) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev) +#define STACK_PUSH_POS_NOT(pat,s,sprev) STACK_PUSH(STK_POS_NOT,pat,s,sprev) +#define STACK_PUSH_STOP_BT STACK_PUSH_TYPE(STK_STOP_BT) +#define STACK_PUSH_LOOK_BEHIND_NOT(pat,s,sprev) \ + STACK_PUSH(STK_LOOK_BEHIND_NOT,pat,s,sprev) + +#define STACK_PUSH_REPEAT(id, pat) do {\ + STACK_ENSURE(1);\ + stk->type = STK_REPEAT;\ + stk->u.repeat.num = (id);\ + stk->u.repeat.pcode = (pat);\ + stk->u.repeat.count = 0;\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_REPEAT_INC(sindex) do {\ + STACK_ENSURE(1);\ + stk->type = STK_REPEAT_INC;\ + stk->u.repeat_inc.si = (sindex);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MEM_START(mnum, s) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MEM_START;\ + stk->u.mem.num = (mnum);\ + stk->u.mem.pstr = (s);\ + stk->u.mem.start = mem_start_stk[mnum];\ + stk->u.mem.end = mem_end_stk[mnum];\ + mem_start_stk[mnum] = GET_STACK_INDEX(stk);\ + mem_end_stk[mnum] = INVALID_STACK_INDEX;\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MEM_END(mnum, s) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MEM_END;\ + stk->u.mem.num = (mnum);\ + stk->u.mem.pstr = (s);\ + stk->u.mem.start = mem_start_stk[mnum];\ + stk->u.mem.end = mem_end_stk[mnum];\ + mem_end_stk[mnum] = GET_STACK_INDEX(stk);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MEM_END_MARK(mnum) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MEM_END_MARK;\ + stk->u.mem.num = (mnum);\ + STACK_INC;\ +} while(0) + +#define STACK_GET_MEM_START(mnum, k) do {\ + int level = 0;\ + k = stk;\ + while (k > stk_base) {\ + k--;\ + if ((k->type == STK_MEM_END_MARK || k->type == STK_MEM_END) \ + && k->u.mem.num == (mnum)) {\ + level++;\ + }\ + else if (k->type == STK_MEM_START && k->u.mem.num == (mnum)) {\ + if (level == 0) break;\ + level--;\ + }\ + }\ +} while (0) + +#define STACK_PUSH_NULL_CHECK_START(cnum, s) do {\ + STACK_ENSURE(1);\ + stk->type = STK_NULL_CHECK_START;\ + stk->u.null_check.num = (cnum);\ + stk->u.null_check.pstr = (s);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_CALL_FRAME(pat) do {\ + STACK_ENSURE(1);\ + stk->type = STK_CALL_FRAME;\ + stk->u.call_frame.ret_addr = (pat);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_RETURN do {\ + STACK_ENSURE(1);\ + stk->type = STK_RETURN;\ + STACK_INC;\ +} while(0) + + +#ifdef REG_DEBUG +#define STACK_BASE_CHECK(p) \ + if ((p) < stk_base) goto stack_error; +#else +#define STACK_BASE_CHECK(p) +#endif + +#define STACK_POP_ONE do {\ + stk--;\ + STACK_BASE_CHECK(stk); \ +} while(0) + +#define STACK_POP do {\ + switch (pop_level) {\ + case STACK_POP_LEVEL_FREE:\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + }\ + break;\ + case STACK_POP_LEVEL_MEM_START:\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ + break;\ + default:\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ + break;\ + }\ +} while(0) + +#define STACK_POP_TIL_POS_NOT do {\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if (stk->type == STK_POS_NOT) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ +} while(0) + +#define STACK_POP_TIL_LOOK_BEHIND_NOT do {\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if (stk->type == STK_LOOK_BEHIND_NOT) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ +} while(0) + +#define STACK_POS_END(k) do {\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_POS) {\ + k->type = STK_VOID;\ + break;\ + }\ + }\ +} while(0) + +#define STACK_STOP_BT_END do {\ + StackType *k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_STOP_BT) {\ + k->type = STK_VOID;\ + break;\ + }\ + }\ +} while(0) + +#define STACK_NULL_CHECK(isnull,id,s) do {\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_NULL_CHECK_START) {\ + if (k->u.null_check.num == (id)) {\ + (isnull) = (k->u.null_check.pstr == (s));\ + break;\ + }\ + }\ + }\ +} while(0) + +#define STACK_GET_REPEAT(id, k) do {\ + int level = 0;\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_REPEAT) {\ + if (level == 0) {\ + if (k->u.repeat.num == (id)) {\ + break;\ + }\ + }\ + }\ + else if (k->type == STK_CALL_FRAME) level--;\ + else if (k->type == STK_RETURN) level++;\ + }\ +} while (0) + +#define STACK_RETURN(addr) do {\ + int level = 0;\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_CALL_FRAME) {\ + if (level == 0) {\ + (addr) = k->u.call_frame.ret_addr;\ + break;\ + }\ + else level--;\ + }\ + else if (k->type == STK_RETURN)\ + level++;\ + }\ +} while(0) + + +#define CASETABLE_TOLOWER(c) (casetable[c]) + +/* byte_code is already converted to lower-case at string compile time */ +#define SBTRANSCMP(byte_code,c) (byte_code == CASETABLE_TOLOWER(c)) + +#define STRING_CMP(s1,s2,len) do {\ + if (ignore_case) {\ + int slen; \ + while (len > 0) {\ + slen = mblen(encode, *s1); \ + if (slen == 1) {\ + if (CASETABLE_TOLOWER(*s1) != CASETABLE_TOLOWER(*s2)) \ + goto fail;\ + s1++; s2++; len--; \ + } \ + else {\ + len -= slen; \ + while (slen-- > 0) { \ + if (*s1++ != *s2++) goto fail;\ + } \ + }\ + }\ + }\ + else {\ + while (len-- > 0) {\ + if (*s1++ != *s2++) goto fail;\ + }\ + }\ +} while(0) + +#define STRING_CMP_VALUE(s1,s2,len,is_fail) do {\ + is_fail = 0;\ + if (ignore_case) {\ + int slen; \ + while (len > 0) {\ + slen = mblen(encode, *s1); \ + if (slen == 1) {\ + if (CASETABLE_TOLOWER(*s1) != CASETABLE_TOLOWER(*s2)) {\ + is_fail = 1; break;\ + }\ + s1++; s2++; len--; \ + } \ + else {\ + len -= slen; \ + while (slen-- > 0) { \ + if (*s1++ != *s2++) {\ + is_fail = 1; break;\ + }\ + } \ + if (is_fail != 0) break;\ + }\ + }\ + }\ + else {\ + while (len-- > 0) {\ + if (*s1++ != *s2++) {\ + is_fail = 1; break;\ + }\ + }\ + }\ +} while(0) + +#define ON_STR_BEGIN(s) ((s) == str) +#define ON_STR_END(s) ((s) == end) +#define IS_EMPTY_STR (str == end) + +#define DATA_ENSURE(n) \ + if (s + (n) > end) goto fail + +#define DATA_ENSURE_CHECK(n) (s + (n) <= end) + +#ifdef REG_DEBUG_STATISTICS + +#define USE_TIMEOFDAY + +#ifdef USE_TIMEOFDAY +#ifdef HAVE_SYS_TIME_H +#include <sys/time.h> +#endif +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif +static struct timeval ts, te; +#define GETTIME(t) gettimeofday(&(t), (struct timezone* )0) +#define TIMEDIFF(te,ts) (((te).tv_usec - (ts).tv_usec) + \ + (((te).tv_sec - (ts).tv_sec)*1000000)) +#else +#ifdef HAVE_SYS_TIMES_H +#include <sys/times.h> +#endif +static struct tms ts, te; +#define GETTIME(t) times(&(t)) +#define TIMEDIFF(te,ts) ((te).tms_utime - (ts).tms_utime) +#endif + +static int OpCounter[256]; +static int OpPrevCounter[256]; +static unsigned long OpTime[256]; +static int OpCurr = OP_FINISH; +static int OpPrevTarget = OP_FAIL; +static int MaxStackDepth = 0; + +#define STAT_OP_IN(opcode) do {\ + if (opcode == OpPrevTarget) OpPrevCounter[OpCurr]++;\ + OpCurr = opcode;\ + OpCounter[opcode]++;\ + GETTIME(ts);\ +} while (0) + +#define STAT_OP_OUT do {\ + GETTIME(te);\ + OpTime[OpCurr] += TIMEDIFF(te, ts);\ +} while (0) + +extern void regex_statistics_init() +{ + int i; + for (i = 0; i < 256; i++) { + OpCounter[i] = OpPrevCounter[i] = 0; OpTime[i] = 0; + } + MaxStackDepth = 0; + +#ifdef RUBY_PLATFORM + rb_define_global_function("regex_stat_print", regex_stat_print, 0); +#endif +} + +#ifdef RUBY_PLATFORM +static VALUE regex_stat_print() +{ + regex_print_statistics(stderr); + return Qnil; +} +#endif + +extern void +regex_print_statistics(FILE* f) +{ + int i; + fprintf(f, " count prev time\n"); + for (i = 0; RegOpInfo[i].opcode >= 0; i++) { + fprintf(f, "%8d: %8d: %10ld: %s\n", + OpCounter[i], OpPrevCounter[i], OpTime[i], RegOpInfo[i].name); + } + fprintf(f, "\nmax stack depth: %d\n", MaxStackDepth); +} + +#define STACK_INC do {\ + stk++;\ + if (stk - stk_base > MaxStackDepth) \ + MaxStackDepth = stk - stk_base;\ +} while (0) + +#else +#define STACK_INC stk++ + +#define STAT_OP_IN(opcode) +#define STAT_OP_OUT +#endif + +extern int +regex_is_in_wc_range(UChar* p, WCINT wc) +{ + WCINT n, *data; + int low, high, x; + + GET_WCINT(n, p); + data = (WCINT* )p; + data++; + + for (low = 0, high = n; low < high; ) { + x = (low + high) >> 1; + if (wc > data[x * 2 + 1]) + low = x + 1; + else + high = x; + } + + return ((low < n && wc >= data[low * 2]) ? 1 : 0); +} + + +/* matching region of POSIX API */ +typedef int regoff_t; + +typedef struct { + regoff_t rm_so; + regoff_t rm_eo; +} posix_regmatch_t; + +/* match data(str - end) from position (sstart). */ +/* if sstart == str then set sprev to NULL. */ +static int +match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, + UChar* sprev, MatchArg* msa) +{ + static UChar FinishCode[] = { OP_FINISH }; + + int i, n, num_mem, best_len, pop_level, find_cond; + LengthType tlen, tlen2; + MemNumType mem; + RelAddrType addr; + RegOptionType option = reg->options; + RegCharEncoding encode = reg->enc; + unsigned char* casetable = DefaultTransTable; + int ignore_case; + UChar *s, *q, *sbegin; + UChar *p = reg->p; + char *alloca_base; + StackType *stk_alloc, *stk_base, *stk, *stk_end; + StackType *stkp; /* used as any purpose. */ + StackIndex *repeat_stk; + StackIndex *mem_start_stk, *mem_end_stk; + n = reg->num_repeat + reg->num_mem * 2; + + STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); + ignore_case = IS_IGNORECASE(option); + find_cond = IS_FIND_CONDITION(option); + pop_level = reg->stack_pop_level; + num_mem = reg->num_mem; + repeat_stk = (StackIndex* )alloca_base; + + mem_start_stk = (StackIndex* )(repeat_stk + reg->num_repeat); + mem_end_stk = mem_start_stk + num_mem; + mem_start_stk--; /* for index start from 1, + mem_start_stk[1]..mem_start_stk[num_mem] */ + mem_end_stk--; /* for index start from 1, + mem_end_stk[1]..mem_end_stk[num_mem] */ + for (i = 1; i <= num_mem; i++) { + mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX; + } + +#ifdef REG_DEBUG_MATCH + fprintf(stderr, "match_at: str: %d, end: %d, start: %d, sprev: %d\n", + (int )str, (int )end, (int )sstart, (int )sprev); + fprintf(stderr, "size: %d, start offset: %d\n", + (int )(end - str), (int )(sstart - str)); +#endif + + STACK_PUSH_ENSURED(STK_ALT, FinishCode); /* bottom stack */ + best_len = REG_MISMATCH; + s = sstart; + while (1) { +#ifdef REG_DEBUG_MATCH + { + UChar *q, *bp, buf[50]; + int len; + fprintf(stderr, "%4d> \"", (int )(s - str)); + bp = buf; + for (i = 0, q = s; i < 7 && q < end; i++) { + len = mblen(encode, *q); + while (len-- > 0) *bp++ = *q++; + } + if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } + else { xmemcpy(bp, "\"", 1); bp += 1; } + *bp = 0; + fputs(buf, stderr); + for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr); + regex_print_compiled_byte_code(stderr, p, NULL); + fprintf(stderr, "\n"); + } +#endif + + sbegin = s; + switch (*p++) { + case OP_END: STAT_OP_IN(OP_END); + n = s - sstart; + if (n > best_len) { + RegRegion* region = msa->region; + best_len = n; + if (region) { + if (IS_POSIX_REGION(msa->options)) { + posix_regmatch_t* rmt = (posix_regmatch_t* )region; + + rmt[0].rm_so = sstart - str; + rmt[0].rm_eo = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->backtrack_mem, i)) + rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str; + + rmt[i].rm_eo = (find_cond != 0 + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; + } + else { + rmt[i].rm_so = rmt[i].rm_eo = REG_REGION_NOTPOS; + } + } + } + else { + region->beg[0] = sstart - str; + region->end[0] = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->backtrack_mem, i)) + region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; + + region->end[i] = (find_cond != 0 + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; + } + else { + region->beg[i] = region->end[i] = REG_REGION_NOTPOS; + } + } + } + } + } + STAT_OP_OUT; + + if (find_cond) { + if (IS_FIND_NOT_EMPTY(option) && s == sstart) { + best_len = REG_MISMATCH; + goto fail; /* for retry */ + } + if (IS_FIND_LONGEST(option) && s < end) { + goto fail; /* for retry */ + } + } + else { + /* default behavior: return first-matching result. */ + goto finish; + } + break; + + case OP_EXACT1: STAT_OP_IN(OP_EXACT1); +#if 0 + DATA_ENSURE(1); + if (*p != *s) goto fail; + p++; s++; +#endif + if (*p != *s++) goto fail; + DATA_ENSURE(0); + p++; + STAT_OP_OUT; + break; + + case OP_EXACT1_IC: STAT_OP_IN(OP_EXACT1_IC); + if (! SBTRANSCMP(*p, *s)) goto fail; + DATA_ENSURE(1); + p++; s++; + STAT_OP_OUT; + break; + + case OP_EXACT2: STAT_OP_IN(OP_EXACT2); + DATA_ENSURE(2); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACT3: STAT_OP_IN(OP_EXACT3); + DATA_ENSURE(3); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACT4: STAT_OP_IN(OP_EXACT4); + DATA_ENSURE(4); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACT5: STAT_OP_IN(OP_EXACT5); + DATA_ENSURE(5); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTN: STAT_OP_IN(OP_EXACTN); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen); + while (tlen-- > 0) { + if (*p++ != *s++) goto fail; + } + sprev = s - 1; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTN_IC: STAT_OP_IN(OP_EXACTN_IC); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen); + while (tlen-- > 0) { + if (! SBTRANSCMP(*p, *s)) goto fail; + p++; s++; + } + sprev = s - 1; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB2N1: STAT_OP_IN(OP_EXACTMB2N1); + DATA_ENSURE(2); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + STAT_OP_OUT; + break; + + case OP_EXACTMB2N2: STAT_OP_IN(OP_EXACTMB2N2); + DATA_ENSURE(4); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + sprev = s; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB2N3: STAT_OP_IN(OP_EXACTMB2N3); + DATA_ENSURE(6); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + sprev = s; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB2N: STAT_OP_IN(OP_EXACTMB2N); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen * 2); + while (tlen-- > 0) { + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + } + sprev = s - 2; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB3N: STAT_OP_IN(OP_EXACTMB3N); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen * 3); + while (tlen-- > 0) { + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + } + sprev = s - 3; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMBN: STAT_OP_IN(OP_EXACTMBN); + GET_LENGTH_INC(tlen, p); /* mb-len */ + GET_LENGTH_INC(tlen2, p); /* string len */ + tlen2 *= tlen; + DATA_ENSURE(tlen2); + while (tlen2-- > 0) { + if (*p != *s) goto fail; + p++; s++; + } + sprev = s - tlen; + STAT_OP_OUT; + continue; + break; + + case OP_CCLASS: STAT_OP_IN(OP_CCLASS); + DATA_ENSURE(1); + if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail; + p += SIZE_BITSET; + s += mblen(encode, *s); /* OP_CCLASS can match mb-code. \D, \S */ + STAT_OP_OUT; + break; + + case OP_CCLASS_MB: STAT_OP_IN(OP_CCLASS_MB); + if (! ismb(encode, *s)) goto fail; + + cclass_mb: + GET_LENGTH_INC(tlen, p); + { + WCINT wc; + UChar *ss; + int mb_len = mblen(encode, *s); + + DATA_ENSURE(mb_len); + ss = s; + s += mb_len; + wc = MB2WC(ss, s, encode); + +#ifdef UNALIGNED_WORD_ACCESS + if (! regex_is_in_wc_range(p, wc)) goto fail; +#else + q = p; + ALIGNMENT_RIGHT(q); + if (! regex_is_in_wc_range(q, wc)) goto fail; +#endif + } + p += tlen; + STAT_OP_OUT; + break; + + case OP_CCLASS_MIX: STAT_OP_IN(OP_CCLASS_MIX); + DATA_ENSURE(1); + if (ismb(encode, *s)) { + p += SIZE_BITSET; + goto cclass_mb; + } + else { + if (BITSET_AT(((BitSetRef )p), *s) == 0) + goto fail; + + p += SIZE_BITSET; + GET_LENGTH_INC(tlen, p); + p += tlen; + s++; + } + STAT_OP_OUT; + break; + + case OP_CCLASS_NOT: STAT_OP_IN(OP_CCLASS_NOT); + DATA_ENSURE(1); + if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail; + p += SIZE_BITSET; + s += mblen(encode, *s); + STAT_OP_OUT; + break; + + case OP_CCLASS_MB_NOT: STAT_OP_IN(OP_CCLASS_MB_NOT); + if (! ismb(encode, *s)) { + DATA_ENSURE(1); + s++; + GET_LENGTH_INC(tlen, p); + p += tlen; + goto cc_mb_not_success; + } + + cclass_mb_not: + GET_LENGTH_INC(tlen, p); + { + WCINT wc; + UChar *ss; + int mb_len = mblen(encode, *s); + + if (s + mb_len > end) { + s = end; + p += tlen; + goto cc_mb_not_success; + } + + ss = s; + s += mb_len; + wc = MB2WC(ss, s, encode); + +#ifdef UNALIGNED_WORD_ACCESS + if (regex_is_in_wc_range(p, wc)) goto fail; +#else + q = p; + ALIGNMENT_RIGHT(q); + if (regex_is_in_wc_range(q, wc)) goto fail; +#endif + } + p += tlen; + + cc_mb_not_success: + STAT_OP_OUT; + break; + + case OP_CCLASS_MIX_NOT: STAT_OP_IN(OP_CCLASS_MIX_NOT); + DATA_ENSURE(1); + if (ismb(encode, *s)) { + p += SIZE_BITSET; + goto cclass_mb_not; + } + else { + if (BITSET_AT(((BitSetRef )p), *s) != 0) + goto fail; + + p += SIZE_BITSET; + GET_LENGTH_INC(tlen, p); + p += tlen; + s++; + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR); + DATA_ENSURE(1); + if (ismb(encode, *s)) { + n = mblen(encode, *s); + DATA_ENSURE(n); + s += n; + } + else { + if (! IS_MULTILINE(option)) { + if (IS_NEWLINE(*s)) goto fail; + } + s++; + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR_STAR: STAT_OP_IN(OP_ANYCHAR_STAR); + if (! IS_MULTILINE(option)) { + while (s < end) { + STACK_PUSH_ALT(p, s, sprev); + if (ismb(encode, *s)) { + n = mblen(encode, *s); + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + if (IS_NEWLINE(*s)) goto fail; + sprev = s; + s++; + } + } + } + else { + while (s < end) { + STACK_PUSH_ALT(p, s, sprev); + if (ismb(encode, *s)) { + n = mblen(encode, *s); + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR_STAR_PEEK_NEXT: STAT_OP_IN(OP_ANYCHAR_STAR_PEEK_NEXT); + while (s < end) { + if (*p == *s) { + STACK_PUSH_ALT(p + 1, s, sprev); + } + if (ismb(encode, *s)) { + n = mblen(encode, *s); + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + if (! IS_MULTILINE(option)) { + if (IS_NEWLINE(*s)) goto fail; + } + sprev = s; + s++; + } + } + p++; + STAT_OP_OUT; + break; + + case OP_WORD: STAT_OP_IN(OP_WORD); + DATA_ENSURE(1); + if (! IS_WORD_STR_INC(encode, s, end)) + goto fail; + STAT_OP_OUT; + break; + + case OP_NOT_WORD: STAT_OP_IN(OP_NOT_WORD); + DATA_ENSURE(1); + if (IS_WORD_STR_INC(encode, s, end)) + goto fail; + STAT_OP_OUT; + break; + +#ifdef USE_SBMB_CLASS + case OP_WORD_SB: STAT_OP_IN(OP_WORD_SB); + DATA_ENSURE(1); + if (! IS_SB_WORD(encode, *s)) + goto fail; + s++; + STAT_OP_OUT; + break; + + case OP_WORD_MB: STAT_OP_IN(OP_WORD_MB); + DATA_ENSURE(1); + if (! IS_MB_WORD(encode, *s)) + goto fail; + + n = mblen(encode, *s); + DATA_ENSURE(n); + s += n; + STAT_OP_OUT; + break; +#endif + + case OP_WORD_BOUND: STAT_OP_IN(OP_WORD_BOUND); + if (ON_STR_BEGIN(s)) { + DATA_ENSURE(1); + if (! IS_WORD_STR(encode, s, end)) + goto fail; + } + else if (ON_STR_END(s)) { + if (! IS_WORD_STR(encode, sprev, end)) + goto fail; + } + else { + if (IS_WORD_STR(encode, s, end) == IS_WORD_STR(encode, sprev, end)) + goto fail; + } + STAT_OP_OUT; + continue; + break; + + case OP_NOT_WORD_BOUND: STAT_OP_IN(OP_NOT_WORD_BOUND); + if (ON_STR_BEGIN(s)) { + if (DATA_ENSURE_CHECK(1) && IS_WORD_STR(encode, s, end)) + goto fail; + } + else if (ON_STR_END(s)) { + if (IS_WORD_STR(encode, sprev, end)) + goto fail; + } + else { + if (IS_WORD_STR(encode, s, end) != IS_WORD_STR(encode, sprev, end)) + goto fail; + } + STAT_OP_OUT; + continue; + break; + +#ifdef USE_WORD_BEGIN_END + case OP_WORD_BEGIN: STAT_OP_IN(OP_WORD_BEGIN); + if (DATA_ENSURE_CHECK(1) && IS_WORD_STR(encode, s, end)) { + if (ON_STR_BEGIN(s) || !IS_WORD_STR(encode, sprev, end)) { + STAT_OP_OUT; + continue; + } + } + goto fail; + break; + + case OP_WORD_END: STAT_OP_IN(OP_WORD_END); + if (!ON_STR_BEGIN(s) && IS_WORD_STR(encode, sprev, end)) { + if (ON_STR_END(s) || !IS_WORD_STR(encode, s, end)) { + STAT_OP_OUT; + continue; + } + } + goto fail; + break; +#endif + + case OP_BEGIN_BUF: STAT_OP_IN(OP_BEGIN_BUF); + if (! ON_STR_BEGIN(s)) goto fail; + + STAT_OP_OUT; + continue; + break; + + case OP_END_BUF: STAT_OP_IN(OP_END_BUF); + if (! ON_STR_END(s)) goto fail; + + STAT_OP_OUT; + continue; + break; + + case OP_BEGIN_LINE: STAT_OP_IN(OP_BEGIN_LINE); + if (ON_STR_BEGIN(s)) { + if (IS_NOTBOL(msa->options)) goto fail; + STAT_OP_OUT; + continue; + } + else if (IS_NEWLINE(*sprev) && !ON_STR_END(s)) { + STAT_OP_OUT; + continue; + } + goto fail; + break; + + case OP_END_LINE: STAT_OP_IN(OP_END_LINE); + if (ON_STR_END(s)) { +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + if (IS_EMPTY_STR || !IS_NEWLINE(*sprev)) { +#endif + if (IS_NOTEOL(msa->options)) goto fail; + STAT_OP_OUT; + continue; +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + } +#endif + } + else if (IS_NEWLINE(*s)) { + STAT_OP_OUT; + continue; + } + goto fail; + break; + + case OP_SEMI_END_BUF: STAT_OP_IN(OP_SEMI_END_BUF); + if (ON_STR_END(s)) { +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + if (IS_EMPTY_STR || !IS_NEWLINE(*sprev)) { +#endif + if (IS_NOTEOL(msa->options)) goto fail; /* Is it needed? */ + STAT_OP_OUT; + continue; +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + } +#endif + } + if (IS_NEWLINE(*s) && ON_STR_END(s+1)) { + STAT_OP_OUT; + continue; + } + goto fail; + break; + + case OP_BEGIN_POSITION: STAT_OP_IN(OP_BEGIN_POSITION); + if (s != msa->start) + goto fail; + + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_START_PUSH: STAT_OP_IN(OP_MEMORY_START_PUSH); + GET_MEMNUM_INC(mem, p); + STACK_PUSH_MEM_START(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_START: STAT_OP_IN(OP_MEMORY_START); + GET_MEMNUM_INC(mem, p); + mem_start_stk[mem] = (StackIndex )((void* )s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_END_PUSH: STAT_OP_IN(OP_MEMORY_END_PUSH); + GET_MEMNUM_INC(mem, p); + STACK_PUSH_MEM_END(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_END: STAT_OP_IN(OP_MEMORY_END); + GET_MEMNUM_INC(mem, p); + mem_end_stk[mem] = (StackIndex )((void* )s); + STAT_OP_OUT; + continue; + break; + +#ifdef USE_SUBEXP_CALL + case OP_MEMORY_END_PUSH_REC: STAT_OP_IN(OP_MEMORY_END_PUSH_REC); + GET_MEMNUM_INC(mem, p); + STACK_GET_MEM_START(mem, stkp); + mem_start_stk[mem] = GET_STACK_INDEX(stkp); + STACK_PUSH_MEM_END(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_END_REC: STAT_OP_IN(OP_MEMORY_END_REC); + GET_MEMNUM_INC(mem, p); + mem_end_stk[mem] = (StackIndex )((void* )s); + STACK_GET_MEM_START(mem, stkp); + mem_start_stk[mem] = GET_STACK_INDEX(stkp); + STACK_PUSH_MEM_END_MARK(mem); + STAT_OP_OUT; + continue; + break; +#endif + + case OP_BACKREF1: STAT_OP_IN(OP_BACKREF1); + mem = 1; + goto backref; + break; + + case OP_BACKREF2: STAT_OP_IN(OP_BACKREF2); + mem = 2; + goto backref; + break; + + case OP_BACKREF3: STAT_OP_IN(OP_BACKREF3); + mem = 3; + goto backref; + break; + + case OP_BACKREFN: STAT_OP_IN(OP_BACKREFN); + GET_MEMNUM_INC(mem, p); + backref: + { + int len; + UChar *pstart, *pend; + + /* if you want to remove following line, + you should check in parse and compile time. */ + if (mem > num_mem) goto fail; + if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + + if (BIT_STATUS_AT(reg->backtrack_mem, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (find_cond != 0 + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + STRING_CMP(pstart, s, n); + while (sprev + (len = mblen(encode, *sprev)) < s) + sprev += len; + + STAT_OP_OUT; + continue; + } + break; + + case OP_BACKREF_MULTI: STAT_OP_IN(OP_BACKREF_MULTI); + { + int len, is_fail; + UChar *pstart, *pend, *swork; + + GET_LENGTH_INC(tlen, p); + for (i = 0; i < tlen; i++) { + GET_MEMNUM_INC(mem, p); + + if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + + if (BIT_STATUS_AT(reg->backtrack_mem, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (find_cond != 0 + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(pstart, swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = mblen(encode, *sprev)) < s) + sprev += len; + + p += (SIZE_MEMNUM * (tlen - i - 1)); + break; /* success */ + } + if (i == tlen) goto fail; + STAT_OP_OUT; + continue; + } + break; + + case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH); + GET_OPTION_INC(option, p); + ignore_case = IS_IGNORECASE(option); + STACK_PUSH_ALT(p, s, sprev); + p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL; + STAT_OP_OUT; + continue; + break; + + case OP_SET_OPTION: STAT_OP_IN(OP_SET_OPTION); + GET_OPTION_INC(option, p); + ignore_case = IS_IGNORECASE(option); + STAT_OP_OUT; + continue; + break; + + case OP_NULL_CHECK_START: STAT_OP_IN(OP_NULL_CHECK_START); + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_PUSH_NULL_CHECK_START(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_NULL_CHECK_END: STAT_OP_IN(OP_NULL_CHECK_END); + { + int isnull; + + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_NULL_CHECK(isnull, mem, s); + if (isnull) { +#ifdef REG_DEBUG_MATCH + fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d\n", + (int )mem, (int )s); +#endif + /* empty loop founded, skip next instruction */ + switch (*p++) { + case OP_JUMP: + case OP_PUSH: + p += SIZE_RELADDR; + break; + case OP_REPEAT_INC: + case OP_REPEAT_INC_NG: + p += SIZE_MEMNUM; + break; + default: + goto unexpected_bytecode_error; + break; + } + } + } + STAT_OP_OUT; + continue; + break; + + case OP_JUMP: STAT_OP_IN(OP_JUMP); + GET_RELADDR_INC(addr, p); + p += addr; + STAT_OP_OUT; + continue; + break; + + case OP_PUSH: STAT_OP_IN(OP_PUSH); + GET_RELADDR_INC(addr, p); + STACK_PUSH_ALT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + break; + + case OP_POP: STAT_OP_IN(OP_POP); + STACK_POP_ONE; + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_OR_JUMP_EXACT1: STAT_OP_IN(OP_PUSH_OR_JUMP_EXACT1); + GET_RELADDR_INC(addr, p); + if (*p == *s && DATA_ENSURE_CHECK(1)) { + p++; + STACK_PUSH_ALT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + } + p += (addr + 1); + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_IF_PEEK_NEXT: STAT_OP_IN(OP_PUSH_IF_PEEK_NEXT); + GET_RELADDR_INC(addr, p); + if (*p == *s) { + p++; + STACK_PUSH_ALT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + } + p++; + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT: STAT_OP_IN(OP_REPEAT); + { + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + GET_RELADDR_INC(addr, p); + + STACK_ENSURE(1); + repeat_stk[mem] = GET_STACK_INDEX(stk); + STACK_PUSH_REPEAT(mem, p); + + if (reg->repeat_range[mem].lower == 0) { + STACK_PUSH_ALT(p + addr, s, sprev); + } + } + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT_NG: STAT_OP_IN(OP_REPEAT_NG); + { + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + GET_RELADDR_INC(addr, p); + + STACK_ENSURE(1); + repeat_stk[mem] = GET_STACK_INDEX(stk); + STACK_PUSH_REPEAT(mem, p); + + if (reg->repeat_range[mem].lower == 0) { + STACK_PUSH_ALT(p, s, sprev); + p += addr; + } + } + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT_INC: STAT_OP_IN(OP_REPEAT_INC); + { + StackIndex si; + + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ +#ifdef USE_SUBEXP_CALL + if (reg->num_call > 0) { + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + } + else { + si = repeat_stk[mem]; + stkp = STACK_AT(si); + } +#else + si = repeat_stk[mem]; + stkp = STACK_AT(si); +#endif + stkp->u.repeat.count++; + if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + /* end of repeat. Nothing to do. */ + } + else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(p, s, sprev); + p = stkp->u.repeat.pcode; + } + else { + p = stkp->u.repeat.pcode; + } + STACK_PUSH_REPEAT_INC(si); + } + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT_INC_NG: STAT_OP_IN(OP_REPEAT_INC_NG); + { + StackIndex si; + + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ +#ifdef USE_SUBEXP_CALL + if (reg->num_call > 0) { + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + } + else { + si = repeat_stk[mem]; + stkp = STACK_AT(si); + } +#else + si = repeat_stk[mem]; + stkp = STACK_AT(si); +#endif + stkp->u.repeat.count++; + if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + /* end of repeat. Nothing to do. */ + } + else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(stkp->u.repeat.pcode, s, sprev); + } + else { + p = stkp->u.repeat.pcode; + } + STACK_PUSH_REPEAT_INC(si); + } + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_POS: STAT_OP_IN(OP_PUSH_POS); + STACK_PUSH_POS(s, sprev); + STAT_OP_OUT; + continue; + break; + + case OP_POP_POS: STAT_OP_IN(OP_POP_POS); + { + STACK_POS_END(stkp); + s = stkp->u.state.pstr; + sprev = stkp->u.state.pstr_prev; + } + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_POS_NOT: STAT_OP_IN(OP_PUSH_POS_NOT); + GET_RELADDR_INC(addr, p); + STACK_PUSH_POS_NOT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + break; + + case OP_FAIL_POS: STAT_OP_IN(OP_FAIL_POS); + STACK_POP_TIL_POS_NOT; + goto fail; + break; + + case OP_PUSH_STOP_BT: STAT_OP_IN(OP_PUSH_STOP_BT); + STACK_PUSH_STOP_BT; + STAT_OP_OUT; + continue; + break; + + case OP_POP_STOP_BT: STAT_OP_IN(OP_POP_STOP_BT); + STACK_STOP_BT_END; + STAT_OP_OUT; + continue; + break; + + case OP_LOOK_BEHIND: STAT_OP_IN(OP_LOOK_BEHIND); + GET_LENGTH_INC(tlen, p); + s = MBBACK(encode, str, s, (int )tlen); + if (IS_NULL(s)) goto fail; + sprev = regex_get_prev_char_head(encode, str, s); + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_LOOK_BEHIND_NOT: STAT_OP_IN(OP_PUSH_LOOK_BEHIND_NOT); + GET_RELADDR_INC(addr, p); + GET_LENGTH_INC(tlen, p); + q = MBBACK(encode, str, s, (int )tlen); + if (IS_NULL(q)) { + /* too short case -> success. ex. /(?<!XXX)a/.match("a") + If you want to change to fail, replace following line. */ + p += addr; + /* goto fail; */ + } + else { + STACK_PUSH_LOOK_BEHIND_NOT(p + addr, s, sprev); + s = q; + sprev = regex_get_prev_char_head(encode, str, s); + } + STAT_OP_OUT; + continue; + break; + + case OP_FAIL_LOOK_BEHIND_NOT: STAT_OP_IN(OP_FAIL_LOOK_BEHIND_NOT); + STACK_POP_TIL_LOOK_BEHIND_NOT; + goto fail; + break; + +#ifdef USE_SUBEXP_CALL + case OP_CALL: STAT_OP_IN(OP_CALL); + GET_ABSADDR_INC(addr, p); + STACK_PUSH_CALL_FRAME(p); + p = reg->p + addr; + STAT_OP_OUT; + continue; + break; + + case OP_RETURN: STAT_OP_IN(OP_RETURN); + STACK_RETURN(p); + STACK_PUSH_RETURN; + STAT_OP_OUT; + continue; + break; +#endif + + case OP_FINISH: + goto finish; + break; + + fail: + STAT_OP_OUT; + /* fall */ + case OP_FAIL: STAT_OP_IN(OP_FAIL); + STACK_POP; + p = stk->u.state.pcode; + s = stk->u.state.pstr; + sprev = stk->u.state.pstr_prev; + STAT_OP_OUT; + continue; + break; + + default: + goto bytecode_error; + + } /* end of switch */ + sprev = sbegin; + } /* end of while(1) */ + + finish: + STACK_SAVE; + return best_len; + +#ifdef REG_DEBUG + stack_error: + STACK_SAVE; + return REGERR_STACK_BUG; +#endif + + bytecode_error: + STACK_SAVE; + return REGERR_UNDEFINED_BYTECODE; + + unexpected_bytecode_error: + STACK_SAVE; + return REGERR_UNEXPECTED_BYTECODE; +} + + +UChar* DefaultTransTable = (UChar* )0; + +#ifndef REG_RUBY_M17N +static const char SJIS_FOLLOW_TABLE[SINGLE_BYTE_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 +}; + +#define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) +#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) +#define sjis_ismbfirst(c) ismb(REGCODE_SJIS, (c)) +#define sjis_ismbtrail(c) SJIS_FOLLOW_TABLE[(c)] + +extern WCINT +regex_mb2wc(UChar* p, UChar* end, RegCharEncoding code) +{ + int c, i, len; + WCINT n; + + if (code == REGCODE_UTF8) { + c = *p++; + len = mblen(code,c); + if (len > 1) { + len--; + n = c & ((1 << (6 - len)) - 1); + while (len--) { + c = *p++; + n = (n << 6) | (c & ((1 << 6) - 1)); + } + } + else + n = c; + } + else { + c = *p++; + len = mblen(code,c); + n = c; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + } + return n; +} +#endif /* REG_RUBY_M17N */ + +extern UChar* +regex_get_left_adjust_char_head(RegCharEncoding code, UChar* start, UChar* s) +{ + UChar *p; + int len; + + if (s <= start) return s; + p = s; + +#ifdef REG_RUBY_M17N + while (!m17n_islead(code, *p) && p > start) p--; + while (p + (len = mblen(code, *p)) < s) { + p += len; + } + if (p + len == s) return s; + return p; +#else + + if (code == REGCODE_ASCII) { + return p; + } + else if (code == REGCODE_EUCJP) { + while (!eucjp_islead(*p) && p > start) p--; + len = mblen(code, *p); + if (p + len > s) return p; + p += len; + return p + ((s - p) & ~1); + } + else if (code == REGCODE_SJIS) { + if (sjis_ismbtrail(*p)) { + while (p > start) { + if (! sjis_ismbfirst(*--p)) { + p++; + break; + } + } + } + len = mblen(code, *p); + if (p + len > s) return p; + p += len; + return p + ((s - p) & ~1); + } + else { /* REGCODE_UTF8 */ + while (!utf8_islead(*p) && p > start) p--; + return p; + } +#endif /* REG_RUBY_M17N */ +} + +extern UChar* +regex_get_right_adjust_char_head(RegCharEncoding code, UChar* start, UChar* s) +{ + UChar* p = regex_get_left_adjust_char_head(code, start, s); + + if (p < s) { + p += mblen(code, *p); + } + return p; +} + +static UChar* +get_right_adjust_char_head_with_prev(RegCharEncoding code, + UChar* start, UChar* s, UChar** prev) +{ + UChar* p = regex_get_left_adjust_char_head(code, start, s); + + if (p < s) { + if (prev) *prev = p; + p += mblen(code, *p); + } + else { + if (prev) *prev = (UChar* )NULL; /* Sorry */ + } + return p; +} + +extern UChar* +regex_get_prev_char_head(RegCharEncoding code, UChar* start, UChar* s) +{ + if (s <= start) + return (UChar* )NULL; + + return regex_get_left_adjust_char_head(code, start, s - 1); +} + +static UChar* +step_backward_char(RegCharEncoding code, UChar* start, UChar* s, int n) +{ + while (IS_NOT_NULL(s) && n-- > 0) { + if (s <= start) + return (UChar* )NULL; + + s = regex_get_left_adjust_char_head(code, start, s - 1); + } + return s; +} + +static UChar* +slow_search(RegCharEncoding code, UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + UChar *t, *p, *s, *end; + + end = text_end - (target_end - target) + 1; + if (end > text_range) + end = text_range; + + s = text; + + while (s < end) { + if (*s == *target) { + p = s + 1; + t = target + 1; + while (t < target_end) { + if (*t != *p++) + break; + t++; + } + if (t == target_end) + return s; + } + s += mblen(code, *s); + } + + return (UChar* )NULL; +} + +static int +str_trans_match_after_head_byte(RegCharEncoding code, + int len, UChar* t, UChar* tend, UChar* p) +{ + while (--len > 0) { + if (*t != *p) break; + t++; p++; + } + + if (len == 0) { + while (t < tend) { + len = mblen(code, *p); + if (len == 1) { + if (*t != TOLOWER(code, *p)) + break; + p++; + t++; + } + else { + if (*t != *p++) break; + t++; + while (--len > 0) { + if (*t != *p) break; + t++; p++; + } + if (len > 0) break; + } + } + if (t == tend) + return 1; + } + + return 0; +} + +static UChar* +slow_search_ic(RegCharEncoding code, + UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + int len; + UChar *t, *p, *s, *end; + + end = text_end - (target_end - target) + 1; + if (end > text_range) + end = text_range; + + s = text; + + while (s < end) { + len = mblen(code, *s); + if (*s == *target || (len == 1 && TOLOWER(code, *s) == *target)) { + p = s + 1; + t = target + 1; + if (str_trans_match_after_head_byte(code, len, t, target_end, p)) + return s; + } + s += len; + } + + return (UChar* )NULL; +} + +static UChar* +slow_search_backward(RegCharEncoding code, UChar* target, UChar* target_end, + UChar* text, UChar* adjust_text, UChar* text_end, UChar* text_start) +{ + UChar *t, *p, *s; + + s = text_end - (target_end - target); + if (s > text_start) + s = text_start; + else + s = regex_get_left_adjust_char_head(code, adjust_text, s); + + while (s >= text) { + if (*s == *target) { + p = s + 1; + t = target + 1; + while (t < target_end) { + if (*t != *p++) + break; + t++; + } + if (t == target_end) + return s; + } + s = regex_get_prev_char_head(code, adjust_text, s); + } + + return (UChar* )NULL; +} + +static UChar* +slow_search_backward_ic(RegCharEncoding code, + UChar* target,UChar* target_end, + UChar* text, UChar* adjust_text, + UChar* text_end, UChar* text_start) +{ + int len; + UChar *t, *p, *s; + + s = text_end - (target_end - target); + if (s > text_start) + s = text_start; + else + s = regex_get_left_adjust_char_head(code, adjust_text, s); + + while (s >= text) { + len = mblen(code, *s); + if (*s == *target || (len == 1 && TOLOWER(code, *s) == *target)) { + p = s + 1; + t = target + 1; + if (str_trans_match_after_head_byte(code, len, t, target_end, p)) + return s; + } + s = regex_get_prev_char_head(code, adjust_text, s); + } + + return (UChar* )NULL; +} + +static UChar* +bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + UChar *s, *t, *p, *end; + UChar *tail; + int skip; + + end = text_range + (target_end - target) - 1; + if (end > text_end) + end = text_end; + + tail = target_end - 1; + s = text; + while ((s - text) < target_end - target) { + s += mblen(reg->enc, *s); + } + s--; /* set to text check tail position. */ + + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + + skip = reg->map[*s]; + p++; + t = p; + while ((p - t) < skip) { + p += mblen(reg->enc, *p); + } + s += (p - t); + } + } + else { + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + + skip = reg->int_map[*s]; + p++; + t = p; + while ((p - t) < skip) { + p += mblen(reg->enc, *p); + } + s += (p - t); + } + } + return (UChar* )NULL; +} + +static UChar* +bm_search(regex_t* reg, UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + UChar *s, *t, *p, *end; + UChar *tail; + + end = text_range + (target_end - target) - 1; + if (end > text_end) + end = text_end; + + tail = target_end - 1; + s = text + (target_end - target) - 1; + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + s += reg->map[*s]; + } + } + else { /* see int_map[] */ + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + s += reg->int_map[*s]; + } + } + return (UChar* )NULL; +} + +static int +set_bm_backward_skip(UChar* s, UChar* end, RegCharEncoding enc, + int ignore_case, int** skip) +{ + int i, len; + + if (IS_NULL(*skip)) { + *skip = (int* )xmalloc(sizeof(int) * REG_CHAR_TABLE_SIZE); + if (IS_NULL(*skip)) return REGERR_MEMORY; + } + + len = end - s; + for (i = 0; i < REG_CHAR_TABLE_SIZE; i++) + (*skip)[i] = len; + + if (ignore_case) { + for (i = len - 1; i > 0; i--) + (*skip)[TOLOWER(enc, s[i])] = i; + } + else { + for (i = len - 1; i > 0; i--) + (*skip)[s[i]] = i; + } + return 0; +} + +static UChar* +bm_search_backward(regex_t* reg, UChar* target, UChar* target_end, UChar* text, + UChar* adjust_text, UChar* text_end, UChar* text_start) +{ + UChar *s, *t, *p; + + s = text_end - (target_end - target); + if (text_start < s) + s = text_start; + else + s = regex_get_left_adjust_char_head(reg->enc, adjust_text, s); + + while (s >= text) { + p = s; + t = target; + while (t < target_end && *p == *t) { + p++; t++; + } + if (t == target_end) + return s; + + s -= reg->int_map_backward[*s]; + s = regex_get_left_adjust_char_head(reg->enc, adjust_text, s); + } + + return (UChar* )NULL; +} + +static UChar* +map_search(RegCharEncoding code, UChar map[], UChar* text, UChar* text_range) +{ + UChar *s = text; + + while (s < text_range) { + if (map[*s]) return s; + + s += mblen(code, *s); + } + return (UChar* )NULL; +} + +static UChar* +map_search_backward(RegCharEncoding code, UChar map[], + UChar* text, UChar* adjust_text, UChar* text_start) +{ + UChar *s = text_start; + + while (s >= text) { + if (map[*s]) return s; + + s = regex_get_prev_char_head(code, adjust_text, s); + } + return (UChar* )NULL; +} + +extern int +regex_match(regex_t* reg, UChar* str, UChar* end, UChar* at, RegRegion* region, + RegOptionType option) +{ + int r; + UChar *prev; + MatchArg msa; + + MATCH_ARG_INIT(msa, option, region, at); + + if (region && !IS_POSIX_REGION(option)) + r = regex_region_resize(region, reg->num_mem + 1); + else + r = 0; + + if (r == 0) { + prev = regex_get_prev_char_head(reg->enc, str, at); + r = match_at(reg, str, end, at, prev, &msa); + } + MATCH_ARG_FREE(msa); + return r; +} + +static int +forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, + UChar* range, UChar** low, UChar** high, UChar** low_prev) +{ + UChar *p, *pprev = (UChar* )NULL; + +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, "forward_search_range: str: %d, end: %d, s: %d, range: %d\n", + (int )str, (int )end, (int )s, (int )range); +#endif + + p = s; + if (reg->dmin > 0) { + if (IS_SINGLEBYTE_CODE(reg->enc)) { + p += reg->dmin; + } + else { + UChar *q = p + reg->dmin; + while (p < q) p += mblen(reg->enc, *p); + } + } + + retry: + switch (reg->optimize) { + case REG_OPTIMIZE_EXACT: + p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); + break; + case REG_OPTIMIZE_EXACT_IC: + p = slow_search_ic(reg->enc, reg->exact, reg->exact_end, p, end, range); + break; + + case REG_OPTIMIZE_EXACT_BM: + p = bm_search(reg, reg->exact, reg->exact_end, p, end, range); + break; + + case REG_OPTIMIZE_EXACT_BM_NOT_REV: + p = bm_search_notrev(reg, reg->exact, reg->exact_end, p, end, range); + break; + + case REG_OPTIMIZE_MAP: + p = map_search(reg->enc, reg->map, p, range); + break; + } + + if (p && p < range) { + if (p - reg->dmin < s) { + retry_gate: + pprev = p; + p += mblen(reg->enc, *p); + goto retry; + } + + if (reg->sub_anchor) { + UChar* prev; + + switch (reg->sub_anchor) { + case ANCHOR_BEGIN_LINE: + if (!ON_STR_BEGIN(p)) { + prev = regex_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); + if (!IS_NEWLINE(*prev)) + goto retry_gate; + } + break; + + case ANCHOR_END_LINE: + if (ON_STR_END(p)) { + prev = regex_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); + if (prev && IS_NEWLINE(*prev)) + goto retry_gate; + } + else if (!IS_NEWLINE(*p)) + goto retry_gate; + break; + } + } + + if (reg->dmax == 0) { + *low = p; + if (low_prev) { + if (*low > s) + *low_prev = regex_get_prev_char_head(reg->enc, s, p); + else + *low_prev = regex_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); + } + } + else { + if (reg->dmax != INFINITE_DISTANCE) { + *low = p - reg->dmax; + if (*low > s) { + *low = get_right_adjust_char_head_with_prev(reg->enc, s, + *low, low_prev); + if (low_prev && IS_NULL(*low_prev)) + *low_prev = regex_get_prev_char_head(reg->enc, + (pprev ? pprev : s), *low); + } + else { + if (low_prev) + *low_prev = regex_get_prev_char_head(reg->enc, + (pprev ? pprev : str), *low); + } + } + } + /* no needs to adjust *high, *high is used as range check only */ + *high = p - reg->dmin; + +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, + "forward_search_range success: low: %d, high: %d, dmin: %d, dmax: %d\n", + (int )(*low - str), (int )(*high - str), reg->dmin, reg->dmax); +#endif + return 1; /* success */ + } + + return 0; /* fail */ +} + +static int set_bm_backward_skip P_((UChar* s, UChar* end, RegCharEncoding enc, + int ignore_case, int** skip)); + +#define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 + +static int +backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, + UChar* range, UChar* adjrange, UChar** low, UChar** high) +{ + int r; + UChar *p; + + range += reg->dmin; + p = s; + + retry: + switch (reg->optimize) { + case REG_OPTIMIZE_EXACT: + exact_method: + p = slow_search_backward(reg->enc, reg->exact, reg->exact_end, + range, adjrange, end, p); + break; + + case REG_OPTIMIZE_EXACT_IC: + p = slow_search_backward_ic(reg->enc, reg->exact, + reg->exact_end, range, adjrange, end, p); + break; + + case REG_OPTIMIZE_EXACT_BM: + case REG_OPTIMIZE_EXACT_BM_NOT_REV: + if (IS_NULL(reg->int_map_backward)) { + if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) + goto exact_method; + + r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, 0, + &(reg->int_map_backward)); + if (r) return r; + } + p = bm_search_backward(reg, reg->exact, reg->exact_end, range, adjrange, + end, p); + break; + + case REG_OPTIMIZE_MAP: + p = map_search_backward(reg->enc, reg->map, range, adjrange, p); + break; + } + + if (p) { + if (reg->sub_anchor) { + UChar* prev; + + switch (reg->sub_anchor) { + case ANCHOR_BEGIN_LINE: + if (!ON_STR_BEGIN(p)) { + prev = regex_get_prev_char_head(reg->enc, adjrange, p); + if (!IS_NEWLINE(*prev)) { + p = prev; + goto retry; + } + } + break; + + case ANCHOR_END_LINE: + if (ON_STR_END(p)) { + prev = regex_get_prev_char_head(reg->enc, adjrange, p); + if (IS_NULL(prev)) goto fail; + if (IS_NEWLINE(*prev)) { + p = prev; + goto retry; + } + } + else if (!IS_NEWLINE(*p)) { + p = regex_get_prev_char_head(reg->enc, adjrange, p); + if (IS_NULL(p)) goto fail; + goto retry; + } + break; + } + } + + /* no needs to adjust *high, *high is used as range check only */ + if (reg->dmax != INFINITE_DISTANCE) { + *low = p - reg->dmax; + *high = p - reg->dmin; + *high = regex_get_right_adjust_char_head(reg->enc, adjrange, *high); + } + +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, "backward_search_range: low: %d, high: %d\n", + (int )(*low - str), (int )(*high - str)); +#endif + return 1; /* success */ + } + + fail: +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, "backward_search_range: fail.\n"); +#endif + return 0; /* fail */ +} + + +extern int +regex_search(regex_t* reg, UChar* str, UChar* end, + UChar* start, UChar* range, RegRegion* region, RegOptionType option) +{ + int r; + UChar *s, *prev; + MatchArg msa; + + if (REG_STATE(reg) == REG_STATE_NORMAL) { + reg->state++; /* increment as search counter */ + if (IS_NOT_NULL(reg->chain)) { + regex_chain_reduce(reg); + reg->state++; + } + } + else { + int n = 0; + while (REG_STATE(reg) < REG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return REGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + reg->state++; /* increment as search counter */ + } + +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, "regex_search (entry point): str: %d, end: %d, start: %d, range: %d\n", + (int )str, (int )(end - str), (int )(start - str), (int )(range - str)); +#endif + + if (region && !IS_POSIX_REGION(option)) { + r = regex_region_resize(region, reg->num_mem + 1); + if (r) goto finish_no_msa; + } + + if (start > end || start < str) goto mismatch_no_msa; + +#define MATCH_AND_RETURN_CHECK \ + r = match_at(reg, str, end, s, prev, &msa);\ + if (r != REG_MISMATCH) {\ + if (r >= 0) goto match;\ + goto finish; /* error */ \ + } + + /* anchor optimize: resume search range */ + if (reg->anchor != 0 && str < end) { + UChar* semi_end; + + if (reg->anchor & ANCHOR_BEGIN_POSITION) { + /* search start-position only */ + begin_position: + if (range > start) + range = start + 1; + else + range = start; + } + else if (reg->anchor & ANCHOR_BEGIN_BUF) { + /* search str-position only */ + if (range > start) { + if (start != str) goto mismatch_no_msa; + range = str + 1; + } + else { + if (range <= str) { + start = str; + range = str; + } + else + goto mismatch_no_msa; + } + } + else if (reg->anchor & ANCHOR_END_BUF) { + semi_end = end; + + end_buf: + if (semi_end - str < reg->anchor_dmin) + goto mismatch_no_msa; + + if (range > start) { + if (semi_end - start > reg->anchor_dmax) { + start = semi_end - reg->anchor_dmax; + if (start < end) + start = regex_get_right_adjust_char_head(reg->enc, str, start); + else { /* match with empty at end */ + start = regex_get_prev_char_head(reg->enc, str, end); + } + } + if (semi_end - (range - 1) < reg->anchor_dmin) { + range = semi_end - reg->anchor_dmin + 1; + } + + if (start >= range) goto mismatch_no_msa; + } + else { + if (semi_end - range > reg->anchor_dmax) { + range = semi_end - reg->anchor_dmax; + } + if (semi_end - start < reg->anchor_dmin) { + start = semi_end - reg->anchor_dmin; + start = regex_get_left_adjust_char_head(reg->enc, str, start); + if (range > start) goto mismatch_no_msa; + } + } + } + else if (reg->anchor & ANCHOR_SEMI_END_BUF) { + if (IS_NEWLINE(end[-1])) { + semi_end = end - 1; + if (semi_end > str && start <= semi_end) { + goto end_buf; + } + } + else { + semi_end = end; + goto end_buf; + } + } + else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_PL)) { + goto begin_position; + } + } + else if (str == end) { /* empty string */ + static UChar* address_for_empty_string = ""; + +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, "regex_search: empty string.\n"); +#endif + + if (reg->threshold_len == 0) { + s = start = end = str = address_for_empty_string; + prev = (UChar* )NULL; + + MATCH_ARG_INIT(msa, option, region, start); + MATCH_AND_RETURN_CHECK; + goto mismatch; + } + goto mismatch_no_msa; + } + +#ifdef REG_DEBUG_SEARCH + fprintf(stderr, "regex_search(apply anchor): end: %d, start: %d, range: %d\n", + (int )(end - str), (int )(start - str), (int )(range - str)); +#endif + + MATCH_ARG_INIT(msa, option, region, start); + + s = start; + if (range > start) { /* forward search */ + if (s > str) + prev = regex_get_prev_char_head(reg->enc, str, s); + else + prev = (UChar* )NULL; + + if (reg->optimize != REG_OPTIMIZE_NONE) { + UChar *sch_range, *low, *high, *low_prev; + + sch_range = range; + if (reg->dmax != 0) { + if (reg->dmax == INFINITE_DISTANCE) + sch_range = end; + else { + sch_range += reg->dmax; + if (sch_range > end) sch_range = end; + } + } + if (reg->dmax != INFINITE_DISTANCE && + (end - start) >= reg->threshold_len) { + do { + if (! forward_search_range(reg, str, end, s, sch_range, + &low, &high, &low_prev)) goto mismatch; + if (s < low) { + s = low; + prev = low_prev; + } + while (s <= high) { + MATCH_AND_RETURN_CHECK; + prev = s; + s += mblen(reg->enc, *s); + } + if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { + if (IS_NOT_NULL(prev)) { + while (!IS_NEWLINE(*prev) && s < range) { + prev = s; + s += mblen(reg->enc, *s); + } + } + } + } while (s < range); + goto mismatch; + } + else { /* check only. */ + if ((end - start) < reg->threshold_len || + ! forward_search_range(reg, str, end, s, sch_range, + &low, &high, (UChar** )NULL)) goto mismatch; + } + } + + do { + MATCH_AND_RETURN_CHECK; + prev = s; + s += mblen(reg->enc, *s); + } while (s <= range); /* exec s == range, because empty match with /$/. */ + } + else { /* backward search */ + if (reg->optimize != REG_OPTIMIZE_NONE) { + UChar *low, *high, *adjrange, *sch_start; + + adjrange = regex_get_left_adjust_char_head(reg->enc, str, range); + if (reg->dmax != INFINITE_DISTANCE && + (end - range) >= reg->threshold_len) { + do { + sch_start = s + reg->dmax; + if (sch_start > end) sch_start = end; + if (backward_search_range(reg, str, end, sch_start, range, adjrange, + &low, &high) <= 0) + goto mismatch; + + if (s > high) + s = high; + + while (s >= low) { + prev = regex_get_prev_char_head(reg->enc, str, s); + MATCH_AND_RETURN_CHECK; + s = prev; + } + } while (s >= range); + goto mismatch; + } + else { /* check only. */ + if ((end - range) < reg->threshold_len) goto mismatch; + + sch_start = s; + if (reg->dmax != 0) { + if (reg->dmax == INFINITE_DISTANCE) + sch_start = end; + else { + sch_start += reg->dmax; + if (sch_start > end) sch_start = end; + else + sch_start = regex_get_left_adjust_char_head(reg->enc, start, + sch_start); + } + } + if (backward_search_range(reg, str, end, sch_start, range, adjrange, + &low, &high) <= 0) goto mismatch; + } + } + + do { + prev = regex_get_prev_char_head(reg->enc, str, s); + MATCH_AND_RETURN_CHECK; + s = prev; + } while (s >= range); + } + + mismatch: + r = REG_MISMATCH; + + finish: + MATCH_ARG_FREE(msa); + reg->state--; /* decrement as search counter */ + + /* If result is mismatch and no FIND_NOT_EMPTY option, + then the region is not setted in match_at(). */ + if (IS_FIND_NOT_EMPTY(reg->options) && region && !IS_POSIX_REGION(option)) + regex_region_clear(region); + +#ifdef REG_DEBUG + if (r != REG_MISMATCH) + fprintf(stderr, "regex_search: error %d\n", r); +#endif + return r; + + mismatch_no_msa: + r = REG_MISMATCH; + finish_no_msa: + reg->state--; /* decrement as search counter */ +#ifdef REG_DEBUG + if (r != REG_MISMATCH) + fprintf(stderr, "regex_search: error %d\n", r); +#endif + return r; + + match: + reg->state--; /* decrement as search counter */ + MATCH_ARG_FREE(msa); + return s - str; +} + +extern const char* +regex_version(void) +{ +#define MSTR(a) # a + + return (MSTR(ONIGURUMA_VERSION_MAJOR) "." + MSTR(ONIGURUMA_VERSION_MINOR) "." + MSTR(ONIGURUMA_VERSION_TEENY)); +} diff --git a/ext/mbstring/oniguruma/reggnu.c b/ext/mbstring/oniguruma/reggnu.c new file mode 100644 index 0000000000..7b95e26f76 --- /dev/null +++ b/ext/mbstring/oniguruma/reggnu.c @@ -0,0 +1,231 @@ +/********************************************************************** + + reggnu.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regint.h" + +#if defined(RUBY_PLATFORM) || defined(RUBY) +#ifndef REG_RUBY_M17N +#define USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY +#endif +#endif + +#ifndef NULL +#define NULL ((void* )0) +#endif + +extern void +re_free_registers(RegRegion* r) +{ + /* 0: don't free self */ + regex_region_free(r, 0); +} + +extern int +re_adjust_startpos(regex_t* reg, const char* string, int size, + int startpos, int range) +{ + if (startpos > 0 && mbmaxlen(reg->enc) != 1 && startpos < size) { + UChar *p; + UChar *s = (UChar* )string + startpos; + + if (range > 0) { + p = regex_get_right_adjust_char_head(reg->enc, (UChar* )string, s); + } + else { + p = regex_get_left_adjust_char_head(reg->enc, (UChar* )string, s); + } + return p - (UChar* )string; + } + + return startpos; +} + +extern int +re_match(regex_t* reg, const char* str, int size, int pos, + struct re_registers* regs) +{ + return regex_match(reg, (UChar* )str, (UChar* )(str + size), + (UChar* )(str + pos), regs, REG_OPTION_NONE); +} + +extern int +re_search(regex_t* bufp, const char* string, int size, int startpos, int range, + struct re_registers* regs) +{ + return regex_search(bufp, (UChar* )string, (UChar* )(string + size), + (UChar* )(string + startpos), + (UChar* )(string + startpos + range), regs, REG_OPTION_NONE); +} + +extern int +re_compile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) +{ + int r; + RegErrorInfo einfo; + + r = regex_compile(reg, (UChar* )pattern, (UChar* )(pattern + size), &einfo); + if (r != 0) { + if (IS_NOT_NULL(ebuf)) + (void )regex_error_code_to_str((UChar* )ebuf, r, &einfo); + } + + return r; +} + +extern int +re_recompile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) +{ + int r; + RegErrorInfo einfo; + RegCharEncoding enc; + + /* I think encoding and options should be arguments of this function. + But this is adapted to present re.c. (2002/11/29) + */ + enc = RegDefaultCharEncoding; + + r = regex_recompile(reg, (UChar* )pattern, (UChar* )(pattern + size), + reg->options, enc, RegDefaultSyntax, &einfo); + if (r != 0) { + if (IS_NOT_NULL(ebuf)) + (void )regex_error_code_to_str((UChar* )ebuf, r, &einfo); + } + return r; +} + +extern void +re_free_pattern(regex_t* reg) +{ + regex_free(reg); +} + +extern int +re_alloc_pattern(regex_t** reg) +{ + if (RegDefaultCharEncoding == REGCODE_UNDEF) + return REGERR_DEFAULT_ENCODING_IS_NOT_SETTED; + + return regex_alloc_init(reg, REG_OPTION_DEFAULT, RegDefaultCharEncoding, + RegDefaultSyntax); +} + +extern void +re_set_casetable(const char* table) +{ + regex_set_default_trans_table((UChar* )table); +} + +#ifdef USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY +static const unsigned char mbctab_ascii[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const unsigned char mbctab_euc[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, +}; + +static const unsigned char mbctab_sjis[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 +}; + +static const unsigned char mbctab_utf8[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0, +}; + +const unsigned char *re_mbctab = mbctab_ascii; +#endif + +extern void +#ifdef REG_RUBY_M17N +re_mbcinit(RegCharEncoding enc) +#else +re_mbcinit(int mb_code) +#endif +{ +#ifdef REG_RUBY_M17N + RegDefaultCharEncoding = enc; +#else + RegDefaultCharEncoding = REG_MBLEN_TABLE[mb_code]; +#endif + +#ifdef USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY + switch (mb_code) { + case MBCTYPE_ASCII: + re_mbctab = mbctab_ascii; + break; + case MBCTYPE_EUC: + re_mbctab = mbctab_euc; + break; + case MBCTYPE_SJIS: + re_mbctab = mbctab_sjis; + break; + case MBCTYPE_UTF8: + re_mbctab = mbctab_utf8; + break; + } +#endif +} diff --git a/ext/mbstring/oniguruma/regint.h b/ext/mbstring/oniguruma/regint.h new file mode 100644 index 0000000000..d646dd11f0 --- /dev/null +++ b/ext/mbstring/oniguruma/regint.h @@ -0,0 +1,790 @@ +/********************************************************************** + + regint.h - Oniguruma (regular expression library) + + Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef REGINT_H +#define REGINT_H + +/* for debug */ +/* #define REG_DEBUG_PARSE_TREE */ +/* #define REG_DEBUG_COMPILE */ +/* #define REG_DEBUG_SEARCH */ +/* #define REG_DEBUG_MATCH */ +/* #define REG_DONT_OPTIMIZE */ + +/* for byte-code statistical data. */ +/* #define REG_DEBUG_STATISTICS */ + +#if defined(REG_DEBUG_PARSE_TREE) || defined(REG_DEBUG_MATCH) || \ + defined(REG_DEBUG_COMPILE) || defined(REG_DEBUG_STATISTICS) +#ifndef REG_DEBUG +#define REG_DEBUG +#endif +#endif + +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + (defined(__ppc__) && defined(__APPLE__)) || \ + defined(__mc68020__) +#define UNALIGNED_WORD_ACCESS +#endif + +/* config */ +#define USE_NAMED_SUBEXP +#define USE_SUBEXP_CALL +#define USE_OP_PUSH_OR_JUMP_EXACT +#define USE_QUALIFIER_PEEK_NEXT +#define USE_RECYCLE_NODE +#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ +/* #define USE_SBMB_CLASS */ + +#define INIT_MATCH_STACK_SIZE 160 +#define MATCH_STACK_LIMIT_SIZE 200000 + +/* interface to external system */ +#ifdef NOT_RUBY /* gived from Makefile */ +#include "config.h" +#define USE_VARIABLE_SYNTAX +#define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ +#define DEFAULT_TRANSTABLE_EXIST 1 +#define THREAD_ATOMIC_START /* depend on thread system */ +#define THREAD_ATOMIC_END /* depend on thread system */ +#define THREAD_PASS /* depend on thread system */ +#define xmalloc malloc +#define xrealloc realloc +#define xfree free +#else +#include "ruby.h" +#include "version.h" +#include "rubysig.h" /* for DEFER_INTS, ENABLE_INTS */ +#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR +#define THREAD_ATOMIC_START DEFER_INTS +#define THREAD_ATOMIC_END ENABLE_INTS +#define THREAD_PASS /* I want to use rb_thread_pass() */ +#define WARNING rb_warn +#define VERB_WARNING rb_warning + +#if defined(RUBY_VERSION_MAJOR) +#if RUBY_VERSION_MAJOR > 1 || \ +(RUBY_VERSION_MAJOR == 1 && \ + defined(RUBY_VERSION_MINOR) && RUBY_VERSION_MINOR >= 8) +#define USE_ST_HASH_TABLE +#endif +#endif + +#endif /* else NOT_RUBY */ + +#define THREAD_PASS_LIMIT_COUNT 10 +#define xmemset memset +#define xmemcpy memcpy +#define xmemmove memmove +#if defined(_WIN32) && !defined(__CYGWIN__) +#define xalloca _alloca +#define vsnprintf _vsnprintf +#else +#define xalloca alloca +#endif + +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +#if defined(HAVE_ALLOCA_H) && !defined(__GNUC__) +#include <alloca.h> +#endif + +#ifdef HAVE_STRING_H +# include <string.h> +#else +# include <strings.h> +#endif + +#include <ctype.h> +#include <sys/types.h> + +#ifdef REG_DEBUG +# include <stdio.h> +#endif + +#ifdef NOT_RUBY +# include "oniguruma.h" +#else +# include "regex.h" +#endif + +#ifdef MIN +#undef MIN +#endif +#ifdef MAX +#undef MAX +#endif +#define MIN(a,b) (((a)>(b))?(b):(a)) +#define MAX(a,b) (((a)<(b))?(b):(a)) + +#ifndef UNALIGNED_WORD_ACCESS +#define WORD_ALIGNMENT_SIZE SIZEOF_INT + +#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ + (pad_size) = WORD_ALIGNMENT_SIZE - ((int )(addr) % WORD_ALIGNMENT_SIZE);\ + if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ +} while (0) + +#define ALIGNMENT_RIGHT(addr) do {\ + (addr) += (WORD_ALIGNMENT_SIZE - 1);\ + (addr) -= ((int )(addr) % WORD_ALIGNMENT_SIZE);\ +} while (0) + + +#define B_SHIFT 8 +#define B_MASK 0xff + +#define SERIALIZE_2BYTE_INT(i,p) do {\ + *(p) = ((i) >> B_SHIFT) & B_MASK;\ + *((p)+1) = (i) & B_MASK;\ +} while (0) + +#define SERIALIZE_4BYTE_INT(i,p) do {\ + *(p) = ((i) >> B_SHIFT*3) & B_MASK;\ + *((p)+1) = ((i) >> B_SHIFT*2) & B_MASK;\ + *((p)+2) = ((i) >> B_SHIFT ) & B_MASK;\ + *((p)+3) = (i) & B_MASK;\ +} while (0) + +#define SERIALIZE_8BYTE_INT(i,p) do {\ + *(p) = ((i) >> B_SHIFT*7) & B_MASK;\ + *((p)+1) = ((i) >> B_SHIFT*6) & B_MASK;\ + *((p)+2) = ((i) >> B_SHIFT*5) & B_MASK;\ + *((p)+3) = ((i) >> B_SHIFT*4) & B_MASK;\ + *((p)+4) = ((i) >> B_SHIFT*3) & B_MASK;\ + *((p)+5) = ((i) >> B_SHIFT*2) & B_MASK;\ + *((p)+6) = ((i) >> B_SHIFT ) & B_MASK;\ + *((p)+7) = (i) & B_MASK;\ +} while (0) + +#define GET_2BYTE_INT_INC(type,i,p) do {\ + (i) = (type )(((unsigned int )(*(p)) << B_SHIFT) | (unsigned int )((p)[1]));\ + (p) += 2;\ +} while (0) + +#define GET_4BYTE_INT_INC(type,i,p) do {\ + (i) = (type )(((unsigned int )((p)[0]) << B_SHIFT*3) | \ + ((unsigned int )((p)[1]) << B_SHIFT*2) | \ + ((unsigned int )((p)[2]) << B_SHIFT ) | \ + ((unsigned int )((p)[3]) )); \ + (p) += 4;\ +} while (0) + +#define GET_8BYTE_INT_INC(type,i,p) do {\ + (i) = (type )(((unsigned long )((p)[0]) << B_SHIFT*7) | \ + ((unsigned long )((p)[1]) << B_SHIFT*6) | \ + ((unsigned long )((p)[2]) << B_SHIFT*5) | \ + ((unsigned long )((p)[3]) << B_SHIFT*4) | \ + ((unsigned long )((p)[4]) << B_SHIFT*3) | \ + ((unsigned long )((p)[5]) << B_SHIFT*2) | \ + ((unsigned long )((p)[6]) << B_SHIFT ) | \ + ((unsigned long )((p)[7]) )); \ + (p) += 8;\ +} while (0) + +#if SIZEOF_SHORT == 2 +#define GET_SHORT_INC(i,p) GET_2BYTE_INT_INC(short,i,p) +#define SERIALIZE_SHORT(i,p) SERIALIZE_2BYTE_INT(i,p) +#elif SIZEOF_SHORT == 4 +#define GET_SHORT_INC(i,p) GET_4BYTE_INT_INC(short,i,p) +#define SERIALIZE_SHORT(i,p) SERIALIZE_4BYTE_INT(i,p) +#elif SIZEOF_SHORT == 8 +#define GET_SHORT_INC(i,p) GET_8BYTE_INT_INC(short,i,p) +#define SERIALIZE_SHORT(i,p) SERIALIZE_8BYTE_INT(i,p) +#endif + +#if SIZEOF_INT == 2 +#define GET_INT_INC(i,p) GET_2BYTE_INT_INC(int,i,p) +#define GET_UINT_INC(i,p) GET_2BYTE_INT_INC(unsigned,i,p) +#define SERIALIZE_INT(i,p) SERIALIZE_2BYTE_INT(i,p) +#define SERIALIZE_UINT(i,p) SERIALIZE_2BYTE_INT(i,p) +#elif SIZEOF_INT == 4 +#define GET_INT_INC(i,p) GET_4BYTE_INT_INC(int,i,p) +#define GET_UINT_INC(i,p) GET_4BYTE_INT_INC(unsigned,i,p) +#define SERIALIZE_INT(i,p) SERIALIZE_4BYTE_INT(i,p) +#define SERIALIZE_UINT(i,p) SERIALIZE_4BYTE_INT(i,p) +#elif SIZEOF_INT == 8 +#define GET_INT_INC(i,p) GET_8BYTE_INT_INC(int,i,p) +#define GET_UINT_INC(i,p) GET_8BYTE_INT_INC(unsigned,i,p) +#define SERIALIZE_INT(i,p) SERIALIZE_8BYTE_INT(i,p) +#define SERIALIZE_UINT(i,p) SERIALIZE_8BYTE_INT(i,p) +#endif + +#endif /* UNALIGNED_WORD_ACCESS */ + +/* stack pop level */ +#define STACK_POP_LEVEL_FREE 0 +#define STACK_POP_LEVEL_MEM_START 1 +#define STACK_POP_LEVEL_ALL 2 + +/* optimize flags */ +#define REG_OPTIMIZE_NONE 0 +#define REG_OPTIMIZE_EXACT 1 /* Slow Search */ +#define REG_OPTIMIZE_EXACT_BM 2 /* Boyer Moore Search */ +#define REG_OPTIMIZE_EXACT_BM_NOT_REV 3 /* BM (but not simple match) */ +#define REG_OPTIMIZE_EXACT_IC 4 /* Slow Search (ignore case) */ +#define REG_OPTIMIZE_MAP 5 /* char map */ + +/* bit status */ +typedef unsigned int BitStatusType; + +#define BIT_STATUS_BITS_NUM (sizeof(BitStatusType) * 8) +#define BIT_STATUS_CLEAR(stats) (stats) = 0 +#define BIT_STATUS_ON_ALL(stats) (stats) = ~((BitStatusType )0) +#define BIT_STATUS_AT(stats,n) \ + ((n) < BIT_STATUS_BITS_NUM ? ((stats) & (1 << n)) : ((stats) & 1)) + +#define BIT_STATUS_ON_AT(stats,n) do {\ + if ((n) < BIT_STATUS_BITS_NUM)\ + (stats) |= (1 << (n));\ + else\ + (stats) |= 1;\ +} while (0) + +#define BIT_STATUS_ON_AT_SIMPLE(stats,n) do {\ + if ((n) < BIT_STATUS_BITS_NUM)\ + (stats) |= (1 << (n));\ +} while (0) + + +#define INT_MAX_LIMIT ((1UL << (SIZEOF_INT * 8 - 1)) - 1) + +typedef unsigned int WCINT; + +#define SIZE_WCINT sizeof(WCINT) +#define GET_WCINT(wc,p) (wc) = *((WCINT* )(p)) + +#define INFINITE_DISTANCE ~((RegDistance )0) + +#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) +# define IS_ASCII(c) 1 +#else +# define IS_ASCII(c) isascii(c) +#endif + +#ifdef isblank +# define IS_BLANK(c) (IS_ASCII(c) && isblank(c)) +#else +# define IS_BLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +# define IS_GRAPH(c) (IS_ASCII(c) && isgraph(c)) +#else +# define IS_GRAPH(c) (IS_ASCII(c) && isprint(c) && !isspace(c)) +#endif + +#define IS_PRINT(c) (isprint(c) && IS_ASCII(c)) +#define IS_ALNUM(c) (isalnum(c) && IS_ASCII(c)) +#define IS_ALPHA(c) (isalpha(c) && IS_ASCII(c)) +#define IS_LOWER(c) (islower(c) && IS_ASCII(c)) +#define IS_UPPER(c) (isupper(c) && IS_ASCII(c)) +#define IS_CNTRL(c) (iscntrl(c) && IS_ASCII(c)) +#define IS_PUNCT(c) (ispunct(c) && IS_ASCII(c)) +#define IS_SPACE(c) (isspace(c) && IS_ASCII(c)) +#define IS_DIGIT(c) (isdigit(c) && IS_ASCII(c)) +#define IS_XDIGIT(c) (isxdigit(c) && IS_ASCII(c)) +#define IS_ODIGIT(c) (IS_DIGIT(c) && (c) < '8') + +#define DIGITVAL(c) ((c) - '0') +#define ODIGITVAL(c) DIGITVAL(c) +#define XDIGITVAL(c) \ + (IS_DIGIT(c) ? DIGITVAL(c) : (IS_UPPER(c) ? (c) - 'A' + 10 : (c) - 'a' + 10)) + +#define IS_SINGLELINE(option) ((option) & REG_OPTION_SINGLELINE) +#define IS_MULTILINE(option) ((option) & REG_OPTION_MULTILINE) +#define IS_IGNORECASE(option) ((option) & REG_OPTION_IGNORECASE) +#define IS_EXTEND(option) ((option) & REG_OPTION_EXTEND) +#define IS_FIND_LONGEST(option) ((option) & REG_OPTION_FIND_LONGEST) +#define IS_FIND_NOT_EMPTY(option) ((option) & REG_OPTION_FIND_NOT_EMPTY) +#define IS_POSIXLINE(option) (IS_SINGLELINE(option) && IS_MULTILINE(option)) +#define IS_FIND_CONDITION(option) ((option) & \ + (REG_OPTION_FIND_LONGEST | REG_OPTION_FIND_NOT_EMPTY)) +#define IS_NOTBOL(option) ((option) & REG_OPTION_NOTBOL) +#define IS_NOTEOL(option) ((option) & REG_OPTION_NOTEOL) +#define IS_POSIX_REGION(option) ((option) & REG_OPTION_POSIX_REGION) + +#ifdef NEWLINE +#undef NEWLINE +#endif +#define NEWLINE '\n' +#define IS_NULL(p) (((void*)(p)) == (void*)0) +#define IS_NOT_NULL(p) (((void*)(p)) != (void*)0) +#define IS_NEWLINE(c) ((c) == NEWLINE) +#define CHECK_NULL_RETURN(p) if (IS_NULL(p)) return NULL +#define CHECK_NULL_RETURN_VAL(p,val) if (IS_NULL(p)) return (val) + +#define NULL_UCHARP ((UChar* )0) + +/* bitset */ +#define BITS_PER_BYTE 8 +#define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) +#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE) +#define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) + +#ifdef UNALIGNED_WORD_ACCESS +typedef unsigned int Bits; +#else +typedef unsigned char Bits; +#endif +typedef Bits BitSet[BITSET_SIZE]; +typedef Bits* BitSetRef; + +#define SIZE_BITSET sizeof(BitSet) + +#define BITSET_CLEAR(bs) do {\ + int i;\ + for (i = 0; i < BITSET_SIZE; i++) { (bs)[i] = 0; }\ +} while (0) + +#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM] +#define BS_BIT(pos) (1 << (pos % BITS_IN_ROOM)) + +#define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos)) +#define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos) +#define BITSET_CLEAR_BIT(bs, pos) BS_ROOM(bs,pos) &= ~(BS_BIT(pos)) +#define BITSET_INVERT_BIT(bs, pos) BS_ROOM(bs,pos) ^= BS_BIT(pos) + +/* bytes buffer */ +typedef struct _BBuf { + UChar* p; + unsigned int used; + unsigned int alloc; +} BBuf; + +#define BBUF_INIT(buf,size) regex_bbuf_init((BBuf* )(buf), (size)) + +#define BBUF_SIZE_INC(buf,inc) do{\ + (buf)->alloc += (inc);\ + (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ + if (IS_NULL((buf)->p)) return(REGERR_MEMORY);\ +} while (0) + +#define BBUF_EXPAND(buf,low) do{\ + do { (buf)->alloc *= 2; } while ((buf)->alloc < low);\ + (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ + if (IS_NULL((buf)->p)) return(REGERR_MEMORY);\ +} while (0) + +#define BBUF_ENSURE_SIZE(buf,size) do{\ + int new_alloc = (buf)->alloc;\ + while (new_alloc < (size)) { new_alloc *= 2; }\ + if ((buf)->alloc != new_alloc) {\ + (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ + if (IS_NULL((buf)->p)) return(REGERR_MEMORY);\ + (buf)->alloc = new_alloc;\ + }\ +} while (0) + +#define BBUF_WRITE(buf,pos,bytes,n) do{\ + int used = (pos) + (n);\ + if ((buf)->alloc < used) BBUF_EXPAND((buf),used);\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + if ((buf)->used < used) (buf)->used = used;\ +} while (0) + +#define BBUF_WRITE1(buf,pos,byte) do{\ + int used = (pos) + 1;\ + if ((buf)->alloc < used) BBUF_EXPAND((buf),used);\ + (buf)->p[(pos)] = (byte);\ + if ((buf)->used < used) (buf)->used = used;\ +} while (0) + +#define BBUF_ADD(buf,bytes,n) BBUF_WRITE((buf),(buf)->used,(bytes),(n)) +#define BBUF_ADD1(buf,byte) BBUF_WRITE1((buf),(buf)->used,(byte)) +#define BBUF_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) +#define BBUF_GET_OFFSET_POS(buf) ((buf)->used) + +/* from < to */ +#define BBUF_MOVE_RIGHT(buf,from,to,n) do {\ + if ((to) + (n) > (buf)->alloc) BBUF_EXPAND((buf),(to) + (n));\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ + if ((to) + (n) > (buf)->used) (buf)->used = (to) + (n);\ +} while (0) + +/* from > to */ +#define BBUF_MOVE_LEFT(buf,from,to,n) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ +} while (0) + +/* from > to */ +#define BBUF_MOVE_LEFT_REDUCE(buf,from,to) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ + (buf)->used -= (from - to);\ +} while (0) + +#define BBUF_INSERT(buf,pos,bytes,n) do {\ + if (pos >= (buf)->used) {\ + BBUF_WRITE(buf,pos,bytes,n);\ + }\ + else {\ + BBUF_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + }\ +} while (0) + +#define BBUF_GET_BYTE(buf, pos) (buf)->p[(pos)] + +extern UChar* DefaultTransTable; +#define TOLOWER(enc,c) (DefaultTransTable[c]) + +/* methods for support multi-byte code, */ +#define ismb(code,c) (mblen((code),(c)) != 1) +#define MB2WC(p,end,code) mb2wc((p),(end),(code)) +#define MBBACK(code,start,s,n) step_backward_char((code),(start),(s),(n)) + +#ifdef REG_RUBY_M17N + +#define MB2WC_AVAILABLE(enc) 1 +#define WC2MB_FIRST(enc, wc) m17n_firstbyte((enc),(wc)) + +#define mbmaxlen(enc) m17n_mbmaxlen(enc) +#define mblen(enc,c) m17n_mbclen(enc,c) +#define mbmaxlen_dist(enc) \ + (mbmaxlen(enc) > 0 ? mbmaxlen(enc) : INFINITE_DISTANCE) + +#define IS_SINGLEBYTE_CODE(enc) (m17n_mbmaxlen(enc) == 1) +/* #define IS_INDEPENDENT_TRAIL(enc) m17n_independent_trail(enc) */ +#define IS_INDEPENDENT_TRAIL(enc) IS_SINGLEBYTE_CODE(enc) + +#define IS_CODE_ASCII(enc,c) IS_ASCII(c) +#define IS_CODE_GRAPH(enc,c) IS_GRAPH(c) +#define IS_CODE_PRINT(enc,c) m17n_isprint(enc,c) +#define IS_CODE_ALNUM(enc,c) m17n_isalnum(enc,c) +#define IS_CODE_ALPHA(enc,c) m17n_isalpha(enc,c) +#define IS_CODE_LOWER(enc,c) m17n_islower(enc,c) +#define IS_CODE_UPPER(enc,c) m17n_isupper(enc,c) +#define IS_CODE_CNTRL(enc,c) m17n_iscntrl(enc,c) +#define IS_CODE_PUNCT(enc,c) m17n_ispunct(enc,c) +#define IS_CODE_SPACE(enc,c) m17n_isspace(enc,c) +#define IS_CODE_BLANK(enc,c) IS_BLANK(c) +#define IS_CODE_DIGIT(enc,c) m17n_isdigit(enc,c) +#define IS_CODE_XDIGIT(enc,c) m17n_isxdigit(enc,c) + +#define IS_CODE_WORD(enc,c) m17n_iswchar(enc,c) +#define ISNOT_CODE_WORD(enc,c) (!m17n_iswchar(enc,c)) + +#define IS_WORD_STR(code,s,end) \ + (ismb((code),*(s)) ? (s + mblen((code),*(s)) <= (end)) : \ + m17n_iswchar(code,*(s))) +#define IS_WORD_STR_INC(code,s,end) \ + (ismb((code),*(s)) ? ((s) += mblen((code),*(s)), (s) <= (end)) : \ + (s++, m17n_iswchar(code,s[-1]))) + +#define IS_WORD_HEAD(enc,c) (ismb(enc,c) ? 1 : IS_CODE_WORD(enc,c)) + +#define IS_SB_WORD(code,c) (mblen(code,c) == 1 && IS_CODE_WORD(code,c)) +#define IS_MB_WORD(code,c) ismb(code,c) + +#define mb2wc(p,e,enc) m17n_codepoint((enc),(p),(e)) + +#else /* REG_RUBY_M17N */ + +#define mb2wc(p,e,code) regex_mb2wc((p),(e),(code)) + +#define MB2WC_AVAILABLE(code) 1 +#define WC2MB_FIRST(code, wc) regex_wc2mb_first(code, wc) + +#define mbmaxlen_dist(code) mbmaxlen(code) +#define mbmaxlen(code) regex_mb_max_length(code) +#define mblen(code,c) (code)[(int )(c)] + +#define IS_SINGLEBYTE_CODE(code) ((code) == REGCODE_ASCII) +#define IS_INDEPENDENT_TRAIL(code) \ + ((code) == REGCODE_ASCII || (code) == REGCODE_UTF8) + +#define IS_CODE_ASCII(code,c) IS_ASCII(c) +#define IS_CODE_GRAPH(code,c) IS_GRAPH(c) +#define IS_CODE_PRINT(code,c) IS_PRINT(c) +#define IS_CODE_ALNUM(code,c) IS_ALNUM(c) +#define IS_CODE_ALPHA(code,c) IS_ALPHA(c) +#define IS_CODE_LOWER(code,c) IS_LOWER(c) +#define IS_CODE_UPPER(code,c) IS_UPPER(c) +#define IS_CODE_CNTRL(code,c) IS_CNTRL(c) +#define IS_CODE_PUNCT(code,c) IS_PUNCT(c) +#define IS_CODE_SPACE(code,c) IS_SPACE(c) +#define IS_CODE_BLANK(code,c) IS_BLANK(c) +#define IS_CODE_DIGIT(code,c) IS_DIGIT(c) +#define IS_CODE_ODIGIT(code,c) IS_ODIGIT(c) +#define IS_CODE_XDIGIT(code,c) IS_XDIGIT(c) + +#define IS_SB_WORD(code,c) (IS_CODE_ALNUM(code,c) || (c) == '_') +#define IS_MB_WORD(code,c) ismb(code,c) + +#define IS_CODE_WORD(code,c) \ + (IS_SB_WORD(code,c) && ((c) < 0x80 || (code) == REGCODE_ASCII)) +#define ISNOT_CODE_WORD(code,c) \ + ((!IS_SB_WORD(code,c)) && !ismb(code,c)) + +#define IS_WORD_STR(code,s,end) \ + (ismb((code),*(s)) ? (s + mblen((code),*(s)) <= (end)) : \ + IS_SB_WORD(code,*(s))) +#define IS_WORD_STR_INC(code,s,end) \ + (ismb((code),*(s)) ? ((s) += mblen((code),*(s)), (s) <= (end)) : \ + (s++, IS_SB_WORD(code,s[-1]))) + +#define IS_WORD_HEAD(code,c) (ismb(code,c) ? 1 : IS_SB_WORD(code,c)) + +extern int regex_mb_max_length P_((RegCharEncoding code)); +extern WCINT regex_mb2wc P_((UChar* p, UChar* end, RegCharEncoding code)); +extern int regex_wc2mb_first P_((RegCharEncoding code, WCINT wc)); + +#endif /* not REG_RUBY_M17N */ + + +#define ANCHOR_BEGIN_BUF (1<<0) +#define ANCHOR_BEGIN_LINE (1<<1) +#define ANCHOR_BEGIN_POSITION (1<<2) +#define ANCHOR_END_BUF (1<<3) +#define ANCHOR_SEMI_END_BUF (1<<4) +#define ANCHOR_END_LINE (1<<5) + +#define ANCHOR_WORD_BOUND (1<<6) +#define ANCHOR_NOT_WORD_BOUND (1<<7) +#define ANCHOR_WORD_BEGIN (1<<8) +#define ANCHOR_WORD_END (1<<9) +#define ANCHOR_PREC_READ (1<<10) +#define ANCHOR_PREC_READ_NOT (1<<11) +#define ANCHOR_LOOK_BEHIND (1<<12) +#define ANCHOR_LOOK_BEHIND_NOT (1<<13) + +#define ANCHOR_ANYCHAR_STAR (1<<14) /* ".*" optimize info */ +#define ANCHOR_ANYCHAR_STAR_PL (1<<15) /* ".*" optimize info (posix-line) */ + +/* operation code */ +enum OpCode { + OP_FINISH = 0, /* matching process terminator (no more alternative) */ + OP_END = 1, /* pattern code terminator (success end) */ + + OP_EXACT1 = 2, /* single byte, N = 1 */ + OP_EXACT2, /* single byte, N = 2 */ + OP_EXACT3, /* single byte, N = 3 */ + OP_EXACT4, /* single byte, N = 4 */ + OP_EXACT5, /* single byte, N = 5 */ + OP_EXACTN, /* single byte */ + OP_EXACTMB2N1, /* mb-length = 2 N = 1 */ + OP_EXACTMB2N2, /* mb-length = 2 N = 2 */ + OP_EXACTMB2N3, /* mb-length = 2 N = 3 */ + OP_EXACTMB2N, /* mb-length = 2 */ + OP_EXACTMB3N, /* mb length = 3 */ + OP_EXACTMBN, /* other length */ + + OP_EXACT1_IC, /* single byte, N = 1, ignore case */ + OP_EXACTN_IC, /* single byte, ignore case */ + + OP_CCLASS, + OP_CCLASS_MB, + OP_CCLASS_MIX, + OP_CCLASS_NOT, + OP_CCLASS_MB_NOT, + OP_CCLASS_MIX_NOT, + + OP_ANYCHAR, /* "." */ + OP_ANYCHAR_STAR, /* ".*" */ + OP_ANYCHAR_STAR_PEEK_NEXT, + + OP_WORD, + OP_NOT_WORD, + OP_WORD_SB, + OP_WORD_MB, + OP_WORD_BOUND, + OP_NOT_WORD_BOUND, + OP_WORD_BEGIN, + OP_WORD_END, + + OP_BEGIN_BUF, + OP_END_BUF, + OP_BEGIN_LINE, + OP_END_LINE, + OP_SEMI_END_BUF, + OP_BEGIN_POSITION, + + OP_BACKREF1, + OP_BACKREF2, + OP_BACKREF3, + OP_BACKREFN, + OP_BACKREF_MULTI, + + OP_MEMORY_START, + OP_MEMORY_START_PUSH, /* push back-tracker to stack */ + OP_MEMORY_END_PUSH, /* push back-tracker to stack */ + OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */ + OP_MEMORY_END, + OP_MEMORY_END_REC, /* push marker to stack */ + + OP_SET_OPTION_PUSH, /* set option and push recover option */ + OP_SET_OPTION, /* set option */ + + OP_FAIL, /* pop stack and move */ + OP_JUMP, + OP_PUSH, + OP_POP, + OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ + OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ + OP_REPEAT, /* {n,m} */ + OP_REPEAT_NG, /* {n,m}? (non greedy) */ + OP_REPEAT_INC, + OP_REPEAT_INC_NG, /* non greedy */ + OP_NULL_CHECK_START, /* null loop checker start */ + OP_NULL_CHECK_END, /* null loop checker end */ + + OP_PUSH_POS, /* (?=...) start */ + OP_POP_POS, /* (?=...) end */ + OP_PUSH_POS_NOT, /* (?!...) start */ + OP_FAIL_POS, /* (?!...) end */ + OP_PUSH_STOP_BT, /* (?>...) start */ + OP_POP_STOP_BT, /* (?>...) end */ + OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ + OP_PUSH_LOOK_BEHIND_NOT, /* (?<!...) start */ + OP_FAIL_LOOK_BEHIND_NOT, /* (?<!...) end */ + + OP_CALL, /* \g<name> */ + OP_RETURN +}; + +/* arguments type */ +#define ARG_SPECIAL -1 +#define ARG_NON 0 +#define ARG_RELADDR 1 +#define ARG_ABSADDR 2 +#define ARG_LENGTH 3 +#define ARG_MEMNUM 4 +#define ARG_OPTION 5 + +typedef short int RelAddrType; +typedef short int AbsAddrType; +typedef short int LengthType; +typedef short int MemNumType; +typedef int RepeatNumType; + +#define SIZE_OPCODE 1 +#define SIZE_RELADDR sizeof(RelAddrType) +#define SIZE_ABSADDR sizeof(AbsAddrType) +#define SIZE_LENGTH sizeof(LengthType) +#define SIZE_MEMNUM sizeof(MemNumType) +#define SIZE_REPEATNUM sizeof(RepeatNumType) +#define SIZE_OPTION sizeof(RegOptionType) + +#ifdef UNALIGNED_WORD_ACCESS +#define GET_RELADDR_INC(addr,p) do{\ + addr = *((RelAddrType* )(p));\ + (p) += SIZE_RELADDR;\ +} while(0) + +#define GET_ABSADDR_INC(addr,p) do{\ + addr = *((AbsAddrType* )(p));\ + (p) += SIZE_ABSADDR;\ +} while(0) + +#define GET_LENGTH_INC(len,p) do{\ + len = *((LengthType* )(p));\ + (p) += SIZE_LENGTH;\ +} while(0) + +#define GET_MEMNUM_INC(num,p) do{\ + num = *((MemNumType* )(p));\ + (p) += SIZE_MEMNUM;\ +} while(0) + +#define GET_REPEATNUM_INC(num,p) do{\ + num = *((RepeatNumType* )(p));\ + (p) += SIZE_REPEATNUM;\ +} while(0) + +#define GET_OPTION_INC(option,p) do{\ + option = *((RegOptionType* )(p));\ + (p) += SIZE_OPTION;\ +} while(0) +#else + +#define GET_RELADDR_INC(addr,p) GET_SHORT_INC(addr,p) +#define GET_ABSADDR_INC(addr,p) GET_SHORT_INC(addr,p) +#define GET_LENGTH_INC(len,p) GET_SHORT_INC(len,p) +#define GET_MEMNUM_INC(num,p) GET_SHORT_INC(num,p) +#define GET_REPEATNUM_INC(num,p) GET_INT_INC(num,p) +#define GET_OPTION_INC(option,p) GET_UINT_INC(option,p) + +#define SERIALIZE_RELADDR(addr,p) SERIALIZE_SHORT(addr,p) +#define SERIALIZE_ABSADDR(addr,p) SERIALIZE_SHORT(addr,p) +#define SERIALIZE_LENGTH(len,p) SERIALIZE_SHORT(len,p) +#define SERIALIZE_MEMNUM(num,p) SERIALIZE_SHORT(num,p) +#define SERIALIZE_REPEATNUM(num,p) SERIALIZE_INT(num,p) +#define SERIALIZE_OPTION(option,p) SERIALIZE_UINT(option,p) + +#define SERIALIZE_BUFSIZE SIZEOF_INT + +#endif /* UNALIGNED_WORD_ACCESS */ + +#define GET_BYTE_INC(byte,p) do{\ + byte = *(p);\ + (p)++;\ +} while(0) + + +/* op-code + arg size */ +#define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE +#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1) +#define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_POP SIZE_OPCODE +#define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1) +#define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1) +#define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_PUSH_POS SIZE_OPCODE +#define SIZE_OP_PUSH_POS_NOT (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_POP_POS SIZE_OPCODE +#define SIZE_OP_FAIL_POS SIZE_OPCODE +#define SIZE_OP_SET_OPTION (SIZE_OPCODE + SIZE_OPTION) +#define SIZE_OP_SET_OPTION_PUSH (SIZE_OPCODE + SIZE_OPTION) +#define SIZE_OP_FAIL SIZE_OPCODE +#define SIZE_OP_MEMORY_START (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_START_PUSH (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END_PUSH (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END_PUSH_REC (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END_REC (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_PUSH_STOP_BT SIZE_OPCODE +#define SIZE_OP_POP_STOP_BT SIZE_OPCODE +#define SIZE_OP_NULL_CHECK_START (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_NULL_CHECK_END (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_LOOK_BEHIND (SIZE_OPCODE + SIZE_LENGTH) +#define SIZE_OP_PUSH_LOOK_BEHIND_NOT (SIZE_OPCODE + SIZE_RELADDR + SIZE_LENGTH) +#define SIZE_OP_FAIL_LOOK_BEHIND_NOT SIZE_OPCODE +#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) +#define SIZE_OP_RETURN SIZE_OPCODE + + +#ifdef REG_DEBUG + +typedef struct { + short int opcode; + char* name; + short int arg_type; +} RegOpInfoType; + +extern RegOpInfoType RegOpInfo[]; + +extern void regex_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp)); + +#ifdef REG_DEBUG_STATISTICS +extern void regex_statistics_init P_((void)); +extern void regex_print_statistics P_((FILE* f)); +#endif +#endif + +extern char* regex_error_code_to_format P_((int code)); +extern void regex_snprintf_with_pattern PV_((char buf[], int bufsize, RegCharEncoding enc, char* pat, char* pat_end, char *fmt, ...)); +extern UChar* regex_strdup P_((UChar* s, UChar* end)); +extern int regex_bbuf_init P_((BBuf* buf, int size)); +extern int regex_alloc_init P_((regex_t** reg, RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax)); +extern int regex_compile P_((regex_t* reg, UChar* pattern, UChar* pattern_end, RegErrorInfo* einfo)); +extern void regex_chain_reduce P_((regex_t* reg)); +extern int regex_is_in_wc_range P_((UChar* p, WCINT wc)); + +#endif /* REGINT_H */ diff --git a/ext/mbstring/oniguruma/regparse.c b/ext/mbstring/oniguruma/regparse.c new file mode 100644 index 0000000000..95a55b2a06 --- /dev/null +++ b/ext/mbstring/oniguruma/regparse.c @@ -0,0 +1,4215 @@ +/********************************************************************** + + regparse.c - Oniguruma (regular expression library) + + Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regparse.h" + +#define WARN_BUFSIZE 256 + +#define SYN_POSIX_COMMON_OP \ + ( REG_SYN_OP_ANYCHAR | REG_SYN_OP_POSIX_BRACKET | REG_SYN_OP_BACK_REF | \ + REG_SYN_OP_CC | REG_SYN_OP_0INF | REG_SYN_OP_LINE_ANCHOR | \ + REG_SYN_OP_ESC_CONTROL_CHAR ) + +#define SYN_GNU_REGEX_OP \ + ( REG_SYN_OP_ANYCHAR | REG_SYN_OP_CC | \ + REG_SYN_OP_POSIX_BRACKET | REG_SYN_OP_BACK_REF | \ + REG_SYN_OP_INTERVAL | REG_SYN_OP_SUBEXP | REG_SYN_OP_ALT | \ + REG_SYN_OP_0INF | REG_SYN_OP_1INF | REG_SYN_OP_01 | \ + REG_SYN_OP_ESC_BUF_ANCHOR | REG_SYN_OP_ESC_WORD | \ + REG_SYN_OP_ESC_WORD_BOUND | REG_SYN_OP_ESC_WORD_BEGIN_END | \ + REG_SYN_OP_ESC_WHITE_SPACE | REG_SYN_OP_ESC_DIGIT | \ + REG_SYN_OP_LINE_ANCHOR ) + +#define SYN_GNU_REGEX_BV \ + ( REG_SYN_CONTEXT_INDEP_ANCHORS | REG_SYN_CONTEXT_INDEP_OPS | \ + REG_SYN_CONTEXT_INVALID_OPS | REG_SYN_ALLOW_INVALID_INTERVAL | \ + REG_SYN_ESCAPE_IN_CC | REG_SYN_ALLOW_RANGE_OP_IN_CC ) + +#ifdef USE_VARIABLE_SYNTAX +RegSyntaxType RegSyntaxPosixBasic = { + ( SYN_POSIX_COMMON_OP | REG_SYN_OP_ESC_SUBEXP | REG_SYN_OP_ESC_INTERVAL ) + , 0 + , 0 + , ( REG_OPTION_SINGLELINE | REG_OPTION_MULTILINE ) +}; + +RegSyntaxType RegSyntaxPosixExtended = { + ( SYN_POSIX_COMMON_OP | REG_SYN_OP_SUBEXP | REG_SYN_OP_INTERVAL | + REG_SYN_OP_1INF | REG_SYN_OP_01 | REG_SYN_OP_ALT ) + , 0 + , ( REG_SYN_CONTEXT_INDEP_ANCHORS | + REG_SYN_CONTEXT_INDEP_OPS | REG_SYN_CONTEXT_INVALID_OPS | + REG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP | REG_SYN_ALLOW_RANGE_OP_IN_CC ) + , ( REG_OPTION_SINGLELINE | REG_OPTION_MULTILINE ) +}; + +RegSyntaxType RegSyntaxEmacs = { + ( REG_SYN_OP_ANYCHAR | REG_SYN_OP_CC | REG_SYN_OP_ESC_INTERVAL | + REG_SYN_OP_ESC_SUBEXP | REG_SYN_OP_ESC_ALT | + REG_SYN_OP_0INF | REG_SYN_OP_1INF | REG_SYN_OP_01 | + REG_SYN_OP_BACK_REF | REG_SYN_OP_LINE_ANCHOR | + REG_SYN_OP_ESC_GNU_BUF_ANCHOR | REG_SYN_OP_ESC_CONTROL_CHAR ) + , 0 + , REG_SYN_ALLOW_EMPTY_RANGE_IN_CC + , REG_OPTION_NONE +}; + +RegSyntaxType RegSyntaxGrep = { + ( REG_SYN_OP_ANYCHAR | REG_SYN_OP_CC | REG_SYN_OP_POSIX_BRACKET | + REG_SYN_OP_INTERVAL | REG_SYN_OP_ESC_SUBEXP | REG_SYN_OP_ESC_ALT | + REG_SYN_OP_0INF | REG_SYN_OP_ESC_1INF | REG_SYN_OP_ESC_01 | + REG_SYN_OP_LINE_ANCHOR ) + , 0 + , ( REG_SYN_ALLOW_EMPTY_RANGE_IN_CC | REG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC ) + , REG_OPTION_NONE +}; + +RegSyntaxType RegSyntaxGnuRegex = { + SYN_GNU_REGEX_OP + , 0 + , SYN_GNU_REGEX_BV + , REG_OPTION_NONE +}; + +RegSyntaxType RegSyntaxJava = { + (( SYN_GNU_REGEX_OP | REG_SYN_OP_NON_GREEDY | REG_SYN_OP_SUBEXP_EFFECT | + REG_SYN_OP_ESC_CONTROL_CHAR | REG_SYN_OP_ESC_C_CONTROL | + REG_SYN_OP_QUOTE | REG_SYN_OP_ESC_OCTAL3 | REG_SYN_OP_ESC_X_HEX2 ) + & ~REG_SYN_OP_ESC_WORD_BEGIN_END ) + , ( REG_SYN_OP2_OPTION_PERL | + REG_SYN_OP2_POSSESSIVE_REPEAT | REG_SYN_OP2_POSSESSIVE_INTERVAL | + REG_SYN_OP2_CCLASS_SET | REG_SYN_OP2_ESC_V_VTAB | + REG_SYN_OP2_ESC_U_HEX4 ) + , ( SYN_GNU_REGEX_BV | REG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) + , REG_OPTION_SINGLELINE +}; + +RegSyntaxType RegSyntaxPerl = { + (( SYN_GNU_REGEX_OP | REG_SYN_OP_NON_GREEDY | REG_SYN_OP_SUBEXP_EFFECT | + REG_SYN_OP_ESC_OCTAL3 | REG_SYN_OP_ESC_X_HEX2 | + REG_SYN_OP_ESC_X_BRACE_HEX8 | REG_SYN_OP_ESC_CONTROL_CHAR | + REG_SYN_OP_ESC_C_CONTROL | REG_SYN_OP_QUOTE ) + & ~REG_SYN_OP_ESC_WORD_BEGIN_END ) + , REG_SYN_OP2_OPTION_PERL + , SYN_GNU_REGEX_BV + , REG_OPTION_SINGLELINE +}; +#endif /* USE_VARIABLE_SYNTAX */ + +RegSyntaxType RegSyntaxRuby = { + (( SYN_GNU_REGEX_OP | REG_SYN_OP_NON_GREEDY | REG_SYN_OP_SUBEXP_EFFECT | + REG_SYN_OP_ESC_OCTAL3 | REG_SYN_OP_ESC_X_HEX2 | + REG_SYN_OP_ESC_X_BRACE_HEX8 | REG_SYN_OP_ESC_CONTROL_CHAR | + REG_SYN_OP_ESC_C_CONTROL ) + & ~REG_SYN_OP_ESC_WORD_BEGIN_END ) + , ( REG_SYN_OP2_OPTION_RUBY | + REG_SYN_OP2_NAMED_SUBEXP | REG_SYN_OP2_SUBEXP_CALL | + REG_SYN_OP2_POSSESSIVE_REPEAT | REG_SYN_OP2_CCLASS_SET | + REG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | + REG_SYN_OP2_ESC_M_BAR_META | REG_SYN_OP2_ESC_V_VTAB ) + , ( SYN_GNU_REGEX_BV | REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPED | + REG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) + , REG_OPTION_NONE +}; + +RegSyntaxType* RegDefaultSyntax = REG_SYNTAX_RUBY; + +#ifdef USE_VARIABLE_SYNTAX +extern int +regex_set_default_syntax(RegSyntaxType* syntax) +{ + if (IS_NULL(syntax)) + syntax = REG_SYNTAX_RUBY; + + RegDefaultSyntax = syntax; + return 0; +} +#endif + +static void +bbuf_free(BBuf* bbuf) +{ + if (IS_NOT_NULL(bbuf)) { + if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); + xfree(bbuf); + } +} + +static int +bbuf_clone(BBuf** rto, BBuf* from) +{ + int r; + BBuf *to; + + *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); + CHECK_NULL_RETURN_VAL(to, REGERR_MEMORY); + r = BBUF_INIT(to, from->alloc); + if (r != 0) return r; + to->used = from->used; + xmemcpy(to->p, from->p, from->used); + return 0; +} + +#define WC2MB_MAX_BUFLEN 7 +#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) + +#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \ + add_wc_range_to_buf(pbuf, (WCINT )0x80, ~((WCINT )0),\ + (UChar )0x80, (UChar )0xff); + +#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\ + if (! IS_SINGLEBYTE_CODE(code)) {\ + r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\ + if (r) return r;\ + }\ +} while (0) + + +#define BITSET_IS_EMPTY(bs,empty) do {\ + int i;\ + empty = 1;\ + for (i = 0; i < BITSET_SIZE; i++) {\ + if ((bs)[i] != 0) {\ + empty = 0; break;\ + }\ + }\ +} while (0) + +static void +bitset_set_range(BitSetRef bs, int from, int to) +{ + int i; + for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { + BITSET_SET_BIT(bs, i); + } +} + +static void +bitset_set_all(BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + bs[i] = ~((Bits )0); + } +} + +static void +bitset_invert(BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + bs[i] = ~(bs[i]); + } +} + +static void +bitset_invert_to(BitSetRef from, BitSetRef to) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + to[i] = ~(from[i]); + } +} + +static void +bitset_and(BitSetRef dest, BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + dest[i] &= bs[i]; + } +} + +static void +bitset_or(BitSetRef dest, BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + dest[i] |= bs[i]; + } +} + +static void +bitset_copy(BitSetRef dest, BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + dest[i] = bs[i]; + } +} + +static int +k_strncmp(UChar* s1, UChar* s2, int n) +{ + int x; + + while (n-- > 0) { + x = *s2++ - *s1++; + if (x) return x; + } + return 0; +} + +static void +k_strcpy(UChar* dest, UChar* src, UChar* end) +{ + int len = end - src; + if (len > 0) { + xmemcpy(dest, src, len); + dest[len] = (UChar )0; + } +} + +extern UChar* +regex_strdup(UChar* s, UChar* end) +{ + int len = end - s; + + if (len > 0) { + UChar* r = (UChar* )xmalloc(len + 1); + CHECK_NULL_RETURN(r); + xmemcpy(r, s, len); + r[len] = (UChar )0; + return r; + } + else return NULL; +} + +/* scan pattern methods */ +#define PEND_VALUE -1 + +#define PFETCH(c) do { (c) = *p++; } while (0) +#define PUNFETCH p-- +#define PINC p++ +#define PPEEK (p < end ? *p : PEND_VALUE) +#define PEND (p < end ? 0 : 1) + +#ifdef REG_RUBY_M17N + +static int +wc2mb_buf(WCINT wc, UChar **bufs, UChar **bufe, RegCharEncoding enc) +{ + int c, len; + + c = m17n_firstbyte(enc, wc); + len = mblen(enc, c); + if (len > (*bufe - *bufs)) { + *bufs = xmalloc(len); + CHECK_NULL_RETURN_VAL(*bufs, REGERR_MEMORY); + } + m17n_mbcput(enc, wc, *bufs); + *bufe = *bufs + len; + return 0; +} + +#else /* REG_RUBY_M17N */ + +extern int +regex_wc2mb_first(RegCharEncoding code, WCINT wc) +{ + if (code == REGCODE_ASCII) { + return (wc & 0xff); + } + else if (code == REGCODE_UTF8) { + if ((wc & 0xffffff80) == 0) + return wc; + else { + if ((wc & 0xfffff800) == 0) + return ((wc>>6)& 0x1f) | 0xc0; + else if ((wc & 0xffff0000) == 0) + return ((wc>>12) & 0x0f) | 0xe0; + else if ((wc & 0xffe00000) == 0) + return ((wc>>18) & 0x07) | 0xf0; + else if ((wc & 0xfc000000) == 0) + return ((wc>>24) & 0x03) | 0xf8; + else if ((wc & 0x80000000) == 0) + return ((wc>>30) & 0x01) | 0xfc; + else { + return REGERR_TOO_BIG_WIDE_CHAR_VALUE; + } + } + } + else { + int first; + + if ((wc & 0xff0000) != 0) { + first = (wc >> 16) & 0xff; + if (mblen(code, first) != 3) + return REGERR_INVALID_WIDE_CHAR_VALUE; + } + else if ((wc & 0xff00) != 0) { + first = (wc >> 8) & 0xff; + if (mblen(code, first) != 2) + return REGERR_INVALID_WIDE_CHAR_VALUE; + } + else { + if (mblen(code, wc) != 1) + return REGERR_INVALID_WIDE_CHAR_VALUE; + return wc; + } + return first; + } +} + +static int +wc2mb(WCINT wc, UChar buf[], RegCharEncoding code) +{ +#define UTF8_TRAILS(wc, shift) ((((wc) >> (shift)) & 0x3f) | 0x80) +#define UTF8_TRAIL0(wc) (((wc) & 0x3f) | 0x80) + + UChar *p = buf; + + if (code == REGCODE_UTF8) { + if ((wc & 0xffffff80) == 0) + *p++ = wc; + else { + if ((wc & 0xfffff800) == 0) { + *p++ = ((wc>>6)& 0x1f) | 0xc0; + } + else if ((wc & 0xffff0000) == 0) { + *p++ = ((wc>>12) & 0x0f) | 0xe0; + *p++ = UTF8_TRAILS(wc, 6); + } + else if ((wc & 0xffe00000) == 0) { + *p++ = ((wc>>18) & 0x07) | 0xf0; + *p++ = UTF8_TRAILS(wc, 12); + *p++ = UTF8_TRAILS(wc, 6); + } + else if ((wc & 0xfc000000) == 0) { + *p++ = ((wc>>24) & 0x03) | 0xf8; + *p++ = UTF8_TRAILS(wc, 18); + *p++ = UTF8_TRAILS(wc, 12); + *p++ = UTF8_TRAILS(wc, 6); + } + else if ((wc & 0x80000000) == 0) { + *p++ = ((wc>>30) & 0x01) | 0xfc; + *p++ = UTF8_TRAILS(wc, 24); + *p++ = UTF8_TRAILS(wc, 18); + *p++ = UTF8_TRAILS(wc, 12); + *p++ = UTF8_TRAILS(wc, 6); + } + else { + return REGERR_TOO_BIG_WIDE_CHAR_VALUE; + } + *p++ = UTF8_TRAIL0(wc); + } + } + else { + if ((wc & 0xff0000) != 0) *p++ = ((wc >> 16) & 0xff); + if ((wc & 0xff00) != 0) *p++ = ((wc >> 8) & 0xff); + *p++ = (wc & 0xff); + + if (mblen(code, buf[0]) != (p - buf)) + return REGERR_INVALID_WIDE_CHAR_VALUE; + } + + return p - buf; +} + +static int +wc2mb_buf(WCINT wc, UChar **bufs, UChar **bufe, RegCharEncoding code) +{ + int r; + r = wc2mb(wc, *bufs, code); + if (r < 0) return r; + + *bufe = (*bufs) + r; + return 0; +} +#endif /* not REG_RUBY_M17N */ + +/* used as function pointer value */ +static int +is_code_ascii(RegCharEncoding code, UChar c) +{ + return (c < 128 ? 1 : 0); +} + +static int +is_code_graph(RegCharEncoding code, UChar c) { return IS_CODE_GRAPH(code, c); } +static int +is_code_print(RegCharEncoding code, UChar c) { return IS_CODE_PRINT(code, c); } +static int +is_code_alnum(RegCharEncoding code, UChar c) { return IS_CODE_ALNUM(code, c); } +static int +is_code_alpha(RegCharEncoding code, UChar c) { return IS_CODE_ALPHA(code, c); } +static int +is_code_lower(RegCharEncoding code, UChar c) { return IS_CODE_LOWER(code, c); } +static int +is_code_upper(RegCharEncoding code, UChar c) { return IS_CODE_UPPER(code, c); } +static int +is_code_cntrl(RegCharEncoding code, UChar c) { return IS_CODE_CNTRL(code, c); } +static int +is_code_punct(RegCharEncoding code, UChar c) { return IS_CODE_PUNCT(code, c); } +static int +is_code_space(RegCharEncoding code, UChar c) { return IS_CODE_SPACE(code, c); } +static int +is_code_blank(RegCharEncoding code, UChar c) { return IS_CODE_BLANK(code, c); } +static int +is_code_digit(RegCharEncoding code, UChar c) { return IS_CODE_DIGIT(code, c); } +static int +is_code_xdigit(RegCharEncoding code, UChar c) { return IS_CODE_XDIGIT(code, c); } + +static UChar* +k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, int capa) +{ + UChar* r; + + if (dest) + r = (UChar* )xrealloc(dest, capa + 1); + else + r = (UChar* )xmalloc(capa + 1); + + CHECK_NULL_RETURN(r); + k_strcpy(r + (dest_end - dest), src, src_end); + return r; +} + +/* dest on static area */ +static UChar* +strcat_capa_from_static(UChar* dest, UChar* dest_end, + UChar* src, UChar* src_end, int capa) +{ + UChar* r; + + r = (UChar* )xmalloc(capa + 1); + CHECK_NULL_RETURN(r); + k_strcpy(r, dest, dest_end); + k_strcpy(r + (dest_end - dest), src, src_end); + return r; +} + +#ifdef USE_NAMED_SUBEXP + +#define INIT_NAME_BACKREFS_ALLOC_NUM 8 + +typedef struct { + UChar* name; + int name_len; /* byte length */ + int back_num; /* number of backrefs */ + int back_alloc; + int back_ref1; + int* back_refs; +} NameEntry; + +#ifdef USE_ST_HASH_TABLE + +#include <st.h> + +typedef st_table NameTable; +typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ + +#define NAMEBUF_SIZE 24 +#define NAMEBUF_SIZE_1 25 + +#ifdef REG_DEBUG +static int +i_print_name_entry(UChar* key, NameEntry* e, void* arg) +{ + int i; + FILE* fp = (FILE* )arg; + + fprintf(fp, "%s: ", e->name); + if (e->back_num == 0) + fputs("-", fp); + else if (e->back_num == 1) + fprintf(fp, "%d", e->back_ref1); + else { + for (i = 0; i < e->back_num; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "%d", e->back_refs[i]); + } + } + fputs("\n", fp); + return ST_CONTINUE; +} + +extern int +regex_print_names(FILE* fp, regex_t* reg) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + fprintf(fp, "name table\n"); + st_foreach(t, i_print_name_entry, (HashDataType )fp); + fputs("\n", fp); + } + return 0; +} +#endif + +static int +i_free_name_entry(UChar* key, NameEntry* e, void* arg) +{ + xfree(e->name); /* == key */ + if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); + return ST_DELETE; +} + +static int +names_clear(regex_t* reg) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + st_foreach(t, i_free_name_entry, 0); + } + return 0; +} + +extern int +regex_names_free(regex_t* reg) +{ + int r; + NameTable* t; + + r = names_clear(reg); + if (r) return r; + + t = (NameTable* )reg->name_table; + if (IS_NOT_NULL(t)) st_free_table(t); + reg->name_table = (void* )NULL; + return 0; +} + +static NameEntry* +name_find(regex_t* reg, UChar* name, UChar* name_end) +{ + int len; + UChar namebuf[NAMEBUF_SIZE_1]; + UChar *key; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + e = (NameEntry* )NULL; + if (IS_NOT_NULL(t)) { + if (*name_end == '\0') { + key = name; + } + else { + /* dirty, but st.c API claims NULL terminated key. */ + len = name_end - name; + if (len <= NAMEBUF_SIZE) { + xmemcpy(namebuf, name, len); + namebuf[len] = '\0'; + key = namebuf; + } + else { + key = regex_strdup(name, name_end); + if (IS_NULL(key)) return (NameEntry* )NULL; + } + } + + st_lookup(t, (HashDataType )key, (HashDataType * )&e); + if (key != name && key != namebuf) xfree(key); + } + return e; +} + +typedef struct { + int (*func)(UChar*,int,int*,void*); + void* arg; + int ret; +} INamesArg; + +static int +i_names(UChar* key, NameEntry* e, INamesArg* arg) +{ + int r = (*(arg->func))(e->name, e->back_num, + (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->arg); + if (r != 0) { + arg->ret = r; + return ST_STOP; + } + return ST_CONTINUE; +} + +extern int +regex_foreach_name(regex_t* reg, int (*func)(UChar*,int,int*,void*), void* arg) +{ + INamesArg narg; + NameTable* t = (NameTable* )reg->name_table; + + narg.ret = 0; + if (IS_NOT_NULL(t)) { + narg.func = func; + narg.arg = arg; + st_foreach(t, i_names, (HashDataType )&narg); + } + return narg.ret; +} + +#else /* USE_ST_HASH_TABLE */ + +#define INIT_NAMES_ALLOC_NUM 8 + +typedef struct { + NameEntry* e; + int num; + int alloc; +} NameTable; + + +#ifdef REG_DEBUG +extern int +regex_print_names(FILE* fp, regex_t* reg) +{ + int i, j; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t) && t->num > 0) { + fprintf(fp, "name table\n"); + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + fprintf(fp, "%s: ", e->name); + if (e->back_num == 0) { + fputs("-", fp); + } + else if (e->back_num == 1) { + fprintf(fp, "%d", e->back_ref1); + } + else { + for (j = 0; j < e->back_num; j++) { + if (j > 0) fprintf(fp, ", "); + fprintf(fp, "%d", e->back_refs[j]); + } + } + fputs("\n", fp); + } + fputs("\n", fp); + } + return 0; +} +#endif + +static int +names_clear(regex_t* reg) +{ + int i; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + if (IS_NOT_NULL(e->name)) { + xfree(e->name); + e->name = NULL; + e->name_len = 0; + e->back_num = 0; + e->back_alloc = 0; + if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); + e->back_refs = (int* )NULL; + } + } + t->num = 0; + } + return 0; +} + +extern int +regex_names_free(regex_t* reg) +{ + int r; + NameTable* t; + + r = names_clear(reg); + if (r) return r; + + t = (NameTable* )reg->name_table; + if (IS_NOT_NULL(t)) xfree(t); + reg->name_table = NULL; + return 0; +} + +static NameEntry* +name_find(regex_t* reg, UChar* name, UChar* name_end) +{ + int i, len; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + len = name_end - name; + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + if (len == e->name_len && k_strncmp(name, e->name, len) == 0) + return e; + } + } + return (NameEntry* )NULL; +} + +extern int +regex_foreach_name(regex_t* reg, int (*func)(UChar*,int,int*,void*), void* arg) +{ + int i, r; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + r = (*func)(e->name, e->back_num, + (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg); + if (r != 0) return r; + } + } + return 0; +} + +#endif /* else USE_ST_HASH_TABLE */ + +static int +name_add(regex_t* reg, UChar* name, UChar* name_end, int backref) +{ + int alloc; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (name_end - name <= 0) + return REGERR_INVALID_SUBEXP_NAME; + + e = name_find(reg, name, name_end); + if (IS_NULL(e)) { +#ifdef USE_ST_HASH_TABLE + if (IS_NULL(t)) { + reg->name_table = t = st_init_strtable(); + } + e = (NameEntry* )xmalloc(sizeof(NameEntry)); + CHECK_NULL_RETURN_VAL(e, REGERR_MEMORY); + + e->name = regex_strdup(name, name_end); + if (IS_NULL(e->name)) return REGERR_MEMORY; + st_insert(t, (HashDataType )e->name, (HashDataType )e); + + e->name_len = name_end - name; + e->back_num = 0; + e->back_alloc = 0; + e->back_refs = (int* )NULL; + +#else + + if (IS_NULL(t)) { + alloc = INIT_NAMES_ALLOC_NUM; + t = (NameTable* )xmalloc(sizeof(NameTable)); + CHECK_NULL_RETURN_VAL(t, REGERR_MEMORY); + t->e = NULL; + t->alloc = 0; + t->num = 0; + + t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); + if (IS_NULL(t->e)) { + xfree(t); + return REGERR_MEMORY; + } + t->alloc = alloc; + reg->name_table = t; + goto clear; + } + else if (t->num == t->alloc) { + int i; + + alloc = t->alloc * 2; + t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); + CHECK_NULL_RETURN_VAL(t->e, REGERR_MEMORY); + t->alloc = alloc; + + clear: + for (i = t->num; i < t->alloc; i++) { + t->e[i].name = NULL; + t->e[i].name_len = 0; + t->e[i].back_num = 0; + t->e[i].back_alloc = 0; + t->e[i].back_refs = (int* )NULL; + } + } + e = &(t->e[t->num]); + t->num++; + e->name = regex_strdup(name, name_end); + e->name_len = name_end - name; +#endif + } + + e->back_num++; + if (e->back_num == 1) { + e->back_ref1 = backref; + } + else if (e->back_num == 2) { + alloc = INIT_NAME_BACKREFS_ALLOC_NUM; + e->back_refs = (int* )xmalloc(sizeof(int) * alloc); + CHECK_NULL_RETURN_VAL(e->back_refs, REGERR_MEMORY); + e->back_alloc = alloc; + e->back_refs[0] = e->back_ref1; + e->back_refs[1] = backref; + } + else { + if (e->back_num > e->back_alloc) { + alloc = e->back_alloc * 2; + e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); + CHECK_NULL_RETURN_VAL(e->back_refs, REGERR_MEMORY); + e->back_alloc = alloc; + } + e->back_refs[e->back_num - 1] = backref; + } + + return 0; +} + +extern int +regex_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, + int** nums) +{ + NameEntry* e; + + e = name_find(reg, name, name_end); + if (IS_NULL(e)) return REGERR_UNDEFINED_NAME_REFERENCE; + + switch (e->back_num) { + case 0: + break; + case 1: + *nums = &(e->back_ref1); + break; + default: + *nums = e->back_refs; + break; + } + return e->back_num; +} + +#else + +extern int +regex_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, + int** nums) +{ + return REG_NO_SUPPORT_CONFIG; +} + +extern int +regex_foreach_name(regex_t* reg, int (*func)(UChar*,int,int*,void*), void* arg) +{ + return REG_NO_SUPPORT_CONFIG; +} +#endif + + +#define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 + +static void +scan_env_clear(ScanEnv* env) +{ + int i; + + BIT_STATUS_CLEAR(env->backtrack_mem); + BIT_STATUS_CLEAR(env->backrefed_mem); + env->error = (UChar* )NULL; + env->error_end = (UChar* )NULL; + env->num_call = 0; + env->num_mem = 0; + env->mem_alloc = 0; + env->mem_nodes_dynamic = (Node** )NULL; + + for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) + env->mem_nodes_static[i] = NULL_NODE; +} + +static int +scan_env_add_mem_entry(ScanEnv* env) +{ + int i, need, alloc; + Node** p; + + need = env->num_mem + 1; + if (need >= SCANENV_MEMNODES_SIZE) { + if (env->mem_alloc <= need) { + if (IS_NULL(env->mem_nodes_dynamic)) { + alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; + p = (Node** )xmalloc(sizeof(Node*) * alloc); + xmemcpy(p, env->mem_nodes_static, + sizeof(Node*) * SCANENV_MEMNODES_SIZE); + } + else { + alloc = env->mem_alloc * 2; + p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); + } + CHECK_NULL_RETURN_VAL(p, REGERR_MEMORY); + + for (i = env->num_mem + 1; i < alloc; i++) + p[i] = NULL_NODE; + + env->mem_nodes_dynamic = p; + env->mem_alloc = alloc; + } + } + + env->num_mem++; + return env->num_mem; +} + +static int +scan_env_set_mem_node(ScanEnv* env, int num, Node* node) +{ + if (env->num_mem >= num) + SCANENV_MEM_NODES(env)[num] = node; + else + return REGERR_INVALID_BACKREF; + return 0; +} + + +#ifdef USE_RECYCLE_NODE +typedef struct _FreeNode { + struct _FreeNode* next; +} FreeNode; + +static FreeNode* FreeNodeList = (FreeNode* )NULL; +#endif + +extern void +regex_node_free(Node* node) +{ + if (IS_NULL(node)) return ; + + switch (NTYPE(node)) { + case N_STRING: + if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { + xfree(NSTRING(node).s); + } + break; + + case N_LIST: + case N_ALT: + regex_node_free(NCONS(node).left); + regex_node_free(NCONS(node).right); + break; + + case N_CCLASS: + if (NCCLASS(node).mbuf) + bbuf_free(NCCLASS(node).mbuf); + break; + + case N_QUALIFIER: + if (NQUALIFIER(node).target) + regex_node_free(NQUALIFIER(node).target); + break; + + case N_EFFECT: + if (NEFFECT(node).target) + regex_node_free(NEFFECT(node).target); + break; + + case N_BACKREF: + if (IS_NOT_NULL(NBACKREF(node).back_dynamic)) + xfree(NBACKREF(node).back_dynamic); + break; + + case N_ANCHOR: + if (NANCHOR(node).target) + regex_node_free(NANCHOR(node).target); + break; + } + +#ifdef USE_RECYCLE_NODE + { + FreeNode* n; + + n = (FreeNode* )node; + n->next = FreeNodeList; + FreeNodeList = n; + } +#else + xfree(node); +#endif +} + +#ifdef USE_RECYCLE_NODE +extern int +regex_free_node_list() +{ + FreeNode* n; + + THREAD_ATOMIC_START; + while (FreeNodeList) { + n = FreeNodeList; + FreeNodeList = FreeNodeList->next; + xfree(n); + } + THREAD_ATOMIC_END; + return 0; +} +#endif + +static Node* +node_new() +{ + Node* node; + +#ifdef USE_RECYCLE_NODE + if (IS_NOT_NULL(FreeNodeList)) { + node = (Node* )FreeNodeList; + FreeNodeList = FreeNodeList->next; + return node; + } +#endif + + node = (Node* )xmalloc(sizeof(Node)); + return node; +} + + +static void +initialize_cclass(CClassNode* cc) +{ + BITSET_CLEAR(cc->bs); + cc->not = 0; + cc->mbuf = NULL; +} + +static Node* +node_new_cclass() +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_CCLASS; + + initialize_cclass(&(NCCLASS(node))); + return node; +} + +static Node* +node_new_ctype(int type) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_CTYPE; + NCTYPE(node).type = type; + return node; +} + +static Node* +node_new_anychar() +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_ANYCHAR; + return node; +} + +static Node* +node_new_list(Node* left, Node* right) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_LIST; + NCONS(node).left = left; + NCONS(node).right = right; + return node; +} + +static Node* +node_new_alt(Node* left, Node* right) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_ALT; + NCONS(node).left = left; + NCONS(node).right = right; + return node; +} + +extern Node* +regex_node_new_anchor(int type) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_ANCHOR; + NANCHOR(node).type = type; + NANCHOR(node).target = NULL; + NANCHOR(node).char_len = -1; + return node; +} + +static Node* +node_new_backref(int back_num, int* backrefs, ScanEnv* env) +{ + int i; + Node* node = node_new(); + + CHECK_NULL_RETURN(node); + node->type = N_BACKREF; + NBACKREF(node).state = 0; + NBACKREF(node).back_num = back_num; + NBACKREF(node).back_dynamic = (int* )NULL; + + for (i = 0; i < back_num; i++) { + if (backrefs[i] <= env->num_mem && + IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { + NBACKREF(node).state |= NST_RECURSION; /* /...(\1).../ */ + break; + } + } + + if (back_num <= NODE_BACKREFS_SIZE) { + for (i = 0; i < back_num; i++) + NBACKREF(node).back_static[i] = backrefs[i]; + } + else { + int* p = (int* )xmalloc(sizeof(int) * back_num); + if (IS_NULL(p)) { + regex_node_free(node); + return NULL; + } + NBACKREF(node).back_dynamic = p; + for (i = 0; i < back_num; i++) + p[i] = backrefs[i]; + } + return node; +} + +#ifdef USE_SUBEXP_CALL +static Node* +node_new_call(UChar* name, UChar* name_end) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + + node->type = N_CALL; + NCALL(node).state = 0; + NCALL(node).ref_num = CALLNODE_REFNUM_UNDEF; + NCALL(node).target = NULL_NODE; + NCALL(node).name = name; + NCALL(node).name_end = name_end; + return node; +} +#endif + +static Node* +node_new_qualifier(int lower, int upper, int by_number) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_QUALIFIER; + NQUALIFIER(node).target = NULL; + NQUALIFIER(node).lower = lower; + NQUALIFIER(node).upper = upper; + NQUALIFIER(node).greedy = 1; + NQUALIFIER(node).by_number = by_number; + NQUALIFIER(node).target_may_empty = 0; + NQUALIFIER(node).head_exact = NULL_NODE; + NQUALIFIER(node).next_head_exact = NULL_NODE; + NQUALIFIER(node).is_refered = 0; + return node; +} + +static Node* +node_new_effect(int type) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_EFFECT; + NEFFECT(node).type = type; + NEFFECT(node).state = 0; + NEFFECT(node).regnum = 0; + NEFFECT(node).option = 0; + NEFFECT(node).target = NULL; + NEFFECT(node).call_addr = -1; + NEFFECT(node).opt_count = 0; + return node; +} + +extern Node* +regex_node_new_effect(int type) +{ + return node_new_effect(type); +} + +static Node* +node_new_option(RegOptionType option) +{ + Node* node = node_new_effect(EFFECT_OPTION); + CHECK_NULL_RETURN(node); + NEFFECT(node).option = option; + return node; +} + +extern int +regex_node_str_cat(Node* node, UChar* s, UChar* end) +{ + int addlen = end - s; + + if (addlen > 0) { + int len = NSTRING(node).end - NSTRING(node).s; + + if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { + UChar* p; + int capa = len + addlen + NODE_STR_MARGIN; + + if (capa <= NSTRING(node).capa) { + k_strcpy(NSTRING(node).s + len, s, end); + } + else { + if (NSTRING(node).s == NSTRING(node).buf) + p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end, + s, end, capa); + else + p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa); + + CHECK_NULL_RETURN_VAL(p, REGERR_MEMORY); + NSTRING(node).s = p; + NSTRING(node).capa = capa; + } + } + else { + k_strcpy(NSTRING(node).s + len, s, end); + } + NSTRING(node).end = NSTRING(node).s + len + addlen; + } + + return 0; +} + +static int +node_str_cat_char(Node* node, UChar c) +{ + UChar s[1]; + + s[0] = c; + return regex_node_str_cat(node, s, s + 1); +} + +extern void +regex_node_conv_to_str_node(Node* node, int flag) +{ + node->type = N_STRING; + + NSTRING(node).flag = flag; + NSTRING(node).capa = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; +} + +static Node* +node_new_str(UChar* s, UChar* end) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + + node->type = N_STRING; + NSTRING(node).capa = 0; + NSTRING(node).flag = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; + if (regex_node_str_cat(node, s, end)) { + regex_node_free(node); + return NULL; + } + return node; +} + +static Node* +node_new_str_raw(UChar* s, UChar* end) +{ + Node* node = node_new_str(s, end); + NSTRING_SET_RAW(node); + return node; +} + +static Node* +node_new_empty() +{ + return node_new_str(NULL, NULL); +} + +static Node* +node_new_str_char(UChar c) +{ + UChar p[1]; + + p[0] = c; + return node_new_str(p, p + 1); +} + +static Node* +node_new_str_raw_char(UChar c) +{ + UChar p[1]; + + p[0] = c; + return node_new_str_raw(p, p + 1); +} + +static Node* +str_node_split_last_char(StrNode* sn, RegCharEncoding enc) +{ + UChar *p; + Node* n = NULL_NODE; + + if (sn->end > sn->s) { + p = regex_get_prev_char_head(enc, sn->s, sn->end); + if (p && p > sn->s) { /* can be splitted. */ + n = node_new_str(p, sn->end); + if ((sn->flag & NSTR_RAW) != 0) + NSTRING_SET_RAW(n); + sn->end = p; + } + } + return n; +} + +static int +str_node_can_be_split(StrNode* sn, RegCharEncoding enc) +{ + if (sn->end > sn->s) { + return ((mblen(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0); + } + return 0; +} + +extern int +regex_scan_unsigned_number(UChar** src, UChar* end, RegCharEncoding enc) +{ + unsigned int num, val; + int c; + UChar* p = *src; + + num = 0; + while (!PEND) { + PFETCH(c); + if (IS_CODE_DIGIT(enc, c)) { + val = (unsigned int )DIGITVAL(c); + if ((INT_MAX_LIMIT - val) / 10UL < num) + return -1; /* overflow */ + + num = num * 10 + val; + } + else { + PUNFETCH; + break; + } + } + *src = p; + return num; +} + +static int +scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, + RegCharEncoding enc) +{ + int c; + unsigned int num, val; + UChar* p = *src; + + num = 0; + while (!PEND && maxlen-- != 0) { + PFETCH(c); + if (IS_CODE_XDIGIT(enc, c)) { + val = (unsigned int )XDIGITVAL(c); + if ((INT_MAX_LIMIT - val) / 16UL < num) + return -1; /* overflow */ + + num = (num << 4) + XDIGITVAL(c); + } + else { + PUNFETCH; + break; + } + } + *src = p; + return num; +} + +static int +scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, + RegCharEncoding enc) +{ + int c; + unsigned int num, val; + UChar* p = *src; + + num = 0; + while (!PEND && maxlen-- != 0) { + PFETCH(c); + if (IS_CODE_ODIGIT(enc, c)) { + val = ODIGITVAL(c); + if ((INT_MAX_LIMIT - val) / 8UL < num) + return -1; /* overflow */ + + num = (num << 3) + val; + } + else { + PUNFETCH; + break; + } + } + *src = p; + return num; +} + + +#define BBUF_WRITE_WCINT(bbuf,pos,wc) \ + BBUF_WRITE(bbuf, pos, &(wc), SIZE_WCINT) + +/* data format: + [multi-byte-head-BitSet][n][from-1][to-1][from-2][to-2] ... [from-n][to-n] + (all data size is WCINT) + */ +static int +new_wc_range(BBuf** pbuf) +{ +#define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_WCINT * 5) + int r; + WCINT n; + BBuf* bbuf; + + bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); + CHECK_NULL_RETURN_VAL(*pbuf, REGERR_MEMORY); + r = BBUF_INIT(*pbuf, SIZE_BITSET + INIT_MULTI_BYTE_RANGE_SIZE); + if (r) return r; + + n = 0; + BBUF_WRITE_WCINT(bbuf, SIZE_BITSET, n); + BITSET_CLEAR((BitSetRef )bbuf->p); + return 0; +} + +static int +add_wc_range_to_buf(BBuf** pbuf, WCINT from, WCINT to, UChar cfrom, UChar cto) +{ + int r, inc_n, pos; + int low, high, bound, x; + WCINT n, *data; + BBuf* bbuf; + + if (from > to) { + n = from; from = to; to = n; + } + + if (IS_NULL(*pbuf)) { + r = new_wc_range(pbuf); + if (r) return r; + bbuf = *pbuf; + n = 0; + } + else { + bbuf = *pbuf; + GET_WCINT(n, bbuf->p + SIZE_BITSET); + } + data = (WCINT* )(bbuf->p + SIZE_BITSET); + data++; + + for (low = 0, bound = n; low < bound; ) { + x = (low + bound) >> 1; + if (from > data[x*2 + 1]) + low = x + 1; + else + bound = x; + } + + for (high = low, bound = n; high < bound; ) { + x = (high + bound) >> 1; + if (to >= data[x*2] - 1) + high = x + 1; + else + bound = x; + } + + inc_n = low + 1 - high; + if (n + inc_n > REG_MAX_MULTI_BYTE_RANGES_NUM) + return REGERR_TOO_MANY_MULTI_BYTE_RANGES; + + if (inc_n != 1) { + if (from > data[low*2]) + from = data[low*2]; + if (to < data[(high - 1)*2 + 1]) + to = data[(high - 1)*2 + 1]; + } + + if (inc_n != 0 && high < n) { + int from_pos = SIZE_BITSET + SIZE_WCINT * (1 + high * 2); + int to_pos = SIZE_BITSET + SIZE_WCINT * (1 + (low + 1) * 2); + int size = (n - high) * 2 * SIZE_WCINT; + + if (inc_n > 0) { + BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); + } + else { + BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); + } + } + + pos = SIZE_BITSET + SIZE_WCINT * (1 + low * 2); + BBUF_ENSURE_SIZE(bbuf, pos + SIZE_WCINT * 2); + BBUF_WRITE_WCINT(bbuf, pos, from); + BBUF_WRITE_WCINT(bbuf, pos + SIZE_WCINT, to); + n += inc_n; + BBUF_WRITE_WCINT(bbuf, SIZE_BITSET, n); + + if (inc_n > 0) { + int i; + UChar tmp; + + if (cfrom > cto) { + tmp = cfrom; cfrom = cto; cto = tmp; + } + + for (i = cfrom; i <= cto; i++) { + BITSET_SET_BIT((BitSetRef)bbuf->p, i); + } + } + return 0; +} + +static int +add_wc_range(BBuf** pbuf, ScanEnv* env, WCINT from, WCINT to) +{ + int cfrom, cto; + + if (from > to) { + if (IS_SYNTAX_BV(env->syntax, REG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + return 0; + else + return REGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + + cfrom = WC2MB_FIRST(env->enc, from); + if (cfrom < 0) return cfrom; + cto = WC2MB_FIRST(env->enc, to); + if (cto < 0) return cto; + return add_wc_range_to_buf(pbuf, from, to, (UChar )cfrom, (UChar )cto); +} + +static int +not_wc_range_buf(BBuf* bbuf, BBuf** pbuf) +{ + int r, i, n; + WCINT pre, from, to, *data; + + *pbuf = (BBuf* )NULL; + if (IS_NULL(bbuf)) { + set_all: + return SET_ALL_MULTI_BYTE_RANGE(pbuf); + } + + data = (WCINT* )(bbuf->p + SIZE_BITSET); + GET_WCINT(n, data); + data++; + if (n <= 0) goto set_all; + + r = 0; + pre = 0x80; + for (i = 0; i < n; i++) { + from = data[i*2]; + to = data[i*2+1]; + if (pre <= from - 1) { + r = add_wc_range_to_buf(pbuf, pre, from - 1, (UChar )0, (UChar )0); + if (r != 0) return r; + } + if (to == ~((WCINT )0)) break; + pre = to + 1; + } + if (to < ~((WCINT )0)) { + r = add_wc_range_to_buf(pbuf, to + 1, ~((WCINT )0), (UChar )0, (UChar )0); + } + return r; +} + +#define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ + BBuf *tbuf; \ + int tnot; \ + tnot = not1; not1 = not2; not2 = tnot; \ + tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ +} while (0) + +static int +or_wc_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +{ + int i, r; + WCINT n1, *data1; + WCINT from, to; + + *pbuf = (BBuf* )NULL; + if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { + if (not1 != 0 || not2 != 0) + return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return 0; + } + + r = 0; + if (IS_NULL(bbuf2)) + SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); + + if (IS_NULL(bbuf1)) { + if (not1 != 0) { + return SET_ALL_MULTI_BYTE_RANGE(pbuf); + } + else { + if (not2 == 0) { + return bbuf_clone(pbuf, bbuf2); + } + else { + return not_wc_range_buf(bbuf2, pbuf); + } + } + } + + if (not1 != 0) + SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); + + data1 = (WCINT* )(bbuf1->p + SIZE_BITSET); + GET_WCINT(n1, data1); + data1++; + + if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ + r = bbuf_clone(pbuf, bbuf2); + } + else if (not1 == 0) { /* 1 OR (not 2) */ + r = not_wc_range_buf(bbuf2, pbuf); + } + if (r != 0) return r; + + for (i = 0; i < n1; i++) { + from = data1[i*2]; + to = data1[i*2+1]; + r = add_wc_range_to_buf(pbuf, from, to, (UChar )0, (UChar )0); + if (r != 0) return r; + } + return 0; +} + +static int +and_wc_range1(BBuf** pbuf, WCINT from1, WCINT to1, WCINT* data, int n) +{ + int i, r; + WCINT from2, to2; + + for (i = 0; i < n; i++) { + from2 = data[i*2]; + to2 = data[i*2+1]; + if (from2 < from1) { + if (to2 < from1) continue; + else { + from1 = to2 + 1; + } + } + else if (from2 <= to1) { + if (to2 < to1) { + if (from1 <= from2 - 1) { + r = add_wc_range_to_buf(pbuf, from1, from2-1, (UChar )0, (UChar )0); + if (r != 0) return r; + } + from1 = to2 + 1; + } + else { + to1 = from2 - 1; + } + } + else { + from1 = from2; + } + if (from1 > to1) break; + } + if (from1 <= to1) { + r = add_wc_range_to_buf(pbuf, from1, to1, (UChar )0, (UChar )0); + if (r != 0) return r; + } + return 0; +} + +static int +and_wc_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +{ + int i, j, r; + WCINT n1, n2, *data1, *data2; + WCINT from, to, from1, to1, from2, to2; + + *pbuf = (BBuf* )NULL; + if (IS_NULL(bbuf1)) { + if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ + return bbuf_clone(pbuf, bbuf2); + return 0; + } + else if (IS_NULL(bbuf2)) { + if (not2 != 0) + return bbuf_clone(pbuf, bbuf1); + return 0; + } + + if (not1 != 0) + SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); + + data1 = (WCINT* )(bbuf1->p + SIZE_BITSET); + data2 = (WCINT* )(bbuf2->p + SIZE_BITSET); + GET_WCINT(n1, data1); + GET_WCINT(n2, data2); + data1++; + data2++; + + if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ + for (i = 0; i < n1; i++) { + from1 = data1[i*2]; + to1 = data1[i*2+1]; + for (j = 0; j < n2; j++) { + from2 = data2[j*2]; + to2 = data2[j*2+1]; + if (from2 > to1) break; + if (to2 < from1) continue; + from = MAX(from1, from2); + to = MIN(to1, to2); + r = add_wc_range_to_buf(pbuf, from, to, (UChar )0, (UChar )0); + if (r != 0) return r; + } + } + } + else if (not1 == 0) { /* 1 AND (not 2) */ + for (i = 0; i < n1; i++) { + from1 = data1[i*2]; + to1 = data1[i*2+1]; + r = and_wc_range1(pbuf, from1, to1, data2, n2); + if (r != 0) return r; + } + } + + return 0; +} + +static int +and_cclass(CClassNode* dest, CClassNode* cc) +{ + int r, not1, not2; + BBuf *buf1, *buf2, *pbuf; + BitSetRef bsr1, bsr2; + BitSet bs1, bs2; + + not1 = dest->not; + bsr1 = dest->bs; + buf1 = dest->mbuf; + not2 = cc->not; + bsr2 = cc->bs; + buf2 = cc->mbuf; + + if (not1 != 0) { + bitset_invert_to(bsr1, bs1); + bsr1 = bs1; + } + if (not2 != 0) { + bitset_invert_to(bsr2, bs2); + bsr2 = bs2; + } + bitset_and(bsr1, bsr2); + if (bsr1 != dest->bs) { + bitset_copy(dest->bs, bsr1); + bsr1 = dest->bs; + } + if (not1 != 0) { + bitset_invert(dest->bs); + } + + if (not1 != 0 && not2 != 0) { + r = or_wc_range_buf(buf1, 0, buf2, 0, &pbuf); + } + else { + r = and_wc_range_buf(buf1, not1, buf2, not2, &pbuf); + if (r == 0 && not1 != 0) { + BBuf *tbuf; + r = not_wc_range_buf(pbuf, &tbuf); + if (r != 0) { + bbuf_free(pbuf); + return r; + } + bbuf_free(pbuf); + pbuf = tbuf; + } + } + if (r != 0) return r; + + dest->mbuf = pbuf; + bbuf_free(buf1); + if (IS_NOT_NULL(pbuf)) { + bitset_set_all((BitSetRef )pbuf->p); /* Sorry, but I'm tired. */ + } + return r; +} + +static int +or_cclass(CClassNode* dest, CClassNode* cc) +{ + int r, not1, not2; + BBuf *buf1, *buf2, *pbuf; + BitSetRef bsr1, bsr2; + BitSet bs1, bs2; + + not1 = dest->not; + bsr1 = dest->bs; + buf1 = dest->mbuf; + not2 = cc->not; + bsr2 = cc->bs; + buf2 = cc->mbuf; + + if (not1 != 0) { + bitset_invert_to(bsr1, bs1); + bsr1 = bs1; + } + if (not2 != 0) { + bitset_invert_to(bsr2, bs2); + bsr2 = bs2; + } + bitset_or(bsr1, bsr2); + if (bsr1 != dest->bs) { + bitset_copy(dest->bs, bsr1); + bsr1 = dest->bs; + } + if (not1 != 0) { + bitset_invert(dest->bs); + } + + if (not1 != 0 && not2 != 0) { + r = and_wc_range_buf(buf1, 0, buf2, 0, &pbuf); + } + else { + r = or_wc_range_buf(buf1, not1, buf2, not2, &pbuf); + if (r == 0 && not1 != 0) { + BBuf *tbuf; + r = not_wc_range_buf(pbuf, &tbuf); + if (r != 0) { + bbuf_free(pbuf); + return r; + } + bbuf_free(pbuf); + pbuf = tbuf; + } + } + if (r != 0) return r; + + dest->mbuf = pbuf; + bbuf_free(buf1); + if (IS_NOT_NULL(pbuf)) { + bitset_set_all((BitSetRef )pbuf->p); /* Sorry, but I'm tired. */ + } + return r; +} + +static int +conv_backslash_value(int c, ScanEnv* env) +{ + if (IS_SYNTAX_OP(env->syntax, REG_SYN_OP_ESC_CONTROL_CHAR)) { + switch (c) { + case 'n': return '\n'; + case 't': return '\t'; + case 'r': return '\r'; + case 'f': return '\f'; + case 'a': return '\007'; + case 'b': return '\010'; + case 'e': return '\033'; + case 'v': + if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_ESC_V_VTAB)) + return '\v'; + break; + + default: + break; + } + } + return c; +} + +static int +is_invalid_qualifier_target(Node* node) +{ + switch (NTYPE(node)) { + case N_ANCHOR: + return 1; + break; + + case N_EFFECT: + if (NEFFECT(node).type == EFFECT_OPTION) + return is_invalid_qualifier_target(NEFFECT(node).target); + break; + + case N_LIST: /* ex. (?:\G\A)* */ + do { + if (! is_invalid_qualifier_target(NCONS(node).left)) return 0; + } while (IS_NOT_NULL(node = NCONS(node).right)); + return 0; + break; + + case N_ALT: /* ex. (?:abc|\A)* */ + do { + if (is_invalid_qualifier_target(NCONS(node).left)) return 1; + } while (IS_NOT_NULL(node = NCONS(node).right)); + break; + + default: + break; + } + return 0; +} + +/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ +static int +popular_qualifier_num(QualifierNode* qf) +{ + if (qf->greedy) { + if (qf->lower == 0) { + if (qf->upper == 1) return 0; + else if (IS_REPEAT_INFINITE(qf->upper)) return 1; + } + else if (qf->lower == 1) { + if (IS_REPEAT_INFINITE(qf->upper)) return 2; + } + } + else { + if (qf->lower == 0) { + if (qf->upper == 1) return 3; + else if (IS_REPEAT_INFINITE(qf->upper)) return 4; + } + else if (qf->lower == 1) { + if (IS_REPEAT_INFINITE(qf->upper)) return 5; + } + } + return -1; +} + +static void +reduce_nested_qualifier(Node* pnode, Node* cnode) +{ +#define NQ_ASIS 0 /* as is */ +#define NQ_DEL 1 /* delete parent */ +#define NQ_A 2 /* to '*' */ +#define NQ_AQ 3 /* to '*?' */ +#define NQ_QQ 4 /* to '??' */ +#define NQ_P_QQ 5 /* to '+)??' */ +#define NQ_PQ_Q 6 /* to '+?)?' */ + + static char reduces[][6] = { + {NQ_DEL, NQ_A, NQ_A, NQ_QQ, NQ_AQ, NQ_ASIS}, /* '?' */ + {NQ_DEL, NQ_DEL, NQ_DEL, NQ_P_QQ, NQ_P_QQ, NQ_DEL}, /* '*' */ + {NQ_A, NQ_A, NQ_DEL, NQ_ASIS, NQ_P_QQ, NQ_DEL}, /* '+' */ + {NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL, NQ_AQ, NQ_AQ}, /* '??' */ + {NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL}, /* '*?' */ + {NQ_ASIS, NQ_PQ_Q, NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL} /* '+?' */ + }; + + int pnum, cnum; + QualifierNode *p, *c; + + p = &(NQUALIFIER(pnode)); + c = &(NQUALIFIER(cnode)); + pnum = popular_qualifier_num(p); + cnum = popular_qualifier_num(c); + + switch(reduces[cnum][pnum]) { + case NQ_DEL: + *p = *c; + break; + case NQ_A: + p->target = c->target; + p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; + break; + case NQ_AQ: + p->target = c->target; + p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; + break; + case NQ_QQ: + p->target = c->target; + p->lower = 0; p->upper = 1; p->greedy = 0; + break; + case NQ_P_QQ: + p->target = cnode; + p->lower = 0; p->upper = 1; p->greedy = 0; + c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; + return ; + break; + case NQ_PQ_Q: + p->target = cnode; + p->lower = 0; p->upper = 1; p->greedy = 1; + c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; + return ; + break; + case NQ_ASIS: + p->target = cnode; + return ; + break; + } + + c->target = NULL_NODE; + regex_node_free(cnode); +} + + +enum TokenSyms { + TK_EOT = 0, /* end of token */ + TK_BYTE = 1, + TK_RAW_BYTE = 2, + TK_WC, + TK_ANYCHAR, + TK_CHAR_TYPE, + TK_BACKREF, + TK_CALL, + TK_ANCHOR, + TK_OP_REPEAT, + TK_INTERVAL, + TK_ALT, + TK_SUBEXP_OPEN, + TK_SUBEXP_CLOSE, + TK_CC_OPEN, + TK_QUOTE_OPEN, + /* in cc */ + TK_CC_CLOSE, + TK_CC_RANGE, + TK_POSIX_BRACKET_OPEN, + TK_CC_AND, /* && */ + TK_CC_CC_OPEN /* [ */ +}; + +typedef struct { + enum TokenSyms type; + int escaped; + int base; /* is number: 8, 16 (used in [....]) */ + UChar* backp; + union { + int c; + WCINT wc; + int anchor; + int subtype; + struct { + int lower; + int upper; + int greedy; + int possessive; + } repeat; + struct { + int num; + int ref1; + int* refs; + } backref; + struct { + UChar* name; + UChar* name_end; + } call; + } u; +} RegToken; + + +static int +fetch_range_qualifier(UChar** src, UChar* end, RegToken* tok, ScanEnv* env) +{ + int low, up, syn_allow; + int c; + UChar* p = *src; + + syn_allow = IS_SYNTAX_BV(env->syntax, REG_SYN_ALLOW_INVALID_INTERVAL); + + if (PEND) { + if (syn_allow) + return 1; /* "....{" : OK! */ + else + return REGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ + } + + if (! syn_allow) { + c = PPEEK; + if (c == ')' || c == '(' || c == '|') { + return REGERR_END_PATTERN_AT_LEFT_BRACE; + } + } + + low = regex_scan_unsigned_number(&p, end, env->enc); + if (low < 0) return REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + if (low > REG_MAX_REPEAT_NUM) + return REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + if (p == *src) goto invalid; /* can't read low */ + + if (PEND) goto invalid; + PFETCH(c); + if (c == ',') { + UChar* prev = p; + up = regex_scan_unsigned_number(&p, end, env->enc); + if (up < 0) return REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + if (up > REG_MAX_REPEAT_NUM) + return REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + if (p == prev) up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ + } + else { + PUNFETCH; + up = low; /* {n} : exact n times */ + } + + if (PEND) goto invalid; + PFETCH(c); + if (IS_SYNTAX_OP(env->syntax, REG_SYN_OP_ESC_INTERVAL)) { + if (c != '\\') goto invalid; + PFETCH(c); + } + if (c != '}') goto invalid; + + if (!IS_REPEAT_INFINITE(up) && low > up) { + return REGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; + } + + tok->type = TK_INTERVAL; + tok->u.repeat.lower = low; + tok->u.repeat.upper = up; + *src = p; + return 0; + + invalid: + if (syn_allow) + return 1; /* OK */ + else + return REGERR_INVALID_REPEAT_RANGE_PATTERN; +} + +/* \M-, \C-, \c, or \... */ +static int +fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) +{ + int c; + UChar* p = *src; + + if (PEND) return REGERR_END_PATTERN_AT_BACKSLASH; + + PFETCH(c); + switch (c) { + case 'M': + if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_ESC_M_BAR_META)) { + if (PEND) return REGERR_END_PATTERN_AT_META; + PFETCH(c); + if (c != '-') return REGERR_META_CODE_SYNTAX; + if (PEND) return REGERR_END_PATTERN_AT_META; + PFETCH(c); + if (c == '\\') { + c = fetch_escaped_value(&p, end, env); + if (c < 0) return c; + } + c = ((c & 0xff) | 0x80); + } + else + goto backslash; + break; + + case 'C': + if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { + if (PEND) return REGERR_END_PATTERN_AT_CONTROL; + PFETCH(c); + if (c != '-') return REGERR_CONTROL_CODE_SYNTAX; + goto control; + } + else + goto backslash; + + case 'c': + if (IS_SYNTAX_OP(env->syntax, REG_SYN_OP_ESC_C_CONTROL)) { + control: + if (PEND) return REGERR_END_PATTERN_AT_CONTROL; + PFETCH(c); + if (c == '\\') { + c = fetch_escaped_value(&p, end, env); + if (c < 0) return c; + } + else if (c == '?') + c = 0177; + else + c &= 0x9f; + break; + } + /* fall through */ + + default: + { + backslash: + c = conv_backslash_value(c, env); + } + break; + } + + *src = p; + return c; +} + +static int fetch_token(RegToken* tok, UChar** src, UChar* end, ScanEnv* env); + +#ifdef USE_NAMED_SUBEXP +static int +fetch_name(UChar** src, UChar* end, UChar** name_end, ScanEnv* env) +{ + int len; + int c = 0; + UChar *p = *src; + + while (!PEND) { + *name_end = p; + PFETCH(c); + if (c == '>') break; + else if (c == ')' || c == '\\' || c == '\0') + return REGERR_INVALID_SUBEXP_NAME; + + len = mblen(env->enc, c); + while (!PEND && len-- > 1) { + PFETCH(c); + } + } + if (c != '>') return REGERR_INVALID_SUBEXP_NAME; + *src = p; + return 0; +} +#endif + +static void +CC_ESC_WARN(ScanEnv* env, UChar *c) +{ +#ifdef WARNING + if (IS_SYNTAX_BV(env->syntax, REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPED) && + IS_SYNTAX_BV(env->syntax, REG_SYN_ESCAPE_IN_CC)) { + char buf[WARN_BUFSIZE]; + regex_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "character class has '%s' without escape", c); + WARNING(buf); + } +#endif +} + +static void +CCEND_ESC_WARN(ScanEnv* env, UChar* c) +{ +#ifdef WARNING + if (IS_SYNTAX_BV((env)->syntax, REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPED)) { + char buf[WARN_BUFSIZE]; + regex_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, + (env)->pattern, (env)->pattern_end, + "regular expression has '%s' without escape", c); + WARNING(buf); + } +#endif +} + +static UChar* +find_str_position(WCINT s[], int n, UChar* from, UChar* to, UChar **next, + RegCharEncoding enc) +{ + int i; + WCINT x; + UChar *q; + UChar *p = from; + + while (p < to) { + x = mb2wc(p, to, enc); + q = p + mblen(enc, *p); + if (x == s[0]) { + for (i = 1; i < n && q < to; i++) { + x = mb2wc(q, to, enc); + if (x != s[i]) break; + q += mblen(enc, *q); + } + if (i >= n) { + if (IS_NOT_NULL(next)) + *next = q; + return p; + } + } + p = q; + } + return NULL_UCHARP; +} + +static int +str_exist_check_with_esc(WCINT s[], int n, UChar* from, UChar* to, + WCINT bad, RegCharEncoding enc) +{ + int i, in_esc; + WCINT x; + UChar *q; + UChar *p = from; + + in_esc = 0; + while (p < to) { + if (in_esc) { + in_esc = 0; + p += mblen(enc, *p); + } + else { + x = mb2wc(p, to, enc); + q = p + mblen(enc, *p); + if (x == s[0]) { + for (i = 1; i < n && q < to; i++) { + x = mb2wc(q, to, enc); + if (x != s[i]) break; + q += mblen(enc, *q); + } + if (i >= n) return 1; + p += mblen(enc, *p); + } + else { + x = mb2wc(p, to, enc); + if (x == bad) return 0; + else if (x == '\\') in_esc = 1; + p = q; + } + } + } + return 0; +} + +static int +fetch_token_in_cc(RegToken* tok, UChar** src, UChar* end, ScanEnv* env) +{ + int c, num; + RegSyntaxType* syn = env->syntax; + UChar* prev; + UChar* p = *src; + + if (PEND) { + tok->type = TK_EOT; + return tok->type; + } + + PFETCH(c); + tok->type = TK_BYTE; + tok->base = 0; + tok->u.c = c; + if (c == ']') { + tok->type = TK_CC_CLOSE; + } + else if (c == '-') { + tok->type = TK_CC_RANGE; + } + else if (c == '\\') { + if (! IS_SYNTAX_BV(syn, REG_SYN_ESCAPE_IN_CC)) + goto end; + + if (PEND) return REGERR_END_PATTERN_AT_BACKSLASH; + + PFETCH(c); + tok->escaped = 1; + tok->u.c = c; + switch (c) { + case 'w': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WORD; + break; + case 'W': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WORD; + break; + case 'd': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_DIGIT; + break; + case 'D': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_DIGIT; + break; + case 's': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WHITE_SPACE; + break; + case 'S': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WHITE_SPACE; + break; + + case 'x': + if (PEND) break; + + prev = p; + if (PPEEK == '{' && IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_X_BRACE_HEX8)) { + PINC; + num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + if (num < 0) return REGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND && IS_XDIGIT(*p) && p - prev >= 9) + return REGERR_TOO_LONG_WIDE_CHAR_VALUE; + + if (p > prev + 1 && !PEND && PPEEK == '}') { + PINC; + tok->type = TK_WC; + tok->base = 16; + tok->u.wc = (WCINT )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } + } + else if (IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_X_HEX2)) { + num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case 'u': + if (PEND) break; + + prev = p; + if (IS_SYNTAX_OP2(syn, REG_SYN_OP2_ESC_U_HEX4)) { + num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case '0': + case '1': case '2': case '3': case '4': case '5': case '6': case '7': + if (IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_OCTAL3)) { + PUNFETCH; + prev = p; + num = scan_unsigned_octal_number(&p, end, 3, env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 8; + tok->u.c = num; + } + break; + + default: + PUNFETCH; + num = fetch_escaped_value(&p, end, env); + if (num < 0) return num; + if (tok->u.c != num) { + tok->u.c = num; + tok->type = TK_RAW_BYTE; + } + break; + } + } + else if (c == '[') { + if (IS_SYNTAX_OP(syn, REG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') { + WCINT send[] = { (WCINT )':', (WCINT )']' }; + tok->backp = p; /* point at '[' is readed */ + PINC; + if (str_exist_check_with_esc(send, 2, p, end, (WCINT )']', env->enc)) { + tok->type = TK_POSIX_BRACKET_OPEN; + } + else { + PUNFETCH; + goto cc_in_cc; + } + } + else { + cc_in_cc: + if (IS_SYNTAX_OP2(syn, REG_SYN_OP2_CCLASS_SET)) { + tok->type = TK_CC_CC_OPEN; + } + else { + CC_ESC_WARN(env, "["); + } + } + } + else if (c == '&') { + if (IS_SYNTAX_OP2(syn, REG_SYN_OP2_CCLASS_SET) && !PEND && PPEEK == '&') { + PINC; + tok->type = TK_CC_AND; + } + } + + end: + *src = p; + return tok->type; +} + +static int +fetch_token(RegToken* tok, UChar** src, UChar* end, ScanEnv* env) +{ + int r, c, num; + RegSyntaxType* syn = env->syntax; + UChar* prev; + UChar* p = *src; + + start: + if (PEND) { + tok->type = TK_EOT; + return tok->type; + } + + tok->type = TK_BYTE; + tok->base = 0; + PFETCH(c); + if (c == '\\') { + if (PEND) return REGERR_END_PATTERN_AT_BACKSLASH; + + PFETCH(c); + tok->u.c = c; + tok->escaped = 1; + switch (c) { + case '*': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_0INF)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '+': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_1INF)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 1; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '?': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_01)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = 1; + greedy_check: + if (!PEND && PPEEK == '?' && IS_SYNTAX_OP(syn, REG_SYN_OP_NON_GREEDY)) { + PFETCH(c); + tok->u.repeat.greedy = 0; + tok->u.repeat.possessive = 0; + } + else if (!PEND && PPEEK == '+' && + ((IS_SYNTAX_OP2(syn, REG_SYN_OP2_POSSESSIVE_REPEAT) && + tok->type != TK_INTERVAL) || + (IS_SYNTAX_OP2(syn, REG_SYN_OP2_POSSESSIVE_INTERVAL) && + tok->type == TK_INTERVAL))) { + PFETCH(c); + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 1; + } + else { + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 0; + } + break; + + case '{': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_INTERVAL)) break; + tok->backp = p; + r = fetch_range_qualifier(&p, end, tok, env); + if (r < 0) return r; /* error */ + if (r > 0) { + /* normal char */ + } + else + goto greedy_check; + break; + + case '|': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_ALT)) break; + tok->type = TK_ALT; + break; + + case '(': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_SUBEXP)) break; + tok->type = TK_SUBEXP_OPEN; + break; + + case ')': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_SUBEXP)) break; + tok->type = TK_SUBEXP_CLOSE; + break; + + case 'w': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WORD)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WORD; + break; + + case 'W': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WORD)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WORD; + break; + + case 'b': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WORD_BOUND)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_WORD_BOUND; + break; + + case 'B': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WORD_BOUND)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_NOT_WORD_BOUND; + break; + +#ifdef USE_WORD_BEGIN_END + case '<': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WORD_BEGIN_END)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_WORD_BEGIN; + break; + + case '>': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WORD_BEGIN_END)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_WORD_END; + break; +#endif + + case 's': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WHITE_SPACE)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WHITE_SPACE; + break; + + case 'S': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_WHITE_SPACE)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WHITE_SPACE; + break; + + case 'd': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_DIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_DIGIT; + break; + + case 'D': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_DIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_DIGIT; + break; + + case 'A': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_BUF_ANCHOR)) break; + begin_buf: + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_BEGIN_BUF; + break; + + case 'Z': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_BUF_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_SEMI_END_BUF; + break; + + case 'z': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_BUF_ANCHOR)) break; + end_buf: + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_END_BUF; + break; + + case 'G': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_BUF_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_BEGIN_POSITION; + break; + + case '`': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_GNU_BUF_ANCHOR)) break; + goto begin_buf; + break; + + case '\'': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_GNU_BUF_ANCHOR)) break; + goto end_buf; + break; + + case 'x': + if (PEND) break; + + prev = p; + if (PPEEK == '{' && IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_X_BRACE_HEX8)) { + PINC; + num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + if (num < 0) return REGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND && IS_XDIGIT(*p) && p - prev >= 9) + return REGERR_TOO_LONG_WIDE_CHAR_VALUE; + + if (p > prev + 1 && !PEND && PPEEK == '}') { + PINC; + tok->type = TK_WC; + tok->u.wc = (WCINT )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } + } + else if (IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_X_HEX2)) { + num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case 'u': + if (PEND) break; + + prev = p; + if (IS_SYNTAX_OP2(syn, REG_SYN_OP2_ESC_U_HEX4)) { + num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + PUNFETCH; + prev = p; + num = regex_scan_unsigned_number(&p, end, env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (num > REG_MAX_BACKREF_NUM) return REGERR_TOO_BIG_BACKREF_NUMBER; + + if (IS_SYNTAX_OP(syn, REG_SYN_OP_BACK_REF) && + (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ + if (IS_SYNTAX_BV(syn, REG_SYN_STRICT_CHECK_BACKREF)) { + if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) + return REGERR_INVALID_BACKREF; + } + + tok->type = TK_BACKREF; + tok->u.backref.num = 1; + tok->u.backref.ref1 = num; + break; + } + else if (c == '8' || c == '9') { + /* normal char */ + p = prev; PINC; + break; + } + + p = prev; + /* fall through */ + case '0': + if (IS_SYNTAX_OP(syn, REG_SYN_OP_ESC_OCTAL3)) { + prev = p; + num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc); + if (num < 0) return REGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 8; + tok->u.c = num; + } + else if (c != '0') { + PINC; + } + break; + +#ifdef USE_NAMED_SUBEXP + case 'k': + if (IS_SYNTAX_OP2(syn, REG_SYN_OP2_NAMED_SUBEXP)) { + PFETCH(c); + if (c == '<') { + UChar* name_end; + int* backs; + + prev = p; + r = fetch_name(&p, end, &name_end, env); + if (r < 0) return r; + num = regex_name_to_group_numbers(env->reg, prev, name_end, &backs); + if (num <= 0) { + regex_scan_env_set_error_string(env, + REGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); + return REGERR_UNDEFINED_NAME_REFERENCE; + } + if (IS_SYNTAX_BV(syn, REG_SYN_STRICT_CHECK_BACKREF)) { + int i; + for (i = 0; i < num; i++) { + if (backs[i] > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) + return REGERR_INVALID_BACKREF; + } + } + + tok->type = TK_BACKREF; + if (num == 1) { + tok->u.backref.num = 1; + tok->u.backref.ref1 = backs[0]; + } + else { + tok->u.backref.num = num; + tok->u.backref.refs = backs; + } + } + else + PUNFETCH; + } + break; +#endif + +#ifdef USE_SUBEXP_CALL + case 'g': + if (IS_SYNTAX_OP2(syn, REG_SYN_OP2_SUBEXP_CALL)) { + PFETCH(c); + if (c == '<') { + UChar* name_end; + + prev = p; + r = fetch_name(&p, end, &name_end, env); + if (r < 0) return r; + + tok->type = TK_CALL; + tok->u.call.name = prev; + tok->u.call.name_end = name_end; + } + else + PUNFETCH; + } + break; +#endif + + case 'Q': + if (IS_SYNTAX_OP(syn, REG_SYN_OP_QUOTE)) { + tok->type = TK_QUOTE_OPEN; + } + break; + + default: + PUNFETCH; + num = fetch_escaped_value(&p, end, env); + if (num < 0) return num; + /* set_raw: */ + if (tok->u.c != num) { + tok->type = TK_RAW_BYTE; + tok->u.c = num; + } + break; + } + } + else { + tok->u.c = c; + tok->escaped = 0; + + switch (c) { + case '.': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ANYCHAR)) break; + tok->type = TK_ANYCHAR; + break; + + case '*': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_0INF)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '+': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_1INF)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 1; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '?': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_01)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = 1; + goto greedy_check; + break; + + case '{': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_INTERVAL)) break; + tok->backp = p; + r = fetch_range_qualifier(&p, end, tok, env); + if (r < 0) return r; /* error */ + if (r > 0) { + /* normal char */ + } + else + goto greedy_check; + break; + + case '|': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_ALT)) break; + tok->type = TK_ALT; + break; + + case '(': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_SUBEXP)) break; + tok->type = TK_SUBEXP_OPEN; + break; + + case ')': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_SUBEXP)) break; + tok->type = TK_SUBEXP_CLOSE; + break; + + case '^': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_LINE_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = (IS_SINGLELINE(env->option) + ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); + break; + + case '$': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_LINE_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = (IS_SINGLELINE(env->option) + ? ANCHOR_END_BUF : ANCHOR_END_LINE); + break; + + case '[': + if (! IS_SYNTAX_OP(syn, REG_SYN_OP_CC)) break; + tok->type = TK_CC_OPEN; + break; + + case ']': + if (*src > env->pattern) /* /].../ is allowed. */ + CCEND_ESC_WARN(env, "]"); + break; + + case '#': + if (IS_EXTEND(env->option)) { + while (!PEND) { + PFETCH(c); + if (IS_NEWLINE(c)) + break; + } + goto start; + break; + } + break; + + case ' ': case '\t': case '\n': case '\r': case '\f': + if (IS_EXTEND(env->option)) + goto start; + break; + + default: + break; + } + } + + *src = p; + return tok->type; +} + +static void +bitset_by_pred_func(BitSetRef bs, int (*pf)(RegCharEncoding, UChar), + RegCharEncoding code, int not) +{ + int c; + + if (not) { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! pf(code, (UChar )c)) BITSET_SET_BIT(bs, c); + } + } + else { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (pf(code, (UChar )c)) BITSET_SET_BIT(bs, c); + } + } +} + +typedef struct { + UChar *name; + int (*pf)(RegCharEncoding, UChar); + short int len; +} PosixBracketEntryType; + +static int +parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) +{ +#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 +#define POSIX_BRACKET_NAME_MAX_LEN 6 + + static PosixBracketEntryType PBS[] = { + { "alnum", is_code_alnum, 5 }, + { "alpha", is_code_alpha, 5 }, + { "blank", is_code_blank, 5 }, + { "cntrl", is_code_cntrl, 5 }, + { "digit", is_code_digit, 5 }, + { "graph", is_code_graph, 5 }, + { "lower", is_code_lower, 5 }, + { "print", is_code_print, 5 }, + { "punct", is_code_punct, 5 }, + { "space", is_code_space, 5 }, + { "upper", is_code_upper, 5 }, + { "xdigit", is_code_xdigit, 6 }, + { "ascii", is_code_ascii, 5 }, /* I don't know origin. Perl? */ + { (UChar* )NULL, is_code_alnum, 0 } + }; + + PosixBracketEntryType *pb; + int not, i, c; + UChar *p = *src; + + if (PPEEK == '^') { + PINC; + not = 1; + } + else + not = 0; + + if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1) + goto not_posix_bracket; + + for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { + if (k_strncmp(p, pb->name, pb->len) == 0) { + p += pb->len; + if (end - p < 2 || *p != ':' || *(p+1) != ']') + return REGERR_INVALID_POSIX_BRACKET_TYPE; + + bitset_by_pred_func(cc->bs, pb->pf, env->enc, not); + PINC; PINC; + *src = p; + return 0; + } + } + + not_posix_bracket: + c = 0; + i = 0; + while (!PEND && ((c = PPEEK) != ':') && c != ']') { + PINC; + if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; + } + if (c == ':' && !PEND) { + PINC; + if (!PEND) { + PFETCH(c); + if (c == ']') + return REGERR_INVALID_POSIX_BRACKET_TYPE; + } + } + + return 1; /* 1: is not POSIX bracket, but no error. */ +} + + +enum CCSTATE { + CCS_VALUE, + CCS_RANGE, + CCS_COMPLETE, + CCS_START +}; + +enum CCVALTYPE { + CCV_SB, + CCV_WC, + CCV_CLASS +}; + +static int +next_state_class(CClassNode* cc, RegToken* tok, WCINT* vs, + enum CCVALTYPE* type, enum CCSTATE* state, ScanEnv* env) +{ + int r, c; + + if (*state == CCS_RANGE) + return REGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; + + if (*state == CCS_VALUE && *type != CCV_CLASS) { + if (*type == CCV_SB) + BITSET_SET_BIT(cc->bs, (int )(*vs)); + else if (*type == CCV_WC) { + r = add_wc_range(&(cc->mbuf), env, *vs, *vs); + if (r < 0) return r; + } + } + + if (tok->type == TK_CHAR_TYPE) { + switch (tok->u.subtype) { + case CTYPE_WORD: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_CODE_WORD(env->enc, c)) BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(env->enc, cc->mbuf); + break; + case CTYPE_NOT_WORD: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_CODE_WORD(env->enc, c)) BITSET_SET_BIT(cc->bs, c); + } + break; + case CTYPE_WHITE_SPACE: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_CODE_SPACE(env->enc, c)) BITSET_SET_BIT(cc->bs, c); + } + break; + case CTYPE_NOT_WHITE_SPACE: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_CODE_SPACE(env->enc, c)) BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(env->enc, cc->mbuf); + break; + case CTYPE_DIGIT: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_CODE_DIGIT(env->enc, c)) BITSET_SET_BIT(cc->bs, c); + } + break; + case CTYPE_NOT_DIGIT: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_CODE_DIGIT(env->enc, c)) BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(env->enc, cc->mbuf); + break; + default: + return REGERR_PARSER_BUG; + break; + } + } + else { /* TK_POSIX_BRACKET_OPEN */ + /* nothing */ + } + + *state = CCS_VALUE; + *type = CCV_CLASS; + return 0; +} + +static int +next_state_val(CClassNode* cc, WCINT *vs, WCINT v, int* vs_israw, int v_israw, + enum CCVALTYPE intype, enum CCVALTYPE* type, + enum CCSTATE* state, ScanEnv* env) +{ + int r; + + switch (*state) { + case CCS_VALUE: + if (*type == CCV_SB) + BITSET_SET_BIT(cc->bs, (int )(*vs)); + else if (*type == CCV_WC) { + r = add_wc_range(&(cc->mbuf), env, *vs, *vs); + if (r < 0) return r; + } + break; + + case CCS_RANGE: + if (intype == *type) { + if (intype == CCV_SB) { + if (IS_IGNORECASE(env->option) && (*vs_israw == 0 && v_israw == 0)) { + int low, high; + + low = TOLOWER(env->enc, *vs); + high = TOLOWER(env->enc, v); + if (low > high) { + if (IS_SYNTAX_BV(env->syntax, REG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return REGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + + if (low < 'A' && high >= 'a' && high <= 'z') { + bitset_set_range(cc->bs, low, (int )'A' - 1); + bitset_set_range(cc->bs, (int )'a', high); + } + else if (high > 'z' && low >= 'a' && low <= 'z') { + bitset_set_range(cc->bs, low, (int )'z'); + bitset_set_range(cc->bs, (int )'z' + 1, high); + } + else { + bitset_set_range(cc->bs, low, high); + } + } + else { + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, REG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return REGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )v); + } + } + else { + r = add_wc_range(&(cc->mbuf), env, *vs, v); + if (r < 0) return r; + } + } + else { +#ifndef REG_RUBY_M17N + if (env->enc == REGCODE_UTF8 && intype == CCV_WC && *type == CCV_SB) { + bitset_set_range(cc->bs, (int )*vs, 0x7f); + r = add_wc_range(&(cc->mbuf), env, (WCINT )0x80, v); + if (r < 0) return r; + } + else +#endif + return REGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; + } + ccs_range_end: + *state = CCS_COMPLETE; + break; + + case CCS_COMPLETE: + case CCS_START: + *state = CCS_VALUE; + break; + + default: + break; + } + + *vs_israw = v_israw; + *vs = v; + *type = intype; + return 0; +} + +static int +char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped, + RegCharEncoding enc) +{ + int in_esc; + UChar* p = from; + + in_esc = 0; + while (p < to) { + if (ignore_escaped && in_esc) { + in_esc = 0; + } + else { + if (*p == c) return 1; + if (*p == '\\') in_esc = 1; + } + p += mblen(enc, *p); + } + return 0; +} + +static int +parse_char_class(Node** np, RegToken* tok, UChar** src, UChar* end, + ScanEnv* env) +{ + int r, neg, len, fetched, and_start; + WCINT v, vs; + UChar *p; + Node* node; + CClassNode *cc, *prev_cc; + CClassNode work_cc; + + enum CCSTATE state; + enum CCVALTYPE val_type, in_type; + int val_israw, in_israw; + + *np = NULL_NODE; + r = fetch_token_in_cc(tok, src, end, env); + if (r == TK_BYTE && tok->u.c == '^') { + neg = 1; + r = fetch_token_in_cc(tok, src, end, env); + } + else { + neg = 0; + } + + if (r < 0) return r; + if (r == TK_CC_CLOSE) { + if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc)) + return REGERR_EMPTY_CHAR_CLASS; + + CC_ESC_WARN(env, "]"); + r = tok->type = TK_BYTE; /* allow []...] */ + } + + *np = node = node_new_cclass(); + CHECK_NULL_RETURN_VAL(node, REGERR_MEMORY); + cc = &(NCCLASS(node)); + prev_cc = (CClassNode* )NULL; + + and_start = 0; + state = CCS_START; + p = *src; + while (r != TK_CC_CLOSE) { + fetched = 0; + switch (r) { + case TK_BYTE: + len = mblen(env->enc, tok->u.c); + if (len > 1) { + PUNFETCH; + v = MB2WC(p, end, env->enc); + p += len; + } + else { + sb_char: + v = (WCINT )tok->u.c; + } + in_israw = 0; + goto val_entry; + break; + + case TK_RAW_BYTE: + len = mblen(env->enc, tok->u.c); + if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */ + UChar buf[WC2MB_MAX_BUFLEN]; + UChar* bufp = buf; + UChar* bufe = buf + WC2MB_MAX_BUFLEN; + int i, base = tok->base; + + if (len > WC2MB_MAX_BUFLEN) { + bufp = (UChar* )xmalloc(len); + if (IS_NULL(bufp)) { + r = REGERR_MEMORY; + goto err; + } + bufe = bufp + len; + } + bufp[0] = tok->u.c; + for (i = 1; i < len; i++) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto raw_byte_err; + if (r != TK_RAW_BYTE || tok->base != base) break; + bufp[i] = tok->u.c; + } + if (i < len) { + r = REGERR_TOO_SHORT_MULTI_BYTE_STRING; + raw_byte_err: + if (bufp != buf) xfree(bufp); + goto err; + } + v = MB2WC(bufp, bufe, env->enc); + fetched = 1; + if (bufp != buf) xfree(bufp); + } + else { + v = (WCINT )tok->u.c; + } + in_israw = 1; + goto val_entry; + break; + + case TK_WC: + v = tok->u.wc; + in_israw = 1; + val_entry: + in_type = (v < SINGLE_BYTE_SIZE ? CCV_SB : CCV_WC); + r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, + &state, env); + if (r != 0) goto err; + break; + + case TK_POSIX_BRACKET_OPEN: + r = parse_posix_bracket(cc, &p, end, env); + if (r < 0) goto err; + if (r == 1) { /* is not POSIX bracket */ + CC_ESC_WARN(env, "["); + p = tok->backp; + v = (WCINT )tok->u.c; + in_israw = 0; + goto val_entry; + } + /* POSIX bracket fall */ + case TK_CHAR_TYPE: + r = next_state_class(cc, tok, &vs, &val_type, &state, env); + if (r != 0) goto err; + break; + + case TK_CC_RANGE: + if (state == CCS_VALUE) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + fetched = 1; + if (r == TK_CC_CLOSE) { /* allow [x-] */ + range_end_val: + v = (WCINT )'-'; + in_israw = 0; + goto val_entry; + } + else if (r == TK_CC_AND) { + CC_ESC_WARN(env, "-"); + goto range_end_val; + } + state = CCS_RANGE; + } + else if (state == CCS_START) { + /* [-xa] is allowed */ + v = (WCINT )tok->u.c; + in_israw = 0; + + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + fetched = 1; + /* [--x] or [a&&-x] is warned. */ + if (r == TK_CC_RANGE || and_start != 0) + CC_ESC_WARN(env, "-"); + + goto val_entry; + } + else if (state == CCS_RANGE) { + CC_ESC_WARN(env, "-"); + goto sb_char; /* [!--x] is allowed */ + } + else { /* CCS_COMPLETE */ + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + fetched = 1; + if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ + else if (r == TK_CC_AND) { + CC_ESC_WARN(env, "-"); + goto range_end_val; + } + + if (IS_SYNTAX_BV(env->syntax, REG_SYN_ALLOW_RANGE_OP_IN_CC)) { + CC_ESC_WARN(env, "-"); + goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ + } + r = REGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; + goto err; + } + break; + + case TK_CC_CC_OPEN: /* [ */ + { + Node *anode; + CClassNode* acc; + + r = parse_char_class(&anode, tok, &p, end, env); + if (r != 0) goto cc_open_err; + acc = &(NCCLASS(anode)); + r = or_cclass(cc, acc); + + cc_open_err: + regex_node_free(anode); + if (r != 0) goto err; + } + break; + + case TK_CC_AND: /* && */ + { + if (state == CCS_VALUE) { + r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + &val_type, &state, env); + if (r != 0) goto err; + } + /* initialize local variables */ + and_start = 1; + state = CCS_START; + + if (IS_NOT_NULL(prev_cc)) { + r = and_cclass(prev_cc, cc); + if (r != 0) goto err; + } + else { + prev_cc = cc; + cc = &work_cc; + } + initialize_cclass(cc); + } + break; + + case TK_EOT: + r = REGERR_PREMATURE_END_OF_CHAR_CLASS; + goto err; + break; + default: + r = REGERR_PARSER_BUG; + goto err; + break; + } + + if (fetched) + r = tok->type; + else { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + } + } + + if (state == CCS_VALUE) { + r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + &val_type, &state, env); + if (r != 0) goto err; + } + + if (IS_NOT_NULL(prev_cc)) { + r = and_cclass(prev_cc, cc); + if (r != 0) goto err; + cc = prev_cc; + } + + cc->not = neg; + if (cc->not != 0 && + IS_SYNTAX_BV(env->syntax, REG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { + int is_empty; + + is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); + if (is_empty != 0) + BITSET_IS_EMPTY(cc->bs, is_empty); + if (is_empty == 0) + BITSET_SET_BIT(cc->bs, NEWLINE); + } + *src = p; + return 0; + + err: + regex_node_free(*np); + return r; +} + +static int parse_subexp(Node** top, RegToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env); + +static int +parse_effect(Node** np, RegToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env) +{ + Node *target; + RegOptionType option; + int r, c, num; + UChar* p = *src; + + *np = NULL; + if (PEND) return REGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; + + option = env->option; + if (PPEEK == '?' && IS_SYNTAX_OP(env->syntax, REG_SYN_OP_SUBEXP_EFFECT)) { + PINC; + if (PEND) return REGERR_END_PATTERN_IN_GROUP; + + PFETCH(c); + switch (c) { + case '#': /* (?#...) comment */ + while (1) { + if (PEND) return REGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == ')') break; + } + *src = p; + return 3; /* 3: comment */ + break; + + case ':': /* (?:...) grouping only */ + goto group; + break; + + case '=': + *np = regex_node_new_anchor(ANCHOR_PREC_READ); + break; + case '!': /* preceding read */ + *np = regex_node_new_anchor(ANCHOR_PREC_READ_NOT); + break; + case '>': /* (?>...) stop backtrack */ + *np = node_new_effect(EFFECT_STOP_BACKTRACK); + break; + + case '<': /* look behind (?<=...), (?<!...) */ + PFETCH(c); + if (c == '=') + *np = regex_node_new_anchor(ANCHOR_LOOK_BEHIND); + else if (c == '!') + *np = regex_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); +#ifdef USE_NAMED_SUBEXP + else if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_NAMED_SUBEXP)) { + UChar *name; + UChar *name_end; + PUNFETCH; + name = p; + r = fetch_name(&p, end, &name_end, env); + if (r < 0) return r; + + *np = node_new_effect(EFFECT_MEMORY); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + num = scan_env_add_mem_entry(env); + if (num < 0) return num; + NEFFECT(*np).regnum = num; + r = name_add(env->reg, name, name_end, num); + if (r != 0) return r; + } +#endif + else + return REGERR_UNDEFINED_GROUP_OPTION; + break; + +#ifdef USE_POSIXLINE_OPTION + case 'p': +#endif + case '-': case 'i': case 'm': case 's': case 'x': + { + int neg = 0; + + while (1) { + switch (c) { + case ':': + case ')': + break; + + case '-': neg = 1; break; + case 'x': ONOFF(option, REG_OPTION_EXTEND, neg); break; + case 'i': ONOFF(option, REG_OPTION_IGNORECASE, neg); break; + case 's': + if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_OPTION_PERL)) { + ONOFF(option, REG_OPTION_MULTILINE, neg); + } + else + return REGERR_UNDEFINED_GROUP_OPTION; + break; + + case 'm': + if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_OPTION_PERL)) { + ONOFF(option, REG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + } + else if (IS_SYNTAX_OP2(env->syntax, REG_SYN_OP2_OPTION_RUBY)) { + ONOFF(option, REG_OPTION_MULTILINE, neg); + } + else + return REGERR_UNDEFINED_GROUP_OPTION; + break; +#ifdef USE_POSIXLINE_OPTION + case 'p': + ONOFF(option, REG_OPTION_MULTILINE|REG_OPTION_SINGLELINE, neg); + break; +#endif + default: + return REGERR_UNDEFINED_GROUP_OPTION; + } + + if (c == ')') { /* option only */ + if (option == env->option) { + *np = node_new_empty(); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + *src = p; + return 0; + } + else { + *np = node_new_option(option); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + *src = p; + return 2; /* option only */ + } + } + else if (c == ':') { + if (env->option == option) { + group: + r = fetch_token(tok, &p, end, env); + if (r < 0) return r; + r = parse_subexp(np, tok, term, &p, end, env); + if (r < 0) return r; + *src = p; + return 1; /* group */ + } + else { + RegOptionType prev = env->option; + + env->option = option; + r = fetch_token(tok, &p, end, env); + if (r < 0) return r; + r = parse_subexp(&target, tok, term, &p, end, env); + env->option = prev; + if (r < 0) return r; + *np = node_new_option(option); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + NEFFECT(*np).target = target; + *src = p; + return 0; + } + } + + if (PEND) return REGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + } + } + break; + + default: + return REGERR_UNDEFINED_GROUP_OPTION; + } + } + else { +#ifdef USE_NAMED_SUBEXP + if (IS_REG_OPTION_ON(env->option, REG_OPTION_CAPTURE_ONLY_NAMED_GROUP)) { + goto group; + } +#endif + *np = node_new_effect(EFFECT_MEMORY); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + num = scan_env_add_mem_entry(env); + if (num < 0) return num; + NEFFECT(*np).regnum = num; + } + + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + r = fetch_token(tok, &p, end, env); + if (r < 0) return r; + r = parse_subexp(&target, tok, term, &p, end, env); + if (r < 0) return r; + + if (NTYPE(*np) == N_ANCHOR) + NANCHOR(*np).target = target; + else + NEFFECT(*np).target = target; + + *src = p; + return 0; +} + +static int +set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) +{ + QualifierNode* qn; + + qn = &(NQUALIFIER(qnode)); + if (qn->lower == 1 && qn->upper == 1) { + return 1; + } + + switch (NTYPE(target)) { + case N_STRING: + if (! group) { + StrNode* sn = &(NSTRING(target)); + if (str_node_can_be_split(sn, env->enc)) { + Node* n = str_node_split_last_char(sn, env->enc); + if (IS_NOT_NULL(n)) { + qn->target = n; + return 2; + } + } + } + break; + + case N_QUALIFIER: + { /* check redundant double repeat. */ + /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ + QualifierNode* qnt = &(NQUALIFIER(target)); + +#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR + if (qn->by_number == 0 && qnt->by_number == 0) { + if (IS_REPEAT_INFINITE(qn->upper)) { + if (qn->lower == 0) { /* '*' */ + redundant: + { + char buf[WARN_BUFSIZE]; + regex_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "redundant nested repeat operator"); + VERB_WARNING(buf); + goto warn_exit; + } + } + else if (qn->lower == 1) { /* '+' */ + /* (?:a?)+? only allowed. */ + if (qn->greedy || !(qnt->upper == 1 && qnt->greedy)) + goto redundant; + } + } + else if (qn->upper == 1 && qn->lower == 0) { + if (qn->greedy) { /* '?' */ + if (!(qnt->lower == 1 && qnt->greedy == 0)) /* not '+?' */ + goto redundant; + } + else { /* '??' */ + /* '(?:a+)?? only allowd. (?:a*)?? can be replaced to (?:a+)?? */ + if (!(qnt->greedy && qnt->lower == 1 && + IS_REPEAT_INFINITE(qnt->upper))) + goto redundant; + } + } + } +#endif + +#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR + warn_exit: +#endif + if (popular_qualifier_num(qnt) >= 0 && popular_qualifier_num(qn) >= 0) { + reduce_nested_qualifier(qnode, target); + goto q_exit; + } + } + break; + + default: + break; + } + + qn->target = target; + q_exit: + return 0; +} + +static int +parse_exp(Node** np, RegToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env) +{ + int r, len, c, group = 0; + Node* qn; + + start: + *np = NULL; + if (tok->type == term) + goto end_of_token; + + switch (tok->type) { + case TK_ALT: + case TK_EOT: + end_of_token: + *np = node_new_empty(); + return tok->type; + break; + + case TK_SUBEXP_OPEN: + r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env); + if (r < 0) return r; + if (r == 1) group = 1; + else if (r == 2) { /* option only */ + Node* target; + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + r = parse_subexp(&target, tok, term, src, end, env); + if (r < 0) return r; + NEFFECT(*np).target = target; + return tok->type; + } + else if (r == 3) { /* comment */ + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + goto start; + } + else { + if (NTYPE(*np) == N_EFFECT && NEFFECT(*np).type == EFFECT_MEMORY) { + r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np); + if (r != 0) return r; + } + } + break; + + case TK_SUBEXP_CLOSE: + if (! IS_SYNTAX_BV(env->syntax, REG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) + return REGERR_UNMATCHED_CLOSE_PARENTHESIS; + + if (tok->escaped) goto tk_raw_byte; + else goto tk_byte; + break; + + case TK_BYTE: + tk_byte: + *np = node_new_str_char((UChar )tok->u.c); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + len = mblen(env->enc, tok->u.c); + if (len > 1) { + regex_node_str_cat(*np, *src, *src + len - 1); + *src += (len - 1); + } + while (1) { + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + if (r != TK_BYTE) goto repeat; + + r = node_str_cat_char(*np, (UChar )tok->u.c); + if (r < 0) return r; + len = mblen(env->enc, tok->u.c); + if (len > 1) { + regex_node_str_cat(*np, *src, *src + len - 1); + *src += (len - 1); + } + } + break; + + case TK_RAW_BYTE: + tk_raw_byte: + *np = node_new_str_raw_char((UChar )tok->u.c); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + while (1) { + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + if (r != TK_RAW_BYTE) goto repeat; + + r = node_str_cat_char(*np, (UChar )tok->u.c); + if (r < 0) return r; + } + break; + + case TK_WC: + { + UChar buf[WC2MB_MAX_BUFLEN]; + UChar* bufs = buf; + UChar* bufe = bufs + WC2MB_MAX_BUFLEN; + int num = wc2mb_buf(tok->u.wc, &bufs, &bufe, env->enc); + if (num < 0) return num; + *np = node_new_str_raw(bufs, bufe); + if (bufs != buf) xfree(bufs); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + } + break; + + case TK_QUOTE_OPEN: + { + WCINT end_op[] = { (WCINT )'\\', (WCINT )'E' }; + UChar *qstart, *qend, *nextp; + + qstart = *src; + qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); + if (IS_NULL(qend)) { + nextp = qend = end; + } + *np = node_new_str(qstart, qend); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + *src = nextp; + } + break; + + case TK_CHAR_TYPE: + switch (tok->u.subtype) { + case CTYPE_WORD: + case CTYPE_NOT_WORD: + *np = node_new_ctype(tok->u.subtype); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + break; + + case CTYPE_WHITE_SPACE: + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_CODE_SPACE(env->enc, c)) BITSET_SET_BIT(NCCLASS(*np).bs, c); + } + break; + + case CTYPE_NOT_WHITE_SPACE: + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_CODE_SPACE(env->enc, c)) BITSET_SET_BIT(NCCLASS(*np).bs, c); + } + break; + + case CTYPE_DIGIT: + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_CODE_DIGIT(env->enc, c)) BITSET_SET_BIT(NCCLASS(*np).bs, c); + } + break; + + case CTYPE_NOT_DIGIT: + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_CODE_DIGIT(env->enc, c)) BITSET_SET_BIT(NCCLASS(*np).bs, c); + } + break; + + default: + return REGERR_PARSER_BUG; + break; + } + break; + + case TK_CC_OPEN: + r = parse_char_class(np, tok, src, end, env); + if (r != 0) return r; + break; + + case TK_ANYCHAR: + *np = node_new_anychar(); + break; + + case TK_BACKREF: + len = tok->u.backref.num; + *np = node_new_backref(len, + (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), env); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + break; + +#ifdef USE_SUBEXP_CALL + case TK_CALL: + *np = node_new_call(tok->u.call.name, tok->u.call.name_end); + CHECK_NULL_RETURN_VAL(*np, REGERR_MEMORY); + env->num_call++; + break; +#endif + + case TK_ANCHOR: + *np = regex_node_new_anchor(tok->u.anchor); + break; + + case TK_OP_REPEAT: + case TK_INTERVAL: + if (IS_SYNTAX_BV(env->syntax, REG_SYN_CONTEXT_INDEP_OPS)) { + if (IS_SYNTAX_BV(env->syntax, REG_SYN_CONTEXT_INVALID_OPS)) + return REGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; + else + *np = node_new_empty(); + } + else { + *src = tok->backp; + goto tk_byte; + } + break; + + default: + return REGERR_PARSER_BUG; + break; + } + + re_entry: + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + + repeat: + if (r == TK_OP_REPEAT || r == TK_INTERVAL) { + if (is_invalid_qualifier_target(*np)) + return REGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; + + qn = node_new_qualifier(tok->u.repeat.lower, tok->u.repeat.upper, + (r == TK_INTERVAL ? 1 : 0)); + CHECK_NULL_RETURN_VAL(qn, REGERR_MEMORY); + NQUALIFIER(qn).greedy = tok->u.repeat.greedy; + r = set_qualifier(qn, *np, group, env); + if (r < 0) return r; + + if (tok->u.repeat.possessive != 0) { + Node* en; + en = node_new_effect(EFFECT_STOP_BACKTRACK); + CHECK_NULL_RETURN_VAL(en, REGERR_MEMORY); + NEFFECT(en).target = qn; + qn = en; + } + + if (r == 0) { + *np = qn; + } + else if (r == 2) { /* split case: /abc+/ */ + Node* target = *np; + *np = node_new_list(target, NULL); + NCONS(*np).right = node_new_list(qn, NULL); + } + goto re_entry; + } + + return r; +} + +static int +parse_branch(Node** top, RegToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env) +{ + int r; + Node *node, **headp; + + *top = NULL; + r = parse_exp(&node, tok, term, src, end, env); + if (r < 0) return r; + + if (r == TK_EOT || r == term || r == TK_ALT) { + *top = node; + } + else { + *top = node_new_list(node, NULL); + headp = &(NCONS(*top).right); + while (r != TK_EOT && r != term && r != TK_ALT) { + r = parse_exp(&node, tok, term, src, end, env); + if (r < 0) return r; + + if (NTYPE(node) == N_LIST) { + *headp = node; + while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right; + headp = &(NCONS(node).right); + } + else { + *headp = node_new_list(node, NULL); + headp = &(NCONS(*headp).right); + } + } + } + + return r; +} + +/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ +static int +parse_subexp(Node** top, RegToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env) +{ + int r; + Node *node, **headp; + + *top = NULL; + r = parse_branch(&node, tok, term, src, end, env); + if (r < 0) return r; + + if (r == term) { + *top = node; + } + else if (r == TK_ALT) { + *top = node_new_alt(node, NULL); + headp = &(NCONS(*top).right); + while (r == TK_ALT) { + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + r = parse_branch(&node, tok, term, src, end, env); + if (r < 0) return r; + + *headp = node_new_alt(node, NULL); + headp = &(NCONS(*headp).right); + } + + if (tok->type != term) + goto err; + } + else { + err: + if (term == TK_SUBEXP_CLOSE) + return REGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; + else + return REGERR_PARSER_BUG; + } + + return r; +} + +static int +parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) +{ + int r; + RegToken tok; + + r = fetch_token(&tok, src, end, env); + if (r < 0) return r; + r = parse_subexp(top, &tok, TK_EOT, src, end, env); + if (r < 0) return r; + return 0; +} + +extern int +regex_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, + ScanEnv* env) +{ + int r; + UChar* p; + +#ifdef USE_NAMED_SUBEXP + names_clear(reg); +#endif + + scan_env_clear(env); + env->option = reg->options; + env->enc = reg->enc; + env->syntax = reg->syntax; + env->pattern = pattern; + env->pattern_end = end; + env->reg = reg; + + *root = NULL; + p = pattern; + r = parse_regexp(root, &p, end, env); + reg->num_mem = env->num_mem; + return r; +} + +extern void +regex_scan_env_set_error_string(ScanEnv* env, int ecode, + UChar* arg, UChar* arg_end) +{ + env->error = arg; + env->error_end = arg_end; +} diff --git a/ext/mbstring/oniguruma/regparse.h b/ext/mbstring/oniguruma/regparse.h new file mode 100644 index 0000000000..5a073623c9 --- /dev/null +++ b/ext/mbstring/oniguruma/regparse.h @@ -0,0 +1,255 @@ +/********************************************************************** + + regparse.h - Oniguruma (regular expression library) + + Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef REGPARSE_H +#define REGPARSE_H + +#include "regint.h" + +/* node type */ +#define N_STRING (1<< 0) +#define N_CCLASS (1<< 1) +#define N_CTYPE (1<< 2) +#define N_ANYCHAR (1<< 3) +#define N_BACKREF (1<< 4) +#define N_QUALIFIER (1<< 5) +#define N_EFFECT (1<< 6) +#define N_ANCHOR (1<< 7) +#define N_LIST (1<< 8) +#define N_ALT (1<< 9) +#define N_CALL (1<<10) + +#define IS_NODE_TYPE_SIMPLE(type) \ + (((type) & (N_STRING | N_CCLASS | N_CTYPE | N_ANYCHAR | N_BACKREF)) != 0) + +#define NTYPE(node) ((node)->type) +#define NCONS(node) ((node)->u.cons) +#define NSTRING(node) ((node)->u.str) +#define NCCLASS(node) ((node)->u.cclass) +#define NCTYPE(node) ((node)->u.ctype) +#define NQUALIFIER(node) ((node)->u.qualifier) +#define NANCHOR(node) ((node)->u.anchor) +#define NBACKREF(node) ((node)->u.backref) +#define NEFFECT(node) ((node)->u.effect) +#define NCALL(node) ((node)->u.call) + +#define CTYPE_WORD (1<<0) +#define CTYPE_NOT_WORD (1<<1) +#define CTYPE_WHITE_SPACE (1<<2) +#define CTYPE_NOT_WHITE_SPACE (1<<3) +#define CTYPE_DIGIT (1<<4) +#define CTYPE_NOT_DIGIT (1<<5) + + +#define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL) +#define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF) + +#define EFFECT_MEMORY (1<<0) +#define EFFECT_OPTION (1<<1) +#define EFFECT_STOP_BACKTRACK (1<<2) + +#define REPEAT_INFINITE -1 +#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) + +#define NODE_STR_MARGIN 16 +#define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_BACKREFS_SIZE 7 + +#define NSTR_RAW (1<<0) /* by backslashed number */ +#define NSTR_CASE_AMBIG (1<<1) + +#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) +#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW +#define NSTRING_SET_CASE_AMBIG(node) (node)->u.str.flag |= NSTR_CASE_AMBIG +#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) +#define NSTRING_IS_CASE_AMBIG(node) \ + (((node)->u.str.flag & NSTR_CASE_AMBIG) != 0) + +#define BACKREFS_P(br) \ + (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static); + +typedef struct { + UChar* s; + UChar* end; + unsigned int flag; + int capa; /* (allocated size - 1) or 0: use buf[] */ + UChar buf[NODE_STR_BUF_SIZE]; +} StrNode; + +typedef struct { + int not; + BitSet bs; + BBuf* mbuf; /* multi-byte info or NULL */ +} CClassNode; + +typedef struct { + struct _Node* target; + int lower; + int upper; + int greedy; + int by_number; /* {n,m} */ + int target_may_empty; /* target can match with empty data */ + struct _Node* head_exact; + struct _Node* next_head_exact; + int is_refered; /* include called node. don't eliminate even if {0} */ +} QualifierNode; + +/* status bits */ +#define NST_RECURSION (1<<0) +#define NST_CALLED (1<<1) +#define NST_ADDR_FIXED (1<<2) +#define NST_MIN_FIXED (1<<3) +#define NST_MAX_FIXED (1<<4) +#define NST_CLEN_FIXED (1<<5) +#define NST_MARK1 (1<<6) +#define NST_MARK2 (1<<7) +#define NST_MEM_BACKREFED (1<<8) +#define NST_SIMPLE_REPEAT (1<<9) /* for stop backtrack optimization */ + +#define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) +#define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) + +#define IS_EFFECT_CALLED(en) (((en)->state & NST_CALLED) != 0) +#define IS_EFFECT_ADDR_FIXED(en) (((en)->state & NST_ADDR_FIXED) != 0) +#define IS_EFFECT_RECURSION(en) (((en)->state & NST_RECURSION) != 0) +#define IS_EFFECT_MARK1(en) (((en)->state & NST_MARK1) != 0) +#define IS_EFFECT_MARK2(en) (((en)->state & NST_MARK2) != 0) +#define IS_EFFECT_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0) +#define IS_EFFECT_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0) +#define IS_EFFECT_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0) +#define IS_EFFECT_SIMPLE_REPEAT(en) (((en)->state & NST_SIMPLE_REPEAT) != 0) + +#define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION +#define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0) + +typedef struct { + int state; + int type; + int regnum; + RegOptionType option; + struct _Node* target; + AbsAddrType call_addr; + /* for multiple call reference */ + RegDistance min_len; /* min length (byte) */ + RegDistance max_len; /* max length (byte) */ + int char_len; /* character length */ + int opt_count; /* referenced count in optimize_node_left() */ +} EffectNode; + +#define CALLNODE_REFNUM_UNDEF -1 + +#ifdef USE_SUBEXP_CALL + +typedef struct { + int offset; + struct _Node* target; +} UnsetAddr; + +typedef struct { + int num; + int alloc; + UnsetAddr* us; +} UnsetAddrList; + +typedef struct { + int state; + int ref_num; + UChar* name; + UChar* name_end; + struct _Node* target; /* EffectNode : EFFECT_MEMORY */ + UnsetAddrList* unset_addr_list; +} CallNode; + +#endif + +typedef struct { + int state; + int back_num; + int back_static[NODE_BACKREFS_SIZE]; + int* back_dynamic; +} BackrefNode; + +typedef struct { + int type; + struct _Node* target; + int char_len; +} AnchorNode; + +typedef struct _Node { + int type; + union { + StrNode str; + CClassNode cclass; + QualifierNode qualifier; + EffectNode effect; +#ifdef USE_SUBEXP_CALL + CallNode call; +#endif + BackrefNode backref; + AnchorNode anchor; + struct { + struct _Node* left; + struct _Node* right; + } cons; + struct { + int type; + } ctype; + } u; +} Node; + +#define NULL_NODE ((Node* )0) + +#define SCANENV_MEMNODES_SIZE 8 +#define SCANENV_MEM_NODES(senv) \ + (IS_NOT_NULL((senv)->mem_nodes_dynamic) ? \ + (senv)->mem_nodes_dynamic : (senv)->mem_nodes_static) + +typedef struct { + RegOptionType option; + RegCharEncoding enc; + RegSyntaxType* syntax; + BitStatusType backtrack_mem; + BitStatusType backrefed_mem; + UChar* pattern; + UChar* pattern_end; + UChar* error; + UChar* error_end; + regex_t* reg; /* for reg->names only */ + int num_call; +#ifdef USE_SUBEXP_CALL + UnsetAddrList* unset_addr_list; +#endif + int num_mem; + int mem_alloc; + Node* mem_nodes_static[SCANENV_MEMNODES_SIZE]; + Node** mem_nodes_dynamic; +} ScanEnv; + + +#define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0) +#define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) +#define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) + + +extern void regex_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); +extern int regex_scan_unsigned_number P_((UChar** src, UChar* end, RegCharEncoding enc)); +extern void regex_node_conv_to_str_node P_((Node* node, int raw)); +extern int regex_node_str_cat P_((Node* node, UChar* s, UChar* end)); +extern void regex_node_free P_((Node* node)); +extern Node* regex_node_new_effect P_((int type)); +extern Node* regex_node_new_anchor P_((int type)); +extern int regex_free_node_list(); +extern int regex_names_free P_((regex_t* reg)); +extern int regex_parse_make_tree P_((Node** root, UChar* pattern, UChar* end, regex_t* reg, ScanEnv* env)); + +#ifdef REG_DEBUG +#ifdef USE_NAMED_SUBEXP +extern int regex_print_names(FILE*, regex_t*); +#endif +#endif + +#endif /* REGPARSE_H */ diff --git a/ext/mbstring/oniguruma/regposerr.c b/ext/mbstring/oniguruma/regposerr.c new file mode 100644 index 0000000000..007e7b65c0 --- /dev/null +++ b/ext/mbstring/oniguruma/regposerr.c @@ -0,0 +1,68 @@ +/********************************************************************** + + regposerr.c - Oniguruma (regular expression library) + + Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "config.h" +#include "onigposix.h" + +#ifdef HAVE_STRING_H +# include <string.h> +#else +# include <strings.h> +#endif + +static char* ESTRING[] = { + NULL, + "failed to match", /* REG_NOMATCH */ + "Invalid regular expression", /* REG_BADPAT */ + "invalid collating element referenced", /* REG_ECOLLATE */ + "invalid character class type referenced", /* REG_ECTYPE */ + "bad backslash-escape sequence", /* REG_EESCAPE */ + "invalid back reference number", /* REG_ESUBREG */ + "imbalanced [ and ]", /* REG_EBRACK */ + "imbalanced ( and )", /* REG_EPAREN */ + "imbalanced { and }", /* REG_EBRACE */ + "invalid repeat range {n,m}", /* REG_BADBR */ + "invalid range", /* REG_ERANGE */ + "Out of memory", /* REG_ESPACE */ + "? * + not preceded by valid regular expression", /* REG_BADRPT */ + + /* Extended errors */ + "internal error", /* REG_EONIG_INTERNAL */ + "invalid wide char value", /* REG_EONIG_BADWC */ + "invalid argument", /* REG_EONIG_BADARG */ + "multi-thread error" /* REG_EONIG_THREAD */ +}; + +#include <stdio.h> + + +extern size_t +regerror(int posix_ecode, const regex_t* reg, char* buf, size_t size) +{ + char* s; + char tbuf[35]; + size_t len; + + if (posix_ecode > 0 && posix_ecode < sizeof(ESTRING) / sizeof(ESTRING[0])) { + s = ESTRING[posix_ecode]; + } + else if (posix_ecode == 0) { + s = ""; + } + else { + sprintf(tbuf, "undefined error code (%d)", posix_ecode); + s = tbuf; + } + + len = strlen(s) + 1; + + if (buf != NULL && size > 0) { + strncpy(buf, s, size - 1); + buf[size - 1] = '\0'; + } + return len; +} diff --git a/ext/mbstring/oniguruma/regposix.c b/ext/mbstring/oniguruma/regposix.c new file mode 100644 index 0000000000..ad22338132 --- /dev/null +++ b/ext/mbstring/oniguruma/regposix.c @@ -0,0 +1,195 @@ +/********************************************************************** + + regposix.c - Oniguruma (regular expression library) + + Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ + +#define regex_t onig_regex_t +#include "regint.h" +#undef regex_t +#include "onigposix.h" + +#define ONIG_C(reg) ((onig_regex_t* )((reg)->onig)) +#define PONIG_C(reg) ((onig_regex_t** )(&(reg)->onig)) + +#if 1 +#define ENC_STRING_LEN(enc,s,len) do { \ + UChar* tmps = (UChar* )(s); \ + /* while (*tmps != 0) tmps += mblen(enc,*tmps); */ \ + while (*tmps != 0) tmps++; /* OK for UTF-8, EUC-JP, Shift_JIS */ \ + len = tmps - (UChar* )(s); \ +} while(0) +#else +#define ENC_STRING_LEN(enc,s,len) len = strlen(s) +#endif + +typedef struct { + int onig_err; + int posix_err; +} O2PERR; + +static int +onig2posix_error_code(int code) +{ + static O2PERR o2p[] = { + { REG_MISMATCH, REG_NOMATCH }, + { REG_NO_SUPPORT_CONFIG, REG_EONIG_INTERNAL }, + { REGERR_MEMORY, REG_ESPACE }, + { REGERR_MATCH_STACK_LIMIT_OVER, REG_EONIG_INTERNAL }, + { REGERR_TYPE_BUG, REG_EONIG_INTERNAL }, + { REGERR_PARSER_BUG, REG_EONIG_INTERNAL }, + { REGERR_STACK_BUG, REG_EONIG_INTERNAL }, + { REGERR_UNDEFINED_BYTECODE, REG_EONIG_INTERNAL }, + { REGERR_UNEXPECTED_BYTECODE, REG_EONIG_INTERNAL }, + { REGERR_DEFAULT_ENCODING_IS_NOT_SETTED, REG_EONIG_BADARG }, + { REGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR, REG_EONIG_BADARG }, + { REGERR_END_PATTERN_AT_LEFT_BRACE, REG_EBRACE }, + { REGERR_END_PATTERN_AT_LEFT_BRACKET, REG_EBRACK }, + { REGERR_EMPTY_CHAR_CLASS, REG_ECTYPE }, + { REGERR_PREMATURE_END_OF_CHAR_CLASS, REG_ECTYPE }, + { REGERR_END_PATTERN_AT_BACKSLASH, REG_EESCAPE }, + { REGERR_END_PATTERN_AT_META, REG_EESCAPE }, + { REGERR_END_PATTERN_AT_CONTROL, REG_EESCAPE }, + { REGERR_META_CODE_SYNTAX, REG_BADPAT }, + { REGERR_CONTROL_CODE_SYNTAX, REG_BADPAT }, + { REGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE, REG_ECTYPE }, + { REGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE, REG_ECTYPE }, + { REGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS, REG_ECTYPE }, + { REGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED, REG_BADRPT }, + { REGERR_TARGET_OF_REPEAT_OPERATOR_INVALID, REG_BADRPT }, + { REGERR_NESTED_REPEAT_OPERATOR, REG_BADRPT }, + { REGERR_UNMATCHED_CLOSE_PARENTHESIS, REG_EPAREN }, + { REGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS, REG_EPAREN }, + { REGERR_END_PATTERN_IN_GROUP, REG_BADPAT }, + { REGERR_UNDEFINED_GROUP_OPTION, REG_BADPAT }, + { REGERR_INVALID_POSIX_BRACKET_TYPE, REG_BADPAT }, + { REGERR_INVALID_LOOK_BEHIND_PATTERN, REG_BADPAT }, + { REGERR_INVALID_REPEAT_RANGE_PATTERN, REG_BADPAT }, + { REGERR_TOO_BIG_NUMBER, REG_BADPAT }, + { REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE, REG_BADBR }, + { REGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE, REG_BADBR }, + { REGERR_EMPTY_RANGE_IN_CHAR_CLASS, REG_ECTYPE }, + { REGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE, REG_ECTYPE }, + { REGERR_TOO_MANY_MULTI_BYTE_RANGES, REG_ECTYPE }, + { REGERR_TOO_SHORT_MULTI_BYTE_STRING, REG_BADPAT }, + { REGERR_TOO_BIG_BACKREF_NUMBER, REG_ESUBREG }, + { REGERR_INVALID_BACKREF, REG_ESUBREG }, + { REGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, + { REGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, + { REGERR_INVALID_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, + { REGERR_INVALID_SUBEXP_NAME, REG_BADPAT }, + { REGERR_UNDEFINED_NAME_REFERENCE, REG_BADPAT }, + { REGERR_UNDEFINED_GROUP_REFERENCE, REG_BADPAT }, + { REGERR_MULTIPLEX_DEFINITION_NAME_CALL, REG_BADPAT }, + { REGERR_NEVER_ENDING_RECURSION, REG_BADPAT }, + { REGERR_OVER_THREAD_PASS_LIMIT_COUNT, REG_EONIG_THREAD } + }; + + int i; + + if (code >= 0) return 0; + + for (i = 0; i < sizeof(o2p) / sizeof(o2p[0]); i++) { + if (code == o2p[i].onig_err) + return o2p[i].posix_err; + } + + return REG_EONIG_INTERNAL; /* but, unknown error code */ +} + +extern int +regcomp(regex_t* reg, const char* pattern, int posix_options) +{ + int r, len; + RegSyntaxType* syntax = RegDefaultSyntax; + RegOptionType options; + + if ((posix_options & REG_EXTENDED) == 0) + syntax = REG_SYNTAX_POSIX_BASIC; + + options = syntax->options; + if ((posix_options & REG_ICASE) != 0) + REG_OPTION_ON(options, REG_OPTION_IGNORECASE); + if ((posix_options & REG_NEWLINE) != 0) { + REG_OPTION_ON( options, REG_OPTION_NEGATE_SINGLELINE); + REG_OPTION_OFF(options, REG_OPTION_SINGLELINE); + } + + reg->comp_options = posix_options; + + ENC_STRING_LEN(RegDefaultCharEncoding, pattern, len); + r = regex_new(PONIG_C(reg), (UChar* )pattern, (UChar* )(pattern + len), + options, RegDefaultCharEncoding, syntax, (RegErrorInfo* )NULL); + if (r != REG_NORMAL) { + return onig2posix_error_code(r); + } + + reg->re_nsub = ONIG_C(reg)->num_mem; + return 0; +} + +extern int +regexec(regex_t* reg, const char* str, size_t nmatch, + regmatch_t pmatch[], int posix_options) +{ + int r, i, len; + UChar* end; + RegOptionType options; + + options = REG_OPTION_POSIX_REGION; + if ((posix_options & REG_NOTBOL) != 0) options |= REG_OPTION_NOTBOL; + if ((posix_options & REG_NOTEOL) != 0) options |= REG_OPTION_NOTEOL; + + if ((reg->comp_options & REG_NOSUB) != 0) { + pmatch = (regmatch_t* )NULL; + nmatch = 0; + } + + ENC_STRING_LEN(ONIG_C(reg)->code,str,len); + end = (UChar* )(str + len); + r = regex_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end, + (RegRegion* )pmatch, options); + + if (r >= 0) { + r = 0; /* Match */ + } + else if (r == REG_MISMATCH) { + r = REG_NOMATCH; + for (i = 0; i < nmatch; i++) + pmatch[i].rm_so = pmatch[i].rm_eo = REG_REGION_NOTPOS; + } + else { + r = onig2posix_error_code(r); + } + + return r; +} + +extern void +regfree(regex_t* reg) +{ + regex_free(ONIG_C(reg)); +} + + +extern void +reg_set_encoding(int mb_code) +{ + RegDefaultCharEncoding = REG_MBLEN_TABLE[mb_code]; +} + +extern int +reg_name_to_group_numbers(regex_t* reg, + unsigned char* name, unsigned char* name_end, int** nums) +{ + return regex_name_to_group_numbers(ONIG_C(reg), name, name_end, nums); +} + +extern int +reg_foreach_name(regex_t* reg, int (*func)(unsigned char*,int,int*,void*), + void* arg) +{ + return regex_foreach_name(ONIG_C(reg), func, arg); +} diff --git a/ext/mbstring/oniguruma/sample/names.c b/ext/mbstring/oniguruma/sample/names.c new file mode 100644 index 0000000000..1ebc4e856c --- /dev/null +++ b/ext/mbstring/oniguruma/sample/names.c @@ -0,0 +1,64 @@ +/* + * names.c -- example of group name callback. + */ +#include<stdio.h> +#include "oniguruma.h" + +static int +name_callback(UChar* name, int ngroup_num, int* group_nums, void* arg) +{ + int i, gn; + RegRegion *region = (RegRegion* )arg; + + for (i = 0; i < ngroup_num; i++) { + gn = group_nums[i]; + fprintf(stderr, "%s (%d): ", name, gn); + fprintf(stderr, "(%d-%d)\n", region->beg[gn], region->end[gn]); + } + return 0; /* 0: continue */ +} + +extern int main(int argc, char* argv[]) +{ + int r; + unsigned char *start, *range, *end; + regex_t* reg; + RegErrorInfo einfo; + RegRegion *region; + + static unsigned char* pattern = "(?<foo>a*)(?<bar>b*)(?<foo>c*)"; + static unsigned char* str = "aaabbbbcc"; + + r = regex_new(®, pattern, pattern + strlen(pattern), + REG_OPTION_DEFAULT, REGCODE_ASCII, REG_SYNTAX_DEFAULT, &einfo); + if (r != REG_NORMAL) { + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + exit(-1); + } + + region = regex_region_new(); + + end = str + strlen(str); + start = str; + range = end; + r = regex_search(reg, str, end, start, range, region, REG_OPTION_NONE); + if (r >= 0) { + fprintf(stderr, "match at %d\n\n", r); + r = regex_foreach_name(reg, name_callback, (void* )region); + } + else if (r == REG_MISMATCH) { + fprintf(stderr, "search fail\n"); + } + else { /* error */ + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r); + exit(-1); + } + + regex_region_free(region, 1 /* 1:free self, 0:free contents only */); + regex_free(reg); + regex_end(); + return 0; +} diff --git a/ext/mbstring/oniguruma/sample/posix.c b/ext/mbstring/oniguruma/sample/posix.c new file mode 100644 index 0000000000..ff20292cb0 --- /dev/null +++ b/ext/mbstring/oniguruma/sample/posix.c @@ -0,0 +1,92 @@ +/* + * posix.c + */ +#include<stdio.h> +#include "onigposix.h" + +static int x(regex_t* reg, unsigned char* pattern, unsigned char* str) +{ + int r, i; + char buf[200]; + regmatch_t pmatch[20]; + + r = regexec(reg, str, reg->re_nsub + 1, pmatch, 0); + if (r != 0 && r != REG_NOMATCH) { + regerror(r, reg, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + + if (r == REG_NOMATCH) { + fprintf(stderr, "FAIL: /%s/ '%s'\n", pattern, str); + } + else { + fprintf(stderr, "OK: /%s/ '%s'\n", pattern, str); + for (i = 0; i <= reg->re_nsub; i++) { + fprintf(stderr, "%d: %d-%d\n", i, pmatch[i].rm_so, pmatch[i].rm_eo); + } + } + return 0; +} + +extern int main(int argc, char* argv[]) +{ + int r; + char buf[200]; + regex_t reg; + unsigned char* pattern; + + /* default syntax (REG_SYNTAX_RUBY) */ + pattern = "^a+b{2,7}[c-f]?$|uuu"; + r = regcomp(®, pattern, REG_EXTENDED); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + x(®, pattern, "aaabbbbd"); + + /* POSIX Basic RE (REG_EXTENDED is not specified.) */ + pattern = "^a+b{2,7}[c-f]?|uuu"; + r = regcomp(®, pattern, 0); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + x(®, pattern, "a+b{2,7}d?|uuu"); + + /* POSIX Basic RE (REG_EXTENDED is not specified.) */ + pattern = "^a*b\\{2,7\\}\\([c-f]\\)$"; + r = regcomp(®, pattern, 0); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + x(®, pattern, "aaaabbbbbbd"); + + /* POSIX Extended RE */ + regex_set_default_syntax(REG_SYNTAX_POSIX_EXTENDED); + pattern = "^a+b{2,7}[c-f]?)$|uuu"; + r = regcomp(®, pattern, REG_EXTENDED); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + x(®, pattern, "aaabbbbd)"); + + pattern = "^b."; + r = regcomp(®, pattern, REG_EXTENDED | REG_NEWLINE); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + x(®, pattern, "a\nb\n"); + + regfree(®); + regex_end(); + return 0; +} diff --git a/ext/mbstring/oniguruma/sample/simple.c b/ext/mbstring/oniguruma/sample/simple.c new file mode 100644 index 0000000000..89498bac11 --- /dev/null +++ b/ext/mbstring/oniguruma/sample/simple.c @@ -0,0 +1,54 @@ +/* + * simple.c + */ +#include<stdio.h> +#include "oniguruma.h" + +extern int main(int argc, char* argv[]) +{ + int r; + unsigned char *start, *range, *end; + regex_t* reg; + RegErrorInfo einfo; + RegRegion *region; + + static unsigned char* pattern = "a(.*)b|[e-f]+"; + static unsigned char* str = "zzzzaffffffffb"; + + r = regex_new(®, pattern, pattern + strlen(pattern), + REG_OPTION_DEFAULT, REGCODE_ASCII, REG_SYNTAX_DEFAULT, &einfo); + if (r != REG_NORMAL) { + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + exit(-1); + } + + region = regex_region_new(); + + end = str + strlen(str); + start = str; + range = end; + r = regex_search(reg, str, end, start, range, region, REG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stderr, "match at %d\n", r); + for (i = 0; i < region->num_regs; i++) { + fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == REG_MISMATCH) { + fprintf(stderr, "search fail\n"); + } + else { /* error */ + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r); + exit(-1); + } + + regex_region_free(region, 1 /* 1:free self, 0:free contents only */); + regex_free(reg); + regex_end(); + return 0; +} diff --git a/ext/mbstring/oniguruma/test.rb b/ext/mbstring/oniguruma/test.rb new file mode 100644 index 0000000000..2c69344407 --- /dev/null +++ b/ext/mbstring/oniguruma/test.rb @@ -0,0 +1,971 @@ +# test.rb +# Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +def pr(result, reg, str, n = 0, *range) + printf("%s /%s/:'%s'", result, reg.source, str) + if (n.class == Fixnum) + printf(":%d", n) if n != 0 + if (range.size > 0) + if (range[3].nil?) + printf(" (%d-%d : X-X)", range[0], range[1]) + else + printf(" (%d-%d : %d-%d)", range[0], range[1], range[2], range[3]) + end + end + else + printf(" %s", n) + end + printf("\n") +end + +def rok(result_opt, reg, str, n = 0, *range) + result = "OK" + result_opt + result += " " * (7 - result.length) + pr(result, reg, str, n, *range) + $rok += 1 +end + +def rfail(result_opt, reg, str, n = 0, *range) + result = "FAIL" + result_opt + result += " " * (7 - result.length) + pr(result, reg, str, n, *range) + $rfail += 1 +end + +def x(reg, str, s, e, n = 0) + m = reg.match(str) + if m + if (m.size() <= n) + rfail("(%d)" % (m.size()-1), reg, str, n) + else + if (m.begin(n) == s && m.end(n) == e) + rok("", reg, str, n) + else + rfail("", reg, str, n, s, e, m.begin(n), m.end(n)) + end + end + else + rfail("", reg, str, n) + end +end + +def n(reg, str) + m = reg.match(str) + if m + rfail("(N)", reg, str, 0) + else + rok("(N)", reg, str, 0) + end +end + +def r(reg, str, index, pos = nil) + if (pos) + res = str.rindex(reg, pos) + else + res = str.rindex(reg) + end + if res + if (res == index) + rok("(r)", reg, str) + else + rfail("(r)", reg, str, [res, '-', index]) + end + else + rfail("(r)", reg, str) + end +end + +def i(reg, str, s = 0, e = 0, n = 0) + # ignore +end + +### main ### +$rok = $rfail = 0 + + +def test_sb(enc) +$KCODE = enc + + +x(//, '', 0, 0) +x(/^/, '', 0, 0) +x(/$/, '', 0, 0) +x(/\G/, '', 0, 0) +x(/\A/, '', 0, 0) +x(/\Z/, '', 0, 0) +x(/\z/, '', 0, 0) +x(/^$/, '', 0, 0) +x(/\ca/, "\001", 0, 1) +x(/\C-b/, "\002", 0, 1) +x(/\M-Z/, "\xDA", 0, 1) +x(//, 'a', 0, 0) +x(/a/, 'a', 0, 1) +x(/aa/, 'aa', 0, 2) +x(/aaa/, 'aaa', 0, 3) +x(/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 0, 35) +x(/ab/, 'ab', 0, 2) +x(/b/, 'ab', 1, 2) +x(/bc/, 'abc', 1, 3) +x(/\17/, "\017", 0, 1) +x(/\x1f/, "\x1f", 0, 1) +x(/\xFE/, "\xfe", 0, 1) +x(/a(?#....\\JJJJ)b/, 'ab', 0, 2) +x(/./, 'a', 0, 1) +n(/./, '') +x(/../, 'ab', 0, 2) +x(/\w/, 'e', 0, 1) +n(/\W/, 'e') +x(/\s/, ' ', 0, 1) +x(/\S/, 'b', 0, 1) +x(/\d/, '4', 0, 1) +n(/\D/, '4') +x(/\b/, 'z ', 0, 0) +x(/\b/, ' z', 1, 1) +x(/\B/, 'zz ', 1, 1) +x(/\B/, 'z ', 2, 2) +x(/\B/, ' z', 0, 0) +x(/[ab]/, 'b', 0, 1) +n(/[ab]/, 'c') +x(/[a-z]/, 't', 0, 1) +n(/[^a]/, 'a') +x(/[^a]/, "\n", 0, 1) +x(/[]]/, ']', 0, 1) +n(/[^]]/, ']') +x(/[b-]/, 'b', 0, 1) +x(/[b-]/, '-', 0, 1) +x(/[\w]/, 'z', 0, 1) +n(/[\w]/, ' ') +x(/[\d]/, '5', 0, 1) +n(/[\d]/, 'e') +x(/[\D]/, 't', 0, 1) +n(/[\D]/, '3') +x(/[\s]/, ' ', 0, 1) +n(/[\s]/, 'a') +x(/[\S]/, 'b', 0, 1) +n(/[\S]/, ' ') +x(/[\w\d]/, '2', 0, 1) +n(/[\w\d]/, ' ') +x(/[[:upper:]]/, 'B', 0, 1) +x(/[*[:xdigit:]+]/, '+', 0, 1) +x(/[*[:xdigit:]+]/, 'GHIKK-9+*', 6, 7) +x(/[*[:xdigit:]+]/, '-@^+', 3, 4) +n(/[[:upper]]/, 'A') +x(/[[:upper]]/, ':', 0, 1) +x(/[\044-\047]/, "\046", 0, 1) +x(/[\x5a-\x5c]/, "\x5b", 0, 1) +x(/[\x6A-\x6D]/, "\x6c", 0, 1) +n(/[\x6A-\x6D]/, "\x6E") +n(/^[0-9A-F]+ 0+ UNDEF /, '75F 00000000 SECT14A notype () External | _rb_apply') +x(/[\[]/, '[', 0, 1) +x(/[\]]/, ']', 0, 1) +x(/[&]/, '&', 0, 1) +x(/[[ab]]/, 'b', 0, 1) +x(/[[ab]c]/, 'c', 0, 1) +n(/[[^a]]/, 'a') +n(/[^[a]]/, 'a') +x(/[[ab]&&bc]/, 'b', 0, 1) +n(/[[ab]&&bc]/, 'a') +n(/[[ab]&&bc]/, 'c') +x(/[a-z&&b-y&&c-x]/, 'w', 0, 1) +n(/[^a-z&&b-y&&c-x]/, 'w') +x(/[[^a&&a]&&a-z]/, 'b', 0, 1) +n(/[[^a&&a]&&a-z]/, 'a') +x(/[[^a-z&&bcdef]&&[^c-g]]/, 'h', 0, 1) +n(/[[^a-z&&bcdef]&&[^c-g]]/, 'c') +x(/[^[^abc]&&[^cde]]/, 'c', 0, 1) +x(/[^[^abc]&&[^cde]]/, 'e', 0, 1) +n(/[^[^abc]&&[^cde]]/, 'f') +x(/[a-&&-a]/, '-', 0, 1) +n(/[a-&&-a]/, '&') +n(/\wabc/, ' abc') +x(/a\Wbc/, 'a bc', 0, 4) +x(/a.b.c/, 'aabbc', 0, 5) +x(/.\wb\W..c/, 'abb bcc', 0, 7) +x(/\s\wzzz/, ' zzzz', 0, 5) +x(/aa.b/, 'aabb', 0, 4) +n(/.a/, 'ab') +x(/.a/, 'aa', 0, 2) +x(/^a/, 'a', 0, 1) +x(/^a$/, 'a', 0, 1) +x(/^\w$/, 'a', 0, 1) +n(/^\w$/, ' ') +x(/^\wab$/, 'zab', 0, 3) +x(/^\wabcdef$/, 'zabcdef', 0, 7) +x(/^\w...def$/, 'zabcdef', 0, 7) +x(/\w\w\s\Waaa\d/, 'aa aaa4', 0, 8) +x(/\A\Z/, '', 0, 0) +x(/\Axyz/, 'xyz', 0, 3) +x(/xyz\Z/, 'xyz', 0, 3) +x(/xyz\z/, 'xyz', 0, 3) +x(/\Gaz/, 'az', 0, 2) +n(/\Gz/, 'bza') +n(/az\G/, 'az') +n(/az\A/, 'az') +n(/a\Az/, 'az') +x(/\^\$/, '^$', 0, 2) +x(/\w/, '_', 0, 1) +n(/\W/, '_') +x(/(?=z)z/, 'z', 0, 1) +n(/(?=z)./, 'a') +x(/(?!z)a/, 'a', 0, 1) +n(/(?!z)a/, 'z') +x(/(?i:a)/, 'a', 0, 1) +x(/(?i:a)/, 'A', 0, 1) +x(/(?i:A)/, 'a', 0, 1) +n(/(?i:A)/, 'b') +x(/(?i:[A-Z])/, 'a', 0, 1) +x(/(?i:[f-m])/, 'H', 0, 1) +x(/(?i:[f-m])/, 'h', 0, 1) +n(/(?i:[f-m])/, 'e') +n(/(?i:[A-c])/, 'D') # changed spec. 2003/02/07 +n(/(?i:[a-C])/, 'D') # changed spec. 2003/02/07 +n(/(?i:[b-C])/, 'A') +x(/(?i:[a-C])/, 'B', 0, 1) +n(/(?i:[c-X])/, '[') +n(/(?i:[!-k])/, 'Z') +x(/(?i:[!-k])/, '7', 0, 1) +n(/(?i:[T-}])/, 'b') +x(/(?i:[T-}])/, '{', 0, 1) +x(/(?i:\?a)/, '?A', 0, 2) +x(/(?i:\*A)/, '*a', 0, 2) +n(/./, "\n") +x(/(?m:.)/, "\n", 0, 1) +x(/(?m:a.)/, "a\n", 0, 2) +x(/(?m:.b)/, "a\nb", 1, 3) +x(/a?/, '', 0, 0) +x(/a?/, 'b', 0, 0) +x(/a?/, 'a', 0, 1) +x(/a*/, '', 0, 0) +x(/a*/, 'a', 0, 1) +x(/a*/, 'aaa', 0, 3) +x(/a*/, 'baaaa', 0, 0) +n(/a+/, '') +x(/a+/, 'a', 0, 1) +x(/a+/, 'aaaa', 0, 4) +x(/a+/, 'aabbb', 0, 2) +x(/a+/, 'baaaa', 1, 5) +x(/.?/, '', 0, 0) +x(/.?/, 'f', 0, 1) +x(/.?/, "\n", 0, 0) +x(/.*/, '', 0, 0) +x(/.*/, 'abcde', 0, 5) +x(/.+/, 'z', 0, 1) +x(/.+/, "zdswer\n", 0, 6) +x(/a|b/, 'a', 0, 1) +x(/a|b/, 'b', 0, 1) +x(/|a/, 'a', 0, 0) +x(/(|a)/, 'a', 0, 0) +x(/ab|bc/, 'ab', 0, 2) +x(/ab|bc/, 'bc', 0, 2) +x(/z(?:ab|bc)/, 'zbc', 0, 3) +x(/a(?:ab|bc)c/, 'aabc', 0, 4) +x(/ab|(?:ac|az)/, 'az', 0, 2) +x(/a|b|c/, 'dc', 1, 2) +x(/a|b|cd|efg|h|ijk|lmn|o|pq|rstuvwx|yz/, 'pqr', 0, 2) +n(/a|b|cd|efg|h|ijk|lmn|o|pq|rstuvwx|yz/, 'mn') +x(/a|^z/, 'ba', 1, 2) +x(/a|^z/, 'za', 0, 1) +x(/a|\Gz/, 'bza', 2, 3) +x(/a|\Gz/, 'za', 0, 1) +x(/a|\Az/, 'bza', 2, 3) +x(/a|\Az/, 'za', 0, 1) +x(/a|b\Z/, 'ba', 1, 2) +x(/a|b\Z/, 'b', 0, 1) +x(/a|b\z/, 'ba', 1, 2) +x(/a|b\z/, 'b', 0, 1) +x(/\w|\s/, ' ', 0, 1) +n(/\w|\w/, ' ') +x(/\w|%/, '%', 0, 1) +x(/\w|[&$]/, '&', 0, 1) +x(/[b-d]|[^e-z]/, 'a', 0, 1) +x(/(?:a|[c-f])|bz/, 'dz', 0, 1) +x(/(?:a|[c-f])|bz/, 'bz', 0, 2) +x(/abc|(?=zz)..f/, 'zzf', 0, 3) +x(/abc|(?!zz)..f/, 'abf', 0, 3) +x(/(?=za)..a|(?=zz)..a/, 'zza', 0, 3) +n(/(?>a|abd)c/, 'abdc') +x(/(?>abd|a)c/, 'abdc', 0, 4) +x(/a?|b/, 'a', 0, 1) +x(/a?|b/, 'b', 0, 0) +x(/a?|b/, '', 0, 0) +x(/a*|b/, 'aa', 0, 2) +x(/a*|b*/, 'ba', 0, 0) +x(/a*|b*/, 'ab', 0, 1) +x(/a+|b*/, '', 0, 0) +x(/a+|b*/, 'bbb', 0, 3) +x(/a+|b*/, 'abbb', 0, 1) +n(/a+|b+/, '') +x(/(a|b)?/, 'b', 0, 1) +x(/(a|b)*/, 'ba', 0, 2) +x(/(a|b)+/, 'bab', 0, 3) +x(/(ab|ca)+/, 'caabbc', 0, 4) +x(/(ab|ca)+/, 'aabca', 1, 5) +x(/(ab|ca)+/, 'abzca', 0, 2) +x(/(a|bab)+/, 'ababa', 0, 5) +x(/(a|bab)+/, 'ba', 1, 2) +x(/(a|bab)+/, 'baaaba', 1, 4) +x(/(?:a|b)(?:a|b)/, 'ab', 0, 2) +x(/(?:a*|b*)(?:a*|b*)/, 'aaabbb', 0, 3) +x(/(?:a*|b*)(?:a+|b+)/, 'aaabbb', 0, 6) +x(/(?:a+|b+){2}/, 'aaabbb', 0, 6) +x(/h{0,}/, 'hhhh', 0, 4) +x(/(?:a+|b+){1,2}/, 'aaabbb', 0, 6) +x(/(?:a+|\Ab*)cc/, 'cc', 0, 2) +n(/(?:a+|\Ab*)cc/, 'abcc') +x(/(?:^a+|b+)*c/, 'aabbbabc', 6, 8) +x(/(?:^a+|b+)*c/, 'aabbbbc', 0, 7) +x(/a|(?i)c/, 'C', 0, 1) +x(/(?i)c|a/, 'C', 0, 1) +i(/(?i)c|a/, 'A', 0, 1) # different spec. +x(/(?i:c)|a/, 'C', 0, 1) +n(/(?i:c)|a/, 'A') +x(/[abc]?/, 'abc', 0, 1) +x(/[abc]*/, 'abc', 0, 3) +x(/[^abc]*/, 'abc', 0, 0) +n(/[^abc]+/, 'abc') +x(/a??/, 'aaa', 0, 0) +x(/ba??b/, 'bab', 0, 3) +x(/a*?/, 'aaa', 0, 0) +x(/ba*?/, 'baa', 0, 1) +x(/ba*?b/, 'baab', 0, 4) +x(/a+?/, 'aaa', 0, 1) +x(/ba+?/, 'baa', 0, 2) +x(/ba+?b/, 'baab', 0, 4) +x(/(?:a?)??/, 'a', 0, 0) +x(/(?:a??)?/, 'a', 0, 0) +x(/(?:a?)+?/, 'aaa', 0, 1) +x(/(?:a+)??/, 'aaa', 0, 0) +x(/(?:a+)??b/, 'aaab', 0, 4) +i(/(?:ab)?{2}/, '', 0, 0) # GNU regex bug +x(/(?:ab)?{2}/, 'ababa', 0, 4) +x(/(?:ab)*{0}/, 'ababa', 0, 0) +x(/(?:ab){3,}/, 'abababab', 0, 8) +n(/(?:ab){3,}/, 'abab') +x(/(?:ab){2,4}/, 'ababab', 0, 6) +x(/(?:ab){2,4}/, 'ababababab', 0, 8) +x(/(?:ab){2,4}?/, 'ababababab', 0, 4) +x(/(?:ab){,}/, 'ab{,}', 0, 5) +x(/(?:abc)+?{2}/, 'abcabcabc', 0, 6) +x(/(?:X*)(?i:xa)/, 'XXXa', 0, 4) +x(/(d+)([^abc]z)/, 'dddz', 0, 4) +x(/([^abc]*)([^abc]z)/, 'dddz', 0, 4) +x(/(\w+)(\wz)/, 'dddz', 0, 4) +x(/(a)/, 'a', 0, 1, 1) +x(/(ab)/, 'ab', 0, 2, 1) +x(/((ab))/, 'ab', 0, 2) +x(/((ab))/, 'ab', 0, 2, 1) +x(/((ab))/, 'ab', 0, 2, 2) +x(/((((((((((((((((((((ab))))))))))))))))))))/, 'ab', 0, 2, 20) +x(/(ab)(cd)/, 'abcd', 0, 2, 1) +x(/(ab)(cd)/, 'abcd', 2, 4, 2) +x(/()(a)bc(def)ghijk/, 'abcdefghijk', 3, 6, 3) +x(/(()(a)bc(def)ghijk)/, 'abcdefghijk', 3, 6, 4) +x(/(^a)/, 'a', 0, 1) +x(/(a)|(a)/, 'ba', 1, 2, 1) +x(/(^a)|(a)/, 'ba', 1, 2, 2) +x(/(a?)/, 'aaa', 0, 1, 1) +x(/(a*)/, 'aaa', 0, 3, 1) +x(/(a*)/, '', 0, 0, 1) +x(/(a+)/, 'aaaaaaa', 0, 7, 1) +x(/(a+|b*)/, 'bbbaa', 0, 3, 1) +x(/(a+|b?)/, 'bbbaa', 0, 1, 1) +x(/(abc)?/, 'abc', 0, 3, 1) +x(/(abc)*/, 'abc', 0, 3, 1) +x(/(abc)+/, 'abc', 0, 3, 1) +x(/(xyz|abc)+/, 'abc', 0, 3, 1) +x(/([xyz][abc]|abc)+/, 'abc', 0, 3, 1) +x(/((?i:abc))/, 'AbC', 0, 3, 1) +x(/(abc)(?i:\1)/, 'abcABC', 0, 6) +x(/((?m:a.c))/, "a\nc", 0, 3, 1) +x(/((?=az)a)/, 'azb', 0, 1, 1) +x(/abc|(.abd)/, 'zabd', 0, 4, 1) +x(/(?:abc)|(ABC)/, 'abc', 0, 3) +x(/(?i:(abc))|(zzz)/, 'ABC', 0, 3, 1) +x(/a*(.)/, 'aaaaz', 4, 5, 1) +x(/a*?(.)/, 'aaaaz', 0, 1, 1) +x(/a*?(c)/, 'aaaac', 4, 5, 1) +x(/[bcd]a*(.)/, 'caaaaz', 5, 6, 1) +x(/(\Abb)cc/, 'bbcc', 0, 2, 1) +n(/(\Abb)cc/, 'zbbcc') +x(/(^bb)cc/, 'bbcc', 0, 2, 1) +n(/(^bb)cc/, 'zbbcc') +x(/cc(bb$)/, 'ccbb', 2, 4, 1) +n(/cc(bb$)/, 'ccbbb') +#n(/\1/, 'a') # compile error on Oniguruma +n(/(\1)/, '') +n(/\1(a)/, 'aa') +n(/(a(b)\1)\2+/, 'ababb') +n(/(?:(?:\1|z)(a))+$/, 'zaa') +x(/(?:(?:\1|z)(a))+$/, 'zaaa', 0, 4) +x(/(a)(?=\1)/, 'aa', 0, 1) +n(/(a)$|\1/, 'az') +x(/(a)\1/, 'aa', 0, 2) +n(/(a)\1/, 'ab') +x(/(a?)\1/, 'aa', 0, 2) +x(/(a??)\1/, 'aa', 0, 0) +x(/(a*)\1/, 'aaaaa', 0, 4) +x(/(a*)\1/, 'aaaaa', 0, 2, 1) +x(/a(b*)\1/, 'abbbb', 0, 5) +x(/a(b*)\1/, 'ab', 0, 1) +x(/(a*)(b*)\1\2/, 'aaabbaaabb', 0, 10) +x(/(a*)(b*)\2/, 'aaabbbb', 0, 7) +x(/(((((((a*)b))))))c\7/, 'aaabcaaa', 0, 8) +x(/(((((((a*)b))))))c\7/, 'aaabcaaa', 0, 3, 7) +x(/(a)(b)(c)\2\1\3/, 'abcbac', 0, 6) +x(/([a-d])\1/, 'cc', 0, 2) +x(/(\w\d\s)\1/, 'f5 f5 ', 0, 6) +n(/(\w\d\s)\1/, 'f5 f5') +x(/(who|[a-c]{3})\1/, 'whowho', 0, 6) +x(/...(who|[a-c]{3})\1/, 'abcwhowho', 0, 9) +x(/(who|[a-c]{3})\1/, 'cbccbc', 0, 6) +x(/(^a)\1/, 'aa', 0, 2) +n(/(^a)\1/, 'baa') +n(/(a$)\1/, 'aa') +n(/(ab\Z)\1/, 'ab') +x(/(a*\Z)\1/, 'a', 1, 1) +x(/.(a*\Z)\1/, 'ba', 1, 2) +x(/(.(abc)\2)/, 'zabcabc', 0, 7, 1) +x(/(.(..\d.)\2)/, 'z12341234', 0, 9, 1) +x(/((?i:az))\1/, 'AzAz', 0, 4) +n(/((?i:az))\1/, 'Azaz') +x(/(?<=a)b/, 'ab', 1, 2) +n(/(?<=a)b/, 'bb') +x(/(?<=a|b)b/, 'bb', 1, 2) +x(/(?<=a|bc)b/, 'bcb', 2, 3) +x(/(?<=a|bc)b/, 'ab', 1, 2) +x(/(?<=a|bc||defghij|klmnopq|r)z/, 'rz', 1, 2) +x(/(?<!a)b/, 'cb', 1, 2) +n(/(?<!a)b/, 'ab') +x(/(?<!a|bc)b/, 'bbb', 0, 1) +n(/(?<!a|bc)z/, 'bcz') +x(/(?<name1>a)/, 'a', 0, 1) +x(/(?<name-2>ab)\1/, 'abab', 0, 4) +x(/(?<name-3>.zv.)\k<name-3>/, 'azvbazvb', 0, 8) +x(/(?<=\g<ab>)|-\zEND (?<ab>XyZ)/, 'XyZ', 3, 3) +x(/(?<n>|a\g<n>)+/, '', 0, 0) +x(/(?<n>|\(\g<n>\))+$/, '()(())', 0, 6) +x(/\g<n>(?<n>.){0}/, 'X', 0, 1, 1) +x(/\g<n>(abc|df(?<n>.YZ){2,8}){0}/, 'XYZ', 0, 3) +x(/\A(?<n>(a\g<n>)|)\z/, 'aaaa', 0, 4) +x(/(?<n>|\g<m>\g<n>)\z|\zEND (?<m>a|(b)\g<m>)/, 'bbbbabba', 0, 8) +x(/(?<@:name[1240]>\w+\sx)a+\k<@:name[1240]>/, ' fg xaaaaaaaafg x', 2, 18) +x(/(z)()()(?<9>a)\4/, 'zaa', 1, 2, 4) +x(/(.)(((?<*>a)))\k<*>/, 'zaa', 0, 3) +x(/((?<name1>\d)|(?<name2>\w))(\k<name1>|\k<name2>)/, 'ff', 0, 2) +x(/(?:(?<x>)|(?<x>efg))\k<x>/, '', 0, 0) +x(/(?:(?<@x>abc)|(?<@x>efg))\k<@x>/, 'abcefgefg', 3, 9) +n(/(?:(?<@x>abc)|(?<@x>efg))\k<@x>/, 'abcefg') +x(/(?:(?<n1>.)|(?<n1>..)|(?<n1>...)|(?<n1>....)|(?<n1>.....)|(?<n1>......)|(?<n1>.......)|(?<n1>........)|(?<n1>.........)|(?<n1>..........)|(?<n1>...........)|(?<n1>............)|(?<n1>.............)|(?<n1>..............))\k<n1>$/, 'a-pyumpyum', 2, 10) +x(/(?:(?<n1>.)|(?<n1>..)|(?<n1>...)|(?<n1>....)|(?<n1>.....)|(?<n1>......)|(?<n1>.......)|(?<n1>........)|(?<n1>.........)|(?<n1>..........)|(?<n1>...........)|(?<n1>............)|(?<n1>.............)|(?<n1>..............))\k<n1>$/, 'xxxxabcdefghijklmnabcdefghijklmn', 4, 18, 14) +x(/(?<name1>)(?<name2>)(?<name3>)(?<name4>)(?<name5>)(?<name6>)(?<name7>)(?<name8>)(?<name9>)(?<name10>)(?<name11>)(?<name12>)(?<name13>)(?<name14>)(?<name15>)(?<name16>aaa)(?<name17>)$/, 'aaa', 0, 3, 16) +x(/(?<foo>a|\(\g<foo>\))/, 'a', 0, 1) +x(/(?<foo>a|\(\g<foo>\))/, '((((((a))))))', 0, 13) +x(/(?<foo>a|\(\g<foo>\))/, '((((((((a))))))))', 0, 17, 1) +x(/\g<bar>|\zEND(?<bar>.*abc$)/, 'abcxxxabc', 0, 9) +x(/\g<1>|\zEND(.a.)/, 'bac', 0, 3) +x(/\g<2>\g<1>|\zEND(.a.)(?<?>.b.)/, 'xbxyay', 3, 6, 1) +x(/\A(?:\g<pon>|\g<pan>|\zEND (?<pan>a|c\g<pon>c)(?<pon>b|d\g<pan>d))$/, 'cdcbcdc', 0, 7) +x(/\A(?<n>|a\g<m>)\z|\zEND (?<m>\g<n>)/, 'aaaa', 0, 4) +x(/(?<n>(a|b\g<n>c){3,5})/, 'baaaaca', 1, 5) +x(/(?<n>(a|b\g<n>c){3,5})/, 'baaaacaaaaa', 0, 10) + +r(//, '', 0) +r(/a/, 'a', 0) +r(/a/, 'a', 0, 1) +r(/b/, 'abc', 1) +r(/b/, 'abc', 1, 2) +r(/./, 'a', 0) +r(/.*/, 'abcde fgh', 9) +r(/a*/, 'aaabbc', 6) +r(/a+/, 'aaabbc', 2) +r(/a?/, 'bac', 3) +r(/a??/, 'bac', 3) +r(/abcde/, 'abcdeavcd', 0) +r(/\w\d\s/, ' a2 aa $3 ', 2) +r(/[c-f]aa[x-z]/, '3caaycaaa', 1) +r(/(?i:fG)g/, 'fGgFggFgG', 3) +r(/a|b/, 'b', 0) +r(/ab|bc|cd/, 'bcc', 0) +r(/(ffy)\1/, 'ffyffyffy', 3) +r(/|z/, 'z', 1) +r(/^az/, 'azaz', 0) +r(/az$/, 'azaz', 2) +r(/(((.a)))\3/, 'zazaaa', 0) +r(/(ac*?z)\1/, 'aacczacczacz', 1) +r(/aaz{3,4}/, 'bbaabbaazzzaazz', 6) +r(/\000a/, "b\000a", 1) +r(/ff\xfe/, "fff\xfe", 1) +r(/...abcdefghijklmnopqrstuvwxyz/, 'zzzzzabcdefghijklmnopqrstuvwxyz', 2) +end + +def test_euc(enc) +$KCODE = enc + +x(//, '、「', 0, 0) +x(/、「/, '、「', 0, 2) +n(/、、/, '、「') +x(/、ヲ、ヲ/, '、ヲ、ヲ', 0, 4) +x(/、「、、、ヲ/, '、「、、、ヲ', 0, 6) +x(/、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ/, '、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ、ウ', 0, 70) +x(/、「/, '、、、「', 2, 4) +x(/、、、ヲ/, '、「、、、ヲ', 2, 6) +x(/\xca\xb8/, "\xca\xb8", 0, 2) +x(/./, '、「', 0, 2) +x(/../, '、ォ、ュ', 0, 4) +x(/\w/, '、ェ', 0, 2) +n(/\W/, '、「') +x(/\S/, '、ス', 0, 2) +x(/\S/, 'エチ', 0, 2) +x(/\b/, 'オ、 ', 0, 0) +x(/\b/, ' 、ロ', 1, 1) +x(/\B/, '、サ、ス ', 2, 2) +x(/\B/, '、ヲ ', 3, 3) +x(/\B/, ' 、、', 0, 0) +x(/[、ソ、チ]/, '、チ', 0, 2) +n(/[、ハ、ヒ]/, '、フ') +x(/[、ヲ-、ェ]/, '、ィ', 0, 2) +n(/[^、ア]/, '、ア') +x(/[\w]/, '、ヘ', 0, 2) +n(/[\d]/, '、ユ') +x(/[\D]/, '、マ', 0, 2) +n(/[\s]/, '、ッ') +x(/[\S]/, '、リ', 0, 2) +x(/[\w\d]/, '、', 0, 2) +x(/[\w\d]/, ' 、', 3, 5) +#x(/[\xa4\xcf-\xa4\xd3]/, "\xa4\xd0", 0, 2) # diff spec with GNU regex. +#n(/[\xb6\xe7-\xb6\xef]/, "\xb6\xe5") # diff spec with GNU regex. +n(/\wオエシヨ/, ' オエシヨ') +x(/オエ\Wシヨ/, 'オエ シヨ', 0, 5) +x(/、「.、、.、ヲ/, '、「、「、、、、、ヲ', 0, 10) +x(/.\w、ヲ\W..、セ/, '、ィ、ヲ、ヲ 、ヲ、セ、セ', 0, 13) +x(/\s\w、ウ、ウ、ウ/, ' 、ウ、ウ、ウ、ウ', 0, 9) +x(/、「、「.、ア/, '、「、「、ア、ア', 0, 8) +n(/.、、/, '、、、ィ') +x(/.、ェ/, '、ェ、ェ', 0, 4) +x(/^、「/, '、「', 0, 2) +x(/^、$/, '、', 0, 2) +x(/^\w$/, '、ヒ', 0, 2) +x(/^\w、ォ、ュ、ッ、ア、ウ$/, 'z、ォ、ュ、ッ、ア、ウ', 0, 11) +x(/^\w...、ヲ、ィ、ェ$/, 'z、「、、、ヲ、ヲ、ィ、ェ', 0, 13) +x(/\w\w\s\W、ェ、ェ、ェ\d/, 'a、ェ 、ェ、ェ、ェ4', 0, 12) +x(/\A、ソ、チ、ト/, '、ソ、チ、ト', 0, 6) +x(/、爨皃秉Z/, '、爨皃', 0, 6) +x(/、ォ、ュ、ッ\z/, '、ォ、ュ、ッ', 0, 6) +x(/、ォ、ュ、ッ\Z/, "、ォ、ュ、ッ\n", 0, 6) +x(/\G、ン、ヤ/, '、ン、ヤ', 0, 4) +n(/\G、ィ/, '、ヲ、ィ、ェ') +n(/、ネ、ニ\G/, '、ネ、ニ') +n(/、゙、゚\A/, '、゙、゚') +n(/、゙\A、゚/, '、゙、゚') +x(/(?=、サ)、サ/, '、サ', 0, 2) +n(/(?=、ヲ)./, '、、') +x(/(?!、ヲ)、ォ/, '、ォ', 0, 2) +n(/(?!、ネ)、「/, '、ネ') +x(/(?i:、「)/, '、「', 0, 2) +x(/(?i:、ヨ、ル)/, '、ヨ、ル', 0, 4) +n(/(?i:、、)/, '、ヲ') +x(/(?m:、.)/, "、鐔n", 0, 3) +x(/(?m:.、)/, "、゙\n、", 2, 5) +x(/、「?/, '', 0, 0) +x(/ハム?/, 'イス', 0, 0) +x(/ハム?/, 'ハム', 0, 2) +x(/ホフ*/, '', 0, 0) +x(/ホフ*/, 'ホフ', 0, 2) +x(/サメ*/, 'サメサメサメ', 0, 6) +x(/ヌマ*/, 'シッヌマヌマヌマヌマ', 0, 0) +n(/サウ+/, '') +x(/イマ+/, 'イマ', 0, 2) +x(/サ+/, 'ササササ', 0, 8) +x(/、ィ+/, '、ィ、ィ、ヲ、ヲ、ヲ', 0, 4) +x(/、ヲ+/, '、ェ、ヲ、ヲ、ヲ、ヲ', 2, 10) +x(/.?/, '、ソ', 0, 2) +x(/.*/, '、ム、ヤ、ラ、レ', 0, 8) +x(/.+/, '、', 0, 2) +x(/.+/, "、、、ヲ、ィ、ォ\n", 0, 8) +x(/、「|、、/, '、「', 0, 2) +x(/、「|、、/, '、、', 0, 2) +x(/、「、、|、、、ヲ/, '、「、、', 0, 4) +x(/、「、、|、、、ヲ/, '、、、ヲ', 0, 4) +x(/、(?:、ォ、ュ|、ュ、ッ)/, '、、ォ、ュ', 0, 6) +x(/、(?:、ォ、ュ|、ュ、ッ)、ア/, '、、ュ、ッ、ア', 0, 8) +x(/、「、、|(?:、「、ヲ|、「、)/, '、「、', 0, 4) +x(/、「|、、|、ヲ/, '、ィ、ヲ', 2, 4) +x(/、「|、、|、ヲ、ィ|、ェ、ォ、ュ|、ッ|、ア、ウ、オ|、キ、ケ、サ|、ス|、ソ、チ|、ト、ニ、ネ、ハ、ヒ|、フ、ヘ/, '、キ、ケ、サ', 0, 6) +n(/、「|、、|、ヲ、ィ|、ェ、ォ、ュ|、ッ|、ア、ウ、オ|、キ、ケ、サ|、ス|、ソ、チ|、ト、ニ、ネ、ハ、ヒ|、フ、ヘ/, '、ケ、サ') +x(/、「|^、/, '、ヨ、「', 2, 4) +x(/、「|^、/, '、、「', 0, 2) +x(/オエ|\Gシヨ/, '、アシヨオエ', 4, 6) +x(/オエ|\Gシヨ/, 'シヨオエ', 0, 2) +x(/オエ|\Aシヨ/, 'bシヨオエ', 3, 5) +x(/オエ|\Aシヨ/, 'シヨ', 0, 2) +x(/オエ|シヨ\Z/, 'シヨオエ', 2, 4) +x(/オエ|シヨ\Z/, 'シヨ', 0, 2) +x(/オエ|シヨ\Z/, "シヨ\n", 0, 2) +x(/オエ|シヨ\z/, 'シヨオエ', 2, 4) +x(/オエ|シヨ\z/, 'シヨ', 0, 2) +x(/\w|\s/, '、ェ', 0, 2) +x(/\w|%/, '%、ェ', 0, 1) +x(/\w|[&$]/, '、ヲ&', 0, 2) +x(/[、、-、ア]/, '、ヲ', 0, 2) +x(/[、、-、ア]|[^、ォ-、ウ]/, '、「', 0, 2) +x(/[、、-、ア]|[^、ォ-、ウ]/, '、ォ', 0, 2) +x(/(?:、「|[、ヲ-、ュ])|、、、/, '、ヲ、', 0, 2) +x(/(?:、「|[、ヲ-、ュ])|、、、/, '、、、', 0, 4) +x(/、「、、、ヲ|(?=、ア、ア)..、ロ/, '、ア、ア、ロ', 0, 6) +x(/、「、、、ヲ|(?!、ア、ア)..、ロ/, '、「、、、ロ', 0, 6) +x(/(?=、、「)..、「|(?=、、)..、「/, '、、、「', 0, 6) +x(/(?<=、「|、、、ヲ)、、/, '、、、ヲ、、', 4, 6) +n(/(?>、「|、「、、、ィ)、ヲ/, '、「、、、ィ、ヲ') +x(/(?>、「、、、ィ|、「)、ヲ/, '、「、、、ィ、ヲ', 0, 8) +x(/、「?|、、/, '、「', 0, 2) +x(/、「?|、、/, '、、', 0, 0) +x(/、「?|、、/, '', 0, 0) +x(/、「*|、、/, '、「、「', 0, 4) +x(/、「*|、、*/, '、、、「', 0, 0) +x(/、「*|、、*/, '、「、、', 0, 2) +x(/[a、「]*|、、*/, 'a、「、、、、、、', 0, 3) +x(/、「+|、、*/, '', 0, 0) +x(/、「+|、、*/, '、、、、、、', 0, 6) +x(/、「+|、、*/, '、「、、、、、、', 0, 2) +x(/、「+|、、*/, 'a、「、、、、、、', 0, 0) +n(/、「+|、、+/, '') +x(/(、「|、、)?/, '、、', 0, 2) +x(/(、「|、、)*/, '、、、「', 0, 4) +x(/(、「|、、)+/, '、、、「、、', 0, 6) +x(/(、「、、|、ヲ、「)+/, '、ヲ、「、「、、、ヲ、ィ', 0, 8) +x(/(、「、、|、ヲ、ィ)+/, '、ヲ、「、「、、、ヲ、ィ', 4, 12) +x(/(、「、、|、ヲ、「)+/, '、「、「、、、ヲ、「', 2, 10) +x(/(、「、、|、ヲ、「)+/, '、「、、、、ヲ、「', 0, 4) +x(/(、「、、|、ヲ、「)+/, '$$zzzz、「、、、、ヲ、「', 6, 10) +x(/(、「|、、、「、、)+/, '、「、、、「、、、「', 0, 10) +x(/(、「|、、、「、、)+/, '、、、「', 2, 4) +x(/(、「|、、、「、、)+/, '、、、「、「、「、、、「', 2, 8) +x(/(?:、「|、、)(?:、「|、、)/, '、「、、', 0, 4) +x(/(?:、「*|、、*)(?:、「*|、、*)/, '、「、「、「、、、、、、', 0, 6) +x(/(?:、「*|、、*)(?:、「+|、、+)/, '、「、「、「、、、、、、', 0, 12) +x(/(?:、「+|、、+){2}/, '、「、「、「、、、、、、', 0, 12) +x(/(?:、「+|、、+){1,2}/, '、「、「、「、、、、、、', 0, 12) +x(/(?:、「+|\A、、*)、ヲ、ヲ/, '、ヲ、ヲ', 0, 4) +n(/(?:、「+|\A、、*)、ヲ、ヲ/, '、「、、、ヲ、ヲ') +x(/(?:^、「+|、、+)*、ヲ/, '、「、「、、、、、、、「、、、ヲ', 12, 16) +x(/(?:^、「+|、、+)*、ヲ/, '、「、「、、、、、、、、、ヲ', 0, 14) +x(/、ヲ{0,}/, '、ヲ、ヲ、ヲ、ヲ', 0, 8) +x(/、「|(?i)c/, 'C', 0, 1) +x(/(?i)c|、「/, 'C', 0, 1) +x(/(?i:、「)|a/, 'a', 0, 1) +n(/(?i:、「)|a/, 'A') +x(/[、「、、、ヲ]?/, '、「、、、ヲ', 0, 2) +x(/[、「、、、ヲ]*/, '、「、、、ヲ', 0, 6) +x(/[^、「、、、ヲ]*/, '、「、、、ヲ', 0, 0) +n(/[^、「、、、ヲ]+/, '、「、、、ヲ') +x(/、「??/, '、「、「、「', 0, 0) +x(/、、、「??、、/, '、、、「、、', 0, 6) +x(/、「*?/, '、「、「、「', 0, 0) +x(/、、、「*?/, '、、、「、「', 0, 2) +x(/、、、「*?、、/, '、、、「、「、、', 0, 8) +x(/、「+?/, '、「、「、「', 0, 2) +x(/、、、「+?/, '、、、「、「', 0, 4) +x(/、、、「+?、、/, '、、、「、「、、', 0, 8) +x(/(?:ナキ?)??/, 'ナキ', 0, 0) +x(/(?:ナキ??)?/, 'ナキ', 0, 0) +x(/(?:フエ?)+?/, 'フエフエフエ', 0, 2) +x(/(?:ノ+)??/, 'ノノノ', 0, 0) +x(/(?:タ+)??チ/, 'タ翅翅翆', 0, 8) +i(/(?:、「、、)?{2}/, '', 0, 0) # GNU regex bug +x(/(?:オエシヨ)?{2}/, 'オエシヨオエシヨオエ', 0, 8) +x(/(?:オエシヨ)*{0}/, 'オエシヨオエシヨオエ', 0, 0) +x(/(?:オエシヨ){3,}/, 'オエシヨオエシヨオエシヨオエシヨ', 0, 16) +n(/(?:オエシヨ){3,}/, 'オエシヨオエシヨ') +x(/(?:オエシヨ){2,4}/, 'オエシヨオエシヨオエシヨ', 0, 12) +x(/(?:オエシヨ){2,4}/, 'オエシヨオエシヨオエシヨオエシヨオエシヨ', 0, 16) +x(/(?:オエシヨ){2,4}?/, 'オエシヨオエシヨオエシヨオエシヨオエシヨ', 0, 8) +x(/(?:オエシヨ){,}/, 'オエシヨ{,}', 0, 7) +x(/(?:、ォ、ュ、ッ)+?{2}/, '、ォ、ュ、ッ、ォ、ュ、ッ、ォ、ュ、ッ', 0, 12) +x(/(イミ)/, 'イミ', 0, 2, 1) +x(/(イミソ)/, 'イミソ', 0, 4, 1) +x(/((サエヨ))/, 'サエヨ', 0, 4) +x(/((ノソ))/, 'ノソ', 0, 4, 1) +x(/((コニ))/, 'コニ', 0, 4, 2) +x(/((((((((((((((((((((ホフサメ))))))))))))))))))))/, 'ホフサメ', 0, 4, 20) +x(/(、「、、)(、ヲ、ィ)/, '、「、、、ヲ、ィ', 0, 4, 1) +x(/(、「、、)(、ヲ、ィ)/, '、「、、、ヲ、ィ', 4, 8, 2) +x(/()(、「)、、、ヲ(、ィ、ェ、ォ)、ュ、ッ、ア、ウ/, '、「、、、ヲ、ィ、ェ、ォ、ュ、ッ、ア、ウ', 6, 12, 3) +x(/(()(、「)、、、ヲ(、ィ、ェ、ォ)、ュ、ッ、ア、ウ)/, '、「、、、ヲ、ィ、ェ、ォ、ュ、ッ、ア、ウ', 6, 12, 4) +x(/.*(・ユ・ゥ)・。ヲ・゙(・()・キ・螂ソ)・、・/, '・ユ・ゥ・。ヲ・゙・・キ・螂ソ・、・', 10, 18, 2) +x(/(^、「)/, '、「', 0, 2) +x(/(、「)|(、「)/, '、、、「', 2, 4, 1) +x(/(^、「)|(、「)/, '、、、「', 2, 4, 2) +x(/(、「?)/, '、「、「、「', 0, 2, 1) +x(/(、゙*)/, '、゙、゙、゙', 0, 6, 1) +x(/(、ネ*)/, '', 0, 0, 1) +x(/(、+)/, '、、、、、、、', 0, 14, 1) +x(/(、ユ+|、リ*)/, '、ユ、ユ、ユ、リ、リ', 0, 6, 1) +x(/(、「+|、、?)/, '、、、、、、、「、「', 0, 2, 1) +x(/(、「、、、ヲ)?/, '、「、、、ヲ', 0, 6, 1) +x(/(、「、、、ヲ)*/, '、「、、、ヲ', 0, 6, 1) +x(/(、「、、、ヲ)+/, '、「、、、ヲ', 0, 6, 1) +x(/(、オ、キ、ケ|、「、、、ヲ)+/, '、「、、、ヲ', 0, 6, 1) +x(/([、ハ、ヒ、フ][、ォ、ュ、ッ]|、ォ、ュ、ッ)+/, '、ォ、ュ、ッ', 0, 6, 1) +x(/((?i:、「、、、ヲ))/, '、「、、、ヲ', 0, 6, 1) +x(/((?m:、「.、ヲ))/, "、「\n、ヲ", 0, 5, 1) +x(/((?=、「、)、「)/, '、「、、、', 0, 2, 1) +x(/、「、、、ヲ|(.、「、、、ィ)/, '、、「、、、ィ', 0, 8, 1) +x(/、「*(.)/, '、「、「、「、「、', 8, 10, 1) +x(/、「*?(.)/, '、「、「、「、「、', 0, 2, 1) +x(/、「*?(、)/, '、「、「、「、「、', 8, 10, 1) +x(/[、、、ヲ、ィ]、「*(.)/, '、ィ、「、「、「、「、', 10, 12, 1) +x(/(\A、、、、)、ヲ、ヲ/, '、、、、、ヲ、ヲ', 0, 4, 1) +n(/(\A、、、、)、ヲ、ヲ/, '、、、、、、ヲ、ヲ') +x(/(^、、、、)、ヲ、ヲ/, '、、、、、ヲ、ヲ', 0, 4, 1) +n(/(^、、、、)、ヲ、ヲ/, '、、、、、、ヲ、ヲ') +x(/、、(、、$)/, '、、、、', 4, 8, 1) +n(/、、(、、$)/, '、、、、、') +x(/(フオ)\1/, 'フオフオ', 0, 4) +n(/(フオ)\1/, 'フオノ') +x(/(カ?)\1/, 'カカ', 0, 4) +x(/(カ??)\1/, 'カカ', 0, 0) +x(/(カ*)\1/, 'カカカカカ', 0, 8) +x(/(カ*)\1/, 'カカカカカ', 0, 4, 1) +x(/、「(、、*)\1/, '、「、、、、、、、、', 0, 10) +x(/、「(、、*)\1/, '、「、、', 0, 2) +x(/(、「*)(、、*)\1\2/, '、「、「、「、、、、、「、「、「、、、、', 0, 20) +x(/(、「*)(、、*)\2/, '、「、「、「、、、、、、、、', 0, 14) +x(/(、「*)(、、*)\2/, '、「、「、「、、、、、、、、', 6, 10, 2) +x(/(((((((、ン*)、レ))))))、ヤ\7/, '、ン、ン、ン、レ、ヤ、ン、ン、ン', 0, 16) +x(/(((((((、ン*)、レ))))))、ヤ\7/, '、ン、ン、ン、レ、ヤ、ン、ン、ン', 0, 6, 7) +x(/(、マ)(、メ)(、ユ)\2\1\3/, '、マ、メ、ユ、メ、マ、ユ', 0, 12) +x(/([、ュ-、ア])\1/, '、ッ、ッ', 0, 4) +x(/(\w\d\s)\1/, '、「5 、「5 ', 0, 8) +n(/(\w\d\s)\1/, '、「5 、「5') +x(/(テッ。ゥ|[、「-、ヲ]{3})\1/, 'テッ。ゥテッ。ゥ', 0, 8) +x(/...(テッ。ゥ|[、「-、ヲ]{3})\1/, '、「a、「テッ。ゥテッ。ゥ', 0, 13) +x(/(テッ。ゥ|[、「-、ヲ]{3})\1/, '、ヲ、、、ヲ、ヲ、、、ヲ', 0, 12) +x(/(^、ウ)\1/, '、ウ、ウ', 0, 4) +n(/(^、)\1/, '、皃爨') +n(/(、「$)\1/, '、「、「') +n(/(、「、、\Z)\1/, '、「、、') +x(/(、「*\Z)\1/, '、「', 2, 2) +x(/.(、「*\Z)\1/, '、、、「', 2, 4) +x(/(.(、荀、、)\2)/, 'z、荀、、讀荀、、', 0, 13, 1) +x(/(.(..\d.)\2)/, '、「12341234', 0, 10, 1) +x(/((?i:、「v、コ))\1/, '、「v、コ、「v、コ', 0, 10) +x(/(?<カ、ォ>ハム|\(\g<カ、ォ>\))/, '((((((ハム))))))', 0, 14) +x(/\A(?:\g<ー、-1>|\g<アセ-2>|\zスェホサ (?<ー、-1>エム|シォ\g<アセ-2>シォ)(?<アセ-2>コ゚|ハサァ\g<ー、-1>ハサァ))$/, 'ハサァシォハサァシォコ゚シォハサァシォハサァ', 0, 26) +x(/[[、メ、ユ]]/, '、ユ', 0, 2) +x(/[[、、、ェ、ヲ]、ォ]/, '、ォ', 0, 2) +n(/[[^、「]]/, '、「') +n(/[^[、「]]/, '、「') +x(/[^[^、「]]/, '、「', 0, 2) +x(/[[、ォ、ュ、ッ]&&、ュ、ッ]/, '、ッ', 0, 2) +n(/[[、ォ、ュ、ッ]&&、ュ、ッ]/, '、ォ') +n(/[[、ォ、ュ、ッ]&&、ュ、ッ]/, '、ア') +x(/[、「-、&&、、-、&&、ヲ-、]/, '、', 0, 2) +n(/[^、「-、&&、、-、&&、ヲ-、]/, '、') +x(/[[^、「&&、「]&&、「-、]/, '、、', 0, 2) +n(/[[^、「&&、「]&&、「-、]/, '、「') +x(/[[^、「-、&&、、、ヲ、ィ、ェ]&&[^、ヲ-、ォ]]/, '、ュ', 0, 2) +n(/[[^、「-、&&、、、ヲ、ィ、ェ]&&[^、ヲ-、ォ]]/, '、、') +x(/[^[^、「、、、ヲ]&&[^、ヲ、ィ、ェ]]/, '、ヲ', 0, 2) +x(/[^[^、「、、、ヲ]&&[^、ヲ、ィ、ェ]]/, '、ィ', 0, 2) +n(/[^[^、「、、、ヲ]&&[^、ヲ、ィ、ェ]]/, '、ォ') +x(/[、「-&&-、「]/, '-', 0, 1) +x(/[^[^a-z、「、、、ヲ]&&[^bcdefg、ヲ、ィ、ェ]q-w]/, '、ィ', 0, 2) +x(/[^[^a-z、「、、、ヲ]&&[^bcdefg、ヲ、ィ、ェ]g-w]/, 'f', 0, 1) +x(/[^[^a-z、「、、、ヲ]&&[^bcdefg、ヲ、ィ、ェ]g-w]/, 'g', 0, 1) +n(/[^[^a-z、「、、、ヲ]&&[^bcdefg、ヲ、ィ、ェ]g-w]/, '2') +r(/、「/, '、「', 0) +r(/、「/, '、「', 0, 2) +r(/、、/, '、「、、、ヲ', 2) +r(/、、/, '、「、、、ヲ', 2, 4) +r(/./, '、「', 0) +r(/.*/, '、「、、、ヲ、ィ、ェ 、ォ、ュ、ッ', 17) +r(/.*、ィ、ェ/, '、「、、、ヲ、ィ、ェ 、ォ、ュ、ッ', 6) +r(/、「*/, '、「、「、「、、、、、ヲ', 12) +r(/、「+/, '、「、「、「、、、、、ヲ', 4) +r(/、「?/, '、、、「、ヲ', 6) +r(/チエ??/, 'ノ鮹エハム', 6) +r(/aハユcエチe/, 'aハユcエチeavcd', 0) +r(/\w\d\s/, ' 、「2 、ヲ、ヲ $3 ', 2) +r(/[、ヲ-、ェ]、「、「[、ネ-、]/, '3、ヲ、「、「、ハ、ヲ、「、「、「', 1) +r(/、「|、、/, '、、', 0) +r(/、「、、|、、、ヲ|、ヲ、ィ/, '、、、ヲ、ヲ', 0) +r(/(、ネ、ネ、チ)\1/, '、ネ、ネ、チ、ネ、ネ、チ、ネ、ネ、チ', 6) +r(/|、ィ/, '、ィ', 2) +r(/^、「、コ/, '、「、コ、「、コ', 0) +r(/、「、コ$/, '、「、コ、「、コ', 4) +r(/(((.、「)))\3/, 'z、「z、「、「、「', 0) +r(/(、「、ヲ*?、)\1/, '、「、「、ヲ、ヲ、、「、ヲ、ヲ、、「、ヲ、', 2) +r(/、「、「、{3,4}/, '、ニ、ニ、「、「、、、、、「、「、、、、「、「、、「、「、', 12) +r(/\000、「/, "、、\000、「", 2) +r(/、ネ、ネ\xfe\xfe/, "、ネ、ネ、ネ\xfe\xfe", 2) +r(/...、「、、、ヲ、ィ、ェ、ォ、ュ、ッ、ア、ウ、オ、キ、ケ、サ、ス/, 'zzzzz、「、、、ヲ、ィ、ェ、ォ、ュ、ッ、ア、ウ、オ、キ、ケ、サ、ス', 2) +end + +test_sb('ASCII') +test_sb('EUC') +test_sb('SJIS') +test_sb('UTF8') +test_euc('EUC') + + +# UTF-8 (by UENO Katsuhiro) +$KCODE = 'UTF-8' + +s = "\xe3\x81\x82\xe3\x81\x81\xf0\x90\x80\x85\xe3\x81\x8a\xe3\x81\x85" +x(/[\xc2\x80-\xed\x9f\xbf]+/u, s, 0, 6) + +s = "\xf0\x90\x80\x85\xe3\x81\x82" +x(/[\xc2\x80-\xed\x9f\xbf]/u, s, 4, 7) + +s = "\xed\x9f\xbf" +n(/[\xc2\x80-\xed\x9f\xbe]/u, s) + +s = "\xed\x9f\xbf" +n(/[\xc2\x80-\xed\x9f\xbe]/u, s) + +s = "\xed\x9f\xbf" +n(/[\xc2\x80-\xed\x9f\xbe]/u, s) + +s = "\xed\x9f\xbf" +n(/[\xc3\xad\xed\x9f\xbe]/u, s) + +s = "\xed\x9f\xbf" +n(/[\xc4\x80-\xed\x9f\xbe]/u, s) + +s = "\xed\x9f\xbf\xf0\x90\x80\x85\xed\x9f\xbf" +x(/[^\xc2\x80-\xed\x9f\xbe]/u, s, 0, 3) + +s = "\xed\x9f\xbf" +x(/[^\xc3\xad\xed\x9f\xbe]/u, s, 0, 3) + +s = "\xed\x9f\xbf\xf0\x90\x80\x85\xed\x9f\xbf" +x(/[^\xc4\x80-\xed\x9f\xbe]/u, s, 0, 3) + +s = "\xc3\xbe\xc3\xbf" +n(/[\xfe\xff\xc3\x80]/u, s) + + +# Japanese long text. +$KCODE = 'EUC' + +s = <<EOS +タク螟ホニヒワ、ヒ、ェ、、、ニ、マ。「オキウ、ヒ、ト、、、ニ、マトエココ、ヒエ、ナ、ォ、フネ翳ス、箏、オ、、ノトャ、簑ク、ク。「 +、ソ、ネ、ィ、ミサーススネャヌッシーハ簗シスニ、ホタゥトヌッ、ャニマェタチ隍ホスェ、テ、ソヌッ、ヌ、「、、ウ、ネ、、筅テ、ニキレケ、ヒ +オキウ、ホオシー、ヨ、熙クリト・、ケ、マタノセ、ャ、゙、ォ、熙ネ、ェ、テ、ニ、、、。」 +ヘュフセ、ハマタシヤ、ネ、キ、ニ、マ。「クホ。ヲサハヌマホヒツタマコ、オ、イ、、ウ、ネ、ャ、ヌ、ュ、、タ、、ヲ。」 + +ハシニ」ニススネャ 。ヨヘュコ菴ニ。ラ サヘテォ・鬣ヲ・・ノ (1998) +EOS + +x(/\((.+)\)/, s, 305, 309, 1) +x(/サハヌマホヒツタマコ/, s, 229, 239) +x(/。」$/, s, 202, 204) +x(/(^ハシニ」..ネャ)/, s, 269, 279, 1) +x(/^$/, s, 268, 268) + + +s = <<EOS +・ォ・ハ、茹。シ・゙サ、マーツホハクサ、ヌ、「、、ヲ、ォ。」 +、筅キ、ウ、ネ、ミ、、キ、、ケ、筅ホ、ャハクサ、ヌ、「、、ネ、ケ、、ネ。「、ス、、マ、ウ、ネ、ミ、、キ、、ケ、筅ホ、ヌ、マ、ハ、、。」 +ヒワ、臙ook、マ、ウ、ネ、ミ、ヌ、「、、ャ。「・ロ・、臧on、マイサ、、ハ、鬢ル、ソ、タ、ア、ヌ。「ススハャ、ハテアクタュ、 +、筅ト、筅ホ、ヌ、マ、ハ、、。」 +テアク、ネ、キ、ニ、ホニテト熙ホキチツヨ、、筅ソ、ハ、、、ォ、鬢ヌ、「、。」 +。ヨキチ、ヒ、隍ク。ラ、・「・鬣、マエチサ、ヒツミ、ケ、キレハホナェ、ハーユフ」、ヒヘム、、、ソ、ャ。「 +キチ、ホ、ハ、、、筅ホ、マヒワナ、マク、ヌ、マ、「、熙ィ、ハ、、、ホ、ヌ、「、。」 + +ヌタタナ 。ヨエチサノエマテ。ラ +EOS + +n(/\((.+)\)/, s) +x(/。ヨ(.*)。ラ/, s, 254, 264, 1) +x(/。」$/, s, 34, 36) +x(/(book)/, s, 120, 124, 1) +x(/^$/, s, 360, 360) + + +s = <<EOS +シ盍爨ャアテサウ、ヒ、ッ、タ、テ、ニ、ュ、ソ、ネ、ケ、、ミ。「、ス、ホ、ェ、モ、ソ、タ、キ、、フゥカオネスム、ホホフ、ネ。「 +、ス、ホシチ、ホケ筅オ、ヒ、ェ、ノ、、ッ、ヒ、チ、ャ、、、ハ、、。」 +、ウ、ホウミシヤ、ャ。「ーオナンナェ、ハカテ、ュ、、筅ト、ホ、マ。「、ェノヤニー、オ、、ホチ、ホチー、ヒホゥ、テ、ソ、ネ、ュ、タ、、ヲ。」 +。ン。ン 、ウ、、マ。「・ノ・鬣・」・タソヘ、ホセッヌッナロホ、ヌ、マ、ハ、、、ォ。」 + +サハヌマホヒツタマコ 。ヨアテサウネスム、ホナクウォ。ンノヤニーフタイヲ、ヒ、ユ、、ト、ト。ラ ・「・オ・メ・ー・鬣ユ(1986) +EOS + +x(/\((.+)\)/, s, 290, 296) +x(/。ヨ(.*)。ン(.+)。ラ/, s, 257, 275, 2) +x(/^。ン。ン /, s, 179, 184) +x(/(シ盍)/, s, 0, 4, 1) +x(/\w。「/, s, 30, 34) + + +s = <<EOS +、ォ、ネ、、、テ、ニ。「スチァ、マ。「サニ箙ッ、筍「ケエ莵ッ、筍「、ス、キ、ニ、筅ヲーソヘ、ホソヘハェ、筍「クタ螟ホナフ、ヒ、ケ、ョ、ハ、、、ウ、ネ、。「、ウ、ホツ霈キケ讀マヌ。シツ、ヒハェク、テ、ニ、、、。」 +、ォ、、鮟ーソヘ、ホセョタ筅マーケヤ、篆ミ、ニ、、、ハ、、、ホ、タ。」 +ス、ッ、メ、゙、ャ、ハ、ォ、テ、ソ、ホ、ヌ、「、、ヲ。」 +、キ、ォ、キ。「サィサ。ヨカ眥蠡簪テ。ラ、ャ。「、ハ、ェツ霾ャケ讀篦雜蟷讀篆ミ、ト、ナ、ア、ニ、讀ッ、ヌ、「、、ヲ、ウ、ネ、ヒ、ト、、、ニ、マ。「サ荀マ、ヨ、ュ、゚、ハ、ロ、ノ、ホウホソョ、、筅テ、ニ、、、。」、ウ、ホサィサ、ヒ、マ。「サフウヌスホマ、ホヒ簗ェ、ホ、隍ヲ、ハソヘハェ、ャ。「サーソヘ、筅、、。」 +、ス、、サラ、ヲ、ネ。「、ネ、ュ、ノ、ュ、ソ、眥ゥ、ホスミ、、隍ヲ、ハ、ェ、筅、、ャ、ケ、、ホ、ヌ、「、。」 + +サハヌマホヒツタマコ 。ヨ、ウ、、ハサィサ、荀皃ニ、キ、゙、、、ソ、、。ラ カ眥蠡簪テ ツ霈キスク (1961) +EOS + +x(/\((\d+)\)/, s, 496, 502) +x(/(。ヨ.+サィサ.*。ラ)/, s, 449, 479, 1) +x(/ツ(.)ケ/, s, 96, 98, 1) +x(/。」$/, s, 120, 122) +x(/カ眥蠡簪テ/, s, 209, 217) + + +s = <<EOS +ニススグヌワ、アロ、ィ、ヌツ遉ハホフ、ホイシ、ア、ヒツミケウ、ケ、シホゥ、ニ、ヒソエ、ナ、筅熙ャ、「、テ、ソ、ホ、マ。「タク、ュサト、テ、ニ、、、ソサヘススクョ、ホテマイ、アフ茣ー、ホ、ヲ、チーヒタェイー、ホヌテイーーヒハシアメ、ソ、ターソヘ、タ、テ、ソ。」 +ーク、ヒテマイ、ア、ネ、、、テ、ニ、筍「ケセクヘセク豼エイー、ヒヌシニ、ケ、ク賚ムア、ネ。「ケセクヘサヤテ讀ヒホョ、ケ、筅ホ、ネ、マナチウニア、クハェ、ヌ、マ、ハ、ォ、テ、ソ。」 +、ス、筅ス、筅ャタハェサ、チート、ネ、キ、ニケヘ、ィ、鬢、ソケヤニチア、マ。「ヘ「チ、ケ、ダ、ヒカスチハャ、ャヘマ、ア、タ、キフワクコ、熙キ、ソ、ホ、ヌ、ママテ、ヒ、ハ、鬢ハ、、。」、ス、ウ、ヌ。「ケセクヘセ、ヒヌシ、皃、筅ホ、マ。「セニ、ュ、「、イ、ソア、ーイニーアホーマ、、、ヒ、キ、ソセョイー、ヒタム、゚セ螟イ。「カスチハャ、ネエ、、、ニソソア、ヒサナホゥ、ニセ螟イ、ソ、筅ホ、タ、テ、ソ。」 + +ネモナ靠ツー 。ヨサマチトトサオュ。ラ (2000) +EOS + +x(/\((\d+)\)/, s, 506, 512) +x(/(。ヨ.*。ラ)/, s, 493, 505, 1) +x(/ケヤニチア/, s, 292, 298) + + +s = <<EOS +、ウ、ヲ、キ、ソニヒワソヘ、ホノエ、ヒツミ、ケ、ハム、、テ、ソツヨナル、ホホ「、ヒ、マ。「、ク、ト、マ。「 +ーエモ、キ、ソチェツエス爨ャニッ、、、ニ、、、ソ。」 +、ス、、マ。「、ス、ホノエ、ャ。ヨシ醋シ、ケ箏鬢ヒクォ、サ、、ォ、ノ、ヲ、ォ。ラ、ヌ、「、テ、ソ。」 + +ハシニ」ニススネャ 。ヨヘュコ菴ニ。ラ サヘテォ・鬣ヲ・・ノ (1998) +EOS + +x(/\((\d+)\)/, s, 185, 191) +x(/(。ヨ.*。ラ)/, s, 108, 138, 1) +x(/^、ス、、マ/, s, 90, 96) +x(/^.*$/, s, 0, 58) + +s = <<EOS + ノ」、マソヘ、篩ゥ、、。「ヌマ、ホサホチ、ヒ、筅キ、゙、キ、ソ。」ヌマ、ヒ、マノ」ーセ」、ヒニヲニケ遉、ソ、、、ニ、゙、シ、ソ、筅ホ、ーニ、ヒーイ、マ、ソ、ル、オ、サ、ソ。」ソヘエヨ、隍熙マセ蠻、ホ、筅ホ、、ソ、ル、オ、サ、ソ、筅、ヌ、「、熙゙、ケ。」 + ソヘエヨ、マニコ「、マ・リ・コ・ネモ、、ソ、ル、ソ。」エ・コレ、、讀ヌ、ニ。「、讀ヌ、ク、、ヌマ、ヒ、荀遙「コレ、、ウ、゙、ォ、ヒタレ、遙「コレ、ネノ」、ネハニ、、゙、シ、ニ、ソ、、、ニ、ソ、ル、ソ。」、コ、テ、ネタホ、マハニ、ネノ」、ャネセ。ケ、ー、鬢、、ヌ、「、テ、ソ、ャ。「フタシ」、簇ススヌッツ螟ヒ、ハ、、ネ。「ノ」、、ト、ッ、、ホ、ャ、リ、テ、ニヘ隍ニ。「ノ」、マハニ、ホサーハャ、ホー、ッ、鬢、、ヒ、ハ、テ、ソ。」・リ・コ・ネモ、ヒ、マア、セッ、キ、、、、ソ、筅、ヌ、ケ。」 + +オワヒワセー 。ヨヒコ、、鬢、ソニヒワソヘ。ラ (1960) +EOS + +x(/(ノ」、マハニ、ホサーハャ、ホー、ッ、鬢、、ヒ)/, s, 357, 381, 1) +x(/、「、熙゙、ケ。」$/, s, 140, 150) +x(/ ソヘエヨ(.*)。」/, s, 157, 423, 1) +x(/・リ・コ・ネモ[、、マ、ヌ]/, s, 165, 175) + +s = <<EOS +ソネ、マ、ソ、ネ、メ ノツ「、ホフハユ、ヒオ爨フ、ネ、 ホアテヨ、゙、キツ醯ツコイ + +オネナトセセー 。ヨホアコイマソ。ラ (1859) +EOS + +x(/\((.+)\)/, s, 68, 74) +x(/。ヨ(.*)。ラ/, s, 59, 65, 1) +x(/^(オネナトセセー)/, s, 48, 56, 1) + + +# result +printf("\n*** Result SUCCESS: %d, FAIL: %d ***\n", $rok, $rfail) + +# END. diff --git a/ext/mbstring/oniguruma/testconv.rb b/ext/mbstring/oniguruma/testconv.rb new file mode 100644 index 0000000000..afaa673d90 --- /dev/null +++ b/ext/mbstring/oniguruma/testconv.rb @@ -0,0 +1,223 @@ +#!/usr/local/bin/ruby -Ke +# testconv.rb +# Copyright (C) 2003 K.Kosako (kosako@sofnec.co.jp) + +WINDOWS = (ARGV.size > 0 && /^-win/i =~ ARGV[0]) +ARGV.shift if WINDOWS + +if WINDOWS + REGCODE = 'REGCODE_SJIS' + REGENC = 'REG_ENCODING_SJIS' +else + REGCODE = 'REGCODE_EUCJP' + REGENC = 'REG_ENCODING_EUC_JP' +end + +def conv_reg(s) + s = s.gsub(/\\/, '\\\\\\\\') #' + if (WINDOWS) + s = s.gsub(/\?\?/, '?\\\\?') # escape ANSI trigraph + end + s +end + +def conv_str(s) + if (s[0] == ?') + s = s[1..-2] + return s.gsub(/\\/, '\\\\\\\\') #' + else + return s[1..-2] + end +end + +print(<<"EOS") +/* + * This program was generated by testconv.rb. + */ +#include<stdio.h> + +#ifdef POSIX_TEST +#include "onigposix.h" +#else +#include "oniguruma.h" +#endif + +static int nsucc = 0; +static int nfail = 0; + +#ifndef POSIX_TEST +static RegRegion* region; +#endif + +static void xx(char* pattern, char* str, int from, int to, int mem, int not) +{ + int r; + +#ifdef POSIX_TEST + regex_t reg; + char buf[200]; + regmatch_t pmatch[20]; + + r = regcomp(®, pattern, REG_EXTENDED | REG_NEWLINE); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\\n", buf); + exit(-1); + } + + r = regexec(®, str, reg.re_nsub + 1, pmatch, 0); + if (r != 0 && r != REG_NOMATCH) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\\n", buf); + exit(-1); + } + + if (r == REG_NOMATCH) { + if (not) { + fprintf(stdout, "OK(N): /%s/ '%s'\\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s'\\n", pattern, str); + nfail++; + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): /%s/ '%s'\\n", pattern, str); + nfail++; + } + else { + if (pmatch[mem].rm_so == from && pmatch[mem].rm_eo == to) { + fprintf(stdout, "OK: /%s/ '%s'\\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\\n", pattern, str, + from, to, pmatch[mem].rm_so, pmatch[mem].rm_eo); + nfail++; + } + } + } + regfree(®); + +#else + regex_t* reg; + RegErrorInfo einfo; + + r = regex_new(®, (UChar* )pattern, (UChar* )(pattern + strlen(pattern)), + REG_OPTION_DEFAULT, #{REGCODE}, REG_SYNTAX_DEFAULT, &einfo); + if (r) { + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r, &einfo); + fprintf(stderr, "ERROR: %s\\n", s); + exit(-1); + } + + r = regex_search(reg, (UChar* )str, (UChar* )(str + strlen(str)), + (UChar* )str, (UChar* )(str + strlen(str)), + region, REG_OPTION_NONE); + if (r < REG_MISMATCH) { + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r); + fprintf(stderr, "ERROR: %s\\n", s); + exit(-1); + } + + if (r == REG_MISMATCH) { + if (not) { + fprintf(stdout, "OK(N): /%s/ '%s'\\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s'\\n", pattern, str); + nfail++; + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): /%s/ '%s'\\n", pattern, str); + nfail++; + } + else { + if (region->beg[mem] == from && region->end[mem] == to) { + fprintf(stdout, "OK: /%s/ '%s'\\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\\n", pattern, str, + from, to, region->beg[mem], region->end[mem]); + nfail++; + } + } + } + regex_free(reg); +#endif +} + +static void x2(char* pattern, char* str, int from, int to) +{ + xx(pattern, str, from, to, 0, 0); +} + +static void x3(char* pattern, char* str, int from, int to, int mem) +{ + xx(pattern, str, from, to, mem, 0); +} + +static void n(char* pattern, char* str) +{ + xx(pattern, str, 0, 0, 0, 1); +} + +extern int main(int argc, char* argv[]) +{ +#ifdef POSIX_TEST + reg_set_encoding(#{REGENC}); +#else + region = regex_region_new(); +#endif + +EOS + +CM = '\s*,\s*' +RX2 = %r{^x\(/([^\/]*)/#{CM}('[^']*'|"[^"]*")#{CM}(\S+)#{CM}(\S+)\)$} +RI2 = %r{^i\(/([^\/]*)/#{CM}('[^']*'|"[^"]*")#{CM}(\S+)#{CM}(\S+)\)} +RX3 = %r{^x\(/([^\/]*)/#{CM}('[^']*'|"[^"]*")#{CM}(\S+)#{CM}(\S+)#{CM}(\S+)\)$} +RN = %r{^n\(/([^\/]*)/#{CM}('[^']*'|"[^"]*")\)$} #' + +while line = gets() + if (m = RX2.match(line)) + reg = conv_reg(m[1]) + str = conv_str(m[2]) + printf(" x2(\"%s\", \"%s\", %s, %s);\n", reg, str, m[3], m[4]) + elsif (m = RI2.match(line)) + reg = conv_reg(m[1]) + str = conv_str(m[2]) + printf(" x2(\"%s\", \"%s\", %s, %s);\n", reg, str, m[3], m[4]) + elsif (m = RX3.match(line)) + reg = conv_reg(m[1]) + str = conv_str(m[2]) + printf(" x3(\"%s\", \"%s\", %s, %s, %s);\n", reg, str, m[3], m[4], m[5]) + elsif (m = RN.match(line)) + reg = conv_reg(m[1]) + str = conv_str(m[2]) + printf(" n(\"%s\", \"%s\");\n", reg, str) + else + + end +end + +print(<<'EOS') + fprintf(stdout, "\nRESULT SUCC: %d, FAIL: %d\n", nsucc, nfail); + +#ifndef POSIX_TEST + regex_region_free(region, 1); + regex_end(); +#endif + + return 0; +} +EOS + +# END OF SCRIPT diff --git a/ext/mbstring/oniguruma/win32/Makefile b/ext/mbstring/oniguruma/win32/Makefile new file mode 100644 index 0000000000..bb20474e8f --- /dev/null +++ b/ext/mbstring/oniguruma/win32/Makefile @@ -0,0 +1,131 @@ +# Oniguruma Makefile for Win32 + +product_name = oniguruma + +CPPFLAGS = +CFLAGS = -O2 -nologo +LDFLAGS = +LOADLIBES = +ARLIB = lib +ARLIB_FLAGS = -nologo +ARDLL = cl +ARDLL_FLAGS = -nologo -LD $(LINKFLAGS) -dll +LINKFLAGS = -link -incremental:no -pdb:none + +INSTALL = install -c +CP = copy +CC = cl +DEFS = -DHAVE_CONFIG_H -DNOT_RUBY -DEXPORT +RUBYDIR = .. + +subdirs = + +libbase = onig +libname = $(libbase)_s.lib +dllname = $(libbase).dll +dlllib = $(libbase).lib + +onigheaders = oniguruma.h regint.h regparse.h +posixheaders = onigposix.h +headers = $(posixheaders) $(onigheaders) + +onigobjs = reggnu.obj regerror.obj regparse.obj regcomp.obj regexec.obj +posixobjs = regposix.obj regposerr.obj +libobjs = $(onigobjs) $(posixobjs) + +onigsources = regerror.c regparse.c regcomp.c regexec.c reggnu.c +posixsources = regposix.c regposerr.c +libsources = $(posixsources) $(onigsources) +rubysources = regex.c $(onigsources) + +patchfiles = re.c.168.patch re.c.180.patch +distfiles = README COPYING INSTALL-RUBY HISTORY \ + Makefile.in configure.in config.h.in configure \ + $(headers) $(libsources) regex.c $(patchfiles) \ + test.rb testconv.rb +testc = testc +testp = testp + +makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)' + +.SUFFIXES: +.SUFFIXES: .obj .c .h .ps .dvi .info .texinfo + +.c.obj: + $(CC) $(CFLAGS) $(CPPFLAGS) $(DEFS) /I. /c $< + +# targets +default: all + +all: $(libname) $(dllname) + +$(libname): $(libobjs) + $(ARLIB) $(ARLIB_FLAGS) -out:$@ $(libobjs) + +$(dllname): $(libobjs) + $(ARDLL) $(libobjs) -Fe$@ $(ARDLL_FLAGS) + +regparse.obj: regparse.c $(onigheaders) config.h +regcomp.obj: regcomp.c $(onigheaders) config.h +regexec.obj: regexec.c regint.h oniguruma.h config.h +reggnu.obj: reggnu.c regint.h oniguruma.h config.h +regerror.obj: regerror.c regint.h oniguruma.h config.h +regposix.obj: regposix.c $(posixheaders) oniguruma.h config.h +regposerr.obj: regposerr.c $(posixheaders) config.h + +# Ruby test +rtest: + $(RUBYDIR)\win32\ruby -w -Ke test.rb + +# C library test +ctest: $(testc) + .\$(testc) + +# POSIX C library test +ptest: $(testp) + .\$(testp) + +$(testc): $(testc).c $(libname) + $(CC) -nologo -o $(testc) $(testc).c $(libname) + +$(testp): $(testc).c $(dlllib) + $(CC) -nologo -DPOSIX_TEST -DIMPORT -o $(testp) $(testc).c $(dlllib) + +clean: + del *.obj *.lib *.exp *.dll $(testp).exe $(testc).exe $(testc).obj + + +16: cpruby + patch -d $(RUBYDIR) -p0 < re.c.168.patch + +18: cpruby + patch -d $(RUBYDIR) -p0 < re.c.180.patch + +# backup file suffix +SORIG = ruby_orig + +cpruby: + $(CP) $(RUBYDIR)\regex.c $(RUBYDIR)\regex.c.$(SORIG) + $(CP) $(RUBYDIR)\regex.h $(RUBYDIR)\regex.h.$(SORIG) + $(CP) $(RUBYDIR)\re.c $(RUBYDIR)\re.c.$(SORIG) +# $(rubysources) + $(CP) regex.c $(RUBYDIR) + $(CP) regerror.c $(RUBYDIR) + $(CP) regparse.c $(RUBYDIR) + $(CP) regcomp.c $(RUBYDIR) + $(CP) regexec.c $(RUBYDIR) + $(CP) reggnu.c $(RUBYDIR) +# $(onigheaders) + $(CP) oniguruma.h $(RUBYDIR)\regex.h + $(CP) regint.h $(RUBYDIR) + $(CP) regparse.h $(RUBYDIR) + +rback: + $(CP) $(RUBYDIR)\regex.c.$(SORIG) $(RUBYDIR)\regex.c + $(CP) $(RUBYDIR)\regex.h.$(SORIG) $(RUBYDIR)\regex.h + $(CP) $(RUBYDIR)\re.c.$(SORIG) $(RUBYDIR)\re.c + +samples: + $(CC) $(CFLAGS) -I. -DIMPORT -o simple sample\simple.c $(dlllib) + $(CC) $(CFLAGS) -I. -DIMPORT -o posix sample\posix.c $(dlllib) + $(CC) $(CFLAGS) -I. -DIMPORT -o names sample\names.c $(dlllib) diff --git a/ext/mbstring/oniguruma/win32/config.h b/ext/mbstring/oniguruma/win32/config.h new file mode 100644 index 0000000000..bdbdaf25c1 --- /dev/null +++ b/ext/mbstring/oniguruma/win32/config.h @@ -0,0 +1,84 @@ +#define STDC_HEADERS 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_STAT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_FLOAT_H 1 +#define HAVE_OFF_T 1 +#define SIZEOF_INT 4 +#define SIZEOF_SHORT 2 +#define SIZEOF_LONG 4 +#define SIZEOF_LONG_LONG 0 +#define SIZEOF___INT64 8 +#define SIZEOF_OFF_T 4 +#define SIZEOF_VOIDP 4 +#define SIZEOF_FLOAT 4 +#define SIZEOF_DOUBLE 8 +#define HAVE_PROTOTYPES 1 +#define TOKEN_PASTE(x,y) x##y +#define HAVE_STDARG_PROTOTYPES 1 +#ifndef NORETURN +#if _MSC_VER > 1100 +#define NORETURN(x) __declspec(noreturn) x +#else +#define NORETURN(x) x +#endif +#endif +#define HAVE_DECL_SYS_NERR 1 +#define STDC_HEADERS 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_LIMITS_H 1 +#define HAVE_FCNTL_H 1 +#define HAVE_SYS_UTIME_H 1 +#define HAVE_MEMORY_H 1 +#define uid_t int +#define gid_t int +#define HAVE_STRUCT_STAT_ST_RDEV 1 +#define HAVE_ST_RDEV 1 +#define GETGROUPS_T int +#define RETSIGTYPE void +#define HAVE_ALLOCA 1 +#define HAVE_DUP2 1 +#define HAVE_MEMCMP 1 +#define HAVE_MEMMOVE 1 +#define HAVE_MKDIR 1 +#define HAVE_STRCASECMP 1 +#define HAVE_STRNCASECMP 1 +#define HAVE_STRERROR 1 +#define HAVE_STRFTIME 1 +#define HAVE_STRCHR 1 +#define HAVE_STRSTR 1 +#define HAVE_STRTOD 1 +#define HAVE_STRTOL 1 +#define HAVE_STRTOUL 1 +#define HAVE_FLOCK 1 +#define HAVE_VSNPRINTF 1 +#define HAVE_FINITE 1 +#define HAVE_FMOD 1 +#define HAVE_FREXP 1 +#define HAVE_HYPOT 1 +#define HAVE_MODF 1 +#define HAVE_WAITPID 1 +#define HAVE_CHSIZE 1 +#define HAVE_TIMES 1 +#define HAVE__SETJMP 1 +#define HAVE_TELLDIR 1 +#define HAVE_SEEKDIR 1 +#define HAVE_MKTIME 1 +#define HAVE_COSH 1 +#define HAVE_SINH 1 +#define HAVE_TANH 1 +#define HAVE_EXECVE 1 +#define HAVE_TZNAME 1 +#define HAVE_DAYLIGHT 1 +#define SETPGRP_VOID 1 +#define inline __inline +#define NEED_IO_SEEK_BETWEEN_RW 1 +#define RSHIFT(x,y) ((x)>>(int)y) +#define FILE_COUNT _cnt +#define FILE_READPTR _ptr +#define DEFAULT_KCODE KCODE_NONE +#define DLEXT ".so" +#define DLEXT2 ".dll" diff --git a/ext/mbstring/oniguruma/win32/testc.c b/ext/mbstring/oniguruma/win32/testc.c new file mode 100644 index 0000000000..8ec392cd8c --- /dev/null +++ b/ext/mbstring/oniguruma/win32/testc.c @@ -0,0 +1,804 @@ +/* + * This program was generated by testconv.rb. + */ +#include<stdio.h> + +#ifdef POSIX_TEST +#include "onigposix.h" +#else +#include "oniguruma.h" +#endif + +static int nsucc = 0; +static int nfail = 0; + +#ifndef POSIX_TEST +static RegRegion* region; +#endif + +static void xx(char* pattern, char* str, int from, int to, int mem, int not) +{ + int r; + +#ifdef POSIX_TEST + regex_t reg; + char buf[200]; + regmatch_t pmatch[20]; + + r = regcomp(®, pattern, REG_EXTENDED | REG_NEWLINE); + if (r) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + + r = regexec(®, str, reg.re_nsub + 1, pmatch, 0); + if (r != 0 && r != REG_NOMATCH) { + regerror(r, ®, buf, sizeof(buf)); + fprintf(stderr, "ERROR: %s\n", buf); + exit(-1); + } + + if (r == REG_NOMATCH) { + if (not) { + fprintf(stdout, "OK(N): /%s/ '%s'\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s'\n", pattern, str); + nfail++; + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): /%s/ '%s'\n", pattern, str); + nfail++; + } + else { + if (pmatch[mem].rm_so == from && pmatch[mem].rm_eo == to) { + fprintf(stdout, "OK: /%s/ '%s'\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\n", pattern, str, + from, to, pmatch[mem].rm_so, pmatch[mem].rm_eo); + nfail++; + } + } + } + regfree(®); + +#else + regex_t* reg; + RegErrorInfo einfo; + + r = regex_new(®, (UChar* )pattern, (UChar* )(pattern + strlen(pattern)), + REG_OPTION_DEFAULT, REGCODE_SJIS, REG_SYNTAX_DEFAULT, &einfo); + if (r) { + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + exit(-1); + } + + r = regex_search(reg, (UChar* )str, (UChar* )(str + strlen(str)), + (UChar* )str, (UChar* )(str + strlen(str)), + region, REG_OPTION_NONE); + if (r < REG_MISMATCH) { + char s[REG_MAX_ERROR_MESSAGE_LEN]; + regex_error_code_to_str(s, r); + fprintf(stderr, "ERROR: %s\n", s); + exit(-1); + } + + if (r == REG_MISMATCH) { + if (not) { + fprintf(stdout, "OK(N): /%s/ '%s'\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s'\n", pattern, str); + nfail++; + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): /%s/ '%s'\n", pattern, str); + nfail++; + } + else { + if (region->beg[mem] == from && region->end[mem] == to) { + fprintf(stdout, "OK: /%s/ '%s'\n", pattern, str); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\n", pattern, str, + from, to, region->beg[mem], region->end[mem]); + nfail++; + } + } + } + regex_free(reg); +#endif +} + +static void x2(char* pattern, char* str, int from, int to) +{ + xx(pattern, str, from, to, 0, 0); +} + +static void x3(char* pattern, char* str, int from, int to, int mem) +{ + xx(pattern, str, from, to, mem, 0); +} + +static void n(char* pattern, char* str) +{ + xx(pattern, str, 0, 0, 0, 1); +} + +extern int main(int argc, char* argv[]) +{ +#ifdef POSIX_TEST + reg_set_encoding(REG_ENCODING_SJIS); +#else + region = regex_region_new(); +#endif + + x2("", "", 0, 0); + x2("^", "", 0, 0); + x2("$", "", 0, 0); + x2("\\G", "", 0, 0); + x2("\\A", "", 0, 0); + x2("\\Z", "", 0, 0); + x2("\\z", "", 0, 0); + x2("^$", "", 0, 0); + x2("\\ca", "\001", 0, 1); + x2("\\C-b", "\002", 0, 1); + x2("\\M-Z", "\xDA", 0, 1); + x2("", "a", 0, 0); + x2("a", "a", 0, 1); + x2("aa", "aa", 0, 2); + x2("aaa", "aaa", 0, 3); + x2("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, 35); + x2("ab", "ab", 0, 2); + x2("b", "ab", 1, 2); + x2("bc", "abc", 1, 3); + x2("\\17", "\017", 0, 1); + x2("\\x1f", "\x1f", 0, 1); + x2("\\xFE", "\xfe", 0, 1); + x2("a(?#....\\\\JJJJ)b", "ab", 0, 2); + x2(".", "a", 0, 1); + n(".", ""); + x2("..", "ab", 0, 2); + x2("\\w", "e", 0, 1); + n("\\W", "e"); + x2("\\s", " ", 0, 1); + x2("\\S", "b", 0, 1); + x2("\\d", "4", 0, 1); + n("\\D", "4"); + x2("\\b", "z ", 0, 0); + x2("\\b", " z", 1, 1); + x2("\\B", "zz ", 1, 1); + x2("\\B", "z ", 2, 2); + x2("\\B", " z", 0, 0); + x2("[ab]", "b", 0, 1); + n("[ab]", "c"); + x2("[a-z]", "t", 0, 1); + n("[^a]", "a"); + x2("[^a]", "\n", 0, 1); + x2("[]]", "]", 0, 1); + n("[^]]", "]"); + x2("[b-]", "b", 0, 1); + x2("[b-]", "-", 0, 1); + x2("[\\w]", "z", 0, 1); + n("[\\w]", " "); + x2("[\\d]", "5", 0, 1); + n("[\\d]", "e"); + x2("[\\D]", "t", 0, 1); + n("[\\D]", "3"); + x2("[\\s]", " ", 0, 1); + n("[\\s]", "a"); + x2("[\\S]", "b", 0, 1); + n("[\\S]", " "); + x2("[\\w\\d]", "2", 0, 1); + n("[\\w\\d]", " "); + x2("[[:upper:]]", "B", 0, 1); + x2("[*[:xdigit:]+]", "+", 0, 1); + x2("[*[:xdigit:]+]", "GHIKK-9+*", 6, 7); + x2("[*[:xdigit:]+]", "-@^+", 3, 4); + n("[[:upper]]", "A"); + x2("[[:upper]]", ":", 0, 1); + x2("[\\044-\\047]", "\046", 0, 1); + x2("[\\x5a-\\x5c]", "\x5b", 0, 1); + x2("[\\x6A-\\x6D]", "\x6c", 0, 1); + n("[\\x6A-\\x6D]", "\x6E"); + n("^[0-9A-F]+ 0+ UNDEF ", "75F 00000000 SECT14A notype () External | _rb_apply"); + x2("[\\[]", "[", 0, 1); + x2("[\\]]", "]", 0, 1); + x2("[&]", "&", 0, 1); + x2("[[ab]]", "b", 0, 1); + x2("[[ab]c]", "c", 0, 1); + n("[[^a]]", "a"); + n("[^[a]]", "a"); + x2("[[ab]&&bc]", "b", 0, 1); + n("[[ab]&&bc]", "a"); + n("[[ab]&&bc]", "c"); + x2("[a-z&&b-y&&c-x]", "w", 0, 1); + n("[^a-z&&b-y&&c-x]", "w"); + x2("[[^a&&a]&&a-z]", "b", 0, 1); + n("[[^a&&a]&&a-z]", "a"); + x2("[[^a-z&&bcdef]&&[^c-g]]", "h", 0, 1); + n("[[^a-z&&bcdef]&&[^c-g]]", "c"); + x2("[^[^abc]&&[^cde]]", "c", 0, 1); + x2("[^[^abc]&&[^cde]]", "e", 0, 1); + n("[^[^abc]&&[^cde]]", "f"); + x2("[a-&&-a]", "-", 0, 1); + n("[a-&&-a]", "&"); + n("\\wabc", " abc"); + x2("a\\Wbc", "a bc", 0, 4); + x2("a.b.c", "aabbc", 0, 5); + x2(".\\wb\\W..c", "abb bcc", 0, 7); + x2("\\s\\wzzz", " zzzz", 0, 5); + x2("aa.b", "aabb", 0, 4); + n(".a", "ab"); + x2(".a", "aa", 0, 2); + x2("^a", "a", 0, 1); + x2("^a$", "a", 0, 1); + x2("^\\w$", "a", 0, 1); + n("^\\w$", " "); + x2("^\\wab$", "zab", 0, 3); + x2("^\\wabcdef$", "zabcdef", 0, 7); + x2("^\\w...def$", "zabcdef", 0, 7); + x2("\\w\\w\\s\\Waaa\\d", "aa aaa4", 0, 8); + x2("\\A\\Z", "", 0, 0); + x2("\\Axyz", "xyz", 0, 3); + x2("xyz\\Z", "xyz", 0, 3); + x2("xyz\\z", "xyz", 0, 3); + x2("\\Gaz", "az", 0, 2); + n("\\Gz", "bza"); + n("az\\G", "az"); + n("az\\A", "az"); + n("a\\Az", "az"); + x2("\\^\\$", "^$", 0, 2); + x2("\\w", "_", 0, 1); + n("\\W", "_"); + x2("(?=z)z", "z", 0, 1); + n("(?=z).", "a"); + x2("(?!z)a", "a", 0, 1); + n("(?!z)a", "z"); + x2("(?i:a)", "a", 0, 1); + x2("(?i:a)", "A", 0, 1); + x2("(?i:A)", "a", 0, 1); + n("(?i:A)", "b"); + x2("(?i:[A-Z])", "a", 0, 1); + x2("(?i:[f-m])", "H", 0, 1); + x2("(?i:[f-m])", "h", 0, 1); + n("(?i:[f-m])", "e"); + n("(?i:[b-C])", "A"); + x2("(?i:[a-C])", "B", 0, 1); + n("(?i:[c-X])", "["); + n("(?i:[!-k])", "Z"); + x2("(?i:[!-k])", "7", 0, 1); + n("(?i:[T-}])", "b"); + x2("(?i:[T-}])", "{", 0, 1); + x2("(?i:\\?a)", "?A", 0, 2); + x2("(?i:\\*A)", "*a", 0, 2); + n(".", "\n"); + x2("(?m:.)", "\n", 0, 1); + x2("(?m:a.)", "a\n", 0, 2); + x2("(?m:.b)", "a\nb", 1, 3); + x2("a?", "", 0, 0); + x2("a?", "b", 0, 0); + x2("a?", "a", 0, 1); + x2("a*", "", 0, 0); + x2("a*", "a", 0, 1); + x2("a*", "aaa", 0, 3); + x2("a*", "baaaa", 0, 0); + n("a+", ""); + x2("a+", "a", 0, 1); + x2("a+", "aaaa", 0, 4); + x2("a+", "aabbb", 0, 2); + x2("a+", "baaaa", 1, 5); + x2(".?", "", 0, 0); + x2(".?", "f", 0, 1); + x2(".?", "\n", 0, 0); + x2(".*", "", 0, 0); + x2(".*", "abcde", 0, 5); + x2(".+", "z", 0, 1); + x2(".+", "zdswer\n", 0, 6); + x2("a|b", "a", 0, 1); + x2("a|b", "b", 0, 1); + x2("|a", "a", 0, 0); + x2("(|a)", "a", 0, 0); + x2("ab|bc", "ab", 0, 2); + x2("ab|bc", "bc", 0, 2); + x2("z(?:ab|bc)", "zbc", 0, 3); + x2("a(?:ab|bc)c", "aabc", 0, 4); + x2("ab|(?:ac|az)", "az", 0, 2); + x2("a|b|c", "dc", 1, 2); + x2("a|b|cd|efg|h|ijk|lmn|o|pq|rstuvwx|yz", "pqr", 0, 2); + n("a|b|cd|efg|h|ijk|lmn|o|pq|rstuvwx|yz", "mn"); + x2("a|^z", "ba", 1, 2); + x2("a|^z", "za", 0, 1); + x2("a|\\Gz", "bza", 2, 3); + x2("a|\\Gz", "za", 0, 1); + x2("a|\\Az", "bza", 2, 3); + x2("a|\\Az", "za", 0, 1); + x2("a|b\\Z", "ba", 1, 2); + x2("a|b\\Z", "b", 0, 1); + x2("a|b\\z", "ba", 1, 2); + x2("a|b\\z", "b", 0, 1); + x2("\\w|\\s", " ", 0, 1); + n("\\w|\\w", " "); + x2("\\w|%", "%", 0, 1); + x2("\\w|[&$]", "&", 0, 1); + x2("[b-d]|[^e-z]", "a", 0, 1); + x2("(?:a|[c-f])|bz", "dz", 0, 1); + x2("(?:a|[c-f])|bz", "bz", 0, 2); + x2("abc|(?=zz)..f", "zzf", 0, 3); + x2("abc|(?!zz)..f", "abf", 0, 3); + x2("(?=za)..a|(?=zz)..a", "zza", 0, 3); + n("(?>a|abd)c", "abdc"); + x2("(?>abd|a)c", "abdc", 0, 4); + x2("a?|b", "a", 0, 1); + x2("a?|b", "b", 0, 0); + x2("a?|b", "", 0, 0); + x2("a*|b", "aa", 0, 2); + x2("a*|b*", "ba", 0, 0); + x2("a*|b*", "ab", 0, 1); + x2("a+|b*", "", 0, 0); + x2("a+|b*", "bbb", 0, 3); + x2("a+|b*", "abbb", 0, 1); + n("a+|b+", ""); + x2("(a|b)?", "b", 0, 1); + x2("(a|b)*", "ba", 0, 2); + x2("(a|b)+", "bab", 0, 3); + x2("(ab|ca)+", "caabbc", 0, 4); + x2("(ab|ca)+", "aabca", 1, 5); + x2("(ab|ca)+", "abzca", 0, 2); + x2("(a|bab)+", "ababa", 0, 5); + x2("(a|bab)+", "ba", 1, 2); + x2("(a|bab)+", "baaaba", 1, 4); + x2("(?:a|b)(?:a|b)", "ab", 0, 2); + x2("(?:a*|b*)(?:a*|b*)", "aaabbb", 0, 3); + x2("(?:a*|b*)(?:a+|b+)", "aaabbb", 0, 6); + x2("(?:a+|b+){2}", "aaabbb", 0, 6); + x2("h{0,}", "hhhh", 0, 4); + x2("(?:a+|b+){1,2}", "aaabbb", 0, 6); + x2("(?:a+|\\Ab*)cc", "cc", 0, 2); + n("(?:a+|\\Ab*)cc", "abcc"); + x2("(?:^a+|b+)*c", "aabbbabc", 6, 8); + x2("(?:^a+|b+)*c", "aabbbbc", 0, 7); + x2("a|(?i)c", "C", 0, 1); + x2("(?i)c|a", "C", 0, 1); + x2("(?i)c|a", "A", 0, 1); + x2("(?i:c)|a", "C", 0, 1); + n("(?i:c)|a", "A"); + x2("[abc]?", "abc", 0, 1); + x2("[abc]*", "abc", 0, 3); + x2("[^abc]*", "abc", 0, 0); + n("[^abc]+", "abc"); + x2("a?\?", "aaa", 0, 0); + x2("ba?\?b", "bab", 0, 3); + x2("a*?", "aaa", 0, 0); + x2("ba*?", "baa", 0, 1); + x2("ba*?b", "baab", 0, 4); + x2("a+?", "aaa", 0, 1); + x2("ba+?", "baa", 0, 2); + x2("ba+?b", "baab", 0, 4); + x2("(?:a?)?\?", "a", 0, 0); + x2("(?:a?\?)?", "a", 0, 0); + x2("(?:a?)+?", "aaa", 0, 1); + x2("(?:a+)?\?", "aaa", 0, 0); + x2("(?:a+)?\?b", "aaab", 0, 4); + x2("(?:ab)?{2}", "", 0, 0); + x2("(?:ab)?{2}", "ababa", 0, 4); + x2("(?:ab)*{0}", "ababa", 0, 0); + x2("(?:ab){3,}", "abababab", 0, 8); + n("(?:ab){3,}", "abab"); + x2("(?:ab){2,4}", "ababab", 0, 6); + x2("(?:ab){2,4}", "ababababab", 0, 8); + x2("(?:ab){2,4}?", "ababababab", 0, 4); + x2("(?:ab){,}", "ab{,}", 0, 5); + x2("(?:abc)+?{2}", "abcabcabc", 0, 6); + x2("(?:X*)(?i:xa)", "XXXa", 0, 4); + x2("(d+)([^abc]z)", "dddz", 0, 4); + x2("([^abc]*)([^abc]z)", "dddz", 0, 4); + x2("(\\w+)(\\wz)", "dddz", 0, 4); + x3("(a)", "a", 0, 1, 1); + x3("(ab)", "ab", 0, 2, 1); + x2("((ab))", "ab", 0, 2); + x3("((ab))", "ab", 0, 2, 1); + x3("((ab))", "ab", 0, 2, 2); + x3("((((((((((((((((((((ab))))))))))))))))))))", "ab", 0, 2, 20); + x3("(ab)(cd)", "abcd", 0, 2, 1); + x3("(ab)(cd)", "abcd", 2, 4, 2); + x3("()(a)bc(def)ghijk", "abcdefghijk", 3, 6, 3); + x3("(()(a)bc(def)ghijk)", "abcdefghijk", 3, 6, 4); + x2("(^a)", "a", 0, 1); + x3("(a)|(a)", "ba", 1, 2, 1); + x3("(^a)|(a)", "ba", 1, 2, 2); + x3("(a?)", "aaa", 0, 1, 1); + x3("(a*)", "aaa", 0, 3, 1); + x3("(a*)", "", 0, 0, 1); + x3("(a+)", "aaaaaaa", 0, 7, 1); + x3("(a+|b*)", "bbbaa", 0, 3, 1); + x3("(a+|b?)", "bbbaa", 0, 1, 1); + x3("(abc)?", "abc", 0, 3, 1); + x3("(abc)*", "abc", 0, 3, 1); + x3("(abc)+", "abc", 0, 3, 1); + x3("(xyz|abc)+", "abc", 0, 3, 1); + x3("([xyz][abc]|abc)+", "abc", 0, 3, 1); + x3("((?i:abc))", "AbC", 0, 3, 1); + x2("(abc)(?i:\\1)", "abcABC", 0, 6); + x3("((?m:a.c))", "a\nc", 0, 3, 1); + x3("((?=az)a)", "azb", 0, 1, 1); + x3("abc|(.abd)", "zabd", 0, 4, 1); + x2("(?:abc)|(ABC)", "abc", 0, 3); + x3("(?i:(abc))|(zzz)", "ABC", 0, 3, 1); + x3("a*(.)", "aaaaz", 4, 5, 1); + x3("a*?(.)", "aaaaz", 0, 1, 1); + x3("a*?(c)", "aaaac", 4, 5, 1); + x3("[bcd]a*(.)", "caaaaz", 5, 6, 1); + x3("(\\Abb)cc", "bbcc", 0, 2, 1); + n("(\\Abb)cc", "zbbcc"); + x3("(^bb)cc", "bbcc", 0, 2, 1); + n("(^bb)cc", "zbbcc"); + x3("cc(bb$)", "ccbb", 2, 4, 1); + n("cc(bb$)", "ccbbb"); + n("(\\1)", ""); + n("\\1(a)", "aa"); + n("(a(b)\\1)\\2+", "ababb"); + n("(?:(?:\\1|z)(a))+$", "zaa"); + x2("(?:(?:\\1|z)(a))+$", "zaaa", 0, 4); + x2("(a)(?=\\1)", "aa", 0, 1); + n("(a)$|\\1", "az"); + x2("(a)\\1", "aa", 0, 2); + n("(a)\\1", "ab"); + x2("(a?)\\1", "aa", 0, 2); + x2("(a?\?)\\1", "aa", 0, 0); + x2("(a*)\\1", "aaaaa", 0, 4); + x3("(a*)\\1", "aaaaa", 0, 2, 1); + x2("a(b*)\\1", "abbbb", 0, 5); + x2("a(b*)\\1", "ab", 0, 1); + x2("(a*)(b*)\\1\\2", "aaabbaaabb", 0, 10); + x2("(a*)(b*)\\2", "aaabbbb", 0, 7); + x2("(((((((a*)b))))))c\\7", "aaabcaaa", 0, 8); + x3("(((((((a*)b))))))c\\7", "aaabcaaa", 0, 3, 7); + x2("(a)(b)(c)\\2\\1\\3", "abcbac", 0, 6); + x2("([a-d])\\1", "cc", 0, 2); + x2("(\\w\\d\\s)\\1", "f5 f5 ", 0, 6); + n("(\\w\\d\\s)\\1", "f5 f5"); + x2("(who|[a-c]{3})\\1", "whowho", 0, 6); + x2("...(who|[a-c]{3})\\1", "abcwhowho", 0, 9); + x2("(who|[a-c]{3})\\1", "cbccbc", 0, 6); + x2("(^a)\\1", "aa", 0, 2); + n("(^a)\\1", "baa"); + n("(a$)\\1", "aa"); + n("(ab\\Z)\\1", "ab"); + x2("(a*\\Z)\\1", "a", 1, 1); + x2(".(a*\\Z)\\1", "ba", 1, 2); + x3("(.(abc)\\2)", "zabcabc", 0, 7, 1); + x3("(.(..\\d.)\\2)", "z12341234", 0, 9, 1); + x2("((?i:az))\\1", "AzAz", 0, 4); + n("((?i:az))\\1", "Azaz"); + x2("(?<=a)b", "ab", 1, 2); + n("(?<=a)b", "bb"); + x2("(?<=a|b)b", "bb", 1, 2); + x2("(?<=a|bc)b", "bcb", 2, 3); + x2("(?<=a|bc)b", "ab", 1, 2); + x2("(?<=a|bc||defghij|klmnopq|r)z", "rz", 1, 2); + x2("(?<!a)b", "cb", 1, 2); + n("(?<!a)b", "ab"); + x2("(?<!a|bc)b", "bbb", 0, 1); + n("(?<!a|bc)z", "bcz"); + x2("(?<name1>a)", "a", 0, 1); + x2("(?<name-2>ab)\\1", "abab", 0, 4); + x2("(?<name-3>.zv.)\\k<name-3>", "azvbazvb", 0, 8); + x2("(?<=\\g<ab>)|-\\zEND (?<ab>XyZ)", "XyZ", 3, 3); + x2("(?<n>|a\\g<n>)+", "", 0, 0); + x2("(?<n>|\\(\\g<n>\\))+$", "()(())", 0, 6); + x3("\\g<n>(?<n>.){0}", "X", 0, 1, 1); + x2("\\g<n>(abc|df(?<n>.YZ){2,8}){0}", "XYZ", 0, 3); + x2("\\A(?<n>(a\\g<n>)|)\\z", "aaaa", 0, 4); + x2("(?<n>|\\g<m>\\g<n>)\\z|\\zEND (?<m>a|(b)\\g<m>)", "bbbbabba", 0, 8); + x2("(?<@:name[1240]>\\w+\\sx)a+\\k<@:name[1240]>", " fg xaaaaaaaafg x", 2, 18); + x3("(z)()()(?<9>a)\\4", "zaa", 1, 2, 4); + x2("(.)(((?<*>a)))\\k<*>", "zaa", 0, 3); + x2("((?<name1>\\d)|(?<name2>\\w))(\\k<name1>|\\k<name2>)", "ff", 0, 2); + x2("(?:(?<x>)|(?<x>efg))\\k<x>", "", 0, 0); + x2("(?:(?<@x>abc)|(?<@x>efg))\\k<@x>", "abcefgefg", 3, 9); + n("(?:(?<@x>abc)|(?<@x>efg))\\k<@x>", "abcefg"); + x2("(?:(?<n1>.)|(?<n1>..)|(?<n1>...)|(?<n1>....)|(?<n1>.....)|(?<n1>......)|(?<n1>.......)|(?<n1>........)|(?<n1>.........)|(?<n1>..........)|(?<n1>...........)|(?<n1>............)|(?<n1>.............)|(?<n1>..............))\\k<n1>$", "a-pyumpyum", 2, 10); + x3("(?:(?<n1>.)|(?<n1>..)|(?<n1>...)|(?<n1>....)|(?<n1>.....)|(?<n1>......)|(?<n1>.......)|(?<n1>........)|(?<n1>.........)|(?<n1>..........)|(?<n1>...........)|(?<n1>............)|(?<n1>.............)|(?<n1>..............))\\k<n1>$", "xxxxabcdefghijklmnabcdefghijklmn", 4, 18, 14); + x3("(?<name1>)(?<name2>)(?<name3>)(?<name4>)(?<name5>)(?<name6>)(?<name7>)(?<name8>)(?<name9>)(?<name10>)(?<name11>)(?<name12>)(?<name13>)(?<name14>)(?<name15>)(?<name16>aaa)(?<name17>)$", "aaa", 0, 3, 16); + x2("(?<foo>a|\\(\\g<foo>\\))", "a", 0, 1); + x2("(?<foo>a|\\(\\g<foo>\\))", "((((((a))))))", 0, 13); + x3("(?<foo>a|\\(\\g<foo>\\))", "((((((((a))))))))", 0, 17, 1); + x2("\\g<bar>|\\zEND(?<bar>.*abc$)", "abcxxxabc", 0, 9); + x2("\\g<1>|\\zEND(.a.)", "bac", 0, 3); + x3("\\g<2>\\g<1>|\\zEND(.a.)(?<?>.b.)", "xbxyay", 3, 6, 1); + x2("\\A(?:\\g<pon>|\\g<pan>|\\zEND (?<pan>a|c\\g<pon>c)(?<pon>b|d\\g<pan>d))$", "cdcbcdc", 0, 7); + x2("\\A(?<n>|a\\g<m>)\\z|\\zEND (?<m>\\g<n>)", "aaaa", 0, 4); + x2("(?<n>(a|b\\g<n>c){3,5})", "baaaaca", 1, 5); + x2("(?<n>(a|b\\g<n>c){3,5})", "baaaacaaaaa", 0, 10); + x2("", "あ", 0, 0); + x2("あ", "あ", 0, 2); + n("い", "あ"); + x2("うう", "うう", 0, 4); + x2("あいう", "あいう", 0, 6); + x2("こここここここここここここここここここここここここここここここここここ", "こここここここここここここここここここここここここここここここここここ", 0, 70); + x2("あ", "いあ", 2, 4); + x2("いう", "あいう", 2, 6); + x2("\\xca\\xb8", "\xca\xb8", 0, 2); + x2(".", "あ", 0, 2); + x2("..", "かき", 0, 4); + x2("\\w", "お", 0, 2); + n("\\W", "あ"); + x2("\\S", "そ", 0, 2); + x2("\\S", "漢", 0, 2); + x2("\\b", "気 ", 0, 0); + x2("\\b", " ほ", 1, 1); + x2("\\B", "せそ ", 2, 2); + x2("\\B", "う ", 3, 3); + x2("\\B", " い", 0, 0); + x2("[たち]", "ち", 0, 2); + n("[なに]", "ぬ"); + x2("[う-お]", "え", 0, 2); + n("[^け]", "け"); + x2("[\\w]", "ね", 0, 2); + n("[\\d]", "ふ"); + x2("[\\D]", "は", 0, 2); + n("[\\s]", "く"); + x2("[\\S]", "へ", 0, 2); + x2("[\\w\\d]", "よ", 0, 2); + x2("[\\w\\d]", " よ", 3, 5); + n("\\w鬼車", " 鬼車"); + x2("鬼\\W車", "鬼 車", 0, 5); + x2("あ.い.う", "ああいいう", 0, 10); + x2(".\\wう\\W..ぞ", "えうう うぞぞ", 0, 13); + x2("\\s\\wこここ", " ここここ", 0, 9); + x2("ああ.け", "ああけけ", 0, 8); + n(".い", "いえ"); + x2(".お", "おお", 0, 4); + x2("^あ", "あ", 0, 2); + x2("^む$", "む", 0, 2); + x2("^\\w$", "に", 0, 2); + x2("^\\wかきくけこ$", "zかきくけこ", 0, 11); + x2("^\\w...うえお$", "zあいううえお", 0, 13); + x2("\\w\\w\\s\\Wおおお\\d", "aお おおお4", 0, 12); + x2("\\Aたちつ", "たちつ", 0, 6); + x2("むめも\\Z", "むめも", 0, 6); + x2("かきく\\z", "かきく", 0, 6); + x2("かきく\\Z", "かきく\n", 0, 6); + x2("\\Gぽぴ", "ぽぴ", 0, 4); + n("\\Gえ", "うえお"); + n("とて\\G", "とて"); + n("まみ\\A", "まみ"); + n("ま\\Aみ", "まみ"); + x2("(?=せ)せ", "せ", 0, 2); + n("(?=う).", "い"); + x2("(?!う)か", "か", 0, 2); + n("(?!と)あ", "と"); + x2("(?i:あ)", "あ", 0, 2); + x2("(?i:ぶべ)", "ぶべ", 0, 4); + n("(?i:い)", "う"); + x2("(?m:よ.)", "よ\n", 0, 3); + x2("(?m:.め)", "ま\nめ", 2, 5); + x2("あ?", "", 0, 0); + x2("変?", "化", 0, 0); + x2("変?", "変", 0, 2); + x2("量*", "", 0, 0); + x2("量*", "量", 0, 2); + x2("子*", "子子子", 0, 6); + x2("馬*", "鹿馬馬馬馬", 0, 0); + n("山+", ""); + x2("河+", "河", 0, 2); + x2("時+", "時時時時", 0, 8); + x2("え+", "ええううう", 0, 4); + x2("う+", "おうううう", 2, 10); + x2(".?", "た", 0, 2); + x2(".*", "ぱぴぷぺ", 0, 8); + x2(".+", "ろ", 0, 2); + x2(".+", "いうえか\n", 0, 8); + x2("あ|い", "あ", 0, 2); + x2("あ|い", "い", 0, 2); + x2("あい|いう", "あい", 0, 4); + x2("あい|いう", "いう", 0, 4); + x2("を(?:かき|きく)", "をかき", 0, 6); + x2("を(?:かき|きく)け", "をきくけ", 0, 8); + x2("あい|(?:あう|あを)", "あを", 0, 4); + x2("あ|い|う", "えう", 2, 4); + x2("あ|い|うえ|おかき|く|けこさ|しすせ|そ|たち|つてとなに|ぬね", "しすせ", 0, 6); + n("あ|い|うえ|おかき|く|けこさ|しすせ|そ|たち|つてとなに|ぬね", "すせ"); + x2("あ|^わ", "ぶあ", 2, 4); + x2("あ|^を", "をあ", 0, 2); + x2("鬼|\\G車", "け車鬼", 4, 6); + x2("鬼|\\G車", "車鬼", 0, 2); + x2("鬼|\\A車", "b車鬼", 3, 5); + x2("鬼|\\A車", "車", 0, 2); + x2("鬼|車\\Z", "車鬼", 2, 4); + x2("鬼|車\\Z", "車", 0, 2); + x2("鬼|車\\Z", "車\n", 0, 2); + x2("鬼|車\\z", "車鬼", 2, 4); + x2("鬼|車\\z", "車", 0, 2); + x2("\\w|\\s", "お", 0, 2); + x2("\\w|%", "%お", 0, 1); + x2("\\w|[&$]", "う&", 0, 2); + x2("[い-け]", "う", 0, 2); + x2("[い-け]|[^か-こ]", "あ", 0, 2); + x2("[い-け]|[^か-こ]", "か", 0, 2); + x2("(?:あ|[う-き])|いを", "うを", 0, 2); + x2("(?:あ|[う-き])|いを", "いを", 0, 4); + x2("あいう|(?=けけ)..ほ", "けけほ", 0, 6); + x2("あいう|(?!けけ)..ほ", "あいほ", 0, 6); + x2("(?=をあ)..あ|(?=をを)..あ", "ををあ", 0, 6); + x2("(?<=あ|いう)い", "いうい", 4, 6); + n("(?>あ|あいえ)う", "あいえう"); + x2("(?>あいえ|あ)う", "あいえう", 0, 8); + x2("あ?|い", "あ", 0, 2); + x2("あ?|い", "い", 0, 0); + x2("あ?|い", "", 0, 0); + x2("あ*|い", "ああ", 0, 4); + x2("あ*|い*", "いあ", 0, 0); + x2("あ*|い*", "あい", 0, 2); + x2("[aあ]*|い*", "aあいいい", 0, 3); + x2("あ+|い*", "", 0, 0); + x2("あ+|い*", "いいい", 0, 6); + x2("あ+|い*", "あいいい", 0, 2); + x2("あ+|い*", "aあいいい", 0, 0); + n("あ+|い+", ""); + x2("(あ|い)?", "い", 0, 2); + x2("(あ|い)*", "いあ", 0, 4); + x2("(あ|い)+", "いあい", 0, 6); + x2("(あい|うあ)+", "うああいうえ", 0, 8); + x2("(あい|うえ)+", "うああいうえ", 4, 12); + x2("(あい|うあ)+", "ああいうあ", 2, 10); + x2("(あい|うあ)+", "あいをうあ", 0, 4); + x2("(あい|うあ)+", "$$zzzzあいをうあ", 6, 10); + x2("(あ|いあい)+", "あいあいあ", 0, 10); + x2("(あ|いあい)+", "いあ", 2, 4); + x2("(あ|いあい)+", "いあああいあ", 2, 8); + x2("(?:あ|い)(?:あ|い)", "あい", 0, 4); + x2("(?:あ*|い*)(?:あ*|い*)", "あああいいい", 0, 6); + x2("(?:あ*|い*)(?:あ+|い+)", "あああいいい", 0, 12); + x2("(?:あ+|い+){2}", "あああいいい", 0, 12); + x2("(?:あ+|い+){1,2}", "あああいいい", 0, 12); + x2("(?:あ+|\\Aい*)うう", "うう", 0, 4); + n("(?:あ+|\\Aい*)うう", "あいうう"); + x2("(?:^あ+|い+)*う", "ああいいいあいう", 12, 16); + x2("(?:^あ+|い+)*う", "ああいいいいう", 0, 14); + x2("う{0,}", "うううう", 0, 8); + x2("あ|(?i)c", "C", 0, 1); + x2("(?i)c|あ", "C", 0, 1); + x2("(?i:あ)|a", "a", 0, 1); + n("(?i:あ)|a", "A"); + x2("[あいう]?", "あいう", 0, 2); + x2("[あいう]*", "あいう", 0, 6); + x2("[^あいう]*", "あいう", 0, 0); + n("[^あいう]+", "あいう"); + x2("あ?\?", "あああ", 0, 0); + x2("いあ?\?い", "いあい", 0, 6); + x2("あ*?", "あああ", 0, 0); + x2("いあ*?", "いああ", 0, 2); + x2("いあ*?い", "いああい", 0, 8); + x2("あ+?", "あああ", 0, 2); + x2("いあ+?", "いああ", 0, 4); + x2("いあ+?い", "いああい", 0, 8); + x2("(?:天?)?\?", "天", 0, 0); + x2("(?:天?\?)?", "天", 0, 0); + x2("(?:夢?)+?", "夢夢夢", 0, 2); + x2("(?:風+)?\?", "風風風", 0, 0); + x2("(?:雪+)?\?霜", "雪雪雪霜", 0, 8); + x2("(?:あい)?{2}", "", 0, 0); + x2("(?:鬼車)?{2}", "鬼車鬼車鬼", 0, 8); + x2("(?:鬼車)*{0}", "鬼車鬼車鬼", 0, 0); + x2("(?:鬼車){3,}", "鬼車鬼車鬼車鬼車", 0, 16); + n("(?:鬼車){3,}", "鬼車鬼車"); + x2("(?:鬼車){2,4}", "鬼車鬼車鬼車", 0, 12); + x2("(?:鬼車){2,4}", "鬼車鬼車鬼車鬼車鬼車", 0, 16); + x2("(?:鬼車){2,4}?", "鬼車鬼車鬼車鬼車鬼車", 0, 8); + x2("(?:鬼車){,}", "鬼車{,}", 0, 7); + x2("(?:かきく)+?{2}", "かきくかきくかきく", 0, 12); + x3("(火)", "火", 0, 2, 1); + x3("(火水)", "火水", 0, 4, 1); + x2("((時間))", "時間", 0, 4); + x3("((風水))", "風水", 0, 4, 1); + x3("((昨日))", "昨日", 0, 4, 2); + x3("((((((((((((((((((((量子))))))))))))))))))))", "量子", 0, 4, 20); + x3("(あい)(うえ)", "あいうえ", 0, 4, 1); + x3("(あい)(うえ)", "あいうえ", 4, 8, 2); + x3("()(あ)いう(えおか)きくけこ", "あいうえおかきくけこ", 6, 12, 3); + x3("(()(あ)いう(えおか)きくけこ)", "あいうえおかきくけこ", 6, 12, 4); + x3(".*(フォ)ン・マ(ン()シュタ)イン", "フォン・マンシュタイン", 10, 18, 2); + x2("(^あ)", "あ", 0, 2); + x3("(あ)|(あ)", "いあ", 2, 4, 1); + x3("(^あ)|(あ)", "いあ", 2, 4, 2); + x3("(あ?)", "あああ", 0, 2, 1); + x3("(ま*)", "ままま", 0, 6, 1); + x3("(と*)", "", 0, 0, 1); + x3("(る+)", "るるるるるるる", 0, 14, 1); + x3("(ふ+|へ*)", "ふふふへへ", 0, 6, 1); + x3("(あ+|い?)", "いいいああ", 0, 2, 1); + x3("(あいう)?", "あいう", 0, 6, 1); + x3("(あいう)*", "あいう", 0, 6, 1); + x3("(あいう)+", "あいう", 0, 6, 1); + x3("(さしす|あいう)+", "あいう", 0, 6, 1); + x3("([なにぬ][かきく]|かきく)+", "かきく", 0, 6, 1); + x3("((?i:あいう))", "あいう", 0, 6, 1); + x3("((?m:あ.う))", "あ\nう", 0, 5, 1); + x3("((?=あん)あ)", "あんい", 0, 2, 1); + x3("あいう|(.あいえ)", "んあいえ", 0, 8, 1); + x3("あ*(.)", "ああああん", 8, 10, 1); + x3("あ*?(.)", "ああああん", 0, 2, 1); + x3("あ*?(ん)", "ああああん", 8, 10, 1); + x3("[いうえ]あ*(.)", "えああああん", 10, 12, 1); + x3("(\\Aいい)うう", "いいうう", 0, 4, 1); + n("(\\Aいい)うう", "んいいうう"); + x3("(^いい)うう", "いいうう", 0, 4, 1); + n("(^いい)うう", "んいいうう"); + x3("ろろ(るる$)", "ろろるる", 4, 8, 1); + n("ろろ(るる$)", "ろろるるる"); + x2("(無)\\1", "無無", 0, 4); + n("(無)\\1", "無武"); + x2("(空?)\\1", "空空", 0, 4); + x2("(空?\?)\\1", "空空", 0, 0); + x2("(空*)\\1", "空空空空空", 0, 8); + x3("(空*)\\1", "空空空空空", 0, 4, 1); + x2("あ(い*)\\1", "あいいいい", 0, 10); + x2("あ(い*)\\1", "あい", 0, 2); + x2("(あ*)(い*)\\1\\2", "あああいいあああいい", 0, 20); + x2("(あ*)(い*)\\2", "あああいいいい", 0, 14); + x3("(あ*)(い*)\\2", "あああいいいい", 6, 10, 2); + x2("(((((((ぽ*)ぺ))))))ぴ\\7", "ぽぽぽぺぴぽぽぽ", 0, 16); + x3("(((((((ぽ*)ぺ))))))ぴ\\7", "ぽぽぽぺぴぽぽぽ", 0, 6, 7); + x2("(は)(ひ)(ふ)\\2\\1\\3", "はひふひはふ", 0, 12); + x2("([き-け])\\1", "くく", 0, 4); + x2("(\\w\\d\\s)\\1", "あ5 あ5 ", 0, 8); + n("(\\w\\d\\s)\\1", "あ5 あ5"); + x2("(誰?|[あ-う]{3})\\1", "誰?誰?", 0, 8); + x2("...(誰?|[あ-う]{3})\\1", "あaあ誰?誰?", 0, 13); + x2("(誰?|[あ-う]{3})\\1", "ういうういう", 0, 12); + x2("(^こ)\\1", "ここ", 0, 4); + n("(^む)\\1", "めむむ"); + n("(あ$)\\1", "ああ"); + n("(あい\\Z)\\1", "あい"); + x2("(あ*\\Z)\\1", "あ", 2, 2); + x2(".(あ*\\Z)\\1", "いあ", 2, 4); + x3("(.(やいゆ)\\2)", "zやいゆやいゆ", 0, 13, 1); + x3("(.(..\\d.)\\2)", "あ12341234", 0, 10, 1); + x2("((?i:あvず))\\1", "あvずあvず", 0, 10); + x2("(?<愚か>変|\\(\\g<愚か>\\))", "((((((変))))))", 0, 14); + x2("\\A(?:\\g<阿-1>|\\g<云-2>|\\z終了 (?<阿-1>観|自\\g<云-2>自)(?<云-2>在|菩薩\\g<阿-1>菩薩))$", "菩薩自菩薩自在自菩薩自菩薩", 0, 26); + x2("[[ひふ]]", "ふ", 0, 2); + x2("[[いおう]か]", "か", 0, 2); + n("[[^あ]]", "あ"); + n("[^[あ]]", "あ"); + x2("[^[^あ]]", "あ", 0, 2); + x2("[[かきく]&&きく]", "く", 0, 2); + n("[[かきく]&&きく]", "か"); + n("[[かきく]&&きく]", "け"); + x2("[あ-ん&&い-を&&う-ゑ]", "ゑ", 0, 2); + n("[^あ-ん&&い-を&&う-ゑ]", "ゑ"); + x2("[[^あ&&あ]&&あ-ん]", "い", 0, 2); + n("[[^あ&&あ]&&あ-ん]", "あ"); + x2("[[^あ-ん&&いうえお]&&[^う-か]]", "き", 0, 2); + n("[[^あ-ん&&いうえお]&&[^う-か]]", "い"); + x2("[^[^あいう]&&[^うえお]]", "う", 0, 2); + x2("[^[^あいう]&&[^うえお]]", "え", 0, 2); + n("[^[^あいう]&&[^うえお]]", "か"); + x2("[あ-&&-あ]", "-", 0, 1); + x2("[^[^a-zあいう]&&[^bcdefgうえお]q-w]", "え", 0, 2); + x2("[^[^a-zあいう]&&[^bcdefgうえお]g-w]", "f", 0, 1); + x2("[^[^a-zあいう]&&[^bcdefgうえお]g-w]", "g", 0, 1); + n("[^[^a-zあいう]&&[^bcdefgうえお]g-w]", "2"); + fprintf(stdout, "\nRESULT SUCC: %d, FAIL: %d\n", nsucc, nfail); + +#ifndef POSIX_TEST + regex_region_free(region, 1); + regex_end(); +#endif + + return 0; +} |