From 0424e152c684a85f4b0691f1e84aec203115333d Mon Sep 17 00:00:00 2001 From: naruse Date: Fri, 17 Feb 2012 07:42:23 +0000 Subject: * Merge Onigmo-5.13.1. [ruby-dev:45057] [Feature #5820] https://github.com/k-takata/Onigmo cp reg{comp,enc,error,exec,parse,syntax}.c reg{enc,int,parse}.h cp oniguruma.h cp tool/enc-unicode.rb cp -r enc/ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@34663 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regcomp.c | 544 +++++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 451 insertions(+), 93 deletions(-) (limited to 'regcomp.c') diff --git a/regcomp.c b/regcomp.c index d81334a002..afb8e1cdc4 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1,8 +1,9 @@ /********************************************************************** - regcomp.c - Oniguruma (regular expression library) + regcomp.c - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2008 K.Kosako + * Copyright (c) 2011-2012 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -44,6 +45,7 @@ onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) return 0; } + #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif @@ -152,7 +154,7 @@ onig_bbuf_init(BBuf* buf, OnigDistance size) if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); } - buf->alloc = (unsigned int)size; + buf->alloc = (unsigned int )size; buf->used = 0; return 0; } @@ -433,7 +435,7 @@ add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance str_len, if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; - len += mb_len * str_len; + len += mb_len * (int )str_len; return len; } @@ -1262,6 +1264,26 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg) } break; + case ENCLOSE_CONDITION: + len = SIZE_OP_CONDITION; + if (NTYPE(node->target) == NT_ALT) { + Node* x = node->target; + + tlen = compile_length_tree(NCAR(x), reg); /* yes-node */ + if (tlen < 0) return tlen; + len += tlen + SIZE_OP_JUMP; + if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG; + x = NCDR(x); + tlen = compile_length_tree(NCAR(x), reg); /* no-node */ + if (tlen < 0) return tlen; + len += tlen; + if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN; + } + else { + return ONIGERR_PARSER_BUG; + } + break; + default: return ONIGERR_TYPE_BUG; break; @@ -1365,6 +1387,39 @@ compile_enclose_node(EncloseNode* node, regex_t* reg) } break; + case ENCLOSE_CONDITION: + r = add_opcode(reg, OP_CONDITION); + if (r) return r; + r = add_mem_num(reg, node->regnum); + if (r) return r; + + if (NTYPE(node->target) == NT_ALT) { + Node* x = node->target; + int len2; + + len = compile_length_tree(NCAR(x), reg); /* yes-node */ + if (len < 0) return len; + if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG; + x = NCDR(x); + len2 = compile_length_tree(NCAR(x), reg); /* no-node */ + if (len2 < 0) return len2; + if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN; + + x = node->target; + r = add_rel_addr(reg, len + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree(NCAR(x), reg); /* yes-node */ + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, len2); + if (r) return r; + x = NCDR(x); + r = compile_tree(NCAR(x), reg); /* no-node */ + } + else { + return ONIGERR_PARSER_BUG; + } + break; + default: return ONIGERR_TYPE_BUG; break; @@ -1419,12 +1474,28 @@ compile_anchor_node(AnchorNode* node, regex_t* reg) case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; - case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break; - case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break; + /* used for implicit anchor optimization: /.*a/ ==> /(?:^|\G).*a/ */ + case ANCHOR_ANYCHAR_STAR: r = add_opcode(reg, OP_BEGIN_POS_OR_LINE); break; + + case ANCHOR_WORD_BOUND: + if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND); + else r = add_opcode(reg, OP_WORD_BOUND); + break; + case ANCHOR_NOT_WORD_BOUND: + if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND); + else r = add_opcode(reg, OP_NOT_WORD_BOUND); + break; #ifdef USE_WORD_BEGIN_END - case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break; - case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break; + case ANCHOR_WORD_BEGIN: + if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN); + else r = add_opcode(reg, OP_WORD_BEGIN); + break; + case ANCHOR_WORD_END: + if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END); + else r = add_opcode(reg, OP_WORD_END); + break; #endif + case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break; case ANCHOR_PREC_READ: r = add_opcode(reg, OP_PUSH_POS); @@ -1642,8 +1713,14 @@ compile_tree(Node* node, regex_t* reg) switch (NCTYPE(node)->ctype) { case ONIGENC_CTYPE_WORD: - if (NCTYPE(node)->not != 0) op = OP_NOT_WORD; - else op = OP_WORD; + if (NCTYPE(node)->ascii_range != 0) { + if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD; + else op = OP_ASCII_WORD; + } + else { + if (NCTYPE(node)->not != 0) op = OP_NOT_WORD; + else op = OP_WORD; + } break; default: return ONIGERR_TYPE_BUG; @@ -2031,6 +2108,7 @@ quantifiers_memory_node_info(Node* node) case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: r = quantifiers_memory_node_info(en->target); break; default: @@ -2162,6 +2240,7 @@ get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) #endif case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: r = get_min_match_length(en->target, min, env); break; } @@ -2279,6 +2358,7 @@ get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) #endif case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: r = get_max_match_length(en->target, max, env); break; } @@ -2310,7 +2390,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) do { r = get_char_length_tree1(NCAR(node), reg, &tlen, level); if (r == 0) - *len = (int)distance_add(*len, tlen); + *len = (int )distance_add(*len, tlen); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; @@ -2357,7 +2437,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) if (qn->lower == qn->upper) { r = get_char_length_tree1(qn->target, reg, &tlen, level); if (r == 0) - *len = (int)distance_multiply(tlen, qn->lower); + *len = (int )distance_multiply(tlen, qn->lower); } else r = GET_CHAR_LEN_VARLEN; @@ -2401,6 +2481,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) #endif case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: r = get_char_length_tree1(en->target, reg, len, level); break; default: @@ -2433,7 +2514,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) int i; OnigDistance len; OnigCodePoint code; - UChar *p; + UChar *p, c; int ytype; retry: @@ -2444,7 +2525,8 @@ is_not_included(Node* x, Node* y, regex_t* reg) switch (ytype) { case NT_CTYPE: if (NCTYPE(y)->ctype == NCTYPE(x)->ctype && - NCTYPE(y)->not != NCTYPE(x)->not) + NCTYPE(y)->not != NCTYPE(x)->not && + NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range) return 1; else return 0; @@ -2480,7 +2562,12 @@ is_not_included(Node* x, Node* y, regex_t* reg) if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { if (BITSET_AT(xc->bs, i)) { - if (IS_CODE_SB_WORD(reg->enc, i)) return 0; + if (NCTYPE(y)->ascii_range) { + if (IS_CODE_SB_WORD(reg->enc, i)) return 0; + } + else { + if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0; + } } } return 1; @@ -2489,7 +2576,12 @@ is_not_included(Node* x, Node* y, regex_t* reg) } else { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! IS_CODE_SB_WORD(reg->enc, i)) { + int is_word; + if (NCTYPE(y)->ascii_range) + is_word = IS_CODE_SB_WORD(reg->enc, i); + else + is_word = ONIGENC_IS_CODE_WORD(reg->enc, i); + if (! is_word) { if (!IS_NCCLASS_NOT(xc)) { if (BITSET_AT(xc->bs, i)) return 0; @@ -2547,14 +2639,23 @@ is_not_included(Node* x, Node* y, regex_t* reg) if (NSTRING_LEN(x) == 0) break; + c = *(xs->s); switch (ytype) { case NT_CTYPE: switch (NCTYPE(y)->ctype) { case ONIGENC_CTYPE_WORD: - if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end)) - return NCTYPE(y)->not; - else - return !(NCTYPE(y)->not); + if (NCTYPE(y)->ascii_range) { + if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end)) + return NCTYPE(y)->not; + else + return !(NCTYPE(y)->not); + } + else { + if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end)) + return NCTYPE(y)->not; + else + return !(NCTYPE(y)->not); + } break; default: break; @@ -2582,7 +2683,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) return 0; } else { - for (i = 0, p = ys->s, q = xs->s; (OnigDistance)i < len; i++, p++, q++) { + for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) { if (*p != *q) return 1; } } @@ -2671,6 +2772,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) case ENCLOSE_MEMORY: case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: n = get_head_value_node(en->target, exact, reg); break; } @@ -3069,6 +3171,11 @@ setup_subexp_call(Node* node, ScanEnv* env) cn->unset_addr_list = env->unset_addr_list; } #ifdef USE_NAMED_GROUP +#ifdef USE_PERL_SUBEXP_CALL + else if (cn->name == cn->name_end) { + goto set_call_attr; + } +#endif else { int *refs; @@ -3079,7 +3186,8 @@ setup_subexp_call(Node* node, ScanEnv* env) ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } - else if (n > 1) { + else if (n > 1 && + ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) { onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end); return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; @@ -3172,7 +3280,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) } static int -next_setup(Node* node, Node* next_node, regex_t* reg) +next_setup(Node* node, Node* next_node, int in_root, regex_t* reg) { int type; @@ -3188,7 +3296,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) qn->next_head_exact = n; } #endif - /* automatic posseivation a*b ==> (?>a*)b */ + /* automatic possessivation a*b ==> (?>a*)b */ if (qn->lower <= 1) { int ttype = NTYPE(qn->target); if (IS_NODE_TYPE_SIMPLE(ttype)) { @@ -3206,10 +3314,32 @@ next_setup(Node* node, Node* next_node, regex_t* reg) } } } + +#ifndef ONIG_DONT_OPTIMIZE + if (NTYPE(node) == NT_QTFR && /* the type may be changed by above block */ + in_root && /* qn->lower == 0 && */ + NTYPE(qn->target) == NT_CANY && + ! IS_MULTILINE(reg->options)) { + /* implicit anchor: /.*a/ ==> /(?:^|\G).*a/ */ + Node *np; + np = onig_node_new_list(NULL_NODE, NULL_NODE); + CHECK_NULL_RETURN_MEMERR(np); + swap_node(node, np); + NCDR(node) = onig_node_new_list(np, NULL_NODE); + if (IS_NULL(NCDR(node))) { + onig_node_free(np); + return ONIGERR_MEMORY; + } + np = onig_node_new_anchor(ANCHOR_ANYCHAR_STAR); /* (?:^|\G) */ + CHECK_NULL_RETURN_MEMERR(np); + NCAR(node) = np; + } +#endif } } else if (type == NT_ENCLOSE) { EncloseNode* en = NENCLOSE(node); + in_root = 0; if (en->type == ENCLOSE_MEMORY) { node = en->target; goto retry; @@ -3222,7 +3352,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) static int update_string_node_case_fold(regex_t* reg, Node *node) { - UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + UChar *p, *q, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar *sbuf, *ebuf, *sp; int r, i, len; OnigDistance sbuf_size; @@ -3238,10 +3368,15 @@ update_string_node_case_fold(regex_t* reg, Node *node) p = sn->s; while (p < end) { len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + q = buf; for (i = 0; i < len; i++) { if (sp >= ebuf) { - sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); - CHECK_NULL_RETURN_MEMERR(sbuf); + UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2); + if (IS_NULL(p)) { + xfree(sbuf); + return ONIGERR_MEMORY; + } + sbuf = p; sp = sbuf + sbuf_size; sbuf_size *= 2; ebuf = sbuf + sbuf_size; @@ -3288,18 +3423,22 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, int slen, UChar *end, regex_t* reg, Node **rnode) { - int r, i, j, len, varlen; + int r, i, j, len, varlen, varclen; Node *anode, *var_anode, *snode, *xnode, *an; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; *rnode = var_anode = NULL_NODE; varlen = 0; + varclen = 0; for (i = 0; i < item_num; i++) { if (items[i].byte_len != slen) { varlen = 1; break; } + if (items[i].code_len != 1) { + varclen = 1; + } } if (varlen != 0) { @@ -3384,6 +3523,8 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], } } + if (varclen && !varlen) + return 2; return varlen; mem_err2: @@ -3401,6 +3542,7 @@ expand_case_fold_string(Node* node, regex_t* reg) #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 int r, n, len, alt_num; + int varlen = 0; UChar *start, *end, *p; Node *top_root, *root, *snode, *prev_node; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; @@ -3463,6 +3605,7 @@ expand_case_fold_string(Node* node, regex_t* reg) r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); if (r < 0) goto mem_err; + if (r > 0) varlen = 1; if (r == 1) { if (IS_NULL(root)) { top_root = prev_node; @@ -3476,7 +3619,7 @@ expand_case_fold_string(Node* node, regex_t* reg) root = NCAR(prev_node); } - else { /* r == 0 */ + else { /* r == 0 || r == 2 */ if (IS_NOT_NULL(root)) { if (IS_NULL(onig_node_list_add(root, prev_node))) { onig_node_free(prev_node); @@ -3519,9 +3662,20 @@ expand_case_fold_string(Node* node, regex_t* reg) /* ending */ top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); - swap_node(node, top_root); + if (!varlen) { + /* When all expanded strings are same length, case-insensitive + BM search will be used. */ + r = update_string_node_case_fold(reg, node); + if (r == 0) { + NSTRING_SET_AMBIG(node); + } + } + else { + swap_node(node, top_root); + r = 0; + } onig_node_free(top_root); - return 0; + return r; mem_err: r = ONIGERR_MEMORY; @@ -3676,6 +3830,7 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) #define IN_NOT (1<<1) #define IN_REPEAT (1<<2) #define IN_VAR_REPEAT (1<<3) +#define IN_ROOT (1<<4) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -3690,19 +3845,25 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int type; int r = 0; + int in_root = state & IN_ROOT; + state &= ~IN_ROOT; restart: type = NTYPE(node); switch (type) { case NT_LIST: { Node* prev = NULL_NODE; + int prev_in_root = 0; + state |= in_root; do { r = setup_tree(NCAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NCAR(node), reg); + r = next_setup(prev, NCAR(node), prev_in_root, reg); } prev = NCAR(node); + prev_in_root = state & IN_ROOT; + state &= ~IN_ROOT; } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); } break; @@ -3797,21 +3958,47 @@ restart: /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NTYPE(target) == NT_STR) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && - qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { + if (qn->lower > 1) { + int i, n = qn->lower; OnigDistance len = NSTRING_LEN(target); StrNode* sn = NSTR(target); + Node* np; + + np = onig_node_new_str(sn->s, sn->end); + if (IS_NULL(np)) return ONIGERR_MEMORY; + NSTR(np)->flag = sn->flag; + + for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) { + r = onig_node_str_cat(np, sn->s, sn->end); + if (r) { + onig_node_free(np); + return r; + } + } + if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) { + Node *np1, *np2; - if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { - int i, n = qn->lower; - onig_node_conv_to_str_node(node, NSTR(target)->flag); - for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); - if (r) break; + qn->lower -= i; + if (! IS_REPEAT_INFINITE(qn->upper)) + qn->upper -= i; + + np1 = onig_node_new_list(np, NULL); + if (IS_NULL(np1)) { + onig_node_free(np); + return ONIGERR_MEMORY; + } + swap_node(np1, node); + np2 = onig_node_list_add(node, np1); + if (IS_NULL(np2)) { + onig_node_free(np1); + return ONIGERR_MEMORY; } - onig_node_free(target); - break; /* break case NT_QTFR: */ } + else { + swap_node(np, node); + onig_node_free(np); + } + break; /* break case NT_QTFR: */ } } @@ -3840,6 +4027,7 @@ restart: case ENCLOSE_OPTION: { OnigOptionType options = reg->options; + state |= in_root; reg->options = NENCLOSE(node)->option; r = setup_tree(NENCLOSE(node)->target, reg, state, env); reg->options = options; @@ -3869,6 +4057,18 @@ restart: } } break; + + case ENCLOSE_CONDITION: +#ifdef USE_NAMED_GROUP + if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) && + env->num_named > 0 && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { + return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; + } +#endif + r = setup_tree(NENCLOSE(node)->target, reg, state, env); + break; } } break; @@ -3894,9 +4094,15 @@ restart: #define ALLOWED_ENCLOSE_IN_LB_NOT 0 #define ALLOWED_ANCHOR_IN_LB \ -( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION ) +( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \ + ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \ + ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \ + ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) #define ALLOWED_ANCHOR_IN_LB_NOT \ -( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION ) +( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \ + ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \ + ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \ + ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) case ANCHOR_LOOK_BEHIND: { @@ -3934,32 +4140,151 @@ restart: return r; } -/* set skip map for Boyer-Moor search */ +#ifndef USE_SUNDAY_QUICK_SEARCH +/* set skip map for Boyer-Moore search */ static int -set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, - UChar skip[], int** int_skip) +set_bm_skip(UChar* s, UChar* end, regex_t* reg, + UChar skip[], int** int_skip, int ignore_case) { OnigDistance i, len; + int clen, flen, n, j, k; + UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; + OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; + OnigEncoding enc = reg->enc; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; - - for (i = 0; i < len - 1; i++) - skip[s[i]] = len - 1 - i; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len; + + n = 0; + for (i = 0; i < len - 1; i += clen) { + p = s + i; + if (ignore_case) + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + + for (j = 0; j < n; j++) { + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) + return 1; /* different length isn't supported. */ + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); + if (flen != clen) + return 1; /* different length isn't supported. */ + } + for (j = 0; j < clen; j++) { + skip[s[i + j]] = (UChar )(len - 1 - i - j); + for (k = 0; k < n; k++) { + skip[buf[k][j]] = (UChar )(len - 1 - i - j); + } + } + } } else { if (IS_NULL(*int_skip)) { *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; } - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int)len; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len; + + n = 0; + for (i = 0; i < len - 1; i += clen) { + p = s + i; + if (ignore_case) + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + + for (j = 0; j < n; j++) { + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) + return 1; /* different length isn't supported. */ + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); + if (flen != clen) + return 1; /* different length isn't supported. */ + } + for (j = 0; j < clen; j++) { + (*int_skip)[s[i + j]] = (int )(len - 1 - i - j); + for (k = 0; k < n; k++) { + (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j); + } + } + } + } + return 0; +} + +#else /* USE_SUNDAY_QUICK_SEARCH */ + +/* set skip map for Sunday's quick search */ +static int +set_bm_skip(UChar* s, UChar* end, regex_t* reg, + UChar skip[], int** int_skip, int ignore_case) +{ + OnigDistance i, len; + int clen, flen, n, j, k; + UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; + OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; + OnigEncoding enc = reg->enc; - for (i = 0; i < len - 1; i++) - (*int_skip)[s[i]] = (int)(len - 1 - i); + len = end - s; + if (len < ONIG_CHAR_TABLE_SIZE) { + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1); + + n = 0; + for (i = 0; i < len; i += clen) { + p = s + i; + if (ignore_case) + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + + for (j = 0; j < n; j++) { + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) + return 1; /* different length isn't supported. */ + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); + if (flen != clen) + return 1; /* different length isn't supported. */ + } + for (j = 0; j < clen; j++) { + skip[s[i + j]] = (UChar )(len - i - j); + for (k = 0; k < n; k++) { + skip[buf[k][j]] = (UChar )(len - i - j); + } + } + } + } + else { + if (IS_NULL(*int_skip)) { + *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); + if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; + } + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1); + + n = 0; + for (i = 0; i < len; i += clen) { + p = s + i; + if (ignore_case) + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + + for (j = 0; j < n; j++) { + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) + return 1; /* different length isn't supported. */ + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); + if (flen != clen) + return 1; /* different length isn't supported. */ + } + for (j = 0; j < clen; j++) { + (*int_skip)[s[i + j]] = (int )(len - i - j); + for (k = 0; k < n; k++) { + (*int_skip)[buf[k][j]] = (int )(len - i - j); + } + } + } } return 0; } +#endif /* USE_SUNDAY_QUICK_SEARCH */ #define OPT_EXACT_MAXLEN 24 @@ -4539,7 +4864,7 @@ concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add) if (to->expr.len > 0) { if (add->len.max > 0) { if (to->expr.len > (int )add->len.max) - to->expr.len = (int)add->len.max; + to->expr.len = (int )add->len.max; if (to->expr.mmd.max == 0) select_opt_exact_info(enc, &to->exb, &to->expr); @@ -4652,7 +4977,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) set_mml(&opt->len, slen, max); } - if ((OnigDistance)opt->exb.len == slen) + if ((OnigDistance )opt->exb.len == slen) opt->exb.reach_end = 1; } break; @@ -4685,23 +5010,25 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case NT_CTYPE: { int i, min, max; + int maxcode; max = ONIGENC_MBC_MAXLEN_DIST(env->enc); if (max == 1) { min = 1; + maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE; switch (NCTYPE(node)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(node)->not != 0) { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! ONIGENC_IS_CODE_WORD(env->enc, i)) { + if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } } } else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + for (i = 0; i < maxcode; i++) { if (ONIGENC_IS_CODE_WORD(env->enc, i)) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } @@ -4733,6 +5060,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case ANCHOR_END_BUF: case ANCHOR_SEMI_END_BUF: case ANCHOR_END_LINE: + case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */ add_opt_anc_info(&opt->anc, NANCHOR(node)->type); break; @@ -4756,7 +5084,6 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) break; case ANCHOR_PREC_READ_NOT: - case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */ case ANCHOR_LOOK_BEHIND_NOT: break; } @@ -4814,10 +5141,11 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) r = optimize_node_left(qn->target, &nopt, env); if (r) break; - if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { + if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) { if (env->mmd.max == 0 && NTYPE(qn->target) == NT_CANY && qn->greedy) { if (IS_MULTILINE(env->options)) + /* implicit anchor: /.*a/ ==> /\A.*a/ */ add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML); else add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); @@ -4897,6 +5225,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) break; case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: r = optimize_node_left(en->target, opt, env); break; } @@ -4919,29 +5248,38 @@ static int set_optimize_exact_info(regex_t* reg, OptExactInfo* e) { int r; + int allow_reverse; if (e->len == 0) return 0; - if (e->ignore_case) { - reg->exact = (UChar* )xmalloc(e->len); - CHECK_NULL_RETURN_MEMERR(reg->exact); - xmemcpy(reg->exact, e->s, e->len); - reg->exact_end = reg->exact + e->len; - reg->optimize = ONIG_OPTIMIZE_EXACT_IC; - } - else { - int allow_reverse; - - reg->exact = str_dup(e->s, e->s + e->len); - CHECK_NULL_RETURN_MEMERR(reg->exact); - reg->exact_end = reg->exact + e->len; + reg->exact = (UChar* )xmalloc(e->len); + CHECK_NULL_RETURN_MEMERR(reg->exact); + xmemcpy(reg->exact, e->s, e->len); + reg->exact_end = reg->exact + e->len; - allow_reverse = + allow_reverse = ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); + if (e->ignore_case) { + if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { + r = set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, &(reg->int_map), 1); + if (r == 0) { + reg->optimize = (allow_reverse != 0 + ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC); + } + else { + reg->optimize = ONIG_OPTIMIZE_EXACT_IC; + } + } + else { + reg->optimize = ONIG_OPTIMIZE_EXACT_IC; + } + } + else { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, - reg->map, &(reg->int_map)); + r = set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, &(reg->int_map), 0); if (r) return r; reg->optimize = (allow_reverse != 0 @@ -4956,7 +5294,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) reg->dmax = e->mmd.max; if (reg->dmin != ONIG_INFINITE_DISTANCE) { - reg->threshold_len = (int)(reg->dmin + (reg->exact_end - reg->exact)); + reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact)); } return 0; @@ -4975,7 +5313,7 @@ set_optimize_map_info(regex_t* reg, OptMapInfo* m) reg->dmax = m->mmd.max; if (reg->dmin != ONIG_INFINITE_DISTANCE) { - reg->threshold_len = (int)(reg->dmin + 1); + reg->threshold_len = (int )(reg->dmin + 1); } } @@ -5008,7 +5346,8 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) if (r) return r; reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF | - ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML); + ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML | + ANCHOR_LOOK_BEHIND); reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF); @@ -5154,7 +5493,7 @@ print_anchor(FILE* f, int anchor) } if (anchor & ANCHOR_ANYCHAR_STAR_ML) { if (q) fprintf(f, ", "); - fprintf(f, "anychar-star-pl"); + fprintf(f, "anychar-star-ml"); } fprintf(f, "]"); @@ -5164,7 +5503,8 @@ static void print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", - "EXACT_IC", "MAP" }; + "EXACT_IC", "MAP", + "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); @@ -5244,7 +5584,7 @@ size_t onig_memsize(const regex_t *reg) { size_t size = sizeof(regex_t); - if (!reg) return 0; + if (IS_NULL(reg)) return 0; if (IS_NOT_NULL(reg->p)) size += reg->alloc; if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact; if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE; @@ -5259,7 +5599,7 @@ size_t onig_region_memsize(const OnigRegion *regs) { size_t size = sizeof(*regs); - if (!regs) return 0; + if (IS_NULL(regs)) return 0; size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end)); return size; } @@ -5405,7 +5745,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->num_call = 0; #endif - r = setup_tree(root, reg, 0, &scan_env); + r = setup_tree(root, reg, IN_ROOT, &scan_env); if (r != 0) goto err_unset; #ifdef ONIG_DEBUG_PARSE_TREE @@ -5554,7 +5894,7 @@ onig_reg_init(regex_t* reg, OnigOptionType option, return ONIGERR_INVALID_ARGUMENT; if (ONIGENC_IS_UNDEF(enc)) - return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED; + return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET; if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) { @@ -5779,19 +6119,26 @@ OnigOpInfoType OnigOpInfo[] = { { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, { OP_WORD_BEGIN, "word-begin", ARG_NON }, { OP_WORD_END, "word-end", ARG_NON }, + { OP_ASCII_WORD, "ascii-word", ARG_NON }, + { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON }, + { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON }, + { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON }, + { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON }, + { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON }, { OP_BEGIN_BUF, "begin-buf", ARG_NON }, { OP_END_BUF, "end-buf", ARG_NON }, { OP_BEGIN_LINE, "begin-line", ARG_NON }, { OP_END_LINE, "end-line", ARG_NON }, { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, { OP_BEGIN_POSITION, "begin-position", ARG_NON }, + { OP_BEGIN_POS_OR_LINE, "begin-pos-or-line", ARG_NON }, { OP_BACKREF1, "backref1", ARG_NON }, { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL }, - { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL }, + { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL }, { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, @@ -5800,6 +6147,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, { OP_SET_OPTION, "set-option", ARG_OPTION }, + { OP_KEEP, "keep", ARG_NON }, { OP_FAIL, "fail", ARG_NON }, { OP_JUMP, "jump", ARG_RELADDR }, { OP_PUSH, "push", ARG_RELADDR }, @@ -5827,6 +6175,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, { OP_CALL, "call", ARG_ABSADDR }, { OP_RETURN, "return", ARG_NON }, + { OP_CONDITION, "condition", ARG_SPECIAL }, { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL }, { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL }, { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK }, @@ -6118,6 +6467,12 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, fprintf(f, ":%d:(%d)", scn, addr); break; + case OP_CONDITION: + GET_MEMNUM_INC(mem, bp); + GET_RELADDR_INC(addr, bp); + fprintf(f, ":%d:(%d)", mem, addr); + break; + default: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", *--bp); @@ -6134,17 +6489,15 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) UChar* bp = reg->p; UChar* end = reg->p + reg->used; - fprintf(f, "code length: %d\n", reg->used); + fprintf(f, "code length: %d", reg->used); ncode = -1; while (bp < end) { ncode++; - if (bp > reg->p) { - if (ncode % 5 == 0) - fprintf(f, "\n%ld:", bp-reg->p); - else - fprintf(f, " %ld:", bp-reg->p); - } + if (ncode % 5 == 0) + fprintf(f, "\n%ld:", bp - reg->p); + else + fprintf(f, " %ld:", bp - reg->p); onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc); } @@ -6200,7 +6553,7 @@ print_indent_tree(FILE* f, Node* node, int indent) if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f); if (NCCLASS(node)->mbuf) { BBuf* bbuf = NCCLASS(node)->mbuf; - for (i = 0; i < (int)bbuf->used; i++) { + for (i = 0; i < (int )bbuf->used; i++) { if (i > 0) fprintf(f, ","); fprintf(f, "%0x", bbuf->p[i]); } @@ -6236,6 +6589,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case ANCHOR_END_LINE: fputs("end line", f); break; case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; + case ANCHOR_ANYCHAR_STAR: fputs("begin position/line", f); break; case ANCHOR_WORD_BOUND: fputs("word bound", f); break; case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; @@ -6247,6 +6601,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break; case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break; case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break; + case ANCHOR_KEEP: fputs("keep",f); break; default: fprintf(f, "ERROR: undefined anchor type.\n"); @@ -6296,6 +6651,9 @@ print_indent_tree(FILE* f, Node* node, int indent) case ENCLOSE_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; + case ENCLOSE_CONDITION: + fprintf(f, "condition:%d", NENCLOSE(node)->regnum); + break; default: break; -- cgit v1.2.1