diff options
Diffstat (limited to 'ext/mbstring/oniguruma/regparse.c')
| -rw-r--r-- | ext/mbstring/oniguruma/regparse.c | 1823 |
1 files changed, 1043 insertions, 780 deletions
diff --git a/ext/mbstring/oniguruma/regparse.c b/ext/mbstring/oniguruma/regparse.c index af5c8593e6..01131300c6 100644 --- a/ext/mbstring/oniguruma/regparse.c +++ b/ext/mbstring/oniguruma/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,9 +28,13 @@ */ #include "regparse.h" +#include "st.h" #define WARN_BUFSIZE 256 +#define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS + + OnigSyntaxType OnigSyntaxRuby = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | @@ -41,6 +45,8 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_OPTION_RUBY | ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | + ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | @@ -54,25 +60,20 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE + , + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + } }; OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; -extern void onig_null_warn(const char* s) { } - -#ifdef RUBY_PLATFORM -extern void -onig_rb_warn(const char* s) -{ - rb_warn("%s", s); -} - -extern void -onig_rb_warning(const char* s) -{ - rb_warning("%s", s); -} -#endif +extern void onig_null_warn(const char* s ARG_UNUSED) { } #ifdef DEFAULT_WARN_FUNCTION static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; @@ -112,7 +113,7 @@ bbuf_clone(BBuf** rto, BBuf* from) BBuf *to; *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); - CHECK_NULL_RETURN_VAL(to, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(to); r = BBUF_INIT(to, from->alloc); if (r != 0) return r; to->used = from->used; @@ -120,6 +121,9 @@ bbuf_clone(BBuf** rto, BBuf* from) return 0; } +#define BACKREF_REL_TO_ABS(rel_no, env) \ + ((env)->num_mem + 1 + (rel_no)) + #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) #define MBCODE_START_POS(enc) \ @@ -139,7 +143,7 @@ bbuf_clone(BBuf** rto, BBuf* from) #define BITSET_IS_EMPTY(bs,empty) do {\ int i;\ empty = 1;\ - for (i = 0; i < BITSET_SIZE; i++) {\ + for (i = 0; i < (int )BITSET_SIZE; i++) {\ if ((bs)[i] != 0) {\ empty = 0; break;\ }\ @@ -160,9 +164,7 @@ static void bitset_set_all(BitSetRef bs) { int i; - for (i = 0; i < BITSET_SIZE; i++) { - bs[i] = ~((Bits )0); - } + for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } } #endif @@ -170,45 +172,35 @@ static void bitset_invert(BitSetRef bs) { int i; - for (i = 0; i < BITSET_SIZE; i++) { - bs[i] = ~(bs[i]); - } + for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } } static void bitset_invert_to(BitSetRef from, BitSetRef to) { int i; - for (i = 0; i < BITSET_SIZE; i++) { - to[i] = ~(from[i]); - } + for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); } } static void bitset_and(BitSetRef dest, BitSetRef bs) { int i; - for (i = 0; i < BITSET_SIZE; i++) { - dest[i] &= bs[i]; - } + for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; } } static void bitset_or(BitSetRef dest, BitSetRef bs) { int i; - for (i = 0; i < BITSET_SIZE; i++) { - dest[i] |= bs[i]; - } + for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; } } static void bitset_copy(BitSetRef dest, BitSetRef bs) { int i; - for (i = 0; i < BITSET_SIZE; i++) { - dest[i] = bs[i]; - } + for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; } } extern int @@ -223,8 +215,8 @@ onig_strncmp(const UChar* s1, const UChar* s2, int n) return 0; } -static void -k_strcpy(UChar* dest, const UChar* src, const UChar* end) +extern void +onig_strcpy(UChar* dest, const UChar* src, const UChar* end) { int len = end - src; if (len > 0) { @@ -233,6 +225,7 @@ k_strcpy(UChar* dest, const UChar* src, const UChar* end) } } +#ifdef USE_NAMED_GROUP static UChar* strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) { @@ -251,7 +244,7 @@ strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) return r; } - +#endif /* scan pattern methods */ #define PEND_VALUE 0 @@ -273,7 +266,7 @@ strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) static UChar* -k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, +strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, int capa) { UChar* r; @@ -284,7 +277,7 @@ k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_e r = (UChar* )xmalloc(capa + 1); CHECK_NULL_RETURN(r); - k_strcpy(r + (dest_end - dest), src, src_end); + onig_strcpy(r + (dest_end - dest), src, src_end); return r; } @@ -297,78 +290,23 @@ strcat_capa_from_static(UChar* dest, UChar* dest_end, r = (UChar* )xmalloc(capa + 1); CHECK_NULL_RETURN(r); - k_strcpy(r, dest, dest_end); - k_strcpy(r + (dest_end - dest), src, src_end); + onig_strcpy(r, dest, dest_end); + onig_strcpy(r + (dest_end - dest), src, src_end); return r; } -#ifdef USE_NAMED_GROUP - -#define INIT_NAME_BACKREFS_ALLOC_NUM 8 - -typedef struct { - UChar* name; - int name_len; /* byte length */ - int back_num; /* number of backrefs */ - int back_alloc; - int back_ref1; - int* back_refs; -} NameEntry; - -#ifdef USE_ST_HASH_TABLE -#include "st.h" +#ifdef USE_ST_LIBRARY typedef struct { - unsigned char* s; - unsigned char* end; -} st_strend_key; - -static int strend_cmp(st_strend_key*, st_strend_key*); -static int strend_hash(st_strend_key*); - -static struct st_hash_type type_strend_hash = { - strend_cmp, - strend_hash, -}; - -static st_table* -onig_st_init_strend_table_with_size(int size) -{ - return onig_st_init_table_with_size(&type_strend_hash, size); -} - -static int -onig_st_lookup_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t *value) -{ - st_strend_key key; - - key.s = (unsigned char* )str_key; - key.end = (unsigned char* )end_key; - - return onig_st_lookup(table, (st_data_t )(&key), value); -} - -static int -onig_st_insert_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t value) -{ - st_strend_key* key; - int result; - - key = (st_strend_key* )xmalloc(sizeof(st_strend_key)); - key->s = (unsigned char* )str_key; - key->end = (unsigned char* )end_key; - result = onig_st_insert(table, (st_data_t )key, value); - if (result) { - xfree(key); - } - return result; -} + UChar* s; + UChar* end; +} st_str_end_key; static int -strend_cmp(st_strend_key* x, st_strend_key* y) +str_end_cmp(st_str_end_key* x, st_str_end_key* y) { - unsigned char *p, *q; + UChar *p, *q; int c; if ((x->end - x->s) != (y->end - y->s)) @@ -387,12 +325,11 @@ strend_cmp(st_strend_key* x, st_strend_key* y) } static int -strend_hash(st_strend_key* x) +str_end_hash(st_str_end_key* x) { - int val; - unsigned char *p; + UChar *p; + int val = 0; - val = 0; p = x->s; while (p < x->end) { val = val * 997 + (int )*p++; @@ -401,6 +338,65 @@ strend_hash(st_strend_key* x) return val + (val >> 5); } +extern hash_table_type* +onig_st_init_strend_table_with_size(int size) +{ + static struct st_hash_type hashType = { + str_end_cmp, + str_end_hash, + }; + + return (hash_table_type* ) + onig_st_init_table_with_size(&hashType, size); +} + +extern int +onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, + const UChar* end_key, hash_data_type *value) +{ + st_str_end_key key; + + key.s = (UChar* )str_key; + key.end = (UChar* )end_key; + + return onig_st_lookup(table, (st_data_t )(&key), value); +} + +extern int +onig_st_insert_strend(hash_table_type* table, const UChar* str_key, + const UChar* end_key, hash_data_type value) +{ + st_str_end_key* key; + int result; + + key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); + key->s = (UChar* )str_key; + key->end = (UChar* )end_key; + result = onig_st_insert(table, (st_data_t )key, value); + if (result) { + xfree(key); + } + return result; +} + +#endif /* USE_ST_LIBRARY */ + + +#ifdef USE_NAMED_GROUP + +#define INIT_NAME_BACKREFS_ALLOC_NUM 8 + +typedef struct { + UChar* name; + int name_len; /* byte length */ + int back_num; /* number of backrefs */ + int back_alloc; + int back_ref1; + int* back_refs; +} NameEntry; + +#ifdef USE_ST_LIBRARY + typedef st_table NameTable; typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ @@ -441,10 +437,10 @@ onig_print_names(FILE* fp, regex_t* reg) } return 0; } -#endif +#endif /* ONIG_DEBUG */ static int -i_free_name_entry(UChar* key, NameEntry* e, void* arg) +i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) { xfree(e->name); if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); @@ -501,10 +497,9 @@ typedef struct { } INamesArg; static int -i_names(UChar* key, NameEntry* e, INamesArg* arg) +i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) { int r = (*(arg->func))(e->name, - /*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */ e->name + e->name_len, e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), @@ -518,8 +513,7 @@ i_names(UChar* key, NameEntry* e, INamesArg* arg) extern int onig_foreach_name(regex_t* reg, - int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) { INamesArg narg; NameTable* t = (NameTable* )reg->name_table; @@ -536,7 +530,7 @@ onig_foreach_name(regex_t* reg, } static int -i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map) +i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) { int i; @@ -575,7 +569,7 @@ onig_number_of_names(regex_t* reg) return 0; } -#else /* USE_ST_HASH_TABLE */ +#else /* USE_ST_LIBRARY */ #define INIT_NAMES_ALLOC_NUM 8 @@ -585,7 +579,6 @@ typedef struct { int alloc; } NameTable; - #ifdef ONIG_DEBUG extern int onig_print_names(FILE* fp, regex_t* reg) @@ -683,8 +676,7 @@ name_find(regex_t* reg, UChar* name, UChar* name_end) extern int onig_foreach_name(regex_t* reg, - int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) { int i, r; NameEntry* e; @@ -713,7 +705,7 @@ onig_number_of_names(regex_t* reg) return 0; } -#endif /* else USE_ST_HASH_TABLE */ +#endif /* else USE_ST_LIBRARY */ static int name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) @@ -727,16 +719,18 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) e = name_find(reg, name, name_end); if (IS_NULL(e)) { -#ifdef USE_ST_HASH_TABLE +#ifdef USE_ST_LIBRARY if (IS_NULL(t)) { t = onig_st_init_strend_table_with_size(5); reg->name_table = (void* )t; } e = (NameEntry* )xmalloc(sizeof(NameEntry)); - CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(e); e->name = strdup_with_null(reg->enc, name, name_end); - if (IS_NULL(e->name)) return ONIGERR_MEMORY; + if (IS_NULL(e->name)) { + xfree(e); return ONIGERR_MEMORY; + } onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), (HashDataType )e); @@ -750,7 +744,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) if (IS_NULL(t)) { alloc = INIT_NAMES_ALLOC_NUM; t = (NameTable* )xmalloc(sizeof(NameTable)); - CHECK_NULL_RETURN_VAL(t, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(t); t->e = NULL; t->alloc = 0; t->num = 0; @@ -769,7 +763,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) alloc = t->alloc * 2; t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); - CHECK_NULL_RETURN_VAL(t->e, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(t->e); t->alloc = alloc; clear: @@ -784,6 +778,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) e = &(t->e[t->num]); t->num++; e->name = strdup_with_null(reg->enc, name, name_end); + if (IS_NULL(e->name)) return ONIGERR_MEMORY; e->name_len = name_end - name; #endif } @@ -803,7 +798,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) if (e->back_num == 2) { alloc = INIT_NAME_BACKREFS_ALLOC_NUM; e->back_refs = (int* )xmalloc(sizeof(int) * alloc); - CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(e->back_refs); e->back_alloc = alloc; e->back_refs[0] = e->back_ref1; e->back_refs[1] = backref; @@ -812,7 +807,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) if (e->back_num > e->back_alloc) { alloc = e->back_alloc * 2; e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); - CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(e->back_refs); e->back_alloc = alloc; } e->back_refs[e->back_num - 1] = backref; @@ -826,9 +821,8 @@ extern int onig_name_to_group_numbers(regex_t* reg, const UChar* name, const UChar* name_end, int** nums) { - NameEntry* e; + NameEntry* e = name_find(reg, name, name_end); - e = name_find(reg, name, name_end); if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; switch (e->back_num) { @@ -886,8 +880,7 @@ onig_name_to_backref_number(regex_t* reg, const UChar* name, extern int onig_foreach_name(regex_t* reg, - int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) { return ONIG_NO_SUPPORT_CONFIG; } @@ -928,12 +921,12 @@ scan_env_clear(ScanEnv* env) BIT_STATUS_CLEAR(env->bt_mem_start); BIT_STATUS_CLEAR(env->bt_mem_end); BIT_STATUS_CLEAR(env->backrefed_mem); - env->error = (UChar* )NULL; - env->error_end = (UChar* )NULL; - env->num_call = 0; - env->num_mem = 0; + env->error = (UChar* )NULL; + env->error_end = (UChar* )NULL; + env->num_call = 0; + env->num_mem = 0; #ifdef USE_NAMED_GROUP - env->num_named = 0; + env->num_named = 0; #endif env->mem_alloc = 0; env->mem_nodes_dynamic = (Node** )NULL; @@ -968,7 +961,7 @@ scan_env_add_mem_entry(ScanEnv* env) alloc = env->mem_alloc * 2; p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); } - CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(p); for (i = env->num_mem + 1; i < alloc; i++) p[i] = NULL_NODE; @@ -993,7 +986,7 @@ scan_env_set_mem_node(ScanEnv* env, int num, Node* node) } -#ifdef USE_RECYCLE_NODE +#ifdef USE_PARSE_TREE_NODE_RECYCLE typedef struct _FreeNode { struct _FreeNode* next; } FreeNode; @@ -1008,20 +1001,20 @@ onig_node_free(Node* node) if (IS_NULL(node)) return ; switch (NTYPE(node)) { - case N_STRING: - if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { - xfree(NSTRING(node).s); + case NT_STR: + if (NSTR(node)->capa != 0 && + IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { + xfree(NSTR(node)->s); } break; - case N_LIST: - case N_ALT: - onig_node_free(NCONS(node).left); - /* onig_node_free(NCONS(node).right); */ + case NT_LIST: + case NT_ALT: + onig_node_free(NCAR(node)); { - Node* next_node = NCONS(node).right; + Node* next_node = NCDR(node); -#ifdef USE_RECYCLE_NODE +#ifdef USE_PARSE_TREE_NODE_RECYCLE { FreeNode* n = (FreeNode* )node; @@ -1033,46 +1026,43 @@ onig_node_free(Node* node) #else xfree(node); #endif - node = next_node; goto start; } break; - case N_CCLASS: + case NT_CCLASS: { - CClassNode* cc = &(NCCLASS(node)); - - if (IS_CCLASS_SHARE(cc)) - return ; + CClassNode* cc = NCCLASS(node); + if (IS_NCCLASS_SHARE(cc)) return ; if (cc->mbuf) bbuf_free(cc->mbuf); } break; - case N_QUANTIFIER: - if (NQUANTIFIER(node).target) - onig_node_free(NQUANTIFIER(node).target); + case NT_QTFR: + if (NQTFR(node)->target) + onig_node_free(NQTFR(node)->target); break; - case N_EFFECT: - if (NEFFECT(node).target) - onig_node_free(NEFFECT(node).target); + case NT_ENCLOSE: + if (NENCLOSE(node)->target) + onig_node_free(NENCLOSE(node)->target); break; - case N_BACKREF: - if (IS_NOT_NULL(NBACKREF(node).back_dynamic)) - xfree(NBACKREF(node).back_dynamic); + case NT_BREF: + if (IS_NOT_NULL(NBREF(node)->back_dynamic)) + xfree(NBREF(node)->back_dynamic); break; - case N_ANCHOR: - if (NANCHOR(node).target) - onig_node_free(NANCHOR(node).target); + case NT_ANCHOR: + if (NANCHOR(node)->target) + onig_node_free(NANCHOR(node)->target); break; } -#ifdef USE_RECYCLE_NODE +#ifdef USE_PARSE_TREE_NODE_RECYCLE { FreeNode* n = (FreeNode* )node; @@ -1086,7 +1076,7 @@ onig_node_free(Node* node) #endif } -#ifdef USE_RECYCLE_NODE +#ifdef USE_PARSE_TREE_NODE_RECYCLE extern int onig_free_node_list(void) { @@ -1108,7 +1098,7 @@ node_new(void) { Node* node; -#ifdef USE_RECYCLE_NODE +#ifdef USE_PARSE_TREE_NODE_RECYCLE THREAD_ATOMIC_START; if (IS_NOT_NULL(FreeNodeList)) { node = (Node* )FreeNodeList; @@ -1120,6 +1110,7 @@ node_new(void) #endif node = (Node* )xmalloc(sizeof(Node)); + /* xmemset(node, 0, sizeof(Node)); */ return node; } @@ -1128,6 +1119,7 @@ static void initialize_cclass(CClassNode* cc) { BITSET_CLEAR(cc->bs); + /* cc->base.flags = 0; */ cc->flags = 0; cc->mbuf = NULL; } @@ -1137,53 +1129,55 @@ node_new_cclass(void) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_CCLASS; - initialize_cclass(&(NCCLASS(node))); + SET_NTYPE(node, NT_CCLASS); + initialize_cclass(NCCLASS(node)); return node; } static Node* -node_new_cclass_by_codepoint_range(int not, - const OnigCodePoint sbr[], const OnigCodePoint mbr[]) +node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out, + const OnigCodePoint ranges[]) { + int n, i; CClassNode* cc; - int n, i, j; + OnigCodePoint j; - Node* node = node_new(); + Node* node = node_new_cclass(); CHECK_NULL_RETURN(node); - node->type = N_CCLASS; - cc = &(NCCLASS(node)); - cc->flags = 0; - if (not != 0) CCLASS_SET_NOT(cc); + cc = NCCLASS(node); + if (not != 0) NCCLASS_SET_NOT(cc); BITSET_CLEAR(cc->bs); - if (IS_NOT_NULL(sbr)) { - n = ONIGENC_CODE_RANGE_NUM(sbr); + if (sb_out > 0 && IS_NOT_NULL(ranges)) { + n = ONIGENC_CODE_RANGE_NUM(ranges); for (i = 0; i < n; i++) { - for (j = ONIGENC_CODE_RANGE_FROM(sbr, i); - j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) { + for (j = ONIGENC_CODE_RANGE_FROM(ranges, i); + j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) { + if (j >= sb_out) goto sb_end; + BITSET_SET_BIT(cc->bs, j); } } } - if (IS_NULL(mbr)) { + sb_end: + if (IS_NULL(ranges)) { is_null: cc->mbuf = NULL; } else { BBuf* bbuf; - n = ONIGENC_CODE_RANGE_NUM(mbr); + n = ONIGENC_CODE_RANGE_NUM(ranges); if (n == 0) goto is_null; bbuf = (BBuf* )xmalloc(sizeof(BBuf)); - CHECK_NULL_RETURN_VAL(bbuf, NULL); + CHECK_NULL_RETURN(bbuf); bbuf->alloc = n + 1; bbuf->used = n + 1; - bbuf->p = (UChar* )((void* )mbr); + bbuf->p = (UChar* )((void* )ranges); cc->mbuf = bbuf; } @@ -1192,12 +1186,14 @@ node_new_cclass_by_codepoint_range(int not, } static Node* -node_new_ctype(int type) +node_new_ctype(int type, int not) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_CTYPE; - NCTYPE(node).type = type; + + SET_NTYPE(node, NT_CTYPE); + NCTYPE(node)->ctype = type; + NCTYPE(node)->not = not; return node; } @@ -1206,7 +1202,8 @@ node_new_anychar(void) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_ANYCHAR; + + SET_NTYPE(node, NT_CANY); return node; } @@ -1215,9 +1212,10 @@ node_new_list(Node* left, Node* right) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_LIST; - NCONS(node).left = left; - NCONS(node).right = right; + + SET_NTYPE(node, NT_LIST); + NCAR(node) = left; + NCDR(node) = right; return node; } @@ -1227,14 +1225,33 @@ onig_node_new_list(Node* left, Node* right) return node_new_list(left, right); } -static Node* -node_new_alt(Node* left, Node* right) +extern Node* +onig_node_list_add(Node* list, Node* x) +{ + Node *n; + + n = onig_node_new_list(x, NULL); + if (IS_NULL(n)) return NULL_NODE; + + if (IS_NOT_NULL(list)) { + while (IS_NOT_NULL(NCDR(list))) + list = NCDR(list); + + NCDR(list) = n; + } + + return n; +} + +extern Node* +onig_node_new_alt(Node* left, Node* right) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_ALT; - NCONS(node).left = left; - NCONS(node).right = right; + + SET_NTYPE(node, NT_ALT); + NCAR(node) = left; + NCDR(node) = right; return node; } @@ -1243,16 +1260,17 @@ onig_node_new_anchor(int type) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_ANCHOR; - NANCHOR(node).type = type; - NANCHOR(node).target = NULL; - NANCHOR(node).char_len = -1; + + SET_NTYPE(node, NT_ANCHOR); + NANCHOR(node)->type = type; + NANCHOR(node)->target = NULL; + NANCHOR(node)->char_len = -1; return node; } static Node* node_new_backref(int back_num, int* backrefs, int by_name, -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif ScanEnv* env) @@ -1261,31 +1279,32 @@ node_new_backref(int back_num, int* backrefs, int by_name, Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_BACKREF; - NBACKREF(node).state = 0; - NBACKREF(node).back_num = back_num; - NBACKREF(node).back_dynamic = (int* )NULL; + + SET_NTYPE(node, NT_BREF); + NBREF(node)->state = 0; + NBREF(node)->back_num = back_num; + NBREF(node)->back_dynamic = (int* )NULL; if (by_name != 0) - NBACKREF(node).state |= NST_NAME_REF; + NBREF(node)->state |= NST_NAME_REF; -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL if (exist_level != 0) { - NBACKREF(node).state |= NST_NEST_LEVEL; - NBACKREF(node).nest_level = nest_level; + NBREF(node)->state |= NST_NEST_LEVEL; + NBREF(node)->nest_level = nest_level; } #endif for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { - NBACKREF(node).state |= NST_RECURSION; /* /...(\1).../ */ + NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */ break; } } if (back_num <= NODE_BACKREFS_SIZE) { for (i = 0; i < back_num; i++) - NBACKREF(node).back_static[i] = backrefs[i]; + NBREF(node)->back_static[i] = backrefs[i]; } else { int* p = (int* )xmalloc(sizeof(int) * back_num); @@ -1293,7 +1312,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, onig_node_free(node); return NULL; } - NBACKREF(node).back_dynamic = p; + NBREF(node)->back_dynamic = p; for (i = 0; i < back_num; i++) p[i] = backrefs[i]; } @@ -1302,17 +1321,17 @@ node_new_backref(int back_num, int* backrefs, int by_name, #ifdef USE_SUBEXP_CALL static Node* -node_new_call(UChar* name, UChar* name_end) +node_new_call(UChar* name, UChar* name_end, int gnum) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_CALL; - NCALL(node).state = 0; - NCALL(node).ref_num = CALLNODE_REFNUM_UNDEF; - NCALL(node).target = NULL_NODE; - NCALL(node).name = name; - NCALL(node).name_end = name_end; + SET_NTYPE(node, NT_CALL); + NCALL(node)->state = 0; + NCALL(node)->target = NULL_NODE; + NCALL(node)->name = name; + NCALL(node)->name_end = name_end; + NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */ return node; } #endif @@ -1322,58 +1341,60 @@ node_new_quantifier(int lower, int upper, int by_number) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_QUANTIFIER; - NQUANTIFIER(node).state = 0; - NQUANTIFIER(node).target = NULL; - NQUANTIFIER(node).lower = lower; - NQUANTIFIER(node).upper = upper; - NQUANTIFIER(node).greedy = 1; - NQUANTIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY; - NQUANTIFIER(node).head_exact = NULL_NODE; - NQUANTIFIER(node).next_head_exact = NULL_NODE; - NQUANTIFIER(node).is_refered = 0; + + SET_NTYPE(node, NT_QTFR); + NQTFR(node)->state = 0; + NQTFR(node)->target = NULL; + NQTFR(node)->lower = lower; + NQTFR(node)->upper = upper; + NQTFR(node)->greedy = 1; + NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY; + NQTFR(node)->head_exact = NULL_NODE; + NQTFR(node)->next_head_exact = NULL_NODE; + NQTFR(node)->is_refered = 0; if (by_number != 0) - NQUANTIFIER(node).state |= NST_BY_NUMBER; + NQTFR(node)->state |= NST_BY_NUMBER; #ifdef USE_COMBINATION_EXPLOSION_CHECK - NQUANTIFIER(node).comb_exp_check_num = 0; + NQTFR(node)->comb_exp_check_num = 0; #endif return node; } static Node* -node_new_effect(int type) +node_new_enclose(int type) { Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_EFFECT; - NEFFECT(node).type = type; - NEFFECT(node).state = 0; - NEFFECT(node).regnum = 0; - NEFFECT(node).option = 0; - NEFFECT(node).target = NULL; - NEFFECT(node).call_addr = -1; - NEFFECT(node).opt_count = 0; + + SET_NTYPE(node, NT_ENCLOSE); + NENCLOSE(node)->type = type; + NENCLOSE(node)->state = 0; + NENCLOSE(node)->regnum = 0; + NENCLOSE(node)->option = 0; + NENCLOSE(node)->target = NULL; + NENCLOSE(node)->call_addr = -1; + NENCLOSE(node)->opt_count = 0; return node; } extern Node* -onig_node_new_effect(int type) +onig_node_new_enclose(int type) { - return node_new_effect(type); + return node_new_enclose(type); } static Node* -node_new_effect_memory(OnigOptionType option, int is_named) +node_new_enclose_memory(OnigOptionType option, int is_named) { - Node* node = node_new_effect(EFFECT_MEMORY); + Node* node = node_new_enclose(ENCLOSE_MEMORY); CHECK_NULL_RETURN(node); if (is_named != 0) - SET_EFFECT_STATUS(node, NST_NAMED_GROUP); + SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP); #ifdef USE_SUBEXP_CALL - NEFFECT(node).option = option; + NENCLOSE(node)->option = option; #endif return node; } @@ -1381,9 +1402,9 @@ node_new_effect_memory(OnigOptionType option, int is_named) static Node* node_new_option(OnigOptionType option) { - Node* node = node_new_effect(EFFECT_OPTION); + Node* node = node_new_enclose(ENCLOSE_OPTION); CHECK_NULL_RETURN(node); - NEFFECT(node).option = option; + NENCLOSE(node)->option = option; return node; } @@ -1393,36 +1414,43 @@ onig_node_str_cat(Node* node, const UChar* s, const UChar* end) int addlen = end - s; if (addlen > 0) { - int len = NSTRING(node).end - NSTRING(node).s; + int len = NSTR(node)->end - NSTR(node)->s; - if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { + if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { UChar* p; int capa = len + addlen + NODE_STR_MARGIN; - if (capa <= NSTRING(node).capa) { - k_strcpy(NSTRING(node).s + len, s, end); + if (capa <= NSTR(node)->capa) { + onig_strcpy(NSTR(node)->s + len, s, end); } else { - if (NSTRING(node).s == NSTRING(node).buf) - p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end, + if (NSTR(node)->s == NSTR(node)->buf) + p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end, s, end, capa); else - p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa); + p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa); - CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); - NSTRING(node).s = p; - NSTRING(node).capa = capa; + CHECK_NULL_RETURN_MEMERR(p); + NSTR(node)->s = p; + NSTR(node)->capa = capa; } } else { - k_strcpy(NSTRING(node).s + len, s, end); + onig_strcpy(NSTR(node)->s + len, s, end); } - NSTRING(node).end = NSTRING(node).s + len + addlen; + NSTR(node)->end = NSTR(node)->s + len + addlen; } return 0; } +extern int +onig_node_str_set(Node* node, const UChar* s, const UChar* end) +{ + onig_node_str_clear(node); + return onig_node_str_cat(node, s, end); +} + static int node_str_cat_char(Node* node, UChar c) { @@ -1435,26 +1463,25 @@ node_str_cat_char(Node* node, UChar c) extern void onig_node_conv_to_str_node(Node* node, int flag) { - node->type = N_STRING; - - NSTRING(node).flag = flag; - NSTRING(node).capa = 0; - NSTRING(node).s = NSTRING(node).buf; - NSTRING(node).end = NSTRING(node).buf; + SET_NTYPE(node, NT_STR); + NSTR(node)->flag = flag; + NSTR(node)->capa = 0; + NSTR(node)->s = NSTR(node)->buf; + NSTR(node)->end = NSTR(node)->buf; } extern void onig_node_str_clear(Node* node) { - if (NSTRING(node).capa != 0 && - IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { - xfree(NSTRING(node).s); + if (NSTR(node)->capa != 0 && + IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { + xfree(NSTR(node)->s); } - NSTRING(node).capa = 0; - NSTRING(node).flag = 0; - NSTRING(node).s = NSTRING(node).buf; - NSTRING(node).end = NSTRING(node).buf; + NSTR(node)->capa = 0; + NSTR(node)->flag = 0; + NSTR(node)->s = NSTR(node)->buf; + NSTR(node)->end = NSTR(node)->buf; } static Node* @@ -1463,11 +1490,11 @@ node_new_str(const UChar* s, const UChar* end) Node* node = node_new(); CHECK_NULL_RETURN(node); - node->type = N_STRING; - NSTRING(node).capa = 0; - NSTRING(node).flag = 0; - NSTRING(node).s = NSTRING(node).buf; - NSTRING(node).end = NSTRING(node).buf; + SET_NTYPE(node, NT_STR); + NSTR(node)->capa = 0; + NSTR(node)->flag = 0; + NSTR(node)->s = NSTR(node)->buf; + NSTR(node)->end = NSTR(node)->buf; if (onig_node_str_cat(node, s, end)) { onig_node_free(node); return NULL; @@ -1481,7 +1508,6 @@ onig_node_new_str(const UChar* s, const UChar* end) return node_new_str(s, end); } -#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG static Node* node_new_str_raw(UChar* s, UChar* end) { @@ -1489,7 +1515,6 @@ node_new_str_raw(UChar* s, UChar* end) NSTRING_SET_RAW(node); return node; } -#endif static Node* node_new_empty(void) @@ -1498,12 +1523,12 @@ node_new_empty(void) } static Node* -node_new_str_char(UChar c) +node_new_str_raw_char(UChar c) { UChar p[1]; p[0] = c; - return node_new_str(p, p + 1); + return node_new_str_raw(p, p + 1); } static Node* @@ -1528,7 +1553,7 @@ static int str_node_can_be_split(StrNode* sn, OnigEncoding enc) { if (sn->end > sn->s) { - return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0); + return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0); } return 0; } @@ -1651,7 +1676,7 @@ new_code_range(BBuf** pbuf) BBuf* bbuf; bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); - CHECK_NULL_RETURN_VAL(*pbuf, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*pbuf); r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); if (r) return r; @@ -1952,10 +1977,10 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) BitSetRef bsr1, bsr2; BitSet bs1, bs2; - not1 = IS_CCLASS_NOT(dest); + not1 = IS_NCCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; - not2 = IS_CCLASS_NOT(cc); + not2 = IS_NCCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; @@ -2010,10 +2035,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) BitSetRef bsr1, bsr2; BitSet bs1, bs2; - not1 = IS_CCLASS_NOT(dest); + not1 = IS_NCCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; - not2 = IS_CCLASS_NOT(cc); + not2 = IS_NCCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; @@ -2066,13 +2091,13 @@ conv_backslash_value(int c, ScanEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { - case 'n': return '\n'; - case 't': return '\t'; - case 'r': return '\r'; - case 'f': return '\f'; - case 'a': return '\007'; - case 'b': return '\010'; - case 'e': return '\033'; + case 'n': return '\n'; + case 't': return '\t'; + case 'r': return '\r'; + case 'f': return '\f'; + case 'a': return '\007'; + case 'b': return '\010'; + case 'e': return '\033'; case 'v': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) return '\v'; @@ -2089,26 +2114,26 @@ static int is_invalid_quantifier_target(Node* node) { switch (NTYPE(node)) { - case N_ANCHOR: + case NT_ANCHOR: return 1; break; - case N_EFFECT: - if (NEFFECT(node).type == EFFECT_OPTION) - return is_invalid_quantifier_target(NEFFECT(node).target); + case NT_ENCLOSE: + /* allow enclosed elements */ + /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */ break; - case N_LIST: /* ex. (?:\G\A)* */ + case NT_LIST: do { - if (! is_invalid_quantifier_target(NCONS(node).left)) return 0; - } while (IS_NOT_NULL(node = NCONS(node).right)); + if (! is_invalid_quantifier_target(NCAR(node))) return 0; + } while (IS_NOT_NULL(node = NCDR(node))); return 0; break; - case N_ALT: /* ex. (?:abc|\A)* */ + case NT_ALT: do { - if (is_invalid_quantifier_target(NCONS(node).left)) return 1; - } while (IS_NOT_NULL(node = NCONS(node).right)); + if (is_invalid_quantifier_target(NCAR(node))) return 1; + } while (IS_NOT_NULL(node = NCDR(node))); break; default: @@ -2119,24 +2144,24 @@ is_invalid_quantifier_target(Node* node) /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ static int -popular_quantifier_num(QuantifierNode* qf) +popular_quantifier_num(QtfrNode* q) { - if (qf->greedy) { - if (qf->lower == 0) { - if (qf->upper == 1) return 0; - else if (IS_REPEAT_INFINITE(qf->upper)) return 1; + if (q->greedy) { + if (q->lower == 0) { + if (q->upper == 1) return 0; + else if (IS_REPEAT_INFINITE(q->upper)) return 1; } - else if (qf->lower == 1) { - if (IS_REPEAT_INFINITE(qf->upper)) return 2; + else if (q->lower == 1) { + if (IS_REPEAT_INFINITE(q->upper)) return 2; } } else { - if (qf->lower == 0) { - if (qf->upper == 1) return 3; - else if (IS_REPEAT_INFINITE(qf->upper)) return 4; + if (q->lower == 0) { + if (q->upper == 1) return 3; + else if (IS_REPEAT_INFINITE(q->upper)) return 4; } - else if (qf->lower == 1) { - if (IS_REPEAT_INFINITE(qf->upper)) return 5; + else if (q->lower == 1) { + if (IS_REPEAT_INFINITE(q->upper)) return 5; } } return -1; @@ -2166,16 +2191,17 @@ extern void onig_reduce_nested_quantifier(Node* pnode, Node* cnode) { int pnum, cnum; - QuantifierNode *p, *c; + QtfrNode *p, *c; - p = &(NQUANTIFIER(pnode)); - c = &(NQUANTIFIER(cnode)); + p = NQTFR(pnode); + c = NQTFR(cnode); pnum = popular_quantifier_num(p); cnum = popular_quantifier_num(c); + if (pnum < 0 || cnum < 0) return ; switch(ReduceTypeTable[cnum][pnum]) { case RQ_DEL: - *p = *c; + *pnode = *cnode; break; case RQ_A: p->target = c->target; @@ -2262,7 +2288,7 @@ typedef struct { int ref1; int* refs; int by_name; -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL int exist_level; int level; /* \k<name+n> */ #endif @@ -2270,8 +2296,10 @@ typedef struct { struct { UChar* name; UChar* name_end; + int gnum; } call; struct { + int ctype; int not; } prop; } u; @@ -2346,7 +2374,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC(enc)) goto invalid; + if (c != MC_ESC(env->syntax)) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -2389,7 +2417,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (c != '-') return ONIGERR_META_CODE_SYNTAX; if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH(c); - if (c == MC_ESC(enc)) { + if (c == MC_ESC(env->syntax)) { v = fetch_escaped_value(&p, end, env); if (v < 0) return v; c = (OnigCodePoint )v; @@ -2419,7 +2447,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) c = 0177; } else { - if (c == MC_ESC(enc)) { + if (c == MC_ESC(env->syntax)) { v = fetch_escaped_value(&p, end, env); if (v < 0) return v; c = (OnigCodePoint )v; @@ -2444,23 +2472,47 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); +static OnigCodePoint +get_name_end_code_point(OnigCodePoint start) +{ + switch (start) { + case '<': return (OnigCodePoint )'>'; break; + case '\'': return (OnigCodePoint )'\''; break; + default: + break; + } + + return (OnigCodePoint )0; +} + #ifdef USE_NAMED_GROUP -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> + \k<num+n>, \k<num-n> + \k<-num+n>, \k<-num-n> */ static int -fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end - , ScanEnv* env, int* level) +fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, + UChar** rname_end, ScanEnv* env, + int* rback_num, int* rlevel) { - int r, exist_level = 0; + int r, sign, is_num, exist_level; + OnigCodePoint end_code; OnigCodePoint c = 0; - OnigCodePoint first_code; OnigEncoding enc = env->enc; UChar *name_end; + UChar *pnum_head; UChar *p = *src; PFETCH_READY; + *rback_num = 0; + is_num = exist_level = 0; + sign = 1; + pnum_head = *src; + + end_code = get_name_end_code_point(start_code); + name_end = end; r = 0; if (PEND) { @@ -2468,11 +2520,18 @@ fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end } else { PFETCH(c); - first_code = c; - if (c == '>') + if (c == end_code) return ONIGERR_EMPTY_GROUP_NAME; - if (!ONIGENC_IS_CODE_WORD(enc, c)) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { + is_num = 1; + } + else if (c == '-') { + is_num = 2; + sign = -1; + pnum_head = p; + } + else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } @@ -2480,43 +2539,58 @@ fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end while (!PEND) { name_end = p; PFETCH(c); - if (c == '>' || c == ')' || c == '+' || c == '-') break; + if (c == end_code || c == ')' || c == '+' || c == '-') { + if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; + break; + } - if (!ONIGENC_IS_CODE_WORD(enc, c)) { + if (is_num != 0) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { + is_num = 1; + } + else { + r = ONIGERR_INVALID_GROUP_NAME; + is_num = 0; + } + } + else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } - if (c != '>') { + if (r == 0 && c != end_code) { if (c == '+' || c == '-') { - int num; + int level; int flag = (c == '-' ? -1 : 1); PFETCH(c); if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; PUNFETCH; - num = onig_scan_unsigned_number(&p, end, enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; - *level = (num * flag); + level = onig_scan_unsigned_number(&p, end, enc); + if (level < 0) return ONIGERR_TOO_BIG_NUMBER; + *rlevel = (level * flag); exist_level = 1; PFETCH(c); - if (c == '>') - goto first_check; + if (c == end_code) + goto end; } err: r = ONIGERR_INVALID_GROUP_NAME; name_end = end; } - else { - first_check: - if (ONIGENC_IS_CODE_ASCII(first_code) && - ONIGENC_IS_CODE_UPPER(enc, first_code)) - r = ONIGERR_INVALID_GROUP_NAME; - } + end: if (r == 0) { + if (is_num != 0) { + *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; + else if (*rback_num == 0) goto err; + + *rback_num *= sign; + } + *rname_end = name_end; *src = p; return (exist_level ? 1 : 0); @@ -2526,33 +2600,40 @@ fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end return r; } } -#endif /* USE_BACKREF_AT_LEVEL */ +#endif /* USE_BACKREF_WITH_LEVEL */ /* def: 0 -> define name (don't allow number name) 1 -> reference name (allow number name) */ static int -fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) +fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, + UChar** rname_end, ScanEnv* env, int* rback_num, int ref) { - int r, is_num; + int r, is_num, sign; + OnigCodePoint end_code; OnigCodePoint c = 0; - OnigCodePoint first_code; OnigEncoding enc = env->enc; UChar *name_end; + UChar *pnum_head; UChar *p = *src; PFETCH_READY; + *rback_num = 0; + + end_code = get_name_end_code_point(start_code); + name_end = end; + pnum_head = *src; r = 0; is_num = 0; + sign = 1; if (PEND) { return ONIGERR_EMPTY_GROUP_NAME; } else { PFETCH(c); - first_code = c; - if (c == '>') + if (c == end_code) return ONIGERR_EMPTY_GROUP_NAME; if (ONIGENC_IS_CODE_DIGIT(enc, c)) { @@ -2560,6 +2641,18 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) is_num = 1; else { r = ONIGERR_INVALID_GROUP_NAME; + is_num = 0; + } + } + else if (c == '-') { + if (ref == 1) { + is_num = 2; + sign = -1; + pnum_head = p; + } + else { + r = ONIGERR_INVALID_GROUP_NAME; + is_num = 0; } } else if (!ONIGENC_IS_CODE_WORD(enc, c)) { @@ -2567,74 +2660,137 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) } } - while (!PEND) { - name_end = p; - PFETCH(c); - if (c == '>' || c == ')') break; + if (r == 0) { + while (!PEND) { + name_end = p; + PFETCH(c); + if (c == end_code || c == ')') { + if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; + break; + } - if (is_num == 1) { - if (! ONIGENC_IS_CODE_DIGIT(enc, c)) { - if (!ONIGENC_IS_CODE_WORD(enc, c)) - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - else - r = ONIGERR_INVALID_GROUP_NAME; + if (is_num != 0) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { + is_num = 1; + } + else { + if (!ONIGENC_IS_CODE_WORD(enc, c)) + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + else + r = ONIGERR_INVALID_GROUP_NAME; + + is_num = 0; + } } - } - else { - if (!ONIGENC_IS_CODE_WORD(enc, c)) { - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + else { + if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } } } - } - if (c != '>') { - r = ONIGERR_INVALID_GROUP_NAME; - name_end = end; - } - else { - if (ONIGENC_IS_CODE_ASCII(first_code) && - ONIGENC_IS_CODE_UPPER(enc, first_code)) + if (c != end_code) { r = ONIGERR_INVALID_GROUP_NAME; - } + name_end = end; + } + + if (is_num != 0) { + *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; + else if (*rback_num == 0) { + r = ONIGERR_INVALID_GROUP_NAME; + goto err; + } + + *rback_num *= sign; + } - if (r == 0) { *rname_end = name_end; *src = p; return 0; } else { + while (!PEND) { + name_end = p; + PFETCH(c); + if (c == end_code || c == ')') + break; + } + if (PEND) + name_end = end; + + err: onig_scan_env_set_error_string(env, r, *src, name_end); return r; } } #else static int -fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) +fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, + UChar** rname_end, ScanEnv* env, int* rback_num, int ref) { - int r, len; + int r, is_num, sign; + OnigCodePoint end_code; OnigCodePoint c = 0; UChar *name_end; OnigEncoding enc = env->enc; + UChar *pnum_head; UChar *p = *src; PFETCH_READY; + *rback_num = 0; + + end_code = get_name_end_code_point(start_code); + + *rname_end = name_end = end; r = 0; + pnum_head = *src; + is_num = 0; + sign = 1; + + if (PEND) { + return ONIGERR_EMPTY_GROUP_NAME; + } + else { + PFETCH(c); + if (c == end_code) + return ONIGERR_EMPTY_GROUP_NAME; + + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { + is_num = 1; + } + else if (c == '-') { + is_num = 2; + sign = -1; + pnum_head = p; + } + else { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } + } + while (!PEND) { name_end = p; - if (enc_len(enc, p) > 1) - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; PFETCH(c); - if (c == '>' || c == ')') break; + if (c == end_code || c == ')') break; if (! ONIGENC_IS_CODE_DIGIT(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } - if (c != '>') { + if (r == 0 && c != end_code) { r = ONIGERR_INVALID_GROUP_NAME; name_end = end; } if (r == 0) { + *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; + else if (*rback_num == 0) { + r = ONIGERR_INVALID_GROUP_NAME; + goto err; + } + *rback_num *= sign; + *rname_end = name_end; *src = p; return 0; @@ -2645,7 +2801,7 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) return r; } } -#endif +#endif /* USE_NAMED_GROUP */ static void CC_ESC_WARN(ScanEnv* env, UChar *c) @@ -2663,7 +2819,7 @@ CC_ESC_WARN(ScanEnv* env, UChar *c) } static void -CCEND_ESC_WARN(ScanEnv* env, UChar* c) +CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) { if (onig_warn == onig_null_warn) return ; @@ -2687,12 +2843,12 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, p); + q = p + enclen(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, q); + q += enclen(enc, q); } if (i >= n) { if (IS_NOT_NULL(next)) @@ -2707,7 +2863,7 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, static int str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, - OnigCodePoint bad, OnigEncoding enc) + OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn) { int i, in_esc; OnigCodePoint x; @@ -2718,24 +2874,24 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { if (in_esc) { in_esc = 0; - p += enc_len(enc, p); + p += enclen(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, p); + q = p + enclen(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, q); + q += enclen(enc, q); } if (i >= n) return 1; - p += enc_len(enc, p); + p += enclen(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); if (x == bad) return 0; - else if (x == MC_ESC(enc)) in_esc = 1; + else if (x == MC_ESC(syn)) in_esc = 1; p = q; } } @@ -2771,7 +2927,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) else if (c == '-') { tok->type = TK_CC_RANGE; } - else if (c == MC_ESC(enc)) { + else if (c == MC_ESC(syn)) { if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) goto end; @@ -2783,37 +2939,45 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) switch (c) { case 'w': tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.not = 0; break; case 'W': tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.not = 1; break; case 'd': tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.not = 0; break; case 'D': tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.not = 1; break; case 's': tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_WHITE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.not = 0; break; case 'S': tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_WHITE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.not = 1; break; case 'h': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_XDIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; + tok->u.prop.not = 0; break; case 'H': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_XDIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; + tok->u.prop.not = 1; break; case 'p': @@ -2850,7 +3014,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } - if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) { + if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { PINC; tok->type = TK_CODE_POINT; tok->base = 16; @@ -2922,7 +3086,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; /* point at '[' is readed */ PINC; if (str_exist_check_with_esc(send, 2, p, end, - (OnigCodePoint )']', enc)) { + (OnigCodePoint )']', enc, syn)) { tok->type = TK_POSIX_BRACKET_OPEN; } else { @@ -2975,7 +3139,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - if (IS_MC_ESC_CODE(c, enc, syn)) { + if (IS_MC_ESC_CODE(c, syn)) { if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; tok->backp = p; @@ -3062,13 +3226,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'w': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.not = 0; break; case 'W': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.not = 1; break; case 'b': @@ -3100,37 +3266,43 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 's': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_WHITE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.not = 0; break; case 'S': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_WHITE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.not = 1; break; case 'd': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.not = 0; break; case 'D': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.not = 1; break; case 'h': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_XDIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; + tok->u.prop.not = 0; break; case 'H': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.subtype = CTYPE_NOT_XDIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; + tok->u.prop.not = 1; break; case 'A': @@ -3182,7 +3354,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } - if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) { + if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; tok->u.code = (OnigCodePoint )num; @@ -3240,7 +3412,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.num = 1; tok->u.backref.ref1 = num; tok->u.backref.by_name = 0; -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level = 0; #endif break; @@ -3276,46 +3448,67 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'k': if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { PFETCH(c); - if (c == '<') { + if (c == '<' || c == '\'') { UChar* name_end; int* backs; + int back_num; prev = p; -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL name_end = NULL_UCHARP; /* no need. escape gcc warning. */ - r = fetch_name_with_level(&p, end, &name_end, env, &tok->u.backref.level); + r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end, + env, &back_num, &tok->u.backref.level); if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else - r = fetch_name(&p, end, &name_end, env, 1); + r = fetch_name(&p, end, &name_end, env, &back_num, 1); #endif if (r < 0) return r; - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); - if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); - return ONIGERR_UNDEFINED_NAME_REFERENCE; - } - if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - int i; - for (i = 0; i < num; i++) { - if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) + if (back_num != 0) { + if (back_num < 0) { + back_num = BACKREF_REL_TO_ABS(back_num, env); + if (back_num <= 0) return ONIGERR_INVALID_BACKREF; } - } - tok->type = TK_BACKREF; - tok->u.backref.by_name = 1; - if (num == 1) { + if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { + if (back_num > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[back_num])) + return ONIGERR_INVALID_BACKREF; + } + tok->type = TK_BACKREF; + tok->u.backref.by_name = 0; tok->u.backref.num = 1; - tok->u.backref.ref1 = backs[0]; + tok->u.backref.ref1 = back_num; } else { - tok->u.backref.num = num; - tok->u.backref.refs = backs; + num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + if (num <= 0) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { + int i; + for (i = 0; i < num; i++) { + if (backs[i] > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) + return ONIGERR_INVALID_BACKREF; + } + } + + tok->type = TK_BACKREF; + tok->u.backref.by_name = 1; + if (num == 1) { + tok->u.backref.num = 1; + tok->u.backref.ref1 = backs[0]; + } + else { + tok->u.backref.num = num; + tok->u.backref.refs = backs; + } } } else @@ -3328,16 +3521,18 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'g': if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { PFETCH(c); - if (c == '<') { + if (c == '<' || c == '\'') { + int gnum; UChar* name_end; prev = p; - r = fetch_name(&p, end, &name_end, env, 1); + r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); if (r < 0) return r; tok->type = TK_CALL; tok->u.call.name = prev; tok->u.call.name_end = name_end; + tok->u.call.gnum = gnum; } else PUNFETCH; @@ -3380,7 +3575,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.code = (OnigCodePoint )num; } else { /* string */ - p = tok->backp + enc_len(enc, tok->backp); + p = tok->backp + enclen(enc, tok->backp); } break; } @@ -3392,15 +3587,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_VARIABLE_META_CHARS if ((c != ONIG_INEFFECTIVE_META_CHAR) && IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { - if (c == MC_ANYCHAR(enc)) + if (c == MC_ANYCHAR(syn)) goto any_char; - else if (c == MC_ANYTIME(enc)) + else if (c == MC_ANYTIME(syn)) goto anytime; - else if (c == MC_ZERO_OR_ONE_TIME(enc)) + else if (c == MC_ZERO_OR_ONE_TIME(syn)) goto zero_or_one_time; - else if (c == MC_ONE_OR_MORE_TIME(enc)) + else if (c == MC_ONE_OR_MORE_TIME(syn)) goto one_or_more_time; - else if (c == MC_ANYCHAR_ANYTIME(enc)) { + else if (c == MC_ANYCHAR_ANYTIME(syn)) { tok->type = TK_ANYCHAR_ANYTIME; goto out; } @@ -3477,7 +3672,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); - if (c == MC_ESC(enc)) { + if (c == MC_ESC(syn)) { if (!PEND) PFETCH(c); } else { @@ -3519,7 +3714,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case ']': if (*src > env->pattern) /* /].../ is allowed. */ - CCEND_ESC_WARN(env, (UChar* )"]"); + CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); break; case '#': @@ -3553,24 +3748,36 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int -add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, - const OnigCodePoint sbr[], const OnigCodePoint mbr[]) +add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, + OnigEncoding enc ARG_UNUSED, + OnigCodePoint sb_out, const OnigCodePoint mbr[]) { int i, r; OnigCodePoint j; - int nsb = ONIGENC_CODE_RANGE_NUM(sbr); - int nmb = ONIGENC_CODE_RANGE_NUM(mbr); + int n = ONIGENC_CODE_RANGE_NUM(mbr); if (not == 0) { - for (i = 0; i < nsb; i++) { - for (j = ONIGENC_CODE_RANGE_FROM(sbr, i); - j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) { + for (i = 0; i < n; i++) { + for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); + j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { + if (j >= sb_out) { + if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++; + else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { + r = add_code_range_to_buf(&(cc->mbuf), j, + ONIGENC_CODE_RANGE_TO(mbr, i)); + if (r != 0) return r; + i++; + } + + goto sb_end; + } BITSET_SET_BIT(cc->bs, j); } } - for (i = 0; i < nmb; i++) { + sb_end: + for ( ; i < n; i++) { r = add_code_range_to_buf(&(cc->mbuf), ONIGENC_CODE_RANGE_FROM(mbr, i), ONIGENC_CODE_RANGE_TO(mbr, i)); @@ -3580,24 +3787,24 @@ add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, else { OnigCodePoint prev = 0; - if (ONIGENC_MBC_MINLEN(enc) == 1) { - for (i = 0; i < nsb; i++) { - for (j = prev; - j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) { - BITSET_SET_BIT(cc->bs, j); - } - prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1; - } - if (prev < 0x7f) { - for (j = prev; j < 0x7f; j++) { - BITSET_SET_BIT(cc->bs, j); - } + for (i = 0; i < n; i++) { + for (j = prev; + j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { + if (j >= sb_out) { + goto sb_end2; + } + BITSET_SET_BIT(cc->bs, j); } - - prev = 0x80; + prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; + } + for (j = prev; j < sb_out; j++) { + BITSET_SET_BIT(cc->bs, j); } - for (i = 0; i < nmb; i++) { + sb_end2: + prev = sb_out; + + for (i = 0; i < n; i++) { if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { r = add_code_range_to_buf(&(cc->mbuf), prev, ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); @@ -3618,12 +3825,13 @@ static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; - const OnigCodePoint *sbr, *mbr; + const OnigCodePoint *ranges; + OnigCodePoint sb_out; OnigEncoding enc = env->enc; - r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr); + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); if (r == 0) { - return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr); + return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges); } else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; @@ -3677,13 +3885,13 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_WORD: if (not == 0) { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (ONIGENC_IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); + if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); } ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* 0: invalid code point */ + if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ && ! ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); } @@ -3699,61 +3907,10 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } static int -parse_ctype_to_enc_ctype(int pctype, int* not) -{ - int ctype; - - switch (pctype) { - case CTYPE_WORD: - ctype = ONIGENC_CTYPE_WORD; - *not = 0; - break; - case CTYPE_NOT_WORD: - ctype = ONIGENC_CTYPE_WORD; - *not = 1; - break; - case CTYPE_WHITE_SPACE: - ctype = ONIGENC_CTYPE_SPACE; - *not = 0; - break; - case CTYPE_NOT_WHITE_SPACE: - ctype = ONIGENC_CTYPE_SPACE; - *not = 1; - break; - case CTYPE_DIGIT: - ctype = ONIGENC_CTYPE_DIGIT; - *not = 0; - break; - case CTYPE_NOT_DIGIT: - ctype = ONIGENC_CTYPE_DIGIT; - *not = 1; - break; - case CTYPE_XDIGIT: - ctype = ONIGENC_CTYPE_XDIGIT; - *not = 0; - break; - case CTYPE_NOT_XDIGIT: - ctype = ONIGENC_CTYPE_XDIGIT; - *not = 1; - break; - default: - return ONIGERR_PARSER_BUG; - break; - } - return ctype; -} - -typedef struct { - UChar *name; - int ctype; - short int len; -} PosixBracketEntryType; - -static int parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) { #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 -#define POSIX_BRACKET_NAME_MAX_LEN 6 +#define POSIX_BRACKET_NAME_MIN_LEN 4 static PosixBracketEntryType PBS[] = { { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, @@ -3769,7 +3926,8 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, - { (UChar* )NULL, -1, 0 } + { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 }, + { (UChar* )NULL, -1, 0 } }; PosixBracketEntryType *pb; @@ -3786,7 +3944,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) else not = 0; - if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2) + if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) goto not_posix_bracket; for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { @@ -3820,86 +3978,39 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } } - return 1; /* 1: is not POSIX bracket, but no error. */ -} - -static int -property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) -{ - static PosixBracketEntryType PBS[] = { - { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 }, - { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 }, - { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 }, - { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 }, - { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 }, - { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 }, - { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 }, - { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 }, - { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 }, - { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 }, - { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 }, - { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 }, - { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 }, - { (UChar* )NULL, -1, 0 } - }; - - PosixBracketEntryType *pb; - int len; - - len = onigenc_strlen(enc, p, end); - for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (len == pb->len && - onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) - return pb->ctype; - } - - return -1; + return 1; /* 1: is not POSIX bracket, but no error. */ } static int fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { - int ctype; + int r; OnigCodePoint c; OnigEncoding enc = env->enc; UChar *prev, *start, *p = *src; PFETCH_READY; - /* 'IsXXXX' => 'XXXX' */ - if (!PEND && - IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) { - c = PPEEK; - if (c == 'I') { - PINC; - if (! PEND) { - c = PPEEK; - if (c == 's') - PINC; - else - PUNFETCH; - } - } - } - + r = 0; start = prev = p; while (!PEND) { prev = p; PFETCH(c); if (c == '}') { - ctype = property_name_to_ctype(start, prev, enc); - if (ctype < 0) break; + r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); + if (r < 0) break; *src = p; - return ctype; + return r; } - else if (c == '(' || c == ')' || c == '{' || c == '|') + else if (c == '(' || c == ')' || c == '{' || c == '|') { + r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; break; + } } - onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME, - *src, prev); - return ONIGERR_INVALID_CHAR_PROPERTY_NAME; + onig_scan_env_set_error_string(env, r, *src, prev); + return r; } static int @@ -3913,11 +4024,11 @@ parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, if (ctype < 0) return ctype; *np = node_new_cclass(); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - cc = &(NCCLASS(*np)); + CHECK_NULL_RETURN_MEMERR(*np); + cc = NCCLASS(*np); r = add_ctype_to_cc(cc, ctype, 0, env); if (r != 0) return r; - if (tok->u.prop.not != 0) CCLASS_SET_NOT(cc); + if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); return 0; } @@ -3981,7 +4092,7 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, if (intype == *type) { if (intype == CCV_SB) { if (*vs > 0xff || v > 0xff) - return ONIGERR_INVALID_WIDE_CHAR_VALUE; + return ONIGERR_INVALID_CODE_POINT_VALUE; if (*vs > v) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) @@ -4036,10 +4147,11 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, static int code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, - OnigEncoding enc) + ScanEnv* env) { int in_esc; OnigCodePoint code; + OnigEncoding enc = env->enc; UChar* p = from; PFETCH_READY; @@ -4051,7 +4163,7 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, else { PFETCH(code); if (code == c) return 1; - if (code == MC_ESC(enc)) in_esc = 1; + if (code == MC_ESC(env->syntax)) in_esc = 1; } } return 0; @@ -4086,7 +4198,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (r < 0) return r; if (r == TK_CC_CLOSE) { if (! code_exist_check((OnigCodePoint )']', - *src, env->pattern_end, 1, env->enc)) + *src, env->pattern_end, 1, env)) return ONIGERR_EMPTY_CHAR_CLASS; CC_ESC_WARN(env, (UChar* )"]"); @@ -4094,8 +4206,8 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, } *np = node = node_new_cclass(); - CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY); - cc = &(NCCLASS(node)); + CHECK_NULL_RETURN_MEMERR(node); + cc = NCCLASS(node); and_start = 0; state = CCS_START; @@ -4108,6 +4220,10 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (len > 1) { in_type = CCV_CODE_POINT; } + else if (len < 0) { + r = len; + goto err; + } else { sb_char: in_type = CCV_SB; @@ -4141,7 +4257,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, goto err; } - len = enc_len(env->enc, buf); + len = enclen(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; goto err; @@ -4202,12 +4318,8 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, break; case TK_CHAR_TYPE: - { - int ctype, not; - ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); - r = add_ctype_to_cc(cc, ctype, not, env); - if (r != 0) return r; - } + r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); + if (r != 0) return r; next_class: r = next_state_class(cc, &vs, &val_type, &state, env); @@ -4287,7 +4399,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, r = parse_char_class(&anode, tok, &p, end, env); if (r != 0) goto cc_open_err; - acc = &(NCCLASS(anode)); + acc = NCCLASS(anode); r = or_cclass(cc, acc, env->enc); onig_node_free(anode); @@ -4352,10 +4464,10 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, } if (neg != 0) - CCLASS_SET_NOT(cc); + NCCLASS_SET_NOT(cc); else - CCLASS_CLEAR_NOT(cc); - if (IS_CCLASS_NOT(cc) && + NCCLASS_CLEAR_NOT(cc); + if (IS_NCCLASS_NOT(cc) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { int is_empty; @@ -4378,7 +4490,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, return 0; err: - if (cc != &(NCCLASS(*np))) + if (cc != NCCLASS(*np)) bbuf_free(cc->mbuf); onig_node_free(*np); return r; @@ -4388,15 +4500,19 @@ static int parse_subexp(Node** top, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env); static int -parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env) +parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env) { int r, num; - int list_capture; Node *target; OnigOptionType option; - OnigEncoding enc = env->enc; OnigCodePoint c; + OnigEncoding enc = env->enc; + +#ifdef USE_NAMED_GROUP + int list_capture; +#endif + UChar* p = *src; PFETCH_READY; @@ -4428,9 +4544,19 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); break; case '>': /* (?>...) stop backtrack */ - *np = node_new_effect(EFFECT_STOP_BACKTRACK); + *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); break; +#ifdef USE_NAMED_GROUP + case '\'': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { + goto named_group1; + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; +#endif + case '<': /* look behind (?<=...), (?<!...) */ PFETCH(c); if (c == '=') @@ -4438,35 +4564,45 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else if (c == '!') *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); #ifdef USE_NAMED_GROUP - else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { - UChar *name; - UChar *name_end; + else { + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { + UChar *name; + UChar *name_end; - PUNFETCH; - list_capture = 0; + PUNFETCH; + c = '<'; - named_group: - name = p; - r = fetch_name(&p, end, &name_end, env, 0); - if (r < 0) return r; + named_group1: + list_capture = 0; - num = scan_env_add_mem_entry(env); - if (num < 0) return num; - if (list_capture != 0 && num >= BIT_STATUS_BITS_NUM) - return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; + named_group2: + name = p; + r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); + if (r < 0) return r; - r = name_add(env->reg, name, name_end, num, env); - if (r != 0) return r; - *np = node_new_effect_memory(env->option, 1); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - NEFFECT(*np).regnum = num; - if (list_capture != 0) - BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); - env->num_named++; + num = scan_env_add_mem_entry(env); + if (num < 0) return num; + if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM) + return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; + + r = name_add(env->reg, name, name_end, num, env); + if (r != 0) return r; + *np = node_new_enclose_memory(env->option, 1); + CHECK_NULL_RETURN_MEMERR(*np); + NENCLOSE(*np)->regnum = num; + if (list_capture != 0) + BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); + env->num_named++; + } + else { + return ONIGERR_UNDEFINED_GROUP_OPTION; + } } -#endif - else +#else + else { return ONIGERR_UNDEFINED_GROUP_OPTION; + } +#endif break; case '@': @@ -4474,25 +4610,25 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, #ifdef USE_NAMED_GROUP if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { PFETCH(c); - if (c == '<') { + if (c == '<' || c == '\'') { list_capture = 1; - goto named_group; /* (?@<name>...) */ + goto named_group2; /* (?@<name>...) */ } PUNFETCH; } #endif - *np = node_new_effect_memory(env->option, 0); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + *np = node_new_enclose_memory(env->option, 0); + CHECK_NULL_RETURN_MEMERR(*np); num = scan_env_add_mem_entry(env); if (num < 0) { onig_node_free(*np); return num; } - else if (num >= BIT_STATUS_BITS_NUM) { + else if (num >= (int )BIT_STATUS_BITS_NUM) { onig_node_free(*np); return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; } - NEFFECT(*np).regnum = num; + NENCLOSE(*np)->regnum = num; BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); } else { @@ -4545,7 +4681,7 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, if (c == ')') { *np = node_new_option(option); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); *src = p; return 2; /* option only */ } @@ -4559,8 +4695,8 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, env->option = prev; if (r < 0) return r; *np = node_new_option(option); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - NEFFECT(*np).target = target; + CHECK_NULL_RETURN_MEMERR(*np); + NENCLOSE(*np)->target = target; *src = p; return 0; } @@ -4579,26 +4715,26 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) goto group; - *np = node_new_effect_memory(env->option, 0); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + *np = node_new_enclose_memory(env->option, 0); + CHECK_NULL_RETURN_MEMERR(*np); num = scan_env_add_mem_entry(env); if (num < 0) return num; - NEFFECT(*np).regnum = num; + NENCLOSE(*np)->regnum = num; } - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = parse_subexp(&target, tok, term, &p, end, env); if (r < 0) return r; - if (NTYPE(*np) == N_ANCHOR) - NANCHOR(*np).target = target; + if (NTYPE(*np) == NT_ANCHOR) + NANCHOR(*np)->target = target; else { - NEFFECT(*np).target = target; - if (NEFFECT(*np).type == EFFECT_MEMORY) { + NENCLOSE(*np)->target = target; + if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) { /* Don't move this to previous of parse_subexp() */ - r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np); + r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); if (r != 0) return r; } } @@ -4618,17 +4754,17 @@ static const char* ReduceQStr[] = { static int set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) { - QuantifierNode* qn; + QtfrNode* qn; - qn = &(NQUANTIFIER(qnode)); + qn = NQTFR(qnode); if (qn->lower == 1 && qn->upper == 1) { return 1; } switch (NTYPE(target)) { - case N_STRING: + case NT_STR: if (! group) { - StrNode* sn = &(NSTRING(target)); + StrNode* sn = NSTR(target); if (str_node_can_be_split(sn, env->enc)) { Node* n = str_node_split_last_char(sn, env->enc); if (IS_NOT_NULL(n)) { @@ -4639,10 +4775,10 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } break; - case N_QUANTIFIER: + case NT_QTFR: { /* check redundant double repeat. */ /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ - QuantifierNode* qnt = &(NQUANTIFIER(target)); + QtfrNode* qnt = NQTFR(target); int nestq_num = popular_quantifier_num(qn); int targetq_num = popular_quantifier_num(qnt); @@ -4705,6 +4841,7 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) return 0; } + #ifdef USE_SHARED_CCLASS_TABLE #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 @@ -4728,17 +4865,17 @@ static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) static int type_cclass_hash(type_cclass_key* key) { int i, val; - unsigned char *p; + UChar *p; val = 0; - p = (unsigned char* )&(key->enc); - for (i = 0; i < sizeof(key->enc); i++) { + p = (UChar* )&(key->enc); + for (i = 0; i < (int )sizeof(key->enc); i++) { val = val * 997 + (int )*p++; } - p = (unsigned char* )(&key->type); - for (i = 0; i < sizeof(key->type); i++) { + p = (UChar* )(&key->type); + for (i = 0; i < (int )sizeof(key->type); i++) { val = val * 997 + (int )*p++; } @@ -4755,10 +4892,10 @@ static st_table* OnigTypeCClassTable; static int -i_free_shared_class(type_cclass_key* key, Node* node, void* arg) +i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED) { if (IS_NOT_NULL(node)) { - CClassNode* cc = &(NCCLASS(node)); + CClassNode* cc = NCCLASS(node); if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); xfree(node); } @@ -4782,6 +4919,118 @@ onig_free_shared_cclass_table(void) #endif /* USE_SHARED_CCLASS_TABLE */ +#ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS +static int +clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) +{ + BBuf *tbuf; + int r; + + if (IS_NCCLASS_NOT(cc)) { + bitset_invert(cc->bs); + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + r = not_code_range_buf(enc, cc->mbuf, &tbuf); + if (r != 0) return r; + + bbuf_free(cc->mbuf); + cc->mbuf = tbuf; + } + + NCCLASS_CLEAR_NOT(cc); + } + + return 0; +} +#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ + +typedef struct { + ScanEnv* env; + CClassNode* cc; + Node* alt_root; + Node** ptail; +} IApplyCaseFoldArg; + +static int +i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], + int to_len, void* arg) +{ + IApplyCaseFoldArg* iarg; + ScanEnv* env; + CClassNode* cc; + BitSetRef bs; + + iarg = (IApplyCaseFoldArg* )arg; + env = iarg->env; + cc = iarg->cc; + bs = cc->bs; + + if (to_len == 1) { + int is_in = onig_is_code_in_cc(env->enc, from, cc); +#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS + if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || + (is_in == 0 && IS_NCCLASS_NOT(cc))) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { + add_code_range(&(cc->mbuf), env, *to, *to); + } + else { + BITSET_SET_BIT(bs, *to); + } + } +#else + if (is_in != 0) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { + if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); + add_code_range(&(cc->mbuf), env, *to, *to); + } + else { + if (IS_NCCLASS_NOT(cc)) { + BITSET_CLEAR_BIT(bs, *to); + } + else + BITSET_SET_BIT(bs, *to); + } + } +#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ + } + else { + int r, i, len; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node *snode = NULL_NODE; + + if (onig_is_code_in_cc(env->enc, from, cc) +#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS + && !IS_NCCLASS_NOT(cc) +#endif + ) { + for (i = 0; i < to_len; i++) { + len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + if (i == 0) { + snode = onig_node_new_str(buf, buf + len); + CHECK_NULL_RETURN_MEMERR(snode); + + /* char-class expanded multi-char only + compare with string folded at match time. */ + NSTRING_SET_AMBIG(snode); + } + else { + r = onig_node_str_cat(snode, buf, buf + len); + if (r < 0) { + onig_node_free(snode); + return r; + } + } + } + + *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); + iarg->ptail = &(NCDR((*(iarg->ptail)))); + } + } + + return 0; +} + static int parse_exp(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) @@ -4791,7 +5040,7 @@ parse_exp(Node** np, OnigToken* tok, int term, Node** targetp; *np = NULL; - if (tok->type == term) + if (tok->type == (enum TokenSyms )term) goto end_of_token; switch (tok->type) { @@ -4803,20 +5052,20 @@ parse_exp(Node** np, OnigToken* tok, int term, break; case TK_SUBEXP_OPEN: - r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env); + r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); if (r < 0) return r; if (r == 1) group = 1; else if (r == 2) { /* option only */ Node* target; OnigOptionType prev = env->option; - env->option = NEFFECT(*np).option; + env->option = NENCLOSE(*np)->option; r = fetch_token(tok, src, end, env); if (r < 0) return r; r = parse_subexp(&target, tok, term, src, end, env); env->option = prev; if (r < 0) return r; - NEFFECT(*np).target = target; + NENCLOSE(*np)->target = target; return tok->type; } break; @@ -4833,7 +5082,7 @@ parse_exp(Node** np, OnigToken* tok, int term, tk_byte: { *np = node_new_str(tok->backp, *src); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); while (1) { r = fetch_token(tok, src, end, env); @@ -4853,13 +5102,14 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_RAW_BYTE: tk_raw_byte: { - *np = node_new_str_char((UChar )tok->u.c); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + *np = node_new_str_raw_char((UChar )tok->u.c); + CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enc_len(env->enc, NSTRING(*np).s)) { + if (len == enclen(env->enc, NSTR(*np)->s)) { r = fetch_token(tok, src, end, env); + NSTRING_CLEAR_RAW(*np); goto string_end; } } @@ -4867,12 +5117,14 @@ parse_exp(Node** np, OnigToken* tok, int term, r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_RAW_BYTE) { + /* Don't use this, it is wrong for little endian encodings. */ #ifdef USE_PAD_TO_SHORT_BYTE_CHAR int rem; if (len < ONIGENC_MBC_MINLEN(env->enc)) { rem = ONIGENC_MBC_MINLEN(env->enc) - len; - (void )node_str_head_pad(&NSTRING(*np), rem, (UChar )0); - if (len + rem == enc_len(env->enc, NSTRING(*np).s)) { + (void )node_str_head_pad(NSTR(*np), rem, (UChar )0); + if (len + rem == enclen(env->enc, NSTR(*np)->s)) { + NSTRING_CLEAR_RAW(*np); goto string_end; } } @@ -4898,7 +5150,7 @@ parse_exp(Node** np, OnigToken* tok, int term, #else *np = node_new_str(buf, buf + num); #endif - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); } break; @@ -4907,7 +5159,7 @@ parse_exp(Node** np, OnigToken* tok, int term, OnigCodePoint end_op[2]; UChar *qstart, *qend, *nextp; - end_op[0] = (OnigCodePoint )MC_ESC(env->enc); + end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); end_op[1] = (OnigCodePoint )'E'; qstart = *src; qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); @@ -4915,35 +5167,31 @@ parse_exp(Node** np, OnigToken* tok, int term, nextp = qend = end; } *np = node_new_str(qstart, qend); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); *src = nextp; } break; case TK_CHAR_TYPE: { - switch (tok->u.subtype) { - case CTYPE_WORD: - case CTYPE_NOT_WORD: - *np = node_new_ctype(tok->u.subtype); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + switch (tok->u.prop.ctype) { + case ONIGENC_CTYPE_WORD: + *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not); + CHECK_NULL_RETURN_MEMERR(*np); break; - case CTYPE_WHITE_SPACE: - case CTYPE_NOT_WHITE_SPACE: - case CTYPE_DIGIT: - case CTYPE_NOT_DIGIT: - case CTYPE_XDIGIT: - case CTYPE_NOT_XDIGIT: + case ONIGENC_CTYPE_SPACE: + case ONIGENC_CTYPE_DIGIT: + case ONIGENC_CTYPE_XDIGIT: { CClassNode* cc; - int ctype, not; #ifdef USE_SHARED_CCLASS_TABLE - const OnigCodePoint *sbr, *mbr; + const OnigCodePoint *mbr; + OnigCodePoint sb_out; - ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); - r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr); + r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype, + &sb_out, &mbr); if (r == 0 && ONIGENC_CODE_RANGE_NUM(mbr) >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { @@ -4951,8 +5199,8 @@ parse_exp(Node** np, OnigToken* tok, int term, type_cclass_key* new_key; key.enc = env->enc; - key.not = not; - key.type = ctype; + key.not = tok->u.prop.not; + key.type = tok->u.prop.ctype; THREAD_ATOMIC_START; @@ -4972,14 +5220,17 @@ parse_exp(Node** np, OnigToken* tok, int term, } } - *np = node_new_cclass_by_codepoint_range(not, sbr, mbr); + *np = node_new_cclass_by_codepoint_range(tok->u.prop.not, + sb_out, mbr); if (IS_NULL(*np)) { THREAD_ATOMIC_END; return ONIGERR_MEMORY; } - CCLASS_SET_SHARE(&(NCCLASS(*np))); + cc = NCCLASS(*np); + NCCLASS_SET_SHARE(cc); new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); + xmemcpy(new_key, &key, sizeof(type_cclass_key)); onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key, (st_data_t )*np); @@ -4987,12 +5238,11 @@ parse_exp(Node** np, OnigToken* tok, int term, } else { #endif - ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); *np = node_new_cclass(); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - cc = &(NCCLASS(*np)); - add_ctype_to_cc(cc, ctype, 0, env); - if (not != 0) CCLASS_SET_NOT(cc); + CHECK_NULL_RETURN_MEMERR(*np); + cc = NCCLASS(*np); + add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); #ifdef USE_SHARED_CCLASS_TABLE } #endif @@ -5018,55 +5268,44 @@ parse_exp(Node** np, OnigToken* tok, int term, r = parse_char_class(np, tok, src, end, env); if (r != 0) return r; - cc = &(NCCLASS(*np)); - + cc = NCCLASS(*np); if (IS_IGNORECASE(env->option)) { - int i, n, in_cc; - const OnigPairAmbigCodes* ccs; - BitSetRef bs = cc->bs; - OnigAmbigType amb; - - for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { - if ((amb & env->ambig_flag) == 0) continue; - - n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs); - for (i = 0; i < n; i++) { - in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc); - - if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) || - (in_cc == 0 && IS_CCLASS_NOT(cc))) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || - ccs[i].from >= SINGLE_BYTE_SIZE) { - /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */ - add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to); - } - else { - if (BITSET_AT(bs, ccs[i].from)) { - /* /(?i:[^A-C])/.match("a") ==> fail. */ - BITSET_SET_BIT(bs, ccs[i].to); - } - if (BITSET_AT(bs, ccs[i].to)) { - BITSET_SET_BIT(bs, ccs[i].from); - } - } - } + IApplyCaseFoldArg iarg; + + iarg.env = env; + iarg.cc = cc; + iarg.alt_root = NULL_NODE; + iarg.ptail = &(iarg.alt_root); + + r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, + i_apply_case_fold, &iarg); + if (r != 0) { + onig_node_free(iarg.alt_root); + return r; + } + if (IS_NOT_NULL(iarg.alt_root)) { + Node* work = onig_node_new_alt(*np, iarg.alt_root); + if (IS_NULL(work)) { + onig_node_free(iarg.alt_root); + return ONIGERR_MEMORY; } - } + *np = work; + } } } break; case TK_ANYCHAR: *np = node_new_anychar(); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); break; case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); qn = node_new_quantifier(0, REPEAT_INFINITE, 0); - CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY); - NQUANTIFIER(qn).target = *np; + CHECK_NULL_RETURN_MEMERR(qn); + NQTFR(qn)->target = *np; *np = qn; break; @@ -5075,19 +5314,28 @@ parse_exp(Node** np, OnigToken* tok, int term, *np = node_new_backref(len, (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), tok->u.backref.by_name, -#ifdef USE_BACKREF_AT_LEVEL +#ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level, tok->u.backref.level, #endif env); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(*np); break; #ifdef USE_SUBEXP_CALL case TK_CALL: - *np = node_new_call(tok->u.call.name, tok->u.call.name_end); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - env->num_call++; + { + int gnum = tok->u.call.gnum; + + if (gnum < 0) { + gnum = BACKREF_REL_TO_ABS(gnum, env); + if (gnum <= 0) + return ONIGERR_INVALID_BACKREF; + } + *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum); + CHECK_NULL_RETURN_MEMERR(*np); + env->num_call++; + } break; #endif @@ -5126,31 +5374,46 @@ parse_exp(Node** np, OnigToken* tok, int term, return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, - (r == TK_INTERVAL ? 1 : 0)); - CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY); - NQUANTIFIER(qn).greedy = tok->u.repeat.greedy; + (r == TK_INTERVAL ? 1 : 0)); + CHECK_NULL_RETURN_MEMERR(qn); + NQTFR(qn)->greedy = tok->u.repeat.greedy; r = set_quantifier(qn, *targetp, group, env); - if (r < 0) return r; - + if (r < 0) { + onig_node_free(qn); + return r; + } + if (tok->u.repeat.possessive != 0) { Node* en; - en = node_new_effect(EFFECT_STOP_BACKTRACK); - CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); - NEFFECT(en).target = qn; + en = node_new_enclose(ENCLOSE_STOP_BACKTRACK); + if (IS_NULL(en)) { + onig_node_free(qn); + return ONIGERR_MEMORY; + } + NENCLOSE(en)->target = qn; qn = en; } if (r == 0) { *targetp = qn; } + else if (r == 1) { + onig_node_free(qn); + } else if (r == 2) { /* split case: /abc+/ */ Node *tmp; *targetp = node_new_list(*targetp, NULL); - CHECK_NULL_RETURN_VAL(*targetp, ONIGERR_MEMORY); - tmp = NCONS(*targetp).right = node_new_list(qn, NULL); - CHECK_NULL_RETURN_VAL(tmp, ONIGERR_MEMORY); - targetp = &(NCONS(tmp).left); + if (IS_NULL(*targetp)) { + onig_node_free(qn); + return ONIGERR_MEMORY; + } + tmp = NCDR(*targetp) = node_new_list(qn, NULL); + if (IS_NULL(tmp)) { + onig_node_free(qn); + return ONIGERR_MEMORY; + } + targetp = &(NCAR(tmp)); } goto re_entry; } @@ -5175,19 +5438,19 @@ parse_branch(Node** top, OnigToken* tok, int term, } else { *top = node_new_list(node, NULL); - headp = &(NCONS(*top).right); + headp = &(NCDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { r = parse_exp(&node, tok, term, src, end, env); if (r < 0) return r; - if (NTYPE(node) == N_LIST) { + if (NTYPE(node) == NT_LIST) { *headp = node; - while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right; - headp = &(NCONS(node).right); + while (IS_NOT_NULL(NCDR(node))) node = NCDR(node); + headp = &(NCDR(node)); } else { *headp = node_new_list(node, NULL); - headp = &(NCONS(*headp).right); + headp = &(NCDR(*headp)); } } } @@ -5214,19 +5477,19 @@ parse_subexp(Node** top, OnigToken* tok, int term, *top = node; } else if (r == TK_ALT) { - *top = node_new_alt(node, NULL); - headp = &(NCONS(*top).right); + *top = onig_node_new_alt(node, NULL); + headp = &(NCDR(*top)); while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; r = parse_branch(&node, tok, term, src, end, env); if (r < 0) return r; - *headp = node_new_alt(node, NULL); - headp = &(NCONS(*headp).right); + *headp = onig_node_new_alt(node, NULL); + headp = &(NCDR(*headp)); } - if (tok->type != term) + if (tok->type != (enum TokenSyms )term) goto err; } else { @@ -5254,8 +5517,8 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) } extern int -onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg, - ScanEnv* env) +onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, + regex_t* reg, ScanEnv* env) { int r; UChar* p; @@ -5265,13 +5528,13 @@ onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_ #endif scan_env_clear(env); - env->option = reg->options; - env->ambig_flag = reg->ambig_flag; - env->enc = reg->enc; - env->syntax = reg->syntax; - env->pattern = (UChar* )pattern; - env->pattern_end = (UChar* )end; - env->reg = reg; + env->option = reg->options; + env->case_fold_flag = reg->case_fold_flag; + env->enc = reg->enc; + env->syntax = reg->syntax; + env->pattern = (UChar* )pattern; + env->pattern_end = (UChar* )end; + env->reg = reg; *root = NULL; p = (UChar* )pattern; @@ -5281,7 +5544,7 @@ onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_ } extern void -onig_scan_env_set_error_string(ScanEnv* env, int ecode, +onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, UChar* arg, UChar* arg_end) { env->error = arg; |
