diff options
Diffstat (limited to 'ext/mbstring/php_mbregex.c')
-rw-r--r-- | ext/mbstring/php_mbregex.c | 299 |
1 files changed, 230 insertions, 69 deletions
diff --git a/ext/mbstring/php_mbregex.c b/ext/mbstring/php_mbregex.c index 2ff304f277..319ee567c6 100644 --- a/ext/mbstring/php_mbregex.c +++ b/ext/mbstring/php_mbregex.c @@ -16,9 +16,6 @@ +----------------------------------------------------------------------+ */ -/* $Id$ */ - - #ifdef HAVE_CONFIG_H #include "config.h" #endif @@ -45,7 +42,7 @@ struct _zend_mb_regex_globals { HashTable ht_rc; zval search_str; zval *search_str_val; - unsigned int search_pos; + size_t search_pos; php_mb_regex_t *search_re; OnigRegion *search_regs; OnigOptionType regex_default_options; @@ -65,7 +62,6 @@ static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals) { pglobals->default_mbctype = ONIG_ENCODING_UTF8; pglobals->current_mbctype = ONIG_ENCODING_UTF8; - zend_hash_init(&(pglobals->ht_rc), 0, NULL, php_mb_regex_free_cache, 1); ZVAL_UNDEF(&pglobals->search_str); pglobals->search_re = (php_mb_regex_t*)NULL; pglobals->search_pos = 0; @@ -79,7 +75,6 @@ static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals) /* {{{ _php_mb_regex_globals_dtor */ static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals) { - zend_hash_destroy(&pglobals->ht_rc); } /* }}} */ @@ -126,7 +121,9 @@ PHP_MSHUTDOWN_FUNCTION(mb_regex) /* {{{ PHP_RINIT_FUNCTION(mb_regex) */ PHP_RINIT_FUNCTION(mb_regex) { - return MBSTRG(mb_regex_globals) ? SUCCESS: FAILURE; + if (!MBSTRG(mb_regex_globals)) return FAILURE; + zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0); + return SUCCESS; } /* }}} */ @@ -145,7 +142,7 @@ PHP_RSHUTDOWN_FUNCTION(mb_regex) onig_region_free(MBREX(search_regs), 1); MBREX(search_regs) = (OnigRegion *)NULL; } - zend_hash_clean(&MBREX(ht_rc)); + zend_hash_destroy(&MBREX(ht_rc)); return SUCCESS; } @@ -183,7 +180,7 @@ typedef struct _php_mb_regex_enc_name_map_t { OnigEncoding code; } php_mb_regex_enc_name_map_t; -php_mb_regex_enc_name_map_t enc_name_map[] = { +static const php_mb_regex_enc_name_map_t enc_name_map[] = { #ifdef ONIG_ENCODING_EUC_JP { "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0", @@ -366,7 +363,7 @@ php_mb_regex_enc_name_map_t enc_name_map[] = { static OnigEncoding _php_mb_regex_name2mbctype(const char *pname) { const char *p; - php_mb_regex_enc_name_map_t *mapping; + const php_mb_regex_enc_name_map_t *mapping; if (pname == NULL || !*pname) { return ONIG_ENCODING_UNDEF; @@ -387,7 +384,7 @@ static OnigEncoding _php_mb_regex_name2mbctype(const char *pname) /* {{{ php_mb_regex_mbctype2name */ static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype) { - php_mb_regex_enc_name_map_t *mapping; + const php_mb_regex_enc_name_map_t *mapping; for (mapping = enc_name_map; mapping->names != NULL; mapping++) { if (mapping->code == mbctype) { @@ -441,7 +438,7 @@ const char *php_mb_regex_get_default_mbctype(void) * regex cache */ /* {{{ php_mbregex_compile_pattern */ -static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, int patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax) +static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax) { int err_code = 0; php_mb_regex_t *retval = NULL, *rc = NULL; @@ -576,11 +573,11 @@ static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionT /* {{{ _php_mb_regex_init_options */ static void -_php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval) +_php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval) { - int n; + size_t n; char c; - int optm = 0; + OnigOptionType optm = 0; *syntax = ONIG_SYNTAX_RUBY; @@ -646,6 +643,175 @@ _php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, O } /* }}} */ + +/* + * Callbacks for named subpatterns + */ + +/* {{{ struct mb_ereg_groups_iter_arg */ +typedef struct mb_regex_groups_iter_args { + zval *groups; + char *search_str; + size_t search_len; + OnigRegion *region; +} mb_regex_groups_iter_args; +/* }}} */ + +/* {{{ mb_ereg_groups_iter */ +static int +mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg) +{ + mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg; + int gn, beg, end; + + /* + * In case of duplicate groups, keep only the last succeeding one + * to be consistent with preg_match with the PCRE_DUPNAMES option. + */ + gn = onig_name_to_backref_number(reg, name, name_end, args->region); + beg = args->region->beg[gn]; + end = args->region->end[gn]; + if (beg >= 0 && beg < end && end <= args->search_len) { + add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg); + } else { + add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0); + } + + return 0; +} +/* }}} */ + +/* + * Helper for _php_mb_regex_ereg_replace_exec + */ +/* {{{ mb_regex_substitute */ +static inline void mb_regex_substitute( + smart_str *pbuf, + const char *subject, + size_t subject_len, + char *replace, + size_t replace_len, + php_mb_regex_t *regexp, + OnigRegion *regs, + const mbfl_encoding *enc +) { + char *p, *sp, *eos; + int no; /* bakreference group number */ + int clen; /* byte-length of the current character */ + + p = replace; + eos = replace + replace_len; + + while (p < eos) { + clen = (int) php_mb_mbchar_bytes_ex(p, enc); + if (clen != 1 || p == eos || p[0] != '\\') { + /* skip anything that's not an ascii backslash */ + smart_str_appendl(pbuf, p, clen); + p += clen; + continue; + } + sp = p; /* save position */ + clen = (int) php_mb_mbchar_bytes_ex(++p, enc); + if (clen != 1 || p == eos) { + /* skip backslash followed by multibyte char */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + no = -1; + switch (p[0]) { + case '0': + no = 0; + p++; + break; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + if (!onig_noname_group_capture_is_active(regexp)) { + /* + * FIXME: + * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern. + * For now we just ignore them, but in the future we might want to raise a warning + * and abort the whole replace operation. + */ + p++; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + no = p[0] - '0'; + p++; + break; + case 'k': + clen = (int) php_mb_mbchar_bytes_ex(++p, enc); + if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) { + /* not a backref delimiter */ + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + /* try to consume everything until next delimiter */ + char delim = p[0] == '<' ? '>' : '\''; + char *name, *name_end; + char maybe_num = 1; + name_end = name = p + 1; + while (name_end < eos) { + clen = (int) php_mb_mbchar_bytes_ex(name_end, enc); + if (clen != 1) { + name_end += clen; + maybe_num = 0; + continue; + } + if (name_end[0] == delim) break; + if (maybe_num && !isdigit(name_end[0])) maybe_num = 0; + name_end++; + } + p = name_end + 1; + if (name_end - name < 1 || name_end >= eos) { + /* the backref was empty or we failed to find the end delimiter */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + /* we have either a name or a number */ + if (maybe_num) { + if (!onig_noname_group_capture_is_active(regexp)) { + /* see above note on mixing numbered & named backrefs */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (name_end - name == 1) { + no = name[0] - '0'; + break; + } + if (name[0] == '0') { + /* 01 is not a valid number */ + break; + } + no = (int) strtoul(name, NULL, 10); + break; + } + no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs); + break; + default: + /* We're not treating \ as an escape character and will interpret something like + * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this + * function has not supported escaping of backslashes historically. */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (no < 0 || no >= regs->num_regs) { + /* invalid group number reference, keep the escape sequence in the output */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) { + smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]); + } + } + + if (p < eos) { + smart_str_appendl(pbuf, p, eos - p); + } +} +/* }}} */ + /* * php functions */ @@ -700,23 +866,19 @@ static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase) RETURN_FALSE; } + if (array != NULL) { + zval_ptr_dtor(array); + array_init(array); + } + if (!php_mb_check_encoding( - string, - string_len, - _php_mb_regex_mbctype2name(MBREX(current_mbctype)) + string, + string_len, + _php_mb_regex_mbctype2name(MBREX(current_mbctype)) )) { - if (array != NULL) { - zval_dtor(array); - array_init(array); - } RETURN_FALSE; } - if (array != NULL) { - zval_dtor(array); - array_init(array); - } - options = MBREX(regex_default_options); if (icase) { options |= ONIG_OPTION_IGNORECASE; @@ -766,6 +928,11 @@ static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase) add_index_bool(array, i, 0); } } + + if (onig_number_of_names(re) > 0) { + mb_regex_groups_iter_args args = {array, string, string_len, regs}; + onig_foreach_name(re, mb_regex_groups_iter, &args); + } } if (match_len == 0) { @@ -812,14 +979,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp char *string; size_t string_len; - char *p; php_mb_regex_t *re; OnigSyntaxType *syntax; OnigRegion *regs = NULL; smart_str out_buf = {0}; smart_str eval_buf = {0}; smart_str *pbuf; - size_t i; int err, eval, n; OnigUChar *pos; OnigUChar *string_lim; @@ -929,38 +1094,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp break; } if (err >= 0) { -#if moriyoshi_0 - if (regs->beg[0] == regs->end[0]) { - php_error_docref(NULL, E_WARNING, "Empty regular expression"); - break; - } -#endif /* copy the part of the string before the match */ smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos)); if (!is_callable) { - /* copy replacement and backrefs */ - i = 0; - p = replace; - while (i < replace_len) { - int fwd = (int) php_mb_mbchar_bytes_ex(p, enc); - n = -1; - if ((replace_len - i) >= 2 && fwd == 1 && - p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { - n = p[1] - '0'; - } - if (n >= 0 && n < regs->num_regs) { - if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) { - smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]); - } - p += 2; - i += 2; - } else { - smart_str_appendl(pbuf, p, fwd); - p += fwd; - i += fwd; - } - } + mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc); } if (eval) { @@ -990,7 +1128,7 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v)); /* Clean up */ smart_str_free(&eval_buf); - zval_dtor(&v); + zval_ptr_dtor_str(&v); } else if (is_callable) { zval args[1]; zval subpats, retval; @@ -1000,6 +1138,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp for (i = 0; i < regs->num_regs; i++) { add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]); } + if (onig_number_of_names(re) > 0) { + mb_regex_groups_iter_args args = {&subpats, string, string_len, regs}; + onig_foreach_name(re, mb_regex_groups_iter, &args); + } ZVAL_COPY_VALUE(&args[0], &subpats); /* null terminate buffer */ @@ -1096,7 +1238,7 @@ PHP_FUNCTION(mb_split) OnigUChar *pos, *chunk_pos; size_t string_len; - int n, err; + int err; zend_long count = -1; if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) { @@ -1118,16 +1260,16 @@ PHP_FUNCTION(mb_split) err = 0; regs = onig_region_new(); /* churn through str, generating array entries as we go */ - while (count != 0 && (pos - (OnigUChar *)string) < (ptrdiff_t)string_len) { - int beg, end; + while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) { + size_t beg, end; err = onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0); if (err < 0) { break; } beg = regs->beg[0], end = regs->end[0]; /* add it to the array */ - if ((pos - (OnigUChar *)string) < end) { - if ((size_t)beg < string_len && beg >= (chunk_pos - (OnigUChar *)string)) { + if ((size_t)(pos - (OnigUChar *)string) < end) { + if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) { add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos)); --count; } else { @@ -1149,13 +1291,13 @@ PHP_FUNCTION(mb_split) OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str(err_str, err); php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str); - zval_dtor(return_value); + zend_array_destroy(Z_ARR_P(return_value)); RETURN_FALSE; } /* otherwise we just have one last element to add to the array */ - n = ((OnigUChar *)(string + string_len) - chunk_pos); - if (n > 0) { + if ((OnigUChar *)(string + string_len) > chunk_pos) { + size_t n = ((OnigUChar *)(string + string_len) - chunk_pos); add_next_index_stringl(return_value, (char *)chunk_pos, n); } else { add_next_index_stringl(return_value, "", 0); @@ -1217,7 +1359,8 @@ _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode) { char *arg_pattern = NULL, *arg_options = NULL; size_t arg_pattern_len, arg_options_len; - int n, i, err, pos, len, beg, end; + int err; + size_t n, i, pos, len, beg, end; OnigOptionType option; OnigUChar *str; OnigSyntaxType *syntax; @@ -1293,6 +1436,15 @@ _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode) add_index_bool(return_value, i, 0); } } + if (onig_number_of_names(MBREX(search_re)) > 0) { + mb_regex_groups_iter_args args = { + return_value, + Z_STRVAL(MBREX(search_str)), + Z_STRLEN(MBREX(search_str)), + MBREX(search_regs) + }; + onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args); + } break; default: RETVAL_TRUE; @@ -1341,7 +1493,7 @@ PHP_FUNCTION(mb_ereg_search_regs) Initialize string and regular expression for search. */ PHP_FUNCTION(mb_ereg_search_init) { - size_t argc = ZEND_NUM_ARGS(); + int argc = ZEND_NUM_ARGS(); zend_string *arg_str; char *arg_pattern = NULL, *arg_options = NULL; size_t arg_pattern_len = 0, arg_options_len = 0; @@ -1401,7 +1553,7 @@ PHP_FUNCTION(mb_ereg_search_init) Get matched substring of the last time */ PHP_FUNCTION(mb_ereg_search_getregs) { - int n, i, len, beg, end; + size_t n, i, len, beg, end; OnigUChar *str; if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) { @@ -1419,6 +1571,15 @@ PHP_FUNCTION(mb_ereg_search_getregs) add_index_bool(return_value, i, 0); } } + if (onig_number_of_names(MBREX(search_re)) > 0) { + mb_regex_groups_iter_args args = { + return_value, + Z_STRVAL(MBREX(search_str)), + len, + MBREX(search_regs) + }; + onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args); + } } else { RETVAL_FALSE; } |