summaryrefslogtreecommitdiff
path: root/ext/mbstring/php_mbregex.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/mbstring/php_mbregex.c')
-rw-r--r--ext/mbstring/php_mbregex.c299
1 files changed, 230 insertions, 69 deletions
diff --git a/ext/mbstring/php_mbregex.c b/ext/mbstring/php_mbregex.c
index 2ff304f277..319ee567c6 100644
--- a/ext/mbstring/php_mbregex.c
+++ b/ext/mbstring/php_mbregex.c
@@ -16,9 +16,6 @@
+----------------------------------------------------------------------+
*/
-/* $Id$ */
-
-
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
@@ -45,7 +42,7 @@ struct _zend_mb_regex_globals {
HashTable ht_rc;
zval search_str;
zval *search_str_val;
- unsigned int search_pos;
+ size_t search_pos;
php_mb_regex_t *search_re;
OnigRegion *search_regs;
OnigOptionType regex_default_options;
@@ -65,7 +62,6 @@ static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
{
pglobals->default_mbctype = ONIG_ENCODING_UTF8;
pglobals->current_mbctype = ONIG_ENCODING_UTF8;
- zend_hash_init(&(pglobals->ht_rc), 0, NULL, php_mb_regex_free_cache, 1);
ZVAL_UNDEF(&pglobals->search_str);
pglobals->search_re = (php_mb_regex_t*)NULL;
pglobals->search_pos = 0;
@@ -79,7 +75,6 @@ static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals)
/* {{{ _php_mb_regex_globals_dtor */
static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals)
{
- zend_hash_destroy(&pglobals->ht_rc);
}
/* }}} */
@@ -126,7 +121,9 @@ PHP_MSHUTDOWN_FUNCTION(mb_regex)
/* {{{ PHP_RINIT_FUNCTION(mb_regex) */
PHP_RINIT_FUNCTION(mb_regex)
{
- return MBSTRG(mb_regex_globals) ? SUCCESS: FAILURE;
+ if (!MBSTRG(mb_regex_globals)) return FAILURE;
+ zend_hash_init(&MBREX(ht_rc), 0, NULL, php_mb_regex_free_cache, 0);
+ return SUCCESS;
}
/* }}} */
@@ -145,7 +142,7 @@ PHP_RSHUTDOWN_FUNCTION(mb_regex)
onig_region_free(MBREX(search_regs), 1);
MBREX(search_regs) = (OnigRegion *)NULL;
}
- zend_hash_clean(&MBREX(ht_rc));
+ zend_hash_destroy(&MBREX(ht_rc));
return SUCCESS;
}
@@ -183,7 +180,7 @@ typedef struct _php_mb_regex_enc_name_map_t {
OnigEncoding code;
} php_mb_regex_enc_name_map_t;
-php_mb_regex_enc_name_map_t enc_name_map[] = {
+static const php_mb_regex_enc_name_map_t enc_name_map[] = {
#ifdef ONIG_ENCODING_EUC_JP
{
"EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
@@ -366,7 +363,7 @@ php_mb_regex_enc_name_map_t enc_name_map[] = {
static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
{
const char *p;
- php_mb_regex_enc_name_map_t *mapping;
+ const php_mb_regex_enc_name_map_t *mapping;
if (pname == NULL || !*pname) {
return ONIG_ENCODING_UNDEF;
@@ -387,7 +384,7 @@ static OnigEncoding _php_mb_regex_name2mbctype(const char *pname)
/* {{{ php_mb_regex_mbctype2name */
static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype)
{
- php_mb_regex_enc_name_map_t *mapping;
+ const php_mb_regex_enc_name_map_t *mapping;
for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
if (mapping->code == mbctype) {
@@ -441,7 +438,7 @@ const char *php_mb_regex_get_default_mbctype(void)
* regex cache
*/
/* {{{ php_mbregex_compile_pattern */
-static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, int patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax)
+static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax)
{
int err_code = 0;
php_mb_regex_t *retval = NULL, *rc = NULL;
@@ -576,11 +573,11 @@ static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionT
/* {{{ _php_mb_regex_init_options */
static void
-_php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
+_php_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval)
{
- int n;
+ size_t n;
char c;
- int optm = 0;
+ OnigOptionType optm = 0;
*syntax = ONIG_SYNTAX_RUBY;
@@ -646,6 +643,175 @@ _php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, O
}
/* }}} */
+
+/*
+ * Callbacks for named subpatterns
+ */
+
+/* {{{ struct mb_ereg_groups_iter_arg */
+typedef struct mb_regex_groups_iter_args {
+ zval *groups;
+ char *search_str;
+ size_t search_len;
+ OnigRegion *region;
+} mb_regex_groups_iter_args;
+/* }}} */
+
+/* {{{ mb_ereg_groups_iter */
+static int
+mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
+{
+ mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
+ int gn, beg, end;
+
+ /*
+ * In case of duplicate groups, keep only the last succeeding one
+ * to be consistent with preg_match with the PCRE_DUPNAMES option.
+ */
+ gn = onig_name_to_backref_number(reg, name, name_end, args->region);
+ beg = args->region->beg[gn];
+ end = args->region->end[gn];
+ if (beg >= 0 && beg < end && end <= args->search_len) {
+ add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
+ } else {
+ add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
+ }
+
+ return 0;
+}
+/* }}} */
+
+/*
+ * Helper for _php_mb_regex_ereg_replace_exec
+ */
+/* {{{ mb_regex_substitute */
+static inline void mb_regex_substitute(
+ smart_str *pbuf,
+ const char *subject,
+ size_t subject_len,
+ char *replace,
+ size_t replace_len,
+ php_mb_regex_t *regexp,
+ OnigRegion *regs,
+ const mbfl_encoding *enc
+) {
+ char *p, *sp, *eos;
+ int no; /* bakreference group number */
+ int clen; /* byte-length of the current character */
+
+ p = replace;
+ eos = replace + replace_len;
+
+ while (p < eos) {
+ clen = (int) php_mb_mbchar_bytes_ex(p, enc);
+ if (clen != 1 || p == eos || p[0] != '\\') {
+ /* skip anything that's not an ascii backslash */
+ smart_str_appendl(pbuf, p, clen);
+ p += clen;
+ continue;
+ }
+ sp = p; /* save position */
+ clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
+ if (clen != 1 || p == eos) {
+ /* skip backslash followed by multibyte char */
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ no = -1;
+ switch (p[0]) {
+ case '0':
+ no = 0;
+ p++;
+ break;
+ case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ if (!onig_noname_group_capture_is_active(regexp)) {
+ /*
+ * FIXME:
+ * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
+ * For now we just ignore them, but in the future we might want to raise a warning
+ * and abort the whole replace operation.
+ */
+ p++;
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ no = p[0] - '0';
+ p++;
+ break;
+ case 'k':
+ clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
+ if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
+ /* not a backref delimiter */
+ p += clen;
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ /* try to consume everything until next delimiter */
+ char delim = p[0] == '<' ? '>' : '\'';
+ char *name, *name_end;
+ char maybe_num = 1;
+ name_end = name = p + 1;
+ while (name_end < eos) {
+ clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
+ if (clen != 1) {
+ name_end += clen;
+ maybe_num = 0;
+ continue;
+ }
+ if (name_end[0] == delim) break;
+ if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
+ name_end++;
+ }
+ p = name_end + 1;
+ if (name_end - name < 1 || name_end >= eos) {
+ /* the backref was empty or we failed to find the end delimiter */
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ /* we have either a name or a number */
+ if (maybe_num) {
+ if (!onig_noname_group_capture_is_active(regexp)) {
+ /* see above note on mixing numbered & named backrefs */
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ if (name_end - name == 1) {
+ no = name[0] - '0';
+ break;
+ }
+ if (name[0] == '0') {
+ /* 01 is not a valid number */
+ break;
+ }
+ no = (int) strtoul(name, NULL, 10);
+ break;
+ }
+ no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
+ break;
+ default:
+ /* We're not treating \ as an escape character and will interpret something like
+ * \\1 as \ followed by \1, rather than \\ followed by 1. This is because this
+ * function has not supported escaping of backslashes historically. */
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ if (no < 0 || no >= regs->num_regs) {
+ /* invalid group number reference, keep the escape sequence in the output */
+ smart_str_appendl(pbuf, sp, p - sp);
+ continue;
+ }
+ if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
+ smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
+ }
+ }
+
+ if (p < eos) {
+ smart_str_appendl(pbuf, p, eos - p);
+ }
+}
+/* }}} */
+
/*
* php functions
*/
@@ -700,23 +866,19 @@ static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
RETURN_FALSE;
}
+ if (array != NULL) {
+ zval_ptr_dtor(array);
+ array_init(array);
+ }
+
if (!php_mb_check_encoding(
- string,
- string_len,
- _php_mb_regex_mbctype2name(MBREX(current_mbctype))
+ string,
+ string_len,
+ _php_mb_regex_mbctype2name(MBREX(current_mbctype))
)) {
- if (array != NULL) {
- zval_dtor(array);
- array_init(array);
- }
RETURN_FALSE;
}
- if (array != NULL) {
- zval_dtor(array);
- array_init(array);
- }
-
options = MBREX(regex_default_options);
if (icase) {
options |= ONIG_OPTION_IGNORECASE;
@@ -766,6 +928,11 @@ static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
add_index_bool(array, i, 0);
}
}
+
+ if (onig_number_of_names(re) > 0) {
+ mb_regex_groups_iter_args args = {array, string, string_len, regs};
+ onig_foreach_name(re, mb_regex_groups_iter, &args);
+ }
}
if (match_len == 0) {
@@ -812,14 +979,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
char *string;
size_t string_len;
- char *p;
php_mb_regex_t *re;
OnigSyntaxType *syntax;
OnigRegion *regs = NULL;
smart_str out_buf = {0};
smart_str eval_buf = {0};
smart_str *pbuf;
- size_t i;
int err, eval, n;
OnigUChar *pos;
OnigUChar *string_lim;
@@ -929,38 +1094,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
break;
}
if (err >= 0) {
-#if moriyoshi_0
- if (regs->beg[0] == regs->end[0]) {
- php_error_docref(NULL, E_WARNING, "Empty regular expression");
- break;
- }
-#endif
/* copy the part of the string before the match */
smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
if (!is_callable) {
- /* copy replacement and backrefs */
- i = 0;
- p = replace;
- while (i < replace_len) {
- int fwd = (int) php_mb_mbchar_bytes_ex(p, enc);
- n = -1;
- if ((replace_len - i) >= 2 && fwd == 1 &&
- p[0] == '\\' && p[1] >= '0' && p[1] <= '9') {
- n = p[1] - '0';
- }
- if (n >= 0 && n < regs->num_regs) {
- if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) {
- smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]);
- }
- p += 2;
- i += 2;
- } else {
- smart_str_appendl(pbuf, p, fwd);
- p += fwd;
- i += fwd;
- }
- }
+ mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
}
if (eval) {
@@ -990,7 +1128,7 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v));
/* Clean up */
smart_str_free(&eval_buf);
- zval_dtor(&v);
+ zval_ptr_dtor_str(&v);
} else if (is_callable) {
zval args[1];
zval subpats, retval;
@@ -1000,6 +1138,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
for (i = 0; i < regs->num_regs; i++) {
add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
}
+ if (onig_number_of_names(re) > 0) {
+ mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
+ onig_foreach_name(re, mb_regex_groups_iter, &args);
+ }
ZVAL_COPY_VALUE(&args[0], &subpats);
/* null terminate buffer */
@@ -1096,7 +1238,7 @@ PHP_FUNCTION(mb_split)
OnigUChar *pos, *chunk_pos;
size_t string_len;
- int n, err;
+ int err;
zend_long count = -1;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) {
@@ -1118,16 +1260,16 @@ PHP_FUNCTION(mb_split)
err = 0;
regs = onig_region_new();
/* churn through str, generating array entries as we go */
- while (count != 0 && (pos - (OnigUChar *)string) < (ptrdiff_t)string_len) {
- int beg, end;
+ while (count != 0 && (size_t)(pos - (OnigUChar *)string) < string_len) {
+ size_t beg, end;
err = onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0);
if (err < 0) {
break;
}
beg = regs->beg[0], end = regs->end[0];
/* add it to the array */
- if ((pos - (OnigUChar *)string) < end) {
- if ((size_t)beg < string_len && beg >= (chunk_pos - (OnigUChar *)string)) {
+ if ((size_t)(pos - (OnigUChar *)string) < end) {
+ if (beg < string_len && beg >= (size_t)(chunk_pos - (OnigUChar *)string)) {
add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos));
--count;
} else {
@@ -1149,13 +1291,13 @@ PHP_FUNCTION(mb_split)
OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str(err_str, err);
php_error_docref(NULL, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str);
- zval_dtor(return_value);
+ zend_array_destroy(Z_ARR_P(return_value));
RETURN_FALSE;
}
/* otherwise we just have one last element to add to the array */
- n = ((OnigUChar *)(string + string_len) - chunk_pos);
- if (n > 0) {
+ if ((OnigUChar *)(string + string_len) > chunk_pos) {
+ size_t n = ((OnigUChar *)(string + string_len) - chunk_pos);
add_next_index_stringl(return_value, (char *)chunk_pos, n);
} else {
add_next_index_stringl(return_value, "", 0);
@@ -1217,7 +1359,8 @@ _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
{
char *arg_pattern = NULL, *arg_options = NULL;
size_t arg_pattern_len, arg_options_len;
- int n, i, err, pos, len, beg, end;
+ int err;
+ size_t n, i, pos, len, beg, end;
OnigOptionType option;
OnigUChar *str;
OnigSyntaxType *syntax;
@@ -1293,6 +1436,15 @@ _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
add_index_bool(return_value, i, 0);
}
}
+ if (onig_number_of_names(MBREX(search_re)) > 0) {
+ mb_regex_groups_iter_args args = {
+ return_value,
+ Z_STRVAL(MBREX(search_str)),
+ Z_STRLEN(MBREX(search_str)),
+ MBREX(search_regs)
+ };
+ onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
+ }
break;
default:
RETVAL_TRUE;
@@ -1341,7 +1493,7 @@ PHP_FUNCTION(mb_ereg_search_regs)
Initialize string and regular expression for search. */
PHP_FUNCTION(mb_ereg_search_init)
{
- size_t argc = ZEND_NUM_ARGS();
+ int argc = ZEND_NUM_ARGS();
zend_string *arg_str;
char *arg_pattern = NULL, *arg_options = NULL;
size_t arg_pattern_len = 0, arg_options_len = 0;
@@ -1401,7 +1553,7 @@ PHP_FUNCTION(mb_ereg_search_init)
Get matched substring of the last time */
PHP_FUNCTION(mb_ereg_search_getregs)
{
- int n, i, len, beg, end;
+ size_t n, i, len, beg, end;
OnigUChar *str;
if (MBREX(search_regs) != NULL && Z_TYPE(MBREX(search_str)) == IS_STRING) {
@@ -1419,6 +1571,15 @@ PHP_FUNCTION(mb_ereg_search_getregs)
add_index_bool(return_value, i, 0);
}
}
+ if (onig_number_of_names(MBREX(search_re)) > 0) {
+ mb_regex_groups_iter_args args = {
+ return_value,
+ Z_STRVAL(MBREX(search_str)),
+ len,
+ MBREX(search_regs)
+ };
+ onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
+ }
} else {
RETVAL_FALSE;
}