From 8f1782678e9a790cf696da1268a7dfcb38e22312 Mon Sep 17 00:00:00 2001 From: ju1ius Date: Sat, 30 Jul 2016 06:05:59 +0200 Subject: adds support for named subpatterns to `mb_ereg_replace` Named subpatterns are now passed to `mb_ereg_replace_callback`. This commit also adds a subset of the oniguruma back-reference syntax for replacements: * `\k` and `\k'name'` for named subpatterns. * `\k` and `\k'n'` for numbered subpatterns These last two notations allow referencing numbered groups where n > 9. --- ext/mbstring/php_mbregex.c | 165 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 135 insertions(+), 30 deletions(-) (limited to 'ext/mbstring/php_mbregex.c') diff --git a/ext/mbstring/php_mbregex.c b/ext/mbstring/php_mbregex.c index 957ee484ba..d048376062 100644 --- a/ext/mbstring/php_mbregex.c +++ b/ext/mbstring/php_mbregex.c @@ -690,6 +690,136 @@ mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngrou } /* }}} */ +/* + * Helper for _php_mb_regex_ereg_replace_exec + */ +/* {{{ mb_regex_substitute */ +static inline void mb_regex_substitute( + smart_str *pbuf, + const char *subject, + size_t subject_len, + char *replace, + size_t replace_len, + php_mb_regex_t *regexp, + OnigRegion *regs, + const mbfl_encoding *enc +) { + char *p, *sp, *eos; + int no; /* bakreference group number */ + int clen; /* byte-length of the current character */ + + p = replace; + eos = replace + replace_len; + + while (p < eos) { + clen = (int) php_mb_mbchar_bytes_ex(p, enc); + if (clen != 1 || p == eos || p[0] != '\\') { + /* skip anything that's not an ascii backslash */ + smart_str_appendl(pbuf, p, clen); + p += clen; + continue; + } + sp = p; /* save position */ + clen = (int) php_mb_mbchar_bytes_ex(++p, enc); + if (clen != 1 || p == eos) { + /* skip escaped multibyte char */ + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + no = -1; + switch (p[0]) { + case '0': + no = 0; + p++; + break; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + if (!onig_noname_group_capture_is_active(regexp)) { + /* + * FIXME: + * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern. + * For now we just ignore them, but in the future we might want to raise a warning + * and abort the whole replace operation. + */ + p++; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + no = p[0] - '0'; + p++; + break; + case 'k': + clen = (int) php_mb_mbchar_bytes_ex(++p, enc); + if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) { + /* not a backref delimiter */ + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + /* try to consume everything until next delimiter */ + char delim = p[0] == '<' ? '>' : '\''; + char *name, *name_end; + char maybe_num = 1; + name_end = name = p + 1; + while (name_end < eos) { + clen = (int) php_mb_mbchar_bytes_ex(name_end, enc); + if (clen != 1) { + name_end += clen; + maybe_num = 0; + continue; + } + if (name_end[0] == delim) break; + if (maybe_num && !isdigit(name_end[0])) maybe_num = 0; + name_end++; + } + p = name_end + 1; + if (name_end - name < 1 || name_end >= eos) { + /* the backref was empty or we failed to find the end delimiter */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + /* we have either a name or a number */ + if (maybe_num) { + if (!onig_noname_group_capture_is_active(regexp)) { + /* see above note on mixing numbered & named backrefs */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (name_end - name == 1) { + no = name[0] - '0'; + break; + } + if (name[0] == '0') { + /* 01 is not a valid number */ + break; + } + no = (int) strtoul(name, NULL, 10); + break; + } + no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs); + break; + default: + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (no < 0 || no >= regs->num_regs) { + /* invalid group number reference, keep the escape sequence in the output */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) { + smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]); + } + } + + if (p < eos) { + smart_str_appendl(pbuf, p, eos - p); + } +} +/* }}} */ + /* * php functions */ @@ -857,14 +987,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp char *string; size_t string_len; - char *p; php_mb_regex_t *re; OnigSyntaxType *syntax; OnigRegion *regs = NULL; smart_str out_buf = {0}; smart_str eval_buf = {0}; smart_str *pbuf; - size_t i; int err, eval, n; OnigUChar *pos; OnigUChar *string_lim; @@ -974,38 +1102,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp break; } if (err >= 0) { -#if moriyoshi_0 - if (regs->beg[0] == regs->end[0]) { - php_error_docref(NULL, E_WARNING, "Empty regular expression"); - break; - } -#endif /* copy the part of the string before the match */ smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos)); if (!is_callable) { - /* copy replacement and backrefs */ - i = 0; - p = replace; - while (i < replace_len) { - int fwd = (int) php_mb_mbchar_bytes_ex(p, enc); - n = -1; - if ((replace_len - i) >= 2 && fwd == 1 && - p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { - n = p[1] - '0'; - } - if (n >= 0 && n < regs->num_regs) { - if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) { - smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]); - } - p += 2; - i += 2; - } else { - smart_str_appendl(pbuf, p, fwd); - p += fwd; - i += fwd; - } - } + mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc); } if (eval) { @@ -1045,6 +1146,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp for (i = 0; i < regs->num_regs; i++) { add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]); } + if (onig_number_of_names(re) > 0) { + mb_regex_groups_iter_args args = {&subpats, string, string_len, regs}; + onig_foreach_name(re, mb_regex_groups_iter, &args); + } ZVAL_COPY_VALUE(&args[0], &subpats); /* null terminate buffer */ -- cgit v1.2.1