summaryrefslogtreecommitdiff
path: root/ext/pcre/php_pcre.c
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2019-03-18 12:57:43 +0100
committerNikita Popov <nikita.ppv@gmail.com>2019-03-18 16:58:48 +0100
commit2b9acd37f0a13572684dde80e3e56d5c1b2ec045 (patch)
treeca4e1541d2e998a2d0d4f9ed5ac5c71418edd45e /ext/pcre/php_pcre.c
parent8c9d8c3f667e4cedc7499b49dcc52644dac17c53 (diff)
downloadphp-git-2b9acd37f0a13572684dde80e3e56d5c1b2ec045.tar.gz
Fixed bug #72685
We currently have a large performance problem when implementing lexers working on UTF-8 strings in PHP. This kind of code tends to perform a large number of matches at different offsets on a single string. This is generally fast. However, if /u mode is used, the full string will be UTF-8 validated on each match. This results in quadratic runtime. This patch fixes the issue by adding a IS_STR_VALID_UTF8 flag, which is set when we have determined that the string is valid UTF8 and further validation is skipped. A limitation of this approach is that we can't set the flag for interned strings. I think this is not a problem for this use-case which will generally work on dynamic data. If we want to use this flag for other purposes as well (mbstring?) then it might be worthwhile to UTF-8 validate strings during interning. But right now this doesn't seem useful.
Diffstat (limited to 'ext/pcre/php_pcre.c')
-rw-r--r--ext/pcre/php_pcre.c9
1 files changed, 7 insertions, 2 deletions
diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c
index d18ab6ae32..e1c46842b9 100644
--- a/ext/pcre/php_pcre.c
+++ b/ext/pcre/php_pcre.c
@@ -1104,7 +1104,8 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
}
- options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
+ options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
+ ? 0 : PCRE2_NO_UTF_CHECK;
/* Execute the regular expression. */
#ifdef HAVE_PCRE_JIT_SUPPORT
@@ -1403,8 +1404,12 @@ error:
efree(subpat_names);
}
- /* Did we encounter an error? */
if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
+ /* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
+ if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) {
+ GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
+ }
+
RETVAL_LONG(matched);
} else {
RETVAL_FALSE;