summaryrefslogtreecommitdiff
path: root/ext/pcre/php_pcre.c
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2020-02-07 16:39:06 +0100
committerNikita Popov <nikita.ppv@gmail.com>2020-02-07 16:49:28 +0100
commitc9e78e6d338cc46dcadb39b3e2df119fa969e72b (patch)
tree90fdef22e19c9e9dbe0ea105512399902a01448b /ext/pcre/php_pcre.c
parent0d49cf4ed25e406f00abefca0e2e3e8fd919bf94 (diff)
downloadphp-git-c9e78e6d338cc46dcadb39b3e2df119fa969e72b.tar.gz
PCRE: Check whether start offset is on char boundary
We need not just the whole string to be UTF-8, but the start position to be on a character boundary as well. Check this by looking for a continuation byte.
Diffstat (limited to 'ext/pcre/php_pcre.c')
-rw-r--r--ext/pcre/php_pcre.c18
1 files changed, 17 insertions, 1 deletions
diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c
index 481d564f66..104b8d4c97 100644
--- a/ext/pcre/php_pcre.c
+++ b/ext/pcre/php_pcre.c
@@ -1130,6 +1130,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
}
/* }}} */
+static zend_always_inline zend_bool is_known_valid_utf8(
+ zend_string *subject_str, PCRE2_SIZE start_offset) {
+ if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
+ /* We don't know whether the string is valid UTF-8 or not. */
+ return 0;
+ }
+
+ if (start_offset == ZSTR_LEN(subject_str)) {
+ /* Degenerate case: Offset points to end of string. */
+ return 1;
+ }
+
+ /* Check that the offset does not point to an UTF-8 continuation byte. */
+ return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
+}
+
/* {{{ php_pcre_match_impl() */
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
@@ -1247,7 +1263,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
}
- options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
+ options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2)
? 0 : PCRE2_NO_UTF_CHECK;
/* Execute the regular expression. */