diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2020-02-07 16:39:06 +0100 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2020-02-07 16:49:28 +0100 |
commit | c9e78e6d338cc46dcadb39b3e2df119fa969e72b (patch) | |
tree | 90fdef22e19c9e9dbe0ea105512399902a01448b | |
parent | 0d49cf4ed25e406f00abefca0e2e3e8fd919bf94 (diff) | |
download | php-git-c9e78e6d338cc46dcadb39b3e2df119fa969e72b.tar.gz |
PCRE: Check whether start offset is on char boundary
We need not just the whole string to be UTF-8, but the start
position to be on a character boundary as well. Check this by
looking for a continuation byte.
-rw-r--r-- | ext/pcre/php_pcre.c | 18 | ||||
-rw-r--r-- | ext/pcre/tests/bug79241.phpt | 22 |
2 files changed, 39 insertions, 1 deletions
diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 481d564f66..104b8d4c97 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -1130,6 +1130,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ * } /* }}} */ +static zend_always_inline zend_bool is_known_valid_utf8( + zend_string *subject_str, PCRE2_SIZE start_offset) { + if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) { + /* We don't know whether the string is valid UTF-8 or not. */ + return 0; + } + + if (start_offset == ZSTR_LEN(subject_str)) { + /* Degenerate case: Offset points to end of string. */ + return 1; + } + + /* Check that the offset does not point to an UTF-8 continuation byte. */ + return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80; +} + /* {{{ php_pcre_match_impl() */ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset) @@ -1247,7 +1263,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, } } - options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8) + options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2) ? 0 : PCRE2_NO_UTF_CHECK; /* Execute the regular expression. */ diff --git a/ext/pcre/tests/bug79241.phpt b/ext/pcre/tests/bug79241.phpt new file mode 100644 index 0000000000..92e5253735 --- /dev/null +++ b/ext/pcre/tests/bug79241.phpt @@ -0,0 +1,22 @@ +--TEST-- +Bug #79241: Segmentation fault on preg_match() +--FILE-- +<?php + +// if "’" string is used directly without json_decode, +// the issue does not reproduce +$text = json_decode('"’"'); + +$pattern = '/\b/u'; + +// it has to be exact two calls to preg_match(), +// with the second call offsetting after the tick symbol +var_dump(preg_match($pattern, $text, $matches, 0, 0)); +var_dump(preg_match($pattern, $text, $matches, 0, 1)); +var_dump(preg_last_error() == PREG_BAD_UTF8_OFFSET_ERROR); + +?> +--EXPECT-- +int(0) +bool(false) +bool(true) |