diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2017-07-27 22:48:00 +0200 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2017-07-28 12:32:50 +0200 |
commit | 582a65b06f3de125887cab02d5c561168fcf94bc (patch) | |
tree | 8e1420959ee8f8216227cbc2f15e2fef5ac6d569 /ext/mbstring/mbstring.c | |
parent | 9ac7c1e71d956ddac63b042be6ad8b105e584c10 (diff) | |
download | php-git-582a65b06f3de125887cab02d5c561168fcf94bc.tar.gz |
Implement full case mapping
Implement full case mapping according to SpecialCasing.txt and
also full case folding according to CaseFolding.txt (F). There
are a number of caveats:
* Only language-agnostic and unconditional full case mapping
is implemented. The only language-agnostic conditional case
mapping rule relates to Greek sigma in final position
(Final_Sigma). Correctly handling this requires both arbitrary
lookahead and lookbehind, which would require some larger
changes to how the case mapping is implemented. This is a
possible future extension.
* The only language-specific handling that is implemented is
for Turkish dotted/undotted Is, if the ISO-8859-9 encoding
is used. This matches the previous behavior and makes sure
that no codepoints not supported by the encoding are
produced. A future extension would be to also handle the
Turkish mappings specified by SpecialCasing.txt based on
the mbfl internal language.
* Full case folding is implemented, but case-insensitive mb_*
operations continue to use simple case folding. The reason is
that full case folding of the haystack string may change the
position at which a match occurred. This would have to be
mapped back into the position in the original string.
* mb_convert_case() exposes both the full and the simple case
mapping / folding, where full is the default. The constants
are:
* MB_CASE_LOWER (used by mb_strtolower)
* MB_CASE_UPPER (used by mb_strtolower)
* MB_CASE_TITLE
* MB_CASE_FOLD
* MB_CASE_LOWER_SIMPLE
* MB_CASE_UPPER_SIMPLE
* MB_CASE_TITLE_SIMPLE
* MB_CASE_FOLD_SIMPLE (used by case-insensitive operations)
Diffstat (limited to 'ext/mbstring/mbstring.c')
-rw-r--r-- | ext/mbstring/mbstring.c | 17 |
1 files changed, 15 insertions, 2 deletions
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 139d6d5a79..215c6bd95e 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1603,6 +1603,11 @@ ZEND_TSRMLS_CACHE_UPDATE(); REGISTER_LONG_CONSTANT("MB_CASE_UPPER", PHP_UNICODE_CASE_UPPER, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("MB_CASE_LOWER", PHP_UNICODE_CASE_LOWER, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("MB_CASE_TITLE", PHP_UNICODE_CASE_TITLE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("MB_CASE_FOLD", PHP_UNICODE_CASE_FOLD, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("MB_CASE_UPPER_SIMPLE", PHP_UNICODE_CASE_UPPER_SIMPLE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("MB_CASE_LOWER_SIMPLE", PHP_UNICODE_CASE_LOWER_SIMPLE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("MB_CASE_TITLE_SIMPLE", PHP_UNICODE_CASE_TITLE_SIMPLE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("MB_CASE_FOLD_SIMPLE", PHP_UNICODE_CASE_FOLD_SIMPLE, CONST_CS | CONST_PERSISTENT); #if HAVE_MBREGEX PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU); @@ -3356,6 +3361,11 @@ PHP_FUNCTION(mb_convert_case) if (!enc) { return; } + + if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) { + php_error_docref(NULL, E_WARNING, "Invalid case mode"); + return; + } newstr = php_unicode_convert_case(case_mode, str, str_len, &ret_len, enc); @@ -5379,8 +5389,11 @@ MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t ol needle.encoding = enc; do { + /* We're using simple case-folding here, because we'd have to deal with remapping of + * offsets otherwise. */ + size_t len = 0; - haystack.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD, (char *)old_haystack, old_haystack_len, &len, enc); + haystack.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc); haystack.len = len; if (!haystack.val) { @@ -5391,7 +5404,7 @@ MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t ol break; } - needle.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD, (char *)old_needle, old_needle_len, &len, enc); + needle.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc); needle.len = len; if (!needle.val) { |