summaryrefslogtreecommitdiff
path: root/ext/mbstring/mbstring.c
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2017-07-27 22:48:00 +0200
committerNikita Popov <nikita.ppv@gmail.com>2017-07-28 12:32:50 +0200
commit582a65b06f3de125887cab02d5c561168fcf94bc (patch)
tree8e1420959ee8f8216227cbc2f15e2fef5ac6d569 /ext/mbstring/mbstring.c
parent9ac7c1e71d956ddac63b042be6ad8b105e584c10 (diff)
downloadphp-git-582a65b06f3de125887cab02d5c561168fcf94bc.tar.gz
Implement full case mapping
Implement full case mapping according to SpecialCasing.txt and also full case folding according to CaseFolding.txt (F). There are a number of caveats: * Only language-agnostic and unconditional full case mapping is implemented. The only language-agnostic conditional case mapping rule relates to Greek sigma in final position (Final_Sigma). Correctly handling this requires both arbitrary lookahead and lookbehind, which would require some larger changes to how the case mapping is implemented. This is a possible future extension. * The only language-specific handling that is implemented is for Turkish dotted/undotted Is, if the ISO-8859-9 encoding is used. This matches the previous behavior and makes sure that no codepoints not supported by the encoding are produced. A future extension would be to also handle the Turkish mappings specified by SpecialCasing.txt based on the mbfl internal language. * Full case folding is implemented, but case-insensitive mb_* operations continue to use simple case folding. The reason is that full case folding of the haystack string may change the position at which a match occurred. This would have to be mapped back into the position in the original string. * mb_convert_case() exposes both the full and the simple case mapping / folding, where full is the default. The constants are: * MB_CASE_LOWER (used by mb_strtolower) * MB_CASE_UPPER (used by mb_strtolower) * MB_CASE_TITLE * MB_CASE_FOLD * MB_CASE_LOWER_SIMPLE * MB_CASE_UPPER_SIMPLE * MB_CASE_TITLE_SIMPLE * MB_CASE_FOLD_SIMPLE (used by case-insensitive operations)
Diffstat (limited to 'ext/mbstring/mbstring.c')
-rw-r--r--ext/mbstring/mbstring.c17
1 files changed, 15 insertions, 2 deletions
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
index 139d6d5a79..215c6bd95e 100644
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@@ -1603,6 +1603,11 @@ ZEND_TSRMLS_CACHE_UPDATE();
REGISTER_LONG_CONSTANT("MB_CASE_UPPER", PHP_UNICODE_CASE_UPPER, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("MB_CASE_LOWER", PHP_UNICODE_CASE_LOWER, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("MB_CASE_TITLE", PHP_UNICODE_CASE_TITLE, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("MB_CASE_FOLD", PHP_UNICODE_CASE_FOLD, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("MB_CASE_UPPER_SIMPLE", PHP_UNICODE_CASE_UPPER_SIMPLE, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("MB_CASE_LOWER_SIMPLE", PHP_UNICODE_CASE_LOWER_SIMPLE, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("MB_CASE_TITLE_SIMPLE", PHP_UNICODE_CASE_TITLE_SIMPLE, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("MB_CASE_FOLD_SIMPLE", PHP_UNICODE_CASE_FOLD_SIMPLE, CONST_CS | CONST_PERSISTENT);
#if HAVE_MBREGEX
PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
@@ -3356,6 +3361,11 @@ PHP_FUNCTION(mb_convert_case)
if (!enc) {
return;
}
+
+ if (case_mode < 0 || case_mode > PHP_UNICODE_CASE_MODE_MAX) {
+ php_error_docref(NULL, E_WARNING, "Invalid case mode");
+ return;
+ }
newstr = php_unicode_convert_case(case_mode, str, str_len, &ret_len, enc);
@@ -5379,8 +5389,11 @@ MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t ol
needle.encoding = enc;
do {
+ /* We're using simple case-folding here, because we'd have to deal with remapping of
+ * offsets otherwise. */
+
size_t len = 0;
- haystack.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD, (char *)old_haystack, old_haystack_len, &len, enc);
+ haystack.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_haystack, old_haystack_len, &len, enc);
haystack.len = len;
if (!haystack.val) {
@@ -5391,7 +5404,7 @@ MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t ol
break;
}
- needle.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD, (char *)old_needle, old_needle_len, &len, enc);
+ needle.val = (unsigned char *)php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char *)old_needle, old_needle_len, &len, enc);
needle.len = len;
if (!needle.val) {