diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2017-07-27 22:48:00 +0200 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2017-07-28 12:32:50 +0200 |
commit | 582a65b06f3de125887cab02d5c561168fcf94bc (patch) | |
tree | 8e1420959ee8f8216227cbc2f15e2fef5ac6d569 /ext/mbstring/php_unicode.c | |
parent | 9ac7c1e71d956ddac63b042be6ad8b105e584c10 (diff) | |
download | php-git-582a65b06f3de125887cab02d5c561168fcf94bc.tar.gz |
Implement full case mapping
Implement full case mapping according to SpecialCasing.txt and
also full case folding according to CaseFolding.txt (F). There
are a number of caveats:
* Only language-agnostic and unconditional full case mapping
is implemented. The only language-agnostic conditional case
mapping rule relates to Greek sigma in final position
(Final_Sigma). Correctly handling this requires both arbitrary
lookahead and lookbehind, which would require some larger
changes to how the case mapping is implemented. This is a
possible future extension.
* The only language-specific handling that is implemented is
for Turkish dotted/undotted Is, if the ISO-8859-9 encoding
is used. This matches the previous behavior and makes sure
that no codepoints not supported by the encoding are
produced. A future extension would be to also handle the
Turkish mappings specified by SpecialCasing.txt based on
the mbfl internal language.
* Full case folding is implemented, but case-insensitive mb_*
operations continue to use simple case folding. The reason is
that full case folding of the haystack string may change the
position at which a match occurred. This would have to be
mapped back into the position in the original string.
* mb_convert_case() exposes both the full and the simple case
mapping / folding, where full is the default. The constants
are:
* MB_CASE_LOWER (used by mb_strtolower)
* MB_CASE_UPPER (used by mb_strtolower)
* MB_CASE_TITLE
* MB_CASE_FOLD
* MB_CASE_LOWER_SIMPLE
* MB_CASE_UPPER_SIMPLE
* MB_CASE_TITLE_SIMPLE
* MB_CASE_FOLD_SIMPLE (used by case-insensitive operations)
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r-- | ext/mbstring/php_unicode.c | 157 |
1 files changed, 139 insertions, 18 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index d9f2f23634..1bd348b3d2 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -145,13 +145,13 @@ static inline unsigned mph_lookup( mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \ _uccase_##type##_table, _uccase_##type##_table_size) -unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc) +static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ if (code >= 0x61 && code <= 0x7A) { - if (enc == mbfl_no_encoding_8859_9 && code == 0x0069L) { - return 0x0130L; + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) { + return 0x130; } return code - 0x20; } @@ -165,12 +165,12 @@ unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc) } } -unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc) +static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ if (code >= 0x41 && code <= 0x5A) { - if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) { + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) { return 0x0131L; } return code + 0x20; @@ -179,13 +179,16 @@ unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc) } else { unsigned new_code = CASE_LOOKUP(code, lower); if (new_code != CODE_NOT_FOUND) { + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) { + return 0x69; + } return new_code; } return code; } } -unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc) +static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc) { unsigned new_code = CASE_LOOKUP(code, title); if (new_code != CODE_NOT_FOUND) { @@ -193,16 +196,16 @@ unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc) } /* No dedicated title-case variant, use to-upper instead */ - return php_unicode_toupper(code, enc); + return php_unicode_toupper_raw(code, enc); } -unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc) +unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ if (code >= 0x41 && code <= 0x5A) { - if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) { - return 0x0131L; + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) { + return 0x131; } return code + 0x20; } @@ -210,12 +213,93 @@ unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc) } else { unsigned new_code = CASE_LOOKUP(code, fold); if (new_code != CODE_NOT_FOUND) { + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) { + return 0x69; + } return new_code; } return code; } } +static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_tolower_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} +static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_toupper_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} +static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_totitle_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} +static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_tofold_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} + +static inline unsigned php_unicode_tolower_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_tolower_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} +static inline unsigned php_unicode_toupper_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_toupper_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} +static inline unsigned php_unicode_totitle_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_totitle_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} +static inline unsigned php_unicode_tofold_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_tofold_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} + struct convert_case_data { mbfl_convert_filter *next_filter; enum mbfl_no_encoding no_encoding; @@ -226,39 +310,76 @@ struct convert_case_data { static int convert_case_filter(int c, void *void_data) { struct convert_case_data *data = (struct convert_case_data *) void_data; + unsigned out[3]; + unsigned len; switch (data->case_mode) { + case PHP_UNICODE_CASE_UPPER_SIMPLE: + out[0] = php_unicode_toupper_simple(c, data->no_encoding); + len = 1; + break; + case PHP_UNICODE_CASE_UPPER: - c = php_unicode_toupper(c, data->no_encoding); + len = php_unicode_toupper_full(c, data->no_encoding, out); + break; + + case PHP_UNICODE_CASE_LOWER_SIMPLE: + out[0] = php_unicode_tolower_simple(c, data->no_encoding); + len = 1; break; case PHP_UNICODE_CASE_LOWER: - c = php_unicode_tolower(c, data->no_encoding); + len = php_unicode_tolower_full(c, data->no_encoding, out); break; + case PHP_UNICODE_CASE_FOLD: + len = php_unicode_tofold_full(c, data->no_encoding, out); + break; + + case PHP_UNICODE_CASE_FOLD_SIMPLE: + out[0] = php_unicode_tofold_simple(c, data->no_encoding); + len = 1; + break; + + case PHP_UNICODE_CASE_TITLE_SIMPLE: case PHP_UNICODE_CASE_TITLE: { int res = php_unicode_is_prop(c, UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1); + out[0] = c; + len = 1; if (data->title_mode) { if (res) { - c = php_unicode_tolower(c, data->no_encoding); + if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) { + out[0] = php_unicode_tolower_simple(c, data->no_encoding); + len = 1; + } else { + len = php_unicode_tolower_full(c, data->no_encoding, out); + } } else { data->title_mode = 0; } } else { if (res) { data->title_mode = 1; - c = php_unicode_totitle(c, data->no_encoding); + if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) { + out[0] = php_unicode_totitle_simple(c, data->no_encoding); + len = 1; + } else { + len = php_unicode_totitle_full(c, data->no_encoding, out); + } } } break; } - - case PHP_UNICODE_CASE_FOLD: - c = php_unicode_tofold(c, data->no_encoding); + default: + assert(0); break; } - return (*data->next_filter->filter_function)(c, data->next_filter); + + for (unsigned i = 0; i < len; i++) { + (*data->next_filter->filter_function)(out[i], data->next_filter); + } + return 0; } MBSTRING_API char *php_unicode_convert_case( |