From 582a65b06f3de125887cab02d5c561168fcf94bc Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 27 Jul 2017 22:48:00 +0200 Subject: Implement full case mapping Implement full case mapping according to SpecialCasing.txt and also full case folding according to CaseFolding.txt (F). There are a number of caveats: * Only language-agnostic and unconditional full case mapping is implemented. The only language-agnostic conditional case mapping rule relates to Greek sigma in final position (Final_Sigma). Correctly handling this requires both arbitrary lookahead and lookbehind, which would require some larger changes to how the case mapping is implemented. This is a possible future extension. * The only language-specific handling that is implemented is for Turkish dotted/undotted Is, if the ISO-8859-9 encoding is used. This matches the previous behavior and makes sure that no codepoints not supported by the encoding are produced. A future extension would be to also handle the Turkish mappings specified by SpecialCasing.txt based on the mbfl internal language. * Full case folding is implemented, but case-insensitive mb_* operations continue to use simple case folding. The reason is that full case folding of the haystack string may change the position at which a match occurred. This would have to be mapped back into the position in the original string. * mb_convert_case() exposes both the full and the simple case mapping / folding, where full is the default. The constants are: * MB_CASE_LOWER (used by mb_strtolower) * MB_CASE_UPPER (used by mb_strtolower) * MB_CASE_TITLE * MB_CASE_FOLD * MB_CASE_LOWER_SIMPLE * MB_CASE_UPPER_SIMPLE * MB_CASE_TITLE_SIMPLE * MB_CASE_FOLD_SIMPLE (used by case-insensitive operations) --- ext/mbstring/php_unicode.c | 157 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 139 insertions(+), 18 deletions(-) (limited to 'ext/mbstring/php_unicode.c') diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index d9f2f23634..1bd348b3d2 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -145,13 +145,13 @@ static inline unsigned mph_lookup( mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \ _uccase_##type##_table, _uccase_##type##_table_size) -unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc) +static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ if (code >= 0x61 && code <= 0x7A) { - if (enc == mbfl_no_encoding_8859_9 && code == 0x0069L) { - return 0x0130L; + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) { + return 0x130; } return code - 0x20; } @@ -165,12 +165,12 @@ unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc) } } -unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc) +static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ if (code >= 0x41 && code <= 0x5A) { - if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) { + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) { return 0x0131L; } return code + 0x20; @@ -179,13 +179,16 @@ unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc) } else { unsigned new_code = CASE_LOOKUP(code, lower); if (new_code != CODE_NOT_FOUND) { + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) { + return 0x69; + } return new_code; } return code; } } -unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc) +static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc) { unsigned new_code = CASE_LOOKUP(code, title); if (new_code != CODE_NOT_FOUND) { @@ -193,16 +196,16 @@ unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc) } /* No dedicated title-case variant, use to-upper instead */ - return php_unicode_toupper(code, enc); + return php_unicode_toupper_raw(code, enc); } -unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc) +unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ if (code >= 0x41 && code <= 0x5A) { - if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) { - return 0x0131L; + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) { + return 0x131; } return code + 0x20; } @@ -210,12 +213,93 @@ unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc) } else { unsigned new_code = CASE_LOOKUP(code, fold); if (new_code != CODE_NOT_FOUND) { + if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) { + return 0x69; + } return new_code; } return code; } } +static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_tolower_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} +static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_toupper_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} +static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_totitle_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} +static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) { + code = php_unicode_tofold_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + return _uccase_extra_table[code & 0xffffff]; + } + return code; +} + +static inline unsigned php_unicode_tolower_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_tolower_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} +static inline unsigned php_unicode_toupper_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_toupper_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} +static inline unsigned php_unicode_totitle_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_totitle_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} +static inline unsigned php_unicode_tofold_full( + unsigned code, enum mbfl_no_encoding enc, unsigned *out) { + code = php_unicode_tofold_raw(code, enc); + if (UNEXPECTED(code > 0xffffff)) { + unsigned len = code >> 24; + const unsigned *p = &_uccase_extra_table[code & 0xffffff]; + memcpy(out, p + 1, len * sizeof(unsigned)); + return len; + } + *out = code; + return 1; +} + struct convert_case_data { mbfl_convert_filter *next_filter; enum mbfl_no_encoding no_encoding; @@ -226,39 +310,76 @@ struct convert_case_data { static int convert_case_filter(int c, void *void_data) { struct convert_case_data *data = (struct convert_case_data *) void_data; + unsigned out[3]; + unsigned len; switch (data->case_mode) { + case PHP_UNICODE_CASE_UPPER_SIMPLE: + out[0] = php_unicode_toupper_simple(c, data->no_encoding); + len = 1; + break; + case PHP_UNICODE_CASE_UPPER: - c = php_unicode_toupper(c, data->no_encoding); + len = php_unicode_toupper_full(c, data->no_encoding, out); + break; + + case PHP_UNICODE_CASE_LOWER_SIMPLE: + out[0] = php_unicode_tolower_simple(c, data->no_encoding); + len = 1; break; case PHP_UNICODE_CASE_LOWER: - c = php_unicode_tolower(c, data->no_encoding); + len = php_unicode_tolower_full(c, data->no_encoding, out); break; + case PHP_UNICODE_CASE_FOLD: + len = php_unicode_tofold_full(c, data->no_encoding, out); + break; + + case PHP_UNICODE_CASE_FOLD_SIMPLE: + out[0] = php_unicode_tofold_simple(c, data->no_encoding); + len = 1; + break; + + case PHP_UNICODE_CASE_TITLE_SIMPLE: case PHP_UNICODE_CASE_TITLE: { int res = php_unicode_is_prop(c, UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1); + out[0] = c; + len = 1; if (data->title_mode) { if (res) { - c = php_unicode_tolower(c, data->no_encoding); + if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) { + out[0] = php_unicode_tolower_simple(c, data->no_encoding); + len = 1; + } else { + len = php_unicode_tolower_full(c, data->no_encoding, out); + } } else { data->title_mode = 0; } } else { if (res) { data->title_mode = 1; - c = php_unicode_totitle(c, data->no_encoding); + if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) { + out[0] = php_unicode_totitle_simple(c, data->no_encoding); + len = 1; + } else { + len = php_unicode_totitle_full(c, data->no_encoding, out); + } } } break; } - - case PHP_UNICODE_CASE_FOLD: - c = php_unicode_tofold(c, data->no_encoding); + default: + assert(0); break; } - return (*data->next_filter->filter_function)(c, data->next_filter); + + for (unsigned i = 0; i < len; i++) { + (*data->next_filter->filter_function)(out[i], data->next_filter); + } + return 0; } MBSTRING_API char *php_unicode_convert_case( -- cgit v1.2.1