summaryrefslogtreecommitdiff
path: root/ext/mbstring/php_unicode.c
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2017-07-27 22:48:00 +0200
committerNikita Popov <nikita.ppv@gmail.com>2017-07-28 12:32:50 +0200
commit582a65b06f3de125887cab02d5c561168fcf94bc (patch)
tree8e1420959ee8f8216227cbc2f15e2fef5ac6d569 /ext/mbstring/php_unicode.c
parent9ac7c1e71d956ddac63b042be6ad8b105e584c10 (diff)
downloadphp-git-582a65b06f3de125887cab02d5c561168fcf94bc.tar.gz
Implement full case mapping
Implement full case mapping according to SpecialCasing.txt and also full case folding according to CaseFolding.txt (F). There are a number of caveats: * Only language-agnostic and unconditional full case mapping is implemented. The only language-agnostic conditional case mapping rule relates to Greek sigma in final position (Final_Sigma). Correctly handling this requires both arbitrary lookahead and lookbehind, which would require some larger changes to how the case mapping is implemented. This is a possible future extension. * The only language-specific handling that is implemented is for Turkish dotted/undotted Is, if the ISO-8859-9 encoding is used. This matches the previous behavior and makes sure that no codepoints not supported by the encoding are produced. A future extension would be to also handle the Turkish mappings specified by SpecialCasing.txt based on the mbfl internal language. * Full case folding is implemented, but case-insensitive mb_* operations continue to use simple case folding. The reason is that full case folding of the haystack string may change the position at which a match occurred. This would have to be mapped back into the position in the original string. * mb_convert_case() exposes both the full and the simple case mapping / folding, where full is the default. The constants are: * MB_CASE_LOWER (used by mb_strtolower) * MB_CASE_UPPER (used by mb_strtolower) * MB_CASE_TITLE * MB_CASE_FOLD * MB_CASE_LOWER_SIMPLE * MB_CASE_UPPER_SIMPLE * MB_CASE_TITLE_SIMPLE * MB_CASE_FOLD_SIMPLE (used by case-insensitive operations)
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r--ext/mbstring/php_unicode.c157
1 files changed, 139 insertions, 18 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c
index d9f2f23634..1bd348b3d2 100644
--- a/ext/mbstring/php_unicode.c
+++ b/ext/mbstring/php_unicode.c
@@ -145,13 +145,13 @@ static inline unsigned mph_lookup(
mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
_uccase_##type##_table, _uccase_##type##_table_size)
-unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc)
+static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
if (code >= 0x61 && code <= 0x7A) {
- if (enc == mbfl_no_encoding_8859_9 && code == 0x0069L) {
- return 0x0130L;
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
+ return 0x130;
}
return code - 0x20;
}
@@ -165,12 +165,12 @@ unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc)
}
}
-unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc)
+static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
if (code >= 0x41 && code <= 0x5A) {
- if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) {
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
return 0x0131L;
}
return code + 0x20;
@@ -179,13 +179,16 @@ unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc)
} else {
unsigned new_code = CASE_LOOKUP(code, lower);
if (new_code != CODE_NOT_FOUND) {
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
+ return 0x69;
+ }
return new_code;
}
return code;
}
}
-unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc)
+static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
{
unsigned new_code = CASE_LOOKUP(code, title);
if (new_code != CODE_NOT_FOUND) {
@@ -193,16 +196,16 @@ unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc)
}
/* No dedicated title-case variant, use to-upper instead */
- return php_unicode_toupper(code, enc);
+ return php_unicode_toupper_raw(code, enc);
}
-unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc)
+unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
if (code >= 0x41 && code <= 0x5A) {
- if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) {
- return 0x0131L;
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
+ return 0x131;
}
return code + 0x20;
}
@@ -210,12 +213,93 @@ unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc)
} else {
unsigned new_code = CASE_LOOKUP(code, fold);
if (new_code != CODE_NOT_FOUND) {
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
+ return 0x69;
+ }
return new_code;
}
return code;
}
}
+static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_tolower_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_toupper_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_totitle_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_tofold_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+
+static inline unsigned php_unicode_tolower_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_tolower_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+static inline unsigned php_unicode_toupper_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_toupper_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+static inline unsigned php_unicode_totitle_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_totitle_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+static inline unsigned php_unicode_tofold_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_tofold_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+
struct convert_case_data {
mbfl_convert_filter *next_filter;
enum mbfl_no_encoding no_encoding;
@@ -226,39 +310,76 @@ struct convert_case_data {
static int convert_case_filter(int c, void *void_data)
{
struct convert_case_data *data = (struct convert_case_data *) void_data;
+ unsigned out[3];
+ unsigned len;
switch (data->case_mode) {
+ case PHP_UNICODE_CASE_UPPER_SIMPLE:
+ out[0] = php_unicode_toupper_simple(c, data->no_encoding);
+ len = 1;
+ break;
+
case PHP_UNICODE_CASE_UPPER:
- c = php_unicode_toupper(c, data->no_encoding);
+ len = php_unicode_toupper_full(c, data->no_encoding, out);
+ break;
+
+ case PHP_UNICODE_CASE_LOWER_SIMPLE:
+ out[0] = php_unicode_tolower_simple(c, data->no_encoding);
+ len = 1;
break;
case PHP_UNICODE_CASE_LOWER:
- c = php_unicode_tolower(c, data->no_encoding);
+ len = php_unicode_tolower_full(c, data->no_encoding, out);
break;
+ case PHP_UNICODE_CASE_FOLD:
+ len = php_unicode_tofold_full(c, data->no_encoding, out);
+ break;
+
+ case PHP_UNICODE_CASE_FOLD_SIMPLE:
+ out[0] = php_unicode_tofold_simple(c, data->no_encoding);
+ len = 1;
+ break;
+
+ case PHP_UNICODE_CASE_TITLE_SIMPLE:
case PHP_UNICODE_CASE_TITLE:
{
int res = php_unicode_is_prop(c,
UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1);
+ out[0] = c;
+ len = 1;
if (data->title_mode) {
if (res) {
- c = php_unicode_tolower(c, data->no_encoding);
+ if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
+ out[0] = php_unicode_tolower_simple(c, data->no_encoding);
+ len = 1;
+ } else {
+ len = php_unicode_tolower_full(c, data->no_encoding, out);
+ }
} else {
data->title_mode = 0;
}
} else {
if (res) {
data->title_mode = 1;
- c = php_unicode_totitle(c, data->no_encoding);
+ if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
+ out[0] = php_unicode_totitle_simple(c, data->no_encoding);
+ len = 1;
+ } else {
+ len = php_unicode_totitle_full(c, data->no_encoding, out);
+ }
}
}
break;
}
-
- case PHP_UNICODE_CASE_FOLD:
- c = php_unicode_tofold(c, data->no_encoding);
+ default:
+ assert(0);
break;
}
- return (*data->next_filter->filter_function)(c, data->next_filter);
+
+ for (unsigned i = 0; i < len; i++) {
+ (*data->next_filter->filter_function)(out[i], data->next_filter);
+ }
+ return 0;
}
MBSTRING_API char *php_unicode_convert_case(