summaryrefslogtreecommitdiff
path: root/ext/mbstring/php_unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r--ext/mbstring/php_unicode.c157
1 files changed, 139 insertions, 18 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c
index d9f2f23634..1bd348b3d2 100644
--- a/ext/mbstring/php_unicode.c
+++ b/ext/mbstring/php_unicode.c
@@ -145,13 +145,13 @@ static inline unsigned mph_lookup(
mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
_uccase_##type##_table, _uccase_##type##_table_size)
-unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc)
+static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
if (code >= 0x61 && code <= 0x7A) {
- if (enc == mbfl_no_encoding_8859_9 && code == 0x0069L) {
- return 0x0130L;
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
+ return 0x130;
}
return code - 0x20;
}
@@ -165,12 +165,12 @@ unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc)
}
}
-unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc)
+static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
if (code >= 0x41 && code <= 0x5A) {
- if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) {
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
return 0x0131L;
}
return code + 0x20;
@@ -179,13 +179,16 @@ unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc)
} else {
unsigned new_code = CASE_LOOKUP(code, lower);
if (new_code != CODE_NOT_FOUND) {
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
+ return 0x69;
+ }
return new_code;
}
return code;
}
}
-unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc)
+static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
{
unsigned new_code = CASE_LOOKUP(code, title);
if (new_code != CODE_NOT_FOUND) {
@@ -193,16 +196,16 @@ unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc)
}
/* No dedicated title-case variant, use to-upper instead */
- return php_unicode_toupper(code, enc);
+ return php_unicode_toupper_raw(code, enc);
}
-unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc)
+unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
if (code >= 0x41 && code <= 0x5A) {
- if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) {
- return 0x0131L;
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
+ return 0x131;
}
return code + 0x20;
}
@@ -210,12 +213,93 @@ unsigned php_unicode_tofold(unsigned code, enum mbfl_no_encoding enc)
} else {
unsigned new_code = CASE_LOOKUP(code, fold);
if (new_code != CODE_NOT_FOUND) {
+ if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
+ return 0x69;
+ }
return new_code;
}
return code;
}
}
+static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_tolower_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_toupper_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_totitle_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
+ code = php_unicode_tofold_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ return _uccase_extra_table[code & 0xffffff];
+ }
+ return code;
+}
+
+static inline unsigned php_unicode_tolower_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_tolower_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+static inline unsigned php_unicode_toupper_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_toupper_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+static inline unsigned php_unicode_totitle_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_totitle_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+static inline unsigned php_unicode_tofold_full(
+ unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
+ code = php_unicode_tofold_raw(code, enc);
+ if (UNEXPECTED(code > 0xffffff)) {
+ unsigned len = code >> 24;
+ const unsigned *p = &_uccase_extra_table[code & 0xffffff];
+ memcpy(out, p + 1, len * sizeof(unsigned));
+ return len;
+ }
+ *out = code;
+ return 1;
+}
+
struct convert_case_data {
mbfl_convert_filter *next_filter;
enum mbfl_no_encoding no_encoding;
@@ -226,39 +310,76 @@ struct convert_case_data {
static int convert_case_filter(int c, void *void_data)
{
struct convert_case_data *data = (struct convert_case_data *) void_data;
+ unsigned out[3];
+ unsigned len;
switch (data->case_mode) {
+ case PHP_UNICODE_CASE_UPPER_SIMPLE:
+ out[0] = php_unicode_toupper_simple(c, data->no_encoding);
+ len = 1;
+ break;
+
case PHP_UNICODE_CASE_UPPER:
- c = php_unicode_toupper(c, data->no_encoding);
+ len = php_unicode_toupper_full(c, data->no_encoding, out);
+ break;
+
+ case PHP_UNICODE_CASE_LOWER_SIMPLE:
+ out[0] = php_unicode_tolower_simple(c, data->no_encoding);
+ len = 1;
break;
case PHP_UNICODE_CASE_LOWER:
- c = php_unicode_tolower(c, data->no_encoding);
+ len = php_unicode_tolower_full(c, data->no_encoding, out);
break;
+ case PHP_UNICODE_CASE_FOLD:
+ len = php_unicode_tofold_full(c, data->no_encoding, out);
+ break;
+
+ case PHP_UNICODE_CASE_FOLD_SIMPLE:
+ out[0] = php_unicode_tofold_simple(c, data->no_encoding);
+ len = 1;
+ break;
+
+ case PHP_UNICODE_CASE_TITLE_SIMPLE:
case PHP_UNICODE_CASE_TITLE:
{
int res = php_unicode_is_prop(c,
UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1);
+ out[0] = c;
+ len = 1;
if (data->title_mode) {
if (res) {
- c = php_unicode_tolower(c, data->no_encoding);
+ if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
+ out[0] = php_unicode_tolower_simple(c, data->no_encoding);
+ len = 1;
+ } else {
+ len = php_unicode_tolower_full(c, data->no_encoding, out);
+ }
} else {
data->title_mode = 0;
}
} else {
if (res) {
data->title_mode = 1;
- c = php_unicode_totitle(c, data->no_encoding);
+ if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
+ out[0] = php_unicode_totitle_simple(c, data->no_encoding);
+ len = 1;
+ } else {
+ len = php_unicode_totitle_full(c, data->no_encoding, out);
+ }
}
}
break;
}
-
- case PHP_UNICODE_CASE_FOLD:
- c = php_unicode_tofold(c, data->no_encoding);
+ default:
+ assert(0);
break;
}
- return (*data->next_filter->filter_function)(c, data->next_filter);
+
+ for (unsigned i = 0; i < len; i++) {
+ (*data->next_filter->filter_function)(out[i], data->next_filter);
+ }
+ return 0;
}
MBSTRING_API char *php_unicode_convert_case(