summaryrefslogtreecommitdiff
path: root/ext/mbstring/php_unicode.c
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2017-07-26 00:06:17 +0200
committerNikita Popov <nikita.ppv@gmail.com>2017-07-28 12:32:50 +0200
commit80a0601fe52b9dddbef34a168a2c1136177bda23 (patch)
treebdebc656b159fa037c458b05ec75c211eccd2cbe /ext/mbstring/php_unicode.c
parentf56b0afe6eb3ec0dd6ec5ee6ec5cee429e41c7e5 (diff)
downloadphp-git-80a0601fe52b9dddbef34a168a2c1136177bda23.tar.gz
Use MPH for case maps
Instead of performing a binary search, use a hashtable to store the case maps. In particular a minimal perfect hash construction is used, which does not require collision resolution (but does use an auxiliary table for the hash perturbation).
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r--ext/mbstring/php_unicode.c62
1 files changed, 30 insertions, 32 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c
index 25d9947713..0d4ccdcec5 100644
--- a/ext/mbstring/php_unicode.c
+++ b/ext/mbstring/php_unicode.c
@@ -113,35 +113,39 @@ MBSTRING_API int php_unicode_is_prop(unsigned long code, ...)
return result;
}
-#define CODE_NOT_FOUND ((unsigned long) -1)
+static inline unsigned mph_hash(unsigned d, unsigned x) {
+ x ^= d;
+ x = ((x >> 16) ^ x) * 0x45d9f3b;
+ return x;
+}
+
+#define CODE_NOT_FOUND ((unsigned) -1)
-static unsigned long case_lookup(unsigned long code, long l, long r)
+static inline unsigned mph_lookup(
+ unsigned code,
+ const short *g_table, unsigned g_table_size,
+ const unsigned *table, unsigned table_size)
{
- long m;
- const unsigned int *tmp;
+ short g = g_table[mph_hash(0, code) % g_table_size];
- /*
- * Do the binary search.
- */
- while (l <= r) {
- /*
- * Determine a "mid" point and adjust to make sure the mid point is at
- * the beginning of a case mapping triple.
- */
- m = (l + r) >> 1;
- tmp = &_uccase_map[m*2];
- if (code > *tmp)
- l = m + 1;
- else if (code < *tmp)
- r = m - 1;
- else if (code == *tmp)
- return tmp[1];
+ unsigned idx;
+ if (g <= 0) {
+ idx = -g;
+ } else {
+ idx = mph_hash(g, code) % table_size;
}
+ if (table[2*idx] == code) {
+ return table[2*idx + 1];
+ }
return CODE_NOT_FOUND;
}
-MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc)
+#define CASE_LOOKUP(code, type) \
+ mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
+ _uccase_##type##_table, _uccase_##type##_table_size)
+
+unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
@@ -153,9 +157,7 @@ MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_
}
return code;
} else {
- long l = 0;
- long r = _uccase_len[0] - 1;
- unsigned long new_code = case_lookup(code, l, r);
+ unsigned new_code = CASE_LOOKUP(code, upper);
if (new_code != CODE_NOT_FOUND) {
return new_code;
}
@@ -163,7 +165,7 @@ MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_
}
}
-MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc)
+unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc)
{
if (code < 0x80) {
/* Fast path for ASCII */
@@ -175,9 +177,7 @@ MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_
}
return code;
} else {
- long l = _uccase_len[0];
- long r = _uccase_len[0] + _uccase_len[1] - 1;
- unsigned long new_code = case_lookup(code, l, r);
+ unsigned new_code = CASE_LOOKUP(code, lower);
if (new_code != CODE_NOT_FOUND) {
return new_code;
}
@@ -185,11 +185,9 @@ MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_
}
}
-MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc)
+unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc)
{
- long l = _uccase_len[0] + _uccase_len[1];
- long r = _uccase_size - 1;
- unsigned long new_code = case_lookup(code, l, r);
+ unsigned new_code = CASE_LOOKUP(code, title);
if (new_code != CODE_NOT_FOUND) {
return new_code;
}