diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2017-07-26 00:06:17 +0200 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2017-07-28 12:32:50 +0200 |
commit | 80a0601fe52b9dddbef34a168a2c1136177bda23 (patch) | |
tree | bdebc656b159fa037c458b05ec75c211eccd2cbe /ext/mbstring/php_unicode.c | |
parent | f56b0afe6eb3ec0dd6ec5ee6ec5cee429e41c7e5 (diff) | |
download | php-git-80a0601fe52b9dddbef34a168a2c1136177bda23.tar.gz |
Use MPH for case maps
Instead of performing a binary search, use a hashtable to store
the case maps. In particular a minimal perfect hash construction
is used, which does not require collision resolution (but does
use an auxiliary table for the hash perturbation).
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r-- | ext/mbstring/php_unicode.c | 62 |
1 files changed, 30 insertions, 32 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index 25d9947713..0d4ccdcec5 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -113,35 +113,39 @@ MBSTRING_API int php_unicode_is_prop(unsigned long code, ...) return result; } -#define CODE_NOT_FOUND ((unsigned long) -1) +static inline unsigned mph_hash(unsigned d, unsigned x) { + x ^= d; + x = ((x >> 16) ^ x) * 0x45d9f3b; + return x; +} + +#define CODE_NOT_FOUND ((unsigned) -1) -static unsigned long case_lookup(unsigned long code, long l, long r) +static inline unsigned mph_lookup( + unsigned code, + const short *g_table, unsigned g_table_size, + const unsigned *table, unsigned table_size) { - long m; - const unsigned int *tmp; + short g = g_table[mph_hash(0, code) % g_table_size]; - /* - * Do the binary search. - */ - while (l <= r) { - /* - * Determine a "mid" point and adjust to make sure the mid point is at - * the beginning of a case mapping triple. - */ - m = (l + r) >> 1; - tmp = &_uccase_map[m*2]; - if (code > *tmp) - l = m + 1; - else if (code < *tmp) - r = m - 1; - else if (code == *tmp) - return tmp[1]; + unsigned idx; + if (g <= 0) { + idx = -g; + } else { + idx = mph_hash(g, code) % table_size; } + if (table[2*idx] == code) { + return table[2*idx + 1]; + } return CODE_NOT_FOUND; } -MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc) +#define CASE_LOOKUP(code, type) \ + mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \ + _uccase_##type##_table, _uccase_##type##_table_size) + +unsigned php_unicode_toupper(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ @@ -153,9 +157,7 @@ MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_ } return code; } else { - long l = 0; - long r = _uccase_len[0] - 1; - unsigned long new_code = case_lookup(code, l, r); + unsigned new_code = CASE_LOOKUP(code, upper); if (new_code != CODE_NOT_FOUND) { return new_code; } @@ -163,7 +165,7 @@ MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_ } } -MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc) +unsigned php_unicode_tolower(unsigned code, enum mbfl_no_encoding enc) { if (code < 0x80) { /* Fast path for ASCII */ @@ -175,9 +177,7 @@ MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_ } return code; } else { - long l = _uccase_len[0]; - long r = _uccase_len[0] + _uccase_len[1] - 1; - unsigned long new_code = case_lookup(code, l, r); + unsigned new_code = CASE_LOOKUP(code, lower); if (new_code != CODE_NOT_FOUND) { return new_code; } @@ -185,11 +185,9 @@ MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_ } } -MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc) +unsigned php_unicode_totitle(unsigned code, enum mbfl_no_encoding enc) { - long l = _uccase_len[0] + _uccase_len[1]; - long r = _uccase_size - 1; - unsigned long new_code = case_lookup(code, l, r); + unsigned new_code = CASE_LOOKUP(code, title); if (new_code != CODE_NOT_FOUND) { return new_code; } |