diff options
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r-- | ext/mbstring/php_unicode.c | 230 |
1 files changed, 127 insertions, 103 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index 8b6a52156a..208c10319e 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -43,22 +43,10 @@ #include "mbstring.h" #include "php_unicode.h" #include "unicode_data.h" +#include "libmbfl/mbfl/mbfilter_wchar.h" ZEND_EXTERN_MODULE_GLOBALS(mbstring) -/* - * A simple array of 32-bit masks for lookup. - */ -static unsigned long masks32[32] = { - 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, - 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, - 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000, - 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000, - 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, - 0x40000000, 0x80000000 -}; - - static int prop_lookup(unsigned long code, unsigned long n) { long l, r, m; @@ -98,25 +86,31 @@ static int prop_lookup(unsigned long code, unsigned long n) } -MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1, - unsigned long mask2) +MBSTRING_API int php_unicode_is_prop1(unsigned long code, int prop) { - unsigned long i; + return prop_lookup(code, prop); +} - if (mask1 == 0 && mask2 == 0) - return 0; +MBSTRING_API int php_unicode_is_prop(unsigned long code, ...) +{ + int result = 0; + va_list va; + va_start(va, code); - for (i = 0; mask1 && i < 32; i++) { - if ((mask1 & masks32[i]) && prop_lookup(code, i)) - return 1; - } + while (1) { + int prop = va_arg(va, int); + if (prop < 0) { + break; + } - for (i = 32; mask2 && i < _ucprop_size; i++) { - if ((mask2 & masks32[i & 31]) && prop_lookup(code, i)) - return 1; + if (prop_lookup(code, prop)) { + result = 1; + break; + } } - return 0; + va_end(va); + return result; } static unsigned long case_lookup(unsigned long code, long l, long r, int field) @@ -144,27 +138,22 @@ static unsigned long case_lookup(unsigned long code, long l, long r, int field) return code; } -MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field) -{ - if (code == 0x0069L) { - return 0x0130L; - } - return case_lookup(code, l, r, field); -} - -MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field) -{ - if (code == 0x0049L) { - return 0x0131L; - } - return case_lookup(code, l, r, field); -} - MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc) { int field; long l, r; + if (code < 0x80) { + /* Fast path for ASCII */ + if (code >= 0x61 && code <= 0x7A) { + if (enc == mbfl_no_encoding_8859_9 && code == 0x0069L) { + return 0x0130L; + } + return code - 0x20; + } + return code; + } + if (php_unicode_is_upper(code)) return code; @@ -175,11 +164,6 @@ MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_ field = 2; l = _uccase_len[0]; r = (l + _uccase_len[1]) - 3; - - if (enc == mbfl_no_encoding_8859_9) { - return php_turkish_toupper(code, l, r, field); - } - } else { /* * The character is title case. @@ -196,6 +180,17 @@ MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_ int field; long l, r; + if (code < 0x80) { + /* Fast path for ASCII */ + if (code >= 0x41 && code <= 0x5A) { + if (enc == mbfl_no_encoding_8859_9 && code == 0x0049L) { + return 0x0131L; + } + return code + 0x20; + } + return code; + } + if (php_unicode_is_lower(code)) return code; @@ -206,11 +201,6 @@ MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_ field = 1; l = 0; r = _uccase_len[0] - 3; - - if (enc == mbfl_no_encoding_8859_9) { - return php_turkish_tolower(code, l, r, field); - } - } else { /* * The character is title case. @@ -267,71 +257,105 @@ MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_ ((unsigned char*)(ptr))[3] = (v ) & 0xff;\ } -MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len, - const char *src_encoding) -{ - char *unicode, *newstr; - size_t unicode_len; - unsigned char *unicode_ptr; - size_t i; - enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding); - - if (_src_encoding == mbfl_no_encoding_invalid) { - php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding); - return NULL; - } - - unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len); - if (unicode == NULL) - return NULL; - - unicode_ptr = (unsigned char *)unicode; +struct convert_case_data { + mbfl_convert_filter *next_filter; + enum mbfl_no_encoding no_encoding; + int case_mode; + int title_mode; +}; - switch(case_mode) { +static int convert_case_filter(int c, void *void_data) +{ + struct convert_case_data *data = (struct convert_case_data *) void_data; + switch (data->case_mode) { case PHP_UNICODE_CASE_UPPER: - for (i = 0; i < unicode_len; i+=4) { - UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); - } + c = php_unicode_toupper(c, data->no_encoding); break; case PHP_UNICODE_CASE_LOWER: - for (i = 0; i < unicode_len; i+=4) { - UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); - } + c = php_unicode_tolower(c, data->no_encoding); break; - case PHP_UNICODE_CASE_TITLE: { - int mode = 0; - - for (i = 0; i < unicode_len; i+=4) { - int res = php_unicode_is_prop( - BE_ARY_TO_UINT32(&unicode_ptr[i]), - UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0); - if (mode) { - if (res) { - UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); - } else { - mode = 0; - } + case PHP_UNICODE_CASE_TITLE: + { + int res = php_unicode_is_prop(c, + UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1); + if (data->title_mode) { + if (res) { + c = php_unicode_tolower(c, data->no_encoding); } else { - if (res) { - mode = 1; - UINT32_TO_BE_ARY(&unicode_ptr[i], - php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding)); - } + data->title_mode = 0; + } + } else { + if (res) { + data->title_mode = 1; + c = php_unicode_totitle(c, data->no_encoding); } } - } break; + break; + } + } + return (*data->next_filter->filter_function)(c, data->next_filter); +} +MBSTRING_API char *php_unicode_convert_case( + int case_mode, const char *srcstr, size_t srclen, size_t *ret_len, + const mbfl_encoding *src_encoding) +{ + struct convert_case_data data; + mbfl_convert_filter *from_wchar, *to_wchar; + mbfl_string result, *result_ptr; + + mbfl_memory_device device; + mbfl_memory_device_init(&device, srclen + 1, 0); + + /* encoding -> wchar filter */ + to_wchar = mbfl_convert_filter_new(src_encoding, + &mbfl_encoding_wchar, convert_case_filter, NULL, &data); + if (to_wchar == NULL) { + mbfl_memory_device_clear(&device); + return NULL; } - newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len); - efree(unicode); + /* wchar -> encoding filter */ + from_wchar = mbfl_convert_filter_new( + &mbfl_encoding_wchar, src_encoding, + mbfl_memory_device_output, NULL, &device); + if (from_wchar == NULL) { + mbfl_convert_filter_delete(to_wchar); + mbfl_memory_device_clear(&device); + return NULL; + } + + data.next_filter = from_wchar; + data.no_encoding = src_encoding->no_encoding; + data.case_mode = case_mode; + data.title_mode = 0; + + { + /* feed data */ + const unsigned char *p = (const unsigned char *) srcstr; + size_t n = srclen; + while (n > 0) { + if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) { + break; + } + n--; + } + } + + mbfl_convert_filter_flush(to_wchar); + mbfl_convert_filter_flush(from_wchar); + result_ptr = mbfl_memory_device_result(&device, &result); + mbfl_convert_filter_delete(to_wchar); + mbfl_convert_filter_delete(from_wchar); + + if (!result_ptr) { + return NULL; + } - return newstr; + *ret_len = result.len; + return (char *) result.val; } |