diff options
author | Alexander Barkov <bar@mnogosearch.org> | 2013-03-12 18:33:19 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mnogosearch.org> | 2013-03-12 18:33:19 +0400 |
commit | f5c3c2855d9add187d8b4465d2d88cc87729ef6a (patch) | |
tree | e9acbf22ed9325df568ead5571fc3772592e432f /strings | |
parent | 62222eb5185438ab809e9e3d372457f32aa2b8bf (diff) | |
download | mariadb-git-f5c3c2855d9add187d8b4465d2d88cc87729ef6a.tar.gz |
Performance improvements in "from latin1" and "to utf8" conversion.
Mini-benchmarking demonstrates up to 10% improvement in latin1->utf8
conversion.
modified:
@ strings/ctype-latin1.c
redundant test in ctype-latin1.c removed
@ strings/ctype-utf8.c
my_uni_utf8 rewritten in a more efficient way
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-latin1.c | 11 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 63 |
2 files changed, 33 insertions, 41 deletions
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 80ae11c82c2..b642d6095c5 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -364,9 +364,14 @@ int my_mb_wc_latin1(CHARSET_INFO *cs __attribute__((unused)), { if (str >= end) return MY_CS_TOOSMALL; - - *wc=cs_to_uni[*str]; - return (!wc[0] && str[0]) ? -1 : 1; + /* + There are no unassigned characters in latin1. + Every code point in latin1 is mapped to some Unicode code point. + We can always return 1, no needs to check the value of cs_to_uni[*str]. + */ + *wc= cs_to_uni[*str]; + DBUG_ASSERT(wc[0] || !str[0]); + return 1; } static diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 88bab1fac76..c439b6023fb 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)), my_wc_t wc, uchar *r, uchar *e) { - int count; - - if (r >= e) - return MY_CS_TOOSMALL; - if (wc < 0x80) - count = 1; - else if (wc < 0x800) - count = 2; - else if (wc < 0x10000) - count = 3; -#ifdef UNICODE_32BIT - else if (wc < 0x200000) - count = 4; - else if (wc < 0x4000000) - count = 5; - else if (wc <= 0x7fffffff) - count = 6; -#endif - else return MY_CS_ILUNI; - - /* - e is a character after the string r, not the last character of it. - Because of it (r+count > e), not (r+count-1 >e ) - */ - if ( r+count > e ) - return MY_CS_TOOSMALLN(count); - - switch (count) { - /* Fall through all cases!!! */ -#ifdef UNICODE_32BIT - case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000; - case 5: r[4] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000; - case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000; -#endif - case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800; - case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0; - case 1: r[0] = (uchar) wc; + { + if (r >= e) + return MY_CS_TOOSMALL; + *r= (uchar) wc; + return 1; } - return count; + if (wc < 0x800) + { + if (r + 2 > e) + return MY_CS_TOOSMALLN(2); + /* U+0080..U+07FF: 00000xxx.xxyyyyyy -> 110xxxxx 10yyyyyy */ + *r++= (uchar) (0xC0 | (wc >> 6)); + *r= (uchar) (0x80 | (wc & 0x3F)); + return 2; + } + if (wc < 0x10000) + { + if (r + 3 > e) + return MY_CS_TOOSMALLN(3); + /* U+0800..U+FFFF: xxxxyyyy.yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz */ + *r++= (uchar) (0xE0 | (wc >> 12)); + *r++= (uchar) (0x80 | ((wc >> 6) & 0x3f)); + *r= (uchar) (0x80 | (wc & 0x3f)); + return 3; + } + return MY_CS_ILUNI; } |