summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mnogosearch.org>2013-03-12 18:33:19 +0400
committerAlexander Barkov <bar@mnogosearch.org>2013-03-12 18:33:19 +0400
commitf5c3c2855d9add187d8b4465d2d88cc87729ef6a (patch)
treee9acbf22ed9325df568ead5571fc3772592e432f /strings
parent62222eb5185438ab809e9e3d372457f32aa2b8bf (diff)
downloadmariadb-git-f5c3c2855d9add187d8b4465d2d88cc87729ef6a.tar.gz
Performance improvements in "from latin1" and "to utf8" conversion.
Mini-benchmarking demonstrates up to 10% improvement in latin1->utf8 conversion. modified: @ strings/ctype-latin1.c redundant test in ctype-latin1.c removed @ strings/ctype-utf8.c my_uni_utf8 rewritten in a more efficient way
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-latin1.c11
-rw-r--r--strings/ctype-utf8.c63
2 files changed, 33 insertions, 41 deletions
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 80ae11c82c2..b642d6095c5 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -364,9 +364,14 @@ int my_mb_wc_latin1(CHARSET_INFO *cs __attribute__((unused)),
{
if (str >= end)
return MY_CS_TOOSMALL;
-
- *wc=cs_to_uni[*str];
- return (!wc[0] && str[0]) ? -1 : 1;
+ /*
+ There are no unassigned characters in latin1.
+ Every code point in latin1 is mapped to some Unicode code point.
+ We can always return 1, no needs to check the value of cs_to_uni[*str].
+ */
+ *wc= cs_to_uni[*str];
+ DBUG_ASSERT(wc[0] || !str[0]);
+ return 1;
}
static
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 88bab1fac76..c439b6023fb 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)),
my_wc_t wc, uchar *r, uchar *e)
{
- int count;
-
- if (r >= e)
- return MY_CS_TOOSMALL;
-
if (wc < 0x80)
- count = 1;
- else if (wc < 0x800)
- count = 2;
- else if (wc < 0x10000)
- count = 3;
-#ifdef UNICODE_32BIT
- else if (wc < 0x200000)
- count = 4;
- else if (wc < 0x4000000)
- count = 5;
- else if (wc <= 0x7fffffff)
- count = 6;
-#endif
- else return MY_CS_ILUNI;
-
- /*
- e is a character after the string r, not the last character of it.
- Because of it (r+count > e), not (r+count-1 >e )
- */
- if ( r+count > e )
- return MY_CS_TOOSMALLN(count);
-
- switch (count) {
- /* Fall through all cases!!! */
-#ifdef UNICODE_32BIT
- case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
- case 5: r[4] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000;
- case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
-#endif
- case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
- case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
- case 1: r[0] = (uchar) wc;
+ {
+ if (r >= e)
+ return MY_CS_TOOSMALL;
+ *r= (uchar) wc;
+ return 1;
}
- return count;
+ if (wc < 0x800)
+ {
+ if (r + 2 > e)
+ return MY_CS_TOOSMALLN(2);
+ /* U+0080..U+07FF: 00000xxx.xxyyyyyy -> 110xxxxx 10yyyyyy */
+ *r++= (uchar) (0xC0 | (wc >> 6));
+ *r= (uchar) (0x80 | (wc & 0x3F));
+ return 2;
+ }
+ if (wc < 0x10000)
+ {
+ if (r + 3 > e)
+ return MY_CS_TOOSMALLN(3);
+ /* U+0800..U+FFFF: xxxxyyyy.yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz */
+ *r++= (uchar) (0xE0 | (wc >> 12));
+ *r++= (uchar) (0x80 | ((wc >> 6) & 0x3f));
+ *r= (uchar) (0x80 | (wc & 0x3f));
+ return 3;
+ }
+ return MY_CS_ILUNI;
}