summaryrefslogtreecommitdiff
path: root/strings/ctype-utf8.c
diff options
context:
space:
mode:
authormithun <mithun.c.y@oracle.com>2013-11-12 16:42:46 +0530
committermithun <mithun.c.y@oracle.com>2013-11-12 16:42:46 +0530
commit7c9112b9c73d22f2b1b9a3be5b68607156e129e9 (patch)
tree72a2ead5cb505a8ac8e6999655ebc9309c9651bc /strings/ctype-utf8.c
parent4189e05c1388f1161331c4fe0b3755286221de97 (diff)
downloadmariadb-git-7c9112b9c73d22f2b1b9a3be5b68607156e129e9.tar.gz
Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE
RESULTING MY_WC_T RESULT IS NOT USED Issue : handler functions my_ismbchar_utf8, my_well_formed_len_mb for charset utf8 is calling unicode converion function to validate and to find the character length. Because of this, instructions which will convert the utf8 to unicode are executed for no use. A similar issue exist with charset utf8mb4 Solution : reorganized the code such that character validation part of unicode conversion handler is extracted(duplicated) in to separate function. Hence my_ismbchar_utf8, my_well_formed_len_mb will call the new function which only validates and return the length of mb(utf8). A similar fix for charset utf8mb4. strings/ctype-utf8.c: New functions has been added for charset utf8 and utf8mb4 to validate and to get the length of the character.
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r--strings/ctype-utf8.c264
1 files changed, 229 insertions, 35 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 62d5fbe0111..52e05f17d61 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -27,6 +27,7 @@
#define EILSEQ ENOENT
#endif
+#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@@ -57,6 +58,46 @@
#define HAVE_UNIDATA
#endif
+
+#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
+
+static inline
+int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
+{
+ uchar c;
+
+ DBUG_ASSERT(s < e);
+ c= s[0];
+ if (c < 0x80)
+ return 1;
+
+ if (c < 0xc2)
+ return MY_CS_ILSEQ;
+
+ if (c < 0xe0)
+ {
+ if (s+2 > e) /* We need 2 characters */
+ return MY_CS_TOOSMALL2;
+
+ if (!(IS_CONTINUATION_BYTE(s[1])))
+ return MY_CS_ILSEQ;
+
+ return 2;
+ }
+
+ DBUG_ASSERT(c < 0xf0);
+ if (s+3 > e) /* We need 3 characters */
+ return MY_CS_TOOSMALL3;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
+ (c >= 0xe1 || s[1] >= 0xa0)))
+ return MY_CS_ILSEQ;
+
+ return 3;
+}
+
+#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
+
#ifdef HAVE_UNIDATA
#include "my_uctype.h"
@@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
- if (!((s[1] ^ 0x80) < 0x40))
+ if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
@@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOSMALL5;
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
- (s[4] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
@@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOSMALL6;
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
- (s[4] ^ 0x80) < 0x40 &&
- (s[5] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
+ IS_CONTINUATION_BYTE(s[5]) &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
@@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
-
+
if (c < 0xf0)
{
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
}
+static
+int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
+{
+ uchar c;
+
+ if (s >= e)
+ return MY_CS_TOOSMALL;
+
+ c= s[0];
+ if (c < 0xf0)
+ return my_valid_mbcharlen_utf8mb3(s, e);
+
+#ifdef UNICODE_32BIT
+ if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
+ {
+ if (s+4 > e) /* We need 4 characters */
+ return MY_CS_TOOSMALL4;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ (c >= 0xf1 || s[1] >= 0x90)))
+ return MY_CS_ILSEQ;
+
+ return 4;
+ }
+ if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
+ {
+ if (s+5 >e) /* We need 5 characters */
+ return MY_CS_TOOSMALL5;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
+ (c >= 0xf9 || s[1] >= 0x88)))
+ return MY_CS_ILSEQ;
+
+ return 5;
+ }
+ if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
+ {
+ if ( s+6 >e ) /* We need 6 characters */
+ return MY_CS_TOOSMALL6;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
+ IS_CONTINUATION_BYTE(s[5]) &&
+ (c >= 0xfd || s[1] >= 0x84)))
+ return MY_CS_ILSEQ;
+
+ return 6;
+ }
+#endif
+ return MY_CS_ILSEQ;
+}
+
+static size_t
+my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
+ size_t pos, int *error)
+{
+ const char *b_start= b;
+ *error= 0;
+ while (pos)
+ {
+ int mb_len;
+
+ if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+ {
+ *error= b < e ? 1 : 0;
+ break;
+ }
+ b+= mb_len;
+ pos--;
+ }
+ return (size_t) (b - b_start);
+}
+
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
- my_wc_t wc;
- int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
+ int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
return (res>1) ? res : 0;
}
@@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_mbcharlen_utf8,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_mb,
+ my_well_formed_len_utf8,
my_lengthsp_8bit,
my_numcells_mb,
my_utf8_uni,
@@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
- if (!((s[1] ^ 0x80) < 0x40))
+ if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
[F4][80..8F][80..BF][80..BF]
*/
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
if (c < 0xe0)
{
- if (!((s[1] ^ 0x80) < 0x40))
+ if (!IS_CONTINUATION_BYTE(s[1]))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
-
+
if (c < 0xf0)
{
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x0f) << 12) |
@@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
}
else if (c < 0xf5)
{
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
}
+static int
+my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
+{
+ uchar c;
+
+ if (s >= e)
+ return MY_CS_TOOSMALL;
+
+ c= s[0];
+ if (c < 0xf0)
+ return my_valid_mbcharlen_utf8mb3(s, e);
+
+ if (c < 0xf5)
+ {
+ if (s + 4 > e) /* We need 4 characters */
+ return MY_CS_TOOSMALL4;
+
+ /*
+ UTF-8 quick four-byte mask:
+ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ Encoding allows to encode U+00010000..U+001FFFFF
+
+ The maximum character defined in the Unicode standard is U+0010FFFF.
+ Higher characters U+00110000..U+001FFFFF are not used.
+
+ 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+ 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+
+ Valid codes:
+ [F0][90..BF][80..BF][80..BF]
+ [F1][80..BF][80..BF][80..BF]
+ [F2][80..BF][80..BF][80..BF]
+ [F3][80..BF][80..BF][80..BF]
+ [F4][80..8F][80..BF][80..BF]
+ */
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ (c >= 0xf1 || s[1] >= 0x90) &&
+ (c <= 0xf3 || s[1] <= 0x8F)))
+ return MY_CS_ILSEQ;
+
+ return 4;
+ }
+
+ return MY_CS_ILSEQ;
+}
+
+
+static
+size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
+ const char *b, const char *e,
+ size_t pos, int *error)
+{
+ const char *b_start= b;
+ *error= 0;
+ while (pos)
+ {
+ int mb_len;
+
+ if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+ {
+ *error= b < e ? 1 : 0;
+ break;
+ }
+ b+= mb_len;
+ pos--;
+ }
+ return (size_t) (b - b_start);
+}
+
+
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
- my_wc_t wc;
- int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
+ int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
return (res > 1) ? res : 0;
}
@@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_mbcharlen_utf8mb4,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_mb,
+ my_well_formed_len_utf8mb4,
my_lengthsp_8bit,
my_numcells_mb,
my_mb_wc_utf8mb4,