diff options
author | mithun <mithun.c.y@oracle.com> | 2013-11-12 16:42:46 +0530 |
---|---|---|
committer | mithun <mithun.c.y@oracle.com> | 2013-11-12 16:42:46 +0530 |
commit | 7c9112b9c73d22f2b1b9a3be5b68607156e129e9 (patch) | |
tree | 72a2ead5cb505a8ac8e6999655ebc9309c9651bc /strings | |
parent | 4189e05c1388f1161331c4fe0b3755286221de97 (diff) | |
download | mariadb-git-7c9112b9c73d22f2b1b9a3be5b68607156e129e9.tar.gz |
Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE
RESULTING MY_WC_T RESULT IS NOT USED
Issue : handler functions my_ismbchar_utf8,
my_well_formed_len_mb for charset utf8
is calling unicode converion function
to validate and to find the character
length. Because of this, instructions
which will convert the utf8 to unicode
are executed for no use.
A similar issue exist with charset utf8mb4
Solution : reorganized the code such that character
validation part of unicode conversion
handler is extracted(duplicated) in to
separate function. Hence
my_ismbchar_utf8, my_well_formed_len_mb
will call the new function which only
validates and return the length of mb(utf8).
A similar fix for charset utf8mb4.
strings/ctype-utf8.c:
New functions has been added for charset utf8 and utf8mb4
to validate and to get the length of the character.
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-utf8.c | 264 |
1 files changed, 229 insertions, 35 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 62d5fbe0111..52e05f17d61 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -27,6 +27,7 @@ #define EILSEQ ENOENT #endif +#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40) #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" @@ -57,6 +58,46 @@ #define HAVE_UNIDATA #endif + +#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4) + +static inline +int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) +{ + uchar c; + + DBUG_ASSERT(s < e); + c= s[0]; + if (c < 0x80) + return 1; + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (s+2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + return 2; + } + + DBUG_ASSERT(c < 0xf0); + if (s+3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + return 3; +} + +#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/ + #ifdef HAVE_UNIDATA #include "my_uctype.h" @@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90))) return MY_CS_ILSEQ; @@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+5 >e) /* We need 5 characters */ return MY_CS_TOOSMALL5; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && (c >= 0xf9 || s[1] >= 0x88))) return MY_CS_ILSEQ; @@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if ( s+6 >e ) /* We need 6 characters */ return MY_CS_TOOSMALL6; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && - (s[5] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && (c >= 0xfd || s[1] >= 0x84))) return MY_CS_ILSEQ; @@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), } +static +int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + +#ifdef UNICODE_32BIT + if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) + { + if (s+4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90))) + return MY_CS_ILSEQ; + + return 4; + } + if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) + { + if (s+5 >e) /* We need 5 characters */ + return MY_CS_TOOSMALL5; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + (c >= 0xf9 || s[1] >= 0x88))) + return MY_CS_ILSEQ; + + return 5; + } + if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) + { + if ( s+6 >e ) /* We need 6 characters */ + return MY_CS_TOOSMALL6; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && + (c >= 0xfd || s[1] >= 0x84))) + return MY_CS_ILSEQ; + + return 6; + } +#endif + return MY_CS_ILSEQ; +} + +static size_t +my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - my_wc_t wc; - int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); return (res>1) ? res : 0; } @@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_mbcharlen_utf8, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8, my_lengthsp_8bit, my_numcells_mb, my_utf8_uni, @@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), [F4][80..8F][80..BF][80..BF] */ - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), if (c < 0xe0) { - if (!((s[1] ^ 0x80) < 0x40)) + if (!IS_CONTINUATION_BYTE(s[1])) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x0f) << 12) | @@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), } else if (c < 0xf5) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len) } +static int +my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + + if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + + return 4; + } + + return MY_CS_ILSEQ; +} + + +static +size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, + const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - my_wc_t wc; - int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); return (res > 1) ? res : 0; } @@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_mbcharlen_utf8mb4, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8mb4, my_lengthsp_8bit, my_numcells_mb, my_mb_wc_utf8mb4, |