diff options
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-utf8.c | 264 |
1 files changed, 229 insertions, 35 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 62d5fbe0111..52e05f17d61 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -27,6 +27,7 @@ #define EILSEQ ENOENT #endif +#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40) #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" @@ -57,6 +58,46 @@ #define HAVE_UNIDATA #endif + +#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4) + +static inline +int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) +{ + uchar c; + + DBUG_ASSERT(s < e); + c= s[0]; + if (c < 0x80) + return 1; + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (s+2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + return 2; + } + + DBUG_ASSERT(c < 0xf0); + if (s+3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + return 3; +} + +#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/ + #ifdef HAVE_UNIDATA #include "my_uctype.h" @@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90))) return MY_CS_ILSEQ; @@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+5 >e) /* We need 5 characters */ return MY_CS_TOOSMALL5; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && (c >= 0xf9 || s[1] >= 0x88))) return MY_CS_ILSEQ; @@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if ( s+6 >e ) /* We need 6 characters */ return MY_CS_TOOSMALL6; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && - (s[5] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && (c >= 0xfd || s[1] >= 0x84))) return MY_CS_ILSEQ; @@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), } +static +int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + +#ifdef UNICODE_32BIT + if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) + { + if (s+4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90))) + return MY_CS_ILSEQ; + + return 4; + } + if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) + { + if (s+5 >e) /* We need 5 characters */ + return MY_CS_TOOSMALL5; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + (c >= 0xf9 || s[1] >= 0x88))) + return MY_CS_ILSEQ; + + return 5; + } + if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) + { + if ( s+6 >e ) /* We need 6 characters */ + return MY_CS_TOOSMALL6; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && + (c >= 0xfd || s[1] >= 0x84))) + return MY_CS_ILSEQ; + + return 6; + } +#endif + return MY_CS_ILSEQ; +} + +static size_t +my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - my_wc_t wc; - int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); return (res>1) ? res : 0; } @@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_mbcharlen_utf8, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8, my_lengthsp_8bit, my_numcells_mb, my_utf8_uni, @@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), [F4][80..8F][80..BF][80..BF] */ - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), if (c < 0xe0) { - if (!((s[1] ^ 0x80) < 0x40)) + if (!IS_CONTINUATION_BYTE(s[1])) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x0f) << 12) | @@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), } else if (c < 0xf5) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len) } +static int +my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + + if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + + return 4; + } + + return MY_CS_ILSEQ; +} + + +static +size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, + const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - my_wc_t wc; - int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); return (res > 1) ? res : 0; } @@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_mbcharlen_utf8mb4, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8mb4, my_lengthsp_8bit, my_numcells_mb, my_mb_wc_utf8mb4, |