diff options
author | Sergei Golubchik <sergii@pisem.net> | 2014-02-25 16:04:35 +0100 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2014-02-25 16:04:35 +0100 |
commit | 0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2 (patch) | |
tree | 5c67457ff8abbb89b203a7f55cda776b738c385b /strings | |
parent | 6324c36bd703a0f55dcd49dd721af262f73cf7aa (diff) | |
parent | ff2e82f4a175b7b023cd167b2fa6e6fcd1bd192e (diff) | |
download | mariadb-git-0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2.tar.gz |
5.5 merge
Diffstat (limited to 'strings')
-rw-r--r-- | strings/CMakeLists.txt | 2 | ||||
-rw-r--r-- | strings/ctype-mb.c | 4 | ||||
-rw-r--r-- | strings/ctype-simple.c | 5 | ||||
-rw-r--r-- | strings/ctype-uca.c | 4 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 6 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 268 | ||||
-rw-r--r-- | strings/ctype-win1250ch.c | 5 | ||||
-rw-r--r-- | strings/ctype.c | 6 | ||||
-rw-r--r-- | strings/t_ctype.h | 3 |
9 files changed, 248 insertions, 55 deletions
diff --git a/strings/CMakeLists.txt b/strings/CMakeLists.txt index 18fae59d394..cafd3292930 100644 --- a/strings/CMakeLists.txt +++ b/strings/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2006, 2010, Oracle and/or its affiliates +# Copyright (c) 2006, 2013, Oracle and/or its affiliates # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 0c0332ea3da..9f845511866 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. - Copyright (c) 2009-2011, Monty Program Ab +/* Copyright (c) 2000, 2013, Oracle and/or its affiliates. + Copyright (c) 2009, 2014, SkySQL Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index 3cd6805158e..4556ed75f7e 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1,6 +1,5 @@ -/* Copyright (c) 2002-2007 MySQL AB, 2009 Sun Microsystems, Inc. - Copyright (c) 2009-2011, Monty Program Ab - Use is subject to license terms. +/* Copyright (c) 2002, 2013, Oracle and/or its affiliates. + Copyright (c) 2009, 2014, SkySQL Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 0f464be607d..ad484acd21e 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2004, 2011, Oracle and/or its affiliates. - Copyright (c) 2009, 2011, Monty Program Ab +/* Copyright (c) 2004, 2013, Oracle and/or its affiliates. + Copyright (c) 2009, 2014, SkySQL Ab. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 89df4ae0bc4..fee272e5d35 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1,5 +1,5 @@ -/* Copyright (c) 2003, 2012, Oracle and/or its affiliates - Copyright (c) 2009, 2013, Monty Program Ab. +/* Copyright (c) 2003, 2013, Oracle and/or its affiliates + Copyright (c) 2009, 2014, SkySQL Ab. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public @@ -1664,7 +1664,7 @@ struct charset_info_st my_charset_utf16_general_ci= struct charset_info_st my_charset_utf16_bin= { 55,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, "utf16", /* cs name */ "utf16_bin", /* name */ "UTF-16 Unicode", /* comment */ diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index cc19148b973..03fba7e51e7 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -26,6 +26,7 @@ #define EILSEQ ENOENT #endif +#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40) #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" @@ -56,6 +57,46 @@ #define HAVE_UNIDATA #endif + +#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4) + +static inline +int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) +{ + uchar c; + + DBUG_ASSERT(s < e); + c= s[0]; + if (c < 0x80) + return 1; + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (s+2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + return 2; + } + + DBUG_ASSERT(c < 0xf0); + if (s+3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + return 3; +} + +#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/ + #ifdef HAVE_UNIDATA #include "my_uctype.h" @@ -2285,7 +2326,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -2296,7 +2337,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2312,9 +2353,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90))) return MY_CS_ILSEQ; @@ -2330,10 +2371,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+5 >e) /* We need 5 characters */ return MY_CS_TOOSMALL5; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && (c >= 0xf9 || s[1] >= 0x88))) return MY_CS_ILSEQ; @@ -2349,11 +2390,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if ( s+6 >e ) /* We need 6 characters */ return MY_CS_TOOSMALL6; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && - (s[5] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && (c >= 0xfd || s[1] >= 0x84))) return MY_CS_ILSEQ; @@ -2397,11 +2438,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2876,10 +2917,91 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), return (len * 2 + 2) / 3; } + +static +int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + +#ifdef UNICODE_32BIT + if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) + { + if (s+4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90))) + return MY_CS_ILSEQ; + + return 4; + } + if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) + { + if (s+5 >e) /* We need 5 characters */ + return MY_CS_TOOSMALL5; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + (c >= 0xf9 || s[1] >= 0x88))) + return MY_CS_ILSEQ; + + return 5; + } + if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) + { + if ( s+6 >e ) /* We need 6 characters */ + return MY_CS_TOOSMALL6; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && + (c >= 0xfd || s[1] >= 0x84))) + return MY_CS_ILSEQ; + + return 6; + } +#endif + return MY_CS_ILSEQ; +} + +static size_t +my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - my_wc_t wc; - int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); return (res>1) ? res : 0; } @@ -2928,7 +3050,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_mbcharlen_utf8, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8, my_lengthsp_8bit, my_numcells_mb, my_utf8_uni, @@ -4702,7 +4824,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -4713,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -4746,9 +4868,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), [F4][80..8F][80..BF][80..BF] */ - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -4784,17 +4906,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), if (c < 0xe0) { - if (!((s[1] ^ 0x80) < 0x40)) + if (!IS_CONTINUATION_BYTE(s[1])) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x0f) << 12) | @@ -4805,9 +4927,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), } else if (c < 0xf5) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -5296,11 +5418,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len) } +static int +my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + + if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + + return 4; + } + + return MY_CS_ILSEQ; +} + + +static +size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, + const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - my_wc_t wc; - int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); return (res > 1) ? res : 0; } @@ -5361,7 +5556,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_mbcharlen_utf8mb4, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8mb4, my_lengthsp_8bit, my_numcells_mb, my_mb_wc_utf8mb4, @@ -5423,7 +5618,8 @@ struct charset_info_st my_charset_utf8mb4_general_ci= struct charset_info_st my_charset_utf8mb4_bin= { 46,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */ + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE| + MY_CS_UNICODE_SUPPLEMENT, /* state */ MY_UTF8MB4, /* cs name */ MY_UTF8MB4_BIN, /* name */ "UTF-8 Unicode", /* comment */ diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c index ef54101cf7f..5e33d9ccbd6 100644 --- a/strings/ctype-win1250ch.c +++ b/strings/ctype-win1250ch.c @@ -1,6 +1,5 @@ -/* Copyright (c) 2001 Jan Pazdziora. - Copyright (c) 2002-2007 MySQL AB - Copyright (c) 2009-2011, Monty Program Ab +/* Copyright (c) 2002, 2013, Oracle and/or its affiliates. + Copyright (c) 2009, 2014, SkySQL Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/ctype.c b/strings/ctype.c index 23f18b6617b..df22d0f19e5 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -1,7 +1,5 @@ -/* Copyright (c) 2000-2007 MySQL AB, 2008, 2009 Sun Microsystems, Inc. - Copyright (c) 2009-2011, Monty Program Ab - Use is subject to license terms. - Copyright (c) 2009-2011, Monty Program Ab +/* Copyright (c) 2000, 2013, Oracle and/or its affiliates. + Copyright (c) 2009, 2014, SkySQL Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/t_ctype.h b/strings/t_ctype.h index 8198d3eada8..a4fdd267c3f 100644 --- a/strings/t_ctype.h +++ b/strings/t_ctype.h @@ -1,4 +1,5 @@ -/* Copyright (C) 2000 MySQL AB +/* Copyright (c) 2000, 2001, 2003 MySQL AB + Use is subject to license terms This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by |