From 88680a99c6acdcd8be84c16e970c7616c912ff59 Mon Sep 17 00:00:00 2001 From: Neeraj Bisht Date: Thu, 7 Nov 2013 16:46:24 +0530 Subject: Bug#16691598 - ORDER BY LOWER(COLUMN) PRODUCES OUT-OF-ORDER RESULTS Problem:- We have created a table with UTF8_BIN collation. In case, when in our query we have ORDER BY clause over a function call we are getting result in incorrect order. Note:the bug is not there in 5.5. Analysis: In 5.5, for UTF16_BIN, we have min and max multi-byte length is 2 and 4 respectively.In make_sortkey(),for 2 byte character character we are assuming that the resultant length will be 2 byte/character. But when we use my_strnxfrm_unicode_full_bin(), we store sorting weights using 3 bytes per character.This result in truncated result. Same thing happen for UTF8MB4, where we have 1 byte min multi-byte and 4 byte max multi-byte.We will accsume resultant data as 1 byte/character, which result in truncated result. Solution:- use strnxfrm(means use of MY_CS_STRNXFRM macro) is used for sort, in which the resultant length is not dependent on source length. --- strings/ctype-ucs2.c | 2 +- strings/ctype-utf8.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'strings') diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index cecd4424108..f1d0e775804 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1664,7 +1664,7 @@ CHARSET_INFO my_charset_utf16_general_ci= CHARSET_INFO my_charset_utf16_bin= { 55,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, "utf16", /* cs name */ "utf16_bin", /* name */ "UTF-16 Unicode", /* comment */ diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 4976a9cf31a..62d5fbe0111 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -5435,7 +5435,8 @@ CHARSET_INFO my_charset_utf8mb4_general_ci= CHARSET_INFO my_charset_utf8mb4_bin= { 46,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */ + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE| + MY_CS_UNICODE_SUPPLEMENT, /* state */ MY_UTF8MB4, /* cs name */ MY_UTF8MB4_BIN, /* name */ "UTF-8 Unicode", /* comment */ -- cgit v1.2.1 From 7c9112b9c73d22f2b1b9a3be5b68607156e129e9 Mon Sep 17 00:00:00 2001 From: mithun Date: Tue, 12 Nov 2013 16:42:46 +0530 Subject: Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE RESULTING MY_WC_T RESULT IS NOT USED Issue : handler functions my_ismbchar_utf8, my_well_formed_len_mb for charset utf8 is calling unicode converion function to validate and to find the character length. Because of this, instructions which will convert the utf8 to unicode are executed for no use. A similar issue exist with charset utf8mb4 Solution : reorganized the code such that character validation part of unicode conversion handler is extracted(duplicated) in to separate function. Hence my_ismbchar_utf8, my_well_formed_len_mb will call the new function which only validates and return the length of mb(utf8). A similar fix for charset utf8mb4. strings/ctype-utf8.c: New functions has been added for charset utf8 and utf8mb4 to validate and to get the length of the character. --- strings/ctype-utf8.c | 264 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 229 insertions(+), 35 deletions(-) (limited to 'strings') diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 62d5fbe0111..52e05f17d61 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -27,6 +27,7 @@ #define EILSEQ ENOENT #endif +#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40) #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" @@ -57,6 +58,46 @@ #define HAVE_UNIDATA #endif + +#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4) + +static inline +int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) +{ + uchar c; + + DBUG_ASSERT(s < e); + c= s[0]; + if (c < 0x80) + return 1; + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (s+2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + return 2; + } + + DBUG_ASSERT(c < 0xf0); + if (s+3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + return 3; +} + +#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/ + #ifdef HAVE_UNIDATA #include "my_uctype.h" @@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90))) return MY_CS_ILSEQ; @@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+5 >e) /* We need 5 characters */ return MY_CS_TOOSMALL5; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && (c >= 0xf9 || s[1] >= 0x88))) return MY_CS_ILSEQ; @@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if ( s+6 >e ) /* We need 6 characters */ return MY_CS_TOOSMALL6; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && - (s[5] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && (c >= 0xfd || s[1] >= 0x84))) return MY_CS_ILSEQ; @@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), } +static +int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + +#ifdef UNICODE_32BIT + if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) + { + if (s+4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90))) + return MY_CS_ILSEQ; + + return 4; + } + if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) + { + if (s+5 >e) /* We need 5 characters */ + return MY_CS_TOOSMALL5; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + (c >= 0xf9 || s[1] >= 0x88))) + return MY_CS_ILSEQ; + + return 5; + } + if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) + { + if ( s+6 >e ) /* We need 6 characters */ + return MY_CS_TOOSMALL6; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && + (c >= 0xfd || s[1] >= 0x84))) + return MY_CS_ILSEQ; + + return 6; + } +#endif + return MY_CS_ILSEQ; +} + +static size_t +my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - my_wc_t wc; - int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); return (res>1) ? res : 0; } @@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_mbcharlen_utf8, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8, my_lengthsp_8bit, my_numcells_mb, my_utf8_uni, @@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), [F4][80..8F][80..BF][80..BF] */ - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), if (c < 0xe0) { - if (!((s[1] ^ 0x80) < 0x40)) + if (!IS_CONTINUATION_BYTE(s[1])) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x0f) << 12) | @@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), } else if (c < 0xf5) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len) } +static int +my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + + if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + + return 4; + } + + return MY_CS_ILSEQ; +} + + +static +size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, + const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - my_wc_t wc; - int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); return (res > 1) ? res : 0; } @@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_mbcharlen_utf8mb4, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8mb4, my_lengthsp_8bit, my_numcells_mb, my_mb_wc_utf8mb4, -- cgit v1.2.1 From c92223e19860bdef8ed9d8958b31a17fdb6173aa Mon Sep 17 00:00:00 2001 From: Murthy Narkedimilli Date: Mon, 6 Jan 2014 10:52:35 +0530 Subject: Updated/added copyright headers --- strings/CMakeLists.txt | 2 +- strings/ctype-mb.c | 2 +- strings/ctype-simple.c | 2 +- strings/ctype-uca.c | 2 +- strings/ctype-ucs2.c | 2 +- strings/ctype-win1250ch.c | 2 +- strings/ctype.c | 3 +-- strings/my_strtoll10.c | 2 +- strings/t_ctype.h | 3 ++- 9 files changed, 10 insertions(+), 10 deletions(-) (limited to 'strings') diff --git a/strings/CMakeLists.txt b/strings/CMakeLists.txt index 77b093c4fb1..35b4a472686 100644 --- a/strings/CMakeLists.txt +++ b/strings/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 102680d7c45..fddb8d2a16b 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index 09ea5c08ccb..95598efa03a 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index f0899f6b77e..8cd850b06df 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2004, 2011, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index f1d0e775804..37fd1b5349f 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c index a8181afa776..e5bc919405c 100644 --- a/strings/ctype-win1250ch.c +++ b/strings/ctype-win1250ch.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/ctype.c b/strings/ctype.c index d3ec0f5dc34..ec492a3bf80 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -1,5 +1,4 @@ -/* Copyright (c) 2000-2007 MySQL AB, 2008, 2009 Sun Microsystems, Inc. - Use is subject to license terms. +/* Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/my_strtoll10.c b/strings/my_strtoll10.c index ce935077e4a..dc776e1d3a4 100644 --- a/strings/my_strtoll10.c +++ b/strings/my_strtoll10.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/strings/t_ctype.h b/strings/t_ctype.h index 8198d3eada8..a4fdd267c3f 100644 --- a/strings/t_ctype.h +++ b/strings/t_ctype.h @@ -1,4 +1,5 @@ -/* Copyright (C) 2000 MySQL AB +/* Copyright (c) 2000, 2001, 2003 MySQL AB + Use is subject to license terms This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -- cgit v1.2.1