From e463ee942152e22c77d52403dbfb772648d4f35d Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 30 Oct 2006 14:40:15 +0400 Subject: Bug#20404: SHOW CREATE TABLE fails with Turkish I Problem: SHOW CREATE TABLE printed garbage in table name for tables having TURKISH I (i.e. LATIN CAPITABLE LETTER I WITH DOT ABOVE) when lower-case-table-name=1. Reason: In some cases during lower/upper conversion in utf8, the result string can be shorter the original string (including the above letter). Old implementation of caseup_str() and casedn_str() didn't handle the result length properly, assuming that length cannot change. This fix changes the result type of cs->cset->casedn_str() and cs->cset->caseup_str() from VOID to UINT, to return the result length, as well as put '\0' terminator on a proper place. Also, my_caseup_str_utf8() and my_casedn_str_utf8() were rewritten not to use strlen() for performance purposes. It was done with help of adding of new functions - my_utf8_uni_no_range() and my_uni_utf8_no_range() - for null terminated strings. include/m_ctype.h: Changeing return type from void to int for caseup_str() and casedn_str() mysql-test/r/lowercase_table.result: Adding test case mysql-test/t/lowercase_table.test: Adding test case sql/sql_parse.cc: Set table->table.length to result of my_casedn_str(). strings/ctype-bin.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-mb.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-simple.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-ucs2.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-utf8.c: Changeing return type from void to int for caseup_str() and casedn_str(). Optimization, to get rid of strlen(): Adding my_utf8_uni_no_range() and my_uni_utf8_no_range() - for null terninated strings. --- strings/ctype-bin.c | 3 +- strings/ctype-mb.c | 24 +++++---- strings/ctype-simple.c | 18 ++++--- strings/ctype-ucs2.c | 8 +-- strings/ctype-utf8.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 162 insertions(+), 26 deletions(-) (limited to 'strings') diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 0bd5a1fda76..3e8b05580f6 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -211,9 +211,10 @@ static int my_strnncollsp_8bit_bin(CHARSET_INFO * cs __attribute__((unused)), /* This function is used for all conversion functions */ -static void my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)), +static uint my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)), char *str __attribute__((unused))) { + return 0; } static uint my_case_bin(CHARSET_INFO *cs __attribute__((unused)), diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 394111be3bc..3ef245015d7 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -21,40 +21,44 @@ #ifdef USE_MB -void my_caseup_str_mb(CHARSET_INFO * cs, char *str) +uint my_caseup_str_mb(CHARSET_INFO * cs, char *str) { register uint32 l; - register uchar *map=cs->to_upper; + register uchar *map= cs->to_upper; + char *str_orig= str; while (*str) { /* Pointing after the '\0' is safe here. */ - if ((l=my_ismbchar(cs, str, str + cs->mbmaxlen))) - str+=l; + if ((l= my_ismbchar(cs, str, str + cs->mbmaxlen))) + str+= l; else { - *str=(char) map[(uchar)*str]; + *str= (char) map[(uchar)*str]; str++; } } + return str - str_orig; } -void my_casedn_str_mb(CHARSET_INFO * cs, char *str) +uint my_casedn_str_mb(CHARSET_INFO * cs, char *str) { register uint32 l; - register uchar *map=cs->to_lower; + register uchar *map= cs->to_lower; + char *str_orig= str; while (*str) { /* Pointing after the '\0' is safe here. */ - if ((l=my_ismbchar(cs, str, str + cs->mbmaxlen))) - str+=l; + if ((l= my_ismbchar(cs, str, str + cs->mbmaxlen))) + str+= l; else { - *str=(char) map[(uchar)*str]; + *str= (char) map[(uchar)*str]; str++; } } + return str - str_orig; } uint my_caseup_mb(CHARSET_INFO * cs, char *src, uint srclen, diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index e40a1948dcf..7484f3c0d92 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -188,20 +188,26 @@ int my_strnncollsp_simple(CHARSET_INFO * cs, const uchar *a, uint a_length, } -void my_caseup_str_8bit(CHARSET_INFO * cs,char *str) +uint my_caseup_str_8bit(CHARSET_INFO * cs,char *str) { - register uchar *map=cs->to_upper; - while ((*str = (char) map[(uchar) *str]) != 0) + register uchar *map= cs->to_upper; + char *str_orig= str; + while ((*str= (char) map[(uchar) *str]) != 0) str++; + return str - str_orig; } -void my_casedn_str_8bit(CHARSET_INFO * cs,char *str) + +uint my_casedn_str_8bit(CHARSET_INFO * cs,char *str) { - register uchar *map=cs->to_lower; - while ((*str = (char) map[(uchar)*str]) != 0) + register uchar *map= cs->to_lower; + char *str_orig= str; + while ((*str= (char) map[(uchar) *str]) != 0) str++; + return str - str_orig; } + uint my_caseup_8bit(CHARSET_INFO * cs, char *src, uint srclen, char *dst __attribute__((unused)), uint dstlen __attribute__((unused))) diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 4a60220f73e..3c69a314b45 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -159,13 +159,13 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, uint slen, } -static void my_caseup_str_ucs2(CHARSET_INFO * cs __attribute__((unused)), +static uint my_caseup_str_ucs2(CHARSET_INFO * cs __attribute__((unused)), char * s __attribute__((unused))) { + return 0; } - static uint my_casedn_ucs2(CHARSET_INFO *cs, char *src, uint srclen, char *dst __attribute__((unused)), uint dstlen __attribute__((unused))) @@ -188,9 +188,11 @@ static uint my_casedn_ucs2(CHARSET_INFO *cs, char *src, uint srclen, return srclen; } -static void my_casedn_str_ucs2(CHARSET_INFO *cs __attribute__((unused)), + +static uint my_casedn_str_ucs2(CHARSET_INFO *cs __attribute__((unused)), char * s __attribute__((unused))) { + return 0; } diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index ae2c04fb068..a21fe4961ec 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -2045,6 +2045,52 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), return MY_CS_ILSEQ; } + +/* + The same as above, but without range check + for example, for a null-terminated string +*/ +static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t * pwc, const uchar *s) +{ + unsigned char c; + + c= s[0]; + if (c < 0x80) + { + *pwc = c; + return 1; + } + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (!((s[1] ^ 0x80) < 0x40)) + return MY_CS_ILSEQ; + + *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + return 2; + } + + if (c < 0xf0) + { + if (!((s[1] ^ 0x80) < 0x40 && + (s[2] ^ 0x80) < 0x40 && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + *pwc= ((my_wc_t) (c & 0x0f) << 12) | + ((my_wc_t) (s[1] ^ 0x80) << 6) | + (my_wc_t) (s[2] ^ 0x80); + + return 3; + } + return MY_CS_ILSEQ; +} + + static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) , my_wc_t wc, uchar *r, uchar *e) { @@ -2091,6 +2137,34 @@ static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) , } +/* + The same as above, but without range check. +*/ +static int my_uni_utf8_no_range(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t wc, uchar *r) +{ + int count; + + if (wc < 0x80) + count= 1; + else if (wc < 0x800) + count= 2; + else if (wc < 0x10000) + count= 3; + else + return MY_CS_ILUNI; + + switch (count) + { + /* Fall through all cases!!! */ + case 3: r[2]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0x800; + case 2: r[1]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0xc0; + case 1: r[0]= (uchar) wc; + } + return count; +} + + static uint my_caseup_utf8(CHARSET_INFO *cs, char *src, uint srclen, char *dst, uint dstlen) { @@ -2141,10 +2215,26 @@ static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, uint slen, } -static void my_caseup_str_utf8(CHARSET_INFO * cs, char * s) +static uint my_caseup_str_utf8(CHARSET_INFO *cs, char *src) { - uint len= (uint) strlen(s); - my_caseup_utf8(cs, s, len, s, len); + my_wc_t wc; + int srcres, dstres; + char *dst= src, *dst0= src; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + DBUG_ASSERT(cs->caseup_multiply == 1); + + while (*src && + (srcres= my_utf8_uni_no_range(cs, &wc, (uchar *) src)) > 0) + { + int plane= (wc>>8) & 0xFF; + wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].toupper : wc; + if ((dstres= my_uni_utf8_no_range(cs, wc, (uchar*) dst)) <= 0) + break; + src+= srcres; + dst+= dstres; + } + *dst= '\0'; + return (uint) (dst - dst0); } @@ -2170,10 +2260,43 @@ static uint my_casedn_utf8(CHARSET_INFO *cs, char *src, uint srclen, return (uint) (dst - dst0); } -static void my_casedn_str_utf8(CHARSET_INFO *cs, char * s) + +static uint my_casedn_str_utf8(CHARSET_INFO *cs, char *src) { - uint len= (uint) strlen(s); - my_casedn_utf8(cs, s, len, s, len); + my_wc_t wc; + int srcres, dstres; + char *dst= src, *dst0= src; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + DBUG_ASSERT(cs->casedn_multiply == 1); + + while (*src && + (srcres= my_utf8_uni_no_range(cs, &wc, (uchar *) src)) > 0) + { + int plane= (wc>>8) & 0xFF; + wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].tolower : wc; + if ((dstres= my_uni_utf8_no_range(cs, wc, (uchar*) dst)) <= 0) + break; + src+= srcres; + dst+= dstres; + } + + /* + In rare cases lower string can be shorter than + the original string, for example: + + "U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE" + (which is 0xC4B0 in utf8, i.e. two bytes) + + is converted into + + "U+0069 LATIN SMALL LETTER I" + (which is 0x69 in utf8, i.e. one byte) + + So, we need to put '\0' terminator after converting. + */ + + *dst= '\0'; + return (uint) (dst - dst0); } -- cgit v1.2.1 From b5d53f2031acabef3cf7baa6cac95079caef06af Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 3 Nov 2006 10:18:13 +0400 Subject: After merge fix for BUG#18908 ERROR 1406 (22001): Data too long for column :: using utf8 strings/ctype-utf8.c: Fix for compilation warning - forgot to add a new structure member in one of the previous changes. --- strings/ctype-utf8.c | 1 + 1 file changed, 1 insertion(+) (limited to 'strings') diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 6c3ceaf868b..6ed7736e24a 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -4051,6 +4051,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_strntoull_8bit, my_strntod_8bit, my_strtoll10_8bit, + my_strntoull10rnd_8bit, my_scan_8bit }; -- cgit v1.2.1