diff options
author | unknown <bar@mysql.com> | 2007-03-28 18:57:30 +0500 |
---|---|---|
committer | unknown <bar@mysql.com> | 2007-03-28 18:57:30 +0500 |
commit | b5cc4fa61d615800ab13d428fae0296dbef82b81 (patch) | |
tree | b8f6eee64c7e1d9c576584f7fd0c94bb6afb1581 | |
parent | 685d21b72f201a2eb16718e73c76e62ee708458d (diff) | |
download | mariadb-git-b5cc4fa61d615800ab13d428fae0296dbef82b81.tar.gz |
Bug#22638 SOUNDEX broken for international characters
Problem: SOUNDEX returned an invalid string for international
characters in multi-byte character sets.
For example: for a Chinese/Japanese 3-byte long character
_utf8 0xE99885 it took only the very first byte 0xE9,
put it into the outout string and then appended with three
DIGIT ZERO characters, so the result was 0xE9303030 - which
is an invalide utf8 string.
Fix: make SOUNDEX() multi-byte aware and - put only complete
characters into result, thus return only valid strings.
This patch also makes SOUNDEX() compatible with UCS2.
mysql-test/r/ctype_ucs.result:
Adding tests
mysql-test/r/ctype_utf8.result:
Adding tests
mysql-test/t/ctype_ucs.test:
Adding tests
mysql-test/t/ctype_utf8.test:
Adding tests
sql/item_strfunc.cc:
Making soundex multi-byte aware.
-rw-r--r-- | mysql-test/r/ctype_ucs.result | 18 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8.result | 12 | ||||
-rw-r--r-- | mysql-test/t/ctype_ucs.test | 14 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8.test | 8 | ||||
-rw-r--r-- | sql/item_strfunc.cc | 135 |
5 files changed, 158 insertions, 29 deletions
diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result index e32c1e8aae0..960953b3c5e 100644 --- a/mysql-test/r/ctype_ucs.result +++ b/mysql-test/r/ctype_ucs.result @@ -839,6 +839,24 @@ lily river drop table t1; deallocate prepare stmt; +set names latin1; +set character_set_connection=ucs2; +select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); +soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb') + H000 H4142 I51231 +select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); +hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb')) + 0048003000300030 00480034003100340032 004900350031003200330031 +select 'mood' sounds like 'mud'; +'mood' sounds like 'mud' +1 +select hex(soundex(_ucs2 0x041004110412)); +hex(soundex(_ucs2 0x041004110412)) +0410003000300030 +select hex(soundex(_ucs2 0x00BF00C0)); +hex(soundex(_ucs2 0x00BF00C0)) +00C0003000300030 +set names latin1; create table t1(a blob, b text charset utf8, c text charset ucs2); select data_type, character_octet_length, character_maximum_length from information_schema.columns where table_name='t1'; diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index be1e1742ba6..1c6bc0e05b6 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test'); id a 1 Test drop table t1; +select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); +soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB) +阅000 +select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); +hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)) +E99885303030 +select soundex(_utf8 0xD091D092D093); +soundex(_utf8 0xD091D092D093) +Б000 +select hex(soundex(_utf8 0xD091D092D093)); +hex(soundex(_utf8 0xD091D092D093)) +D091303030 SET collation_connection='utf8_general_ci'; create table t1 select repeat('a',4000) a; delete from t1; diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test index 5a3720dc431..c3320159c41 100644 --- a/mysql-test/t/ctype_ucs.test +++ b/mysql-test/t/ctype_ucs.test @@ -573,6 +573,20 @@ drop table t1; deallocate prepare stmt; # +# Bug#22638 SOUNDEX broken for international characters +# +set names latin1; +set character_set_connection=ucs2; +select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); +select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); +select 'mood' sounds like 'mud'; +# Cyrillic A, BE, VE +select hex(soundex(_ucs2 0x041004110412)); +# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter +select hex(soundex(_ucs2 0x00BF00C0)); +set names latin1; + +# # Bug #14290: character_maximum_length for text fields # create table t1(a blob, b text charset utf8, c text charset ucs2); diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 04b7ec78842..79b73fc7880 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST'); select * from t1 where soundex(a) = soundex('test'); drop table t1; +# +# Bug#22638 SOUNDEX broken for international characters +# +select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); +select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); +select soundex(_utf8 0xD091D092D093); +select hex(soundex(_utf8 0xD091D092D093)); + SET collation_connection='utf8_general_ci'; -- source include/ctype_filesort.inc diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc index 6b1921e5bc8..03887629519 100644 --- a/sql/item_strfunc.cc +++ b/sql/item_strfunc.cc @@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec() { collation.set(args[0]->collation); max_length=args[0]->max_length; - set_if_bigger(max_length,4); + set_if_bigger(max_length, 4 * collation.collation->mbminlen); + tmp_value.set_charset(collation.collation); } @@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec() else return 0 */ -static char soundex_toupper(char ch) +static int soundex_toupper(int ch) { return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch; } -static char get_scode(char *ptr) + +static char get_scode(int wc) { - uchar ch= soundex_toupper(*ptr); + int ch= soundex_toupper(wc); if (ch < 'A' || ch > 'Z') { // Thread extended alfa (country spec) @@ -1832,46 +1834,121 @@ static char get_scode(char *ptr) } +static bool my_uni_isalpha(int wc) +{ + /* + Return true for all Basic Latin letters: a..z A..Z. + Return true for all Unicode characters with code higher than U+00C0: + - characters between 'z' and U+00C0 are controls and punctuations. + - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'. + */ + return (wc >= 'a' && wc <= 'z') || + (wc >= 'A' && wc <= 'Z') || + (wc >= 0xC0); +} + + String *Item_func_soundex::val_str(String *str) { DBUG_ASSERT(fixed == 1); String *res =args[0]->val_str(str); char last_ch,ch; CHARSET_INFO *cs= collation.collation; + my_wc_t wc; + uint nchars; + int rc; - if ((null_value=args[0]->null_value)) + if ((null_value= args[0]->null_value)) return 0; /* purecov: inspected */ - if (tmp_value.alloc(max(res->length(),4))) + if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen))) return str; /* purecov: inspected */ char *to= (char *) tmp_value.ptr(); - char *from= (char *) res->ptr(), *end=from+res->length(); - tmp_value.set_charset(cs); + char *to_end= to + tmp_value.alloced_length(); + char *from= (char *) res->ptr(), *end= from + res->length(); - while (from != end && !my_isalpha(cs,*from)) // Skip pre-space - from++; /* purecov: inspected */ - if (from == end) - return &my_empty_string; // No alpha characters. - *to++ = soundex_toupper(*from); // Copy first letter - last_ch = get_scode(from); // code of the first letter - // for the first 'double-letter check. - // Loop on input letters until - // end of input (null) or output - // letter code count = 3 - for (from++ ; from < end ; from++) - { - if (!my_isalpha(cs,*from)) - continue; - ch=get_scode(from); + for ( ; ; ) /* Skip pre-space */ + { + if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) + return &my_empty_string; /* EOL or invalid byte sequence */ + + if (rc == 1 && cs->ctype) + { + /* Single byte letter found */ + if (my_isalpha(cs, *from)) + { + last_ch= get_scode(*from); // Code of the first letter + *to++= soundex_toupper(*from++); // Copy first letter + break; + } + from++; + } + else + { + from+= rc; + if (my_uni_isalpha(wc)) + { + /* Multibyte letter found */ + wc= soundex_toupper(wc); + last_ch= get_scode(wc); // Code of the first letter + if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0) + { + /* Extra safety - should not really happen */ + DBUG_ASSERT(false); + return &my_empty_string; + } + to+= rc; + break; + } + } + } + + /* + last_ch is now set to the first 'double-letter' check. + loop on input letters until end of input + */ + for (nchars= 1 ; ; ) + { + if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) + break; /* EOL or invalid byte sequence */ + + if (rc == 1 && cs->ctype) + { + if (!my_isalpha(cs, *from++)) + continue; + } + else + { + from+= rc; + if (!my_uni_isalpha(wc)) + continue; + } + + ch= get_scode(wc); if ((ch != '0') && (ch != last_ch)) // if not skipped or double { - *to++ = ch; // letter, copy to output - last_ch = ch; // save code of last input letter - } // for next double-letter check + // letter, copy to output + if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch, + (uchar*) to, (uchar*) to_end)) <= 0) + { + // Extra safety - should not really happen + DBUG_ASSERT(false); + break; + } + to+= rc; + nchars++; + last_ch= ch; // save code of last input letter + } // for next double-letter check } - for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++) - *to = '0'; - *to=0; // end string + + /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */ + if (nchars < 4) + { + uint nbytes= (4 - nchars) * cs->mbminlen; + cs->cset->fill(cs, to, nbytes, '0'); + to+= nbytes; + } + tmp_value.length((uint) (to-tmp_value.ptr())); return &tmp_value; } |