summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <bar@mysql.com>2007-03-28 18:57:30 +0500
committerunknown <bar@mysql.com>2007-03-28 18:57:30 +0500
commitb5cc4fa61d615800ab13d428fae0296dbef82b81 (patch)
treeb8f6eee64c7e1d9c576584f7fd0c94bb6afb1581
parent685d21b72f201a2eb16718e73c76e62ee708458d (diff)
downloadmariadb-git-b5cc4fa61d615800ab13d428fae0296dbef82b81.tar.gz
Bug#22638 SOUNDEX broken for international characters
Problem: SOUNDEX returned an invalid string for international characters in multi-byte character sets. For example: for a Chinese/Japanese 3-byte long character _utf8 0xE99885 it took only the very first byte 0xE9, put it into the outout string and then appended with three DIGIT ZERO characters, so the result was 0xE9303030 - which is an invalide utf8 string. Fix: make SOUNDEX() multi-byte aware and - put only complete characters into result, thus return only valid strings. This patch also makes SOUNDEX() compatible with UCS2. mysql-test/r/ctype_ucs.result: Adding tests mysql-test/r/ctype_utf8.result: Adding tests mysql-test/t/ctype_ucs.test: Adding tests mysql-test/t/ctype_utf8.test: Adding tests sql/item_strfunc.cc: Making soundex multi-byte aware.
-rw-r--r--mysql-test/r/ctype_ucs.result18
-rw-r--r--mysql-test/r/ctype_utf8.result12
-rw-r--r--mysql-test/t/ctype_ucs.test14
-rw-r--r--mysql-test/t/ctype_utf8.test8
-rw-r--r--sql/item_strfunc.cc135
5 files changed, 158 insertions, 29 deletions
diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result
index e32c1e8aae0..960953b3c5e 100644
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@ -839,6 +839,24 @@ lily
river
drop table t1;
deallocate prepare stmt;
+set names latin1;
+set character_set_connection=ucs2;
+select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
+soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb')
+ H000 H4142 I51231
+select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
+hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb'))
+ 0048003000300030 00480034003100340032 004900350031003200330031
+select 'mood' sounds like 'mud';
+'mood' sounds like 'mud'
+1
+select hex(soundex(_ucs2 0x041004110412));
+hex(soundex(_ucs2 0x041004110412))
+0410003000300030
+select hex(soundex(_ucs2 0x00BF00C0));
+hex(soundex(_ucs2 0x00BF00C0))
+00C0003000300030
+set names latin1;
create table t1(a blob, b text charset utf8, c text charset ucs2);
select data_type, character_octet_length, character_maximum_length
from information_schema.columns where table_name='t1';
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index be1e1742ba6..1c6bc0e05b6 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test');
id a
1 Test
drop table t1;
+select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
+soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
+阅000
+select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
+hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
+E99885303030
+select soundex(_utf8 0xD091D092D093);
+soundex(_utf8 0xD091D092D093)
+Б000
+select hex(soundex(_utf8 0xD091D092D093));
+hex(soundex(_utf8 0xD091D092D093))
+D091303030
SET collation_connection='utf8_general_ci';
create table t1 select repeat('a',4000) a;
delete from t1;
diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test
index 5a3720dc431..c3320159c41 100644
--- a/mysql-test/t/ctype_ucs.test
+++ b/mysql-test/t/ctype_ucs.test
@@ -573,6 +573,20 @@ drop table t1;
deallocate prepare stmt;
#
+# Bug#22638 SOUNDEX broken for international characters
+#
+set names latin1;
+set character_set_connection=ucs2;
+select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
+select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
+select 'mood' sounds like 'mud';
+# Cyrillic A, BE, VE
+select hex(soundex(_ucs2 0x041004110412));
+# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
+select hex(soundex(_ucs2 0x00BF00C0));
+set names latin1;
+
+#
# Bug #14290: character_maximum_length for text fields
#
create table t1(a blob, b text charset utf8, c text charset ucs2);
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 04b7ec78842..79b73fc7880 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST');
select * from t1 where soundex(a) = soundex('test');
drop table t1;
+#
+# Bug#22638 SOUNDEX broken for international characters
+#
+select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
+select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
+select soundex(_utf8 0xD091D092D093);
+select hex(soundex(_utf8 0xD091D092D093));
+
SET collation_connection='utf8_general_ci';
-- source include/ctype_filesort.inc
diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc
index 6b1921e5bc8..03887629519 100644
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec()
{
collation.set(args[0]->collation);
max_length=args[0]->max_length;
- set_if_bigger(max_length,4);
+ set_if_bigger(max_length, 4 * collation.collation->mbminlen);
+ tmp_value.set_charset(collation.collation);
}
@@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec()
else return 0
*/
-static char soundex_toupper(char ch)
+static int soundex_toupper(int ch)
{
return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch;
}
-static char get_scode(char *ptr)
+
+static char get_scode(int wc)
{
- uchar ch= soundex_toupper(*ptr);
+ int ch= soundex_toupper(wc);
if (ch < 'A' || ch > 'Z')
{
// Thread extended alfa (country spec)
@@ -1832,46 +1834,121 @@ static char get_scode(char *ptr)
}
+static bool my_uni_isalpha(int wc)
+{
+ /*
+ Return true for all Basic Latin letters: a..z A..Z.
+ Return true for all Unicode characters with code higher than U+00C0:
+ - characters between 'z' and U+00C0 are controls and punctuations.
+ - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'.
+ */
+ return (wc >= 'a' && wc <= 'z') ||
+ (wc >= 'A' && wc <= 'Z') ||
+ (wc >= 0xC0);
+}
+
+
String *Item_func_soundex::val_str(String *str)
{
DBUG_ASSERT(fixed == 1);
String *res =args[0]->val_str(str);
char last_ch,ch;
CHARSET_INFO *cs= collation.collation;
+ my_wc_t wc;
+ uint nchars;
+ int rc;
- if ((null_value=args[0]->null_value))
+ if ((null_value= args[0]->null_value))
return 0; /* purecov: inspected */
- if (tmp_value.alloc(max(res->length(),4)))
+ if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen)))
return str; /* purecov: inspected */
char *to= (char *) tmp_value.ptr();
- char *from= (char *) res->ptr(), *end=from+res->length();
- tmp_value.set_charset(cs);
+ char *to_end= to + tmp_value.alloced_length();
+ char *from= (char *) res->ptr(), *end= from + res->length();
- while (from != end && !my_isalpha(cs,*from)) // Skip pre-space
- from++; /* purecov: inspected */
- if (from == end)
- return &my_empty_string; // No alpha characters.
- *to++ = soundex_toupper(*from); // Copy first letter
- last_ch = get_scode(from); // code of the first letter
- // for the first 'double-letter check.
- // Loop on input letters until
- // end of input (null) or output
- // letter code count = 3
- for (from++ ; from < end ; from++)
- {
- if (!my_isalpha(cs,*from))
- continue;
- ch=get_scode(from);
+ for ( ; ; ) /* Skip pre-space */
+ {
+ if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
+ return &my_empty_string; /* EOL or invalid byte sequence */
+
+ if (rc == 1 && cs->ctype)
+ {
+ /* Single byte letter found */
+ if (my_isalpha(cs, *from))
+ {
+ last_ch= get_scode(*from); // Code of the first letter
+ *to++= soundex_toupper(*from++); // Copy first letter
+ break;
+ }
+ from++;
+ }
+ else
+ {
+ from+= rc;
+ if (my_uni_isalpha(wc))
+ {
+ /* Multibyte letter found */
+ wc= soundex_toupper(wc);
+ last_ch= get_scode(wc); // Code of the first letter
+ if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0)
+ {
+ /* Extra safety - should not really happen */
+ DBUG_ASSERT(false);
+ return &my_empty_string;
+ }
+ to+= rc;
+ break;
+ }
+ }
+ }
+
+ /*
+ last_ch is now set to the first 'double-letter' check.
+ loop on input letters until end of input
+ */
+ for (nchars= 1 ; ; )
+ {
+ if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
+ break; /* EOL or invalid byte sequence */
+
+ if (rc == 1 && cs->ctype)
+ {
+ if (!my_isalpha(cs, *from++))
+ continue;
+ }
+ else
+ {
+ from+= rc;
+ if (!my_uni_isalpha(wc))
+ continue;
+ }
+
+ ch= get_scode(wc);
if ((ch != '0') && (ch != last_ch)) // if not skipped or double
{
- *to++ = ch; // letter, copy to output
- last_ch = ch; // save code of last input letter
- } // for next double-letter check
+ // letter, copy to output
+ if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch,
+ (uchar*) to, (uchar*) to_end)) <= 0)
+ {
+ // Extra safety - should not really happen
+ DBUG_ASSERT(false);
+ break;
+ }
+ to+= rc;
+ nchars++;
+ last_ch= ch; // save code of last input letter
+ } // for next double-letter check
}
- for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++)
- *to = '0';
- *to=0; // end string
+
+ /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */
+ if (nchars < 4)
+ {
+ uint nbytes= (4 - nchars) * cs->mbminlen;
+ cs->cset->fill(cs, to, nbytes, '0');
+ to+= nbytes;
+ }
+
tmp_value.length((uint) (to-tmp_value.ptr()));
return &tmp_value;
}