diff options
author | unknown <svoj@may.pils.ru> | 2006-05-29 16:46:46 +0500 |
---|---|---|
committer | unknown <svoj@may.pils.ru> | 2006-05-29 16:46:46 +0500 |
commit | 528e85a4c0de5b6cfe65f1c664089628db6cf1b4 (patch) | |
tree | 2d99880d147ef4ce559982554840a1011fb091e9 | |
parent | 52078846fc635b913c2090c20adc18e1f06c9e56 (diff) | |
download | mariadb-git-528e85a4c0de5b6cfe65f1c664089628db6cf1b4.tar.gz |
BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
The problem was that MySQL hadn't true ctype implementation. As a
result many multibyte punctuation/whitespace characters were
treated as word characters.
This fix uses recently added CTYPE table for unicode character sets
(WL1386) to detect unicode punctuation/whitespace characters
correctly.
Note: this is incompatible change since it changes parser behavior.
One will have to use REPAIR TABLE statement to rebuild fulltext
indexes.
mysql-test/r/fulltext2.result:
Testcase for BUG#19580.
mysql-test/t/fulltext2.test:
Testcase for BUG#19580.
storage/myisam/ft_parser.c:
Use WL1386 "CTYPE table for unicode character sets" functionality.
storage/myisam/ft_update.c:
Use WL1386 "CTYPE table for unicode character sets" functionality.
Reverse fix for BUG#16489 "utf8 + fulltext leads to corrupt index
file.". It is not needed anymore, since we have true ctype
implementation.
storage/myisam/ftdefs.h:
Use WL1386 "CTYPE table for unicode character sets" functionality.
Rework true_word_char macro so it accepts ctype instead of charset
as first param. It doesn't use my_isalnum anymore, but instead
directly checks ctype.
Obsolete word_char macro removed.
-rw-r--r-- | mysql-test/r/fulltext2.result | 8 | ||||
-rw-r--r-- | mysql-test/t/fulltext2.test | 10 | ||||
-rw-r--r-- | storage/myisam/ft_parser.c | 32 | ||||
-rw-r--r-- | storage/myisam/ft_update.c | 5 | ||||
-rw-r--r-- | storage/myisam/ftdefs.h | 5 |
5 files changed, 43 insertions, 17 deletions
diff --git a/mysql-test/r/fulltext2.result b/mysql-test/r/fulltext2.result index f6a4b20bc22..7e3e25370d3 100644 --- a/mysql-test/r/fulltext2.result +++ b/mysql-test/r/fulltext2.result @@ -241,3 +241,11 @@ select * from t1 where match a against('ab c' in boolean mode); a drop table t1; set names latin1; +SET NAMES utf8; +CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8; +INSERT INTO t1 VALUES('„MySQL“'); +SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE); +a +„MySQL“ +DROP TABLE t1; +SET NAMES latin1; diff --git a/mysql-test/t/fulltext2.test b/mysql-test/t/fulltext2.test index fd97f795534..88967a5dd04 100644 --- a/mysql-test/t/fulltext2.test +++ b/mysql-test/t/fulltext2.test @@ -221,3 +221,13 @@ drop table t1; set names latin1; # End of 4.1 tests + +# +# BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns +# +SET NAMES utf8; +CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8; +INSERT INTO t1 VALUES('„MySQL“'); +SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE); +DROP TABLE t1; +SET NAMES latin1; diff --git a/storage/myisam/ft_parser.c b/storage/myisam/ft_parser.c index 89ede813a2b..d3f83afbd7b 100644 --- a/storage/myisam/ft_parser.c +++ b/storage/myisam/ft_parser.c @@ -114,6 +114,7 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) { byte *doc=*start; + int ctype; uint mwc, length, mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); @@ -122,9 +123,11 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, while (doc<end) { - for (;doc<end;doc++) + for (; doc < end; doc+= (mbl > 0 ? mbl : 1)) { - if (true_word_char(cs,*doc)) break; + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) + break; if (*doc == FTB_RQUOT && param->quot) { param->quot=doc; @@ -158,14 +161,16 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end, } mwc=length=0; - for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) - if (true_word_char(cs,*doc)) + for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1)) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; - + } param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) @@ -200,24 +205,31 @@ byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end, { byte *doc= *start; uint mwc, length, mbl; + int ctype; DBUG_ENTER("ft_simple_get_word"); do { - for (;; doc++) + for (;; doc+= (mbl > 0 ? mbl : 1)) { - if (doc >= end) DBUG_RETURN(0); - if (true_word_char(cs, *doc)) break; + if (doc >= end) + DBUG_RETURN(0); + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) + break; } mwc= length= 0; - for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1)) - if (true_word_char(cs,*doc)) + for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1)) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (true_word_char(ctype, *doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; + } word->len= (uint)(doc-word->pos) - mwc; diff --git a/storage/myisam/ft_update.c b/storage/myisam/ft_update.c index f5501792dcd..adcdf81222f 100644 --- a/storage/myisam/ft_update.c +++ b/storage/myisam/ft_update.c @@ -174,11 +174,6 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2) FT_SEG_ITERATOR ftsi1, ftsi2; CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; DBUG_ENTER("_mi_ft_cmp"); -#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION - if (cs->mbmaxlen > 1) - DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT); -#endif - _mi_ft_segiterator_init(info, keynr, rec1, &ftsi1); _mi_ft_segiterator_init(info, keynr, rec2, &ftsi2); diff --git a/storage/myisam/ftdefs.h b/storage/myisam/ftdefs.h index 2c4cfa1ffd6..82429773824 100644 --- a/storage/myisam/ftdefs.h +++ b/storage/myisam/ftdefs.h @@ -24,9 +24,10 @@ #include <queues.h> #include <mysql/plugin.h> -#define true_word_char(s,X) (my_isalnum(s,X) || (X)=='_') +#define true_word_char(ctype, character) \ + ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ + (character) == '_') #define misc_word_char(X) 0 -#define word_char(s,X) (true_word_char(s,X) || misc_word_char(X)) #define FT_MAX_WORD_LEN_FOR_SORT 31 |