summaryrefslogtreecommitdiff
path: root/storage/myisam
diff options
context:
space:
mode:
authorunknown <svoj@may.pils.ru>2006-05-29 16:46:46 +0500
committerunknown <svoj@may.pils.ru>2006-05-29 16:46:46 +0500
commit528e85a4c0de5b6cfe65f1c664089628db6cf1b4 (patch)
tree2d99880d147ef4ce559982554840a1011fb091e9 /storage/myisam
parent52078846fc635b913c2090c20adc18e1f06c9e56 (diff)
downloadmariadb-git-528e85a4c0de5b6cfe65f1c664089628db6cf1b4.tar.gz
BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
The problem was that MySQL hadn't true ctype implementation. As a result many multibyte punctuation/whitespace characters were treated as word characters. This fix uses recently added CTYPE table for unicode character sets (WL1386) to detect unicode punctuation/whitespace characters correctly. Note: this is incompatible change since it changes parser behavior. One will have to use REPAIR TABLE statement to rebuild fulltext indexes. mysql-test/r/fulltext2.result: Testcase for BUG#19580. mysql-test/t/fulltext2.test: Testcase for BUG#19580. storage/myisam/ft_parser.c: Use WL1386 "CTYPE table for unicode character sets" functionality. storage/myisam/ft_update.c: Use WL1386 "CTYPE table for unicode character sets" functionality. Reverse fix for BUG#16489 "utf8 + fulltext leads to corrupt index file.". It is not needed anymore, since we have true ctype implementation. storage/myisam/ftdefs.h: Use WL1386 "CTYPE table for unicode character sets" functionality. Rework true_word_char macro so it accepts ctype instead of charset as first param. It doesn't use my_isalnum anymore, but instead directly checks ctype. Obsolete word_char macro removed.
Diffstat (limited to 'storage/myisam')
-rw-r--r--storage/myisam/ft_parser.c32
-rw-r--r--storage/myisam/ft_update.c5
-rw-r--r--storage/myisam/ftdefs.h5
3 files changed, 25 insertions, 17 deletions
diff --git a/storage/myisam/ft_parser.c b/storage/myisam/ft_parser.c
index 89ede813a2b..d3f83afbd7b 100644
--- a/storage/myisam/ft_parser.c
+++ b/storage/myisam/ft_parser.c
@@ -114,6 +114,7 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
{
byte *doc=*start;
+ int ctype;
uint mwc, length, mbl;
param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
@@ -122,9 +123,11 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
while (doc<end)
{
- for (;doc<end;doc++)
+ for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
{
- if (true_word_char(cs,*doc)) break;
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ break;
if (*doc == FTB_RQUOT && param->quot)
{
param->quot=doc;
@@ -158,14 +161,16 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
}
mwc=length=0;
- for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
- if (true_word_char(cs,*doc))
+ for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
mwc=0;
else if (!misc_word_char(*doc) || mwc)
break;
else
mwc++;
-
+ }
param->prev='A'; /* be sure *prev is true_word_char */
word->len= (uint)(doc-word->pos) - mwc;
if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
@@ -200,24 +205,31 @@ byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
{
byte *doc= *start;
uint mwc, length, mbl;
+ int ctype;
DBUG_ENTER("ft_simple_get_word");
do
{
- for (;; doc++)
+ for (;; doc+= (mbl > 0 ? mbl : 1))
{
- if (doc >= end) DBUG_RETURN(0);
- if (true_word_char(cs, *doc)) break;
+ if (doc >= end)
+ DBUG_RETURN(0);
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ break;
}
mwc= length= 0;
- for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
- if (true_word_char(cs,*doc))
+ for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
mwc= 0;
else if (!misc_word_char(*doc) || mwc)
break;
else
mwc++;
+ }
word->len= (uint)(doc-word->pos) - mwc;
diff --git a/storage/myisam/ft_update.c b/storage/myisam/ft_update.c
index f5501792dcd..adcdf81222f 100644
--- a/storage/myisam/ft_update.c
+++ b/storage/myisam/ft_update.c
@@ -174,11 +174,6 @@ int _mi_ft_cmp(MI_INFO *info, uint keynr, const byte *rec1, const byte *rec2)
FT_SEG_ITERATOR ftsi1, ftsi2;
CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
DBUG_ENTER("_mi_ft_cmp");
-#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION
- if (cs->mbmaxlen > 1)
- DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
-#endif
-
_mi_ft_segiterator_init(info, keynr, rec1, &ftsi1);
_mi_ft_segiterator_init(info, keynr, rec2, &ftsi2);
diff --git a/storage/myisam/ftdefs.h b/storage/myisam/ftdefs.h
index 2c4cfa1ffd6..82429773824 100644
--- a/storage/myisam/ftdefs.h
+++ b/storage/myisam/ftdefs.h
@@ -24,9 +24,10 @@
#include <queues.h>
#include <mysql/plugin.h>
-#define true_word_char(s,X) (my_isalnum(s,X) || (X)=='_')
+#define true_word_char(ctype, character) \
+ ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+ (character) == '_')
#define misc_word_char(X) 0
-#define word_char(s,X) (true_word_char(s,X) || misc_word_char(X))
#define FT_MAX_WORD_LEN_FOR_SORT 31