diff options
author | unknown <monty@hundin.mysql.fi> | 2002-06-03 12:59:31 +0300 |
---|---|---|
committer | unknown <monty@hundin.mysql.fi> | 2002-06-03 12:59:31 +0300 |
commit | 702256ebaa12ca37724f68da2bd5f5d2959b1e8f (patch) | |
tree | be04186411dc657ef6bbcbe01267d30f2675c914 /myisam/ft_boolean_search.c | |
parent | 775d09780fb32c101c4e5da2807817c1d1231ce9 (diff) | |
parent | 7daf5a5d0ee7b52508943c7095a2cc150abcf616 (diff) | |
download | mariadb-git-702256ebaa12ca37724f68da2bd5f5d2959b1e8f.tar.gz |
merge with 4.0
BitKeeper/etc/ignore:
auto-union
BitKeeper/etc/logging_ok:
auto-union
BUILD/SETUP.sh:
Auto merged
BUILD/compile-pentium-debug:
Auto merged
BitKeeper/triggers/post-commit:
Auto merged
configure.in:
Auto merged
Docs/manual.texi:
Auto merged
client/mysql.cc:
Auto merged
client/mysqldump.c:
Auto merged
client/mysqltest.c:
Auto merged
extra/mysql_install.c:
Auto merged
extra/resolve_stack_dump.c:
Auto merged
extra/resolveip.c:
Auto merged
include/my_sys.h:
Auto merged
include/mysqld_error.h:
Auto merged
isam/pack_isam.c:
Auto merged
libmysql/Makefile.shared:
Auto merged
libmysql/libmysql.c:
Auto merged
myisam/ft_dump.c:
Auto merged
myisam/ft_test1.c:
Auto merged
myisam/ftdefs.h:
Auto merged
myisam/mi_check.c:
Auto merged
myisam/mi_test1.c:
Auto merged
myisam/mi_write.c:
Auto merged
myisam/myisamchk.c:
Auto merged
myisam/myisampack.c:
Auto merged
mysql-test/mysql-test-run.sh:
Auto merged
mysql-test/r/select_found.result:
Auto merged
mysql-test/t/select_found.test:
Auto merged
mysys/charset.c:
Auto merged
mysys/default.c:
Auto merged
mysys/hash.c:
Auto merged
sql/field.cc:
Auto merged
sql/gen_lex_hash.cc:
Auto merged
sql/ha_innodb.cc:
Auto merged
sql/hostname.cc:
Auto merged
sql/item_cmpfunc.h:
Auto merged
sql/item_strfunc.cc:
Auto merged
sql/item_timefunc.h:
Auto merged
sql/lex.h:
Auto merged
sql/log.cc:
Auto merged
sql/mysql_priv.h:
Auto merged
sql/repl_failsafe.cc:
Auto merged
sql/slave.cc:
Auto merged
sql/sql_acl.cc:
Auto merged
sql/sql_base.cc:
Auto merged
sql/sql_cache.cc:
Auto merged
sql/sql_class.cc:
Auto merged
sql/sql_class.h:
Auto merged
sql/sql_db.cc:
Auto merged
sql/sql_parse.cc:
Auto merged
sql/sql_select.cc:
Auto merged
sql/sql_string.cc:
Auto merged
sql/sql_table.cc:
Auto merged
sql/sql_union.cc:
Auto merged
sql/share/czech/errmsg.txt:
Auto merged
sql/share/danish/errmsg.txt:
Auto merged
sql/share/dutch/errmsg.txt:
Auto merged
sql/share/english/errmsg.txt:
Auto merged
sql/share/estonian/errmsg.txt:
Auto merged
sql/share/german/errmsg.txt:
Auto merged
sql/share/greek/errmsg.txt:
Auto merged
sql/share/hungarian/errmsg.txt:
Auto merged
sql/share/italian/errmsg.txt:
Auto merged
sql/share/japanese/errmsg.txt:
Auto merged
sql/share/korean/errmsg.txt:
Auto merged
sql/share/norwegian-ny/errmsg.txt:
Auto merged
sql/share/norwegian/errmsg.txt:
Auto merged
sql/sql_update.cc:
Auto merged
sql/structs.h:
Auto merged
sql/share/polish/errmsg.txt:
Auto merged
sql/share/portuguese/errmsg.txt:
Auto merged
sql/share/romanian/errmsg.txt:
Auto merged
sql/share/russian/errmsg.txt:
Auto merged
sql/share/slovak/errmsg.txt:
Auto merged
sql/share/spanish/errmsg.txt:
Auto merged
sql/share/swedish/errmsg.txt:
Auto merged
sql/share/ukrainian/errmsg.txt:
Auto merged
strings/Makefile.am:
Auto merged
strings/ctype-ujis.c:
Auto merged
tools/mysqlmanager.c:
Auto merged
Diffstat (limited to 'myisam/ft_boolean_search.c')
-rw-r--r-- | myisam/ft_boolean_search.c | 182 |
1 files changed, 132 insertions, 50 deletions
diff --git a/myisam/ft_boolean_search.c b/myisam/ft_boolean_search.c index 0c5efcb50df..f2f3a806892 100644 --- a/myisam/ft_boolean_search.c +++ b/myisam/ft_boolean_search.c @@ -59,6 +59,7 @@ static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ typedef struct st_ftb_expr FTB_EXPR; struct st_ftb_expr { FTB_EXPR *up; + byte *quot, *qend; float weight; uint flags; my_off_t docid[2]; /* for index search and for scan */ @@ -76,7 +77,7 @@ typedef struct st_ftb_word { my_off_t docid[2]; /* for index search and for scan */ uint ndepth; int len; - /* ... there can be docid cache added here. SerG */ + /* ... docid cache can be added here. SerG */ byte word[1]; } FTB_WORD; @@ -84,10 +85,12 @@ typedef struct st_ft_info { struct _ft_vft *please; MI_INFO *info; uint keynr; + CHARSET_INFO *charset; enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE /*, SCAN*/ } state; uint with_scan; FTB_EXPR *root; QUEUE queue; + TREE no_dupes; FTB_WORD **list; MEM_ROOT mem_root; } FTB; @@ -101,18 +104,18 @@ int FTB_WORD_cmp(void *v __attribute__((unused)), FTB_WORD *a, FTB_WORD *b) return i; } -int FTB_WORD_cmp_list(void *v __attribute__((unused)), FTB_WORD **a, FTB_WORD **b) +int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) { /* ORDER BY word DESC, ndepth DESC */ - int i= mi_compare_text(default_charset_info, (*b)->word+1,(*b)->len-1, - (*a)->word+1,(*a)->len-1,0); + int i= mi_compare_text(cs, (*b)->word+1,(*b)->len-1, + (*a)->word+1,(*a)->len-1,0); if (!i) i=CMP_NUM((*b)->ndepth,(*a)->ndepth); return i; } void _ftb_parse_query(FTB *ftb, byte **start, byte *end, - FTB_EXPR *up, uint depth) + FTB_EXPR *up, uint depth) { byte res; FTB_PARAM param; @@ -125,16 +128,17 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, return; param.prev=' '; + param.quot=up->quot; while ((res=ft_get_word(start,end,&w,¶m))) { - int r=param.plusminus; + int r=param.plusminus; float weight= (float) (param.pmsign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; switch (res) { case 1: /* word found */ ftbw=(FTB_WORD *)alloc_root(&ftb->mem_root, - sizeof(FTB_WORD) + - (param.trunc ? MI_MAX_KEY_BUFF : - w.len+extra)); + sizeof(FTB_WORD) + + (param.trunc ? MI_MAX_KEY_BUFF : + w.len+extra)); ftbw->len=w.len+1; ftbw->flags=0; if (param.yesno>0) ftbw->flags|=FTB_FLAG_YES; @@ -148,7 +152,7 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ftbw->word[0]=w.len; if (param.yesno > 0) up->ythresh++; queue_insert(& ftb->queue, (byte *)ftbw); - ftb->with_scan|=param.trunc; + ftb->with_scan|=(param.trunc & FTB_FLAG_TRUNC); break; case 2: /* left bracket */ ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)); @@ -159,17 +163,25 @@ void _ftb_parse_query(FTB *ftb, byte **start, byte *end, ftbe->up=up; ftbe->ythresh=ftbe->yweaks=0; ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR; + if ((ftbe->quot=param.quot)) ftb->with_scan|=2; if (param.yesno > 0) up->ythresh++; _ftb_parse_query(ftb, start, end, ftbe, depth+1); + param.quot=0; break; case 3: /* right bracket */ + if (up->quot) up->qend=param.quot; return; } } return; } -void _ftb_init_index_search(FT_INFO *ftb) +static int _ftb_no_dupes_cmp(void* not_used, const void *a,const void *b) +{ + return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b))); +} + +void _ftb_init_index_search(FT_INFO *ftb) { int i, r; FTB_WORD *ftbw; @@ -188,27 +200,48 @@ void _ftb_init_index_search(FT_INFO *ftb) { ftbw=(FTB_WORD *)(ftb->queue.root[i]); - if (ftbw->flags&FTB_FLAG_TRUNC && - (ftbw->up->ythresh > test(ftbw->flags&FTB_FLAG_YES))) - { - /* no need to search for this prefix in the index - - * it cannot ADD new matches, and to REMOVE half-matched - * rows we do scan anyway */ - ftbw->docid[0]=HA_POS_ERROR; - ftbw->up->yweaks++; - continue; - } + if (ftbw->flags&FTB_FLAG_TRUNC) + /* special treatment for truncation operator :(( + 1. +trunc* and there're other (not +trunc*) words + | no need to search in the index, it can never ADD new rows + | to the result, and to remove half-matched rows we do scan anyway + 2. -trunc* + | same as 1. + 3. trunc* + | We have to index-search for this prefix. + | It may cause duplicates, as in the index (sorted by <word,docid>) + | <aaaa,row1> + | <aabb,row2> + | <aacc,row1> + | Searching for "aa*" will find row1 twice... + */ + if ( test(ftbw->flags&FTB_FLAG_NO) || /* 2 */ + (test(ftbw->flags&FTB_FLAG_YES) && /* 1 */ + ftbw->up->ythresh - ftbw->up->yweaks >1)) /* 1 */ + { + ftbw->docid[0]=HA_POS_ERROR; + ftbw->up->yweaks++; + continue; + } + else /* 3 */ + { + if (!is_tree_inited(& ftb->no_dupes)) + { + init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t), + _ftb_no_dupes_cmp,0,0,0); + } + } r=_mi_search(info, keyinfo, (uchar*) ftbw->word, ftbw->len, SEARCH_FIND | SEARCH_BIGGER, keyroot); if (!r) { - r= mi_compare_text(default_charset_info, + r= mi_compare_text(ftb->charset, info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), - 0); + 0); } if (r) /* not found */ { @@ -241,14 +274,18 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, ftb->state=UNINITIALIZED; ftb->info=info; ftb->keynr=keynr; + ftb->charset= ((keynr==NO_SUCH_KEY) ? + default_charset_info : + info->s->keyinfo[keynr].seg->charset); ftb->with_scan=0; + bzero(& ftb->no_dupes, sizeof(TREE)); init_alloc_root(&ftb->mem_root, 1024, 1024); /* hack: instead of init_queue, we'll use reinit queue to be able * to alloc queue with alloc_root() */ - res=ftb->queue.max_elements=query_len/(ft_min_word_len+1); + res=ftb->queue.max_elements=1+query_len/(ft_min_word_len+1); ftb->queue.root=(byte **)alloc_root(&ftb->mem_root, (res+1)*sizeof(void*)); reinit_queue(& ftb->queue, res, 0, 0, (int (*)(void*,byte*,byte*))FTB_WORD_cmp, ftb); @@ -256,6 +293,7 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, ftbe->weight=1; ftbe->flags=FTB_FLAG_YES; ftbe->nos=1; + ftbe->quot=0; ftbe->up=0; ftbe->ythresh=ftbe->yweaks=0; ftbe->docid[0]=ftbe->docid[1]=HA_POS_ERROR; @@ -263,19 +301,44 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, byte *query, _ftb_parse_query(ftb, &query, query+query_len, ftbe, 0); ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root, sizeof(FTB_WORD *)*ftb->queue.elements); - memcpy(ftb->list, ftb->queue.root, sizeof(FTB_WORD *)*ftb->queue.elements); + memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), - (qsort2_cmp)FTB_WORD_cmp_list, 0); - if (ftb->queue.elements<2) ftb->with_scan=0; + (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset); + if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; ftb->state=READY; return ftb; } -void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) +/* returns 1 if str0 contain str1 */ +int _ftb_strstr(const byte *s0, const byte *e0, + const byte *s1, const byte *e1, + CHARSET_INFO *cs) { + const byte *p; + + while (s0 < e0) + { + while (s0 < e0 && cs->to_upper[(uint) (uchar) *s0++] != + cs->to_upper[(uint) (uchar) *s1]) + /* no-op */; + if (s0 >= e0) + return 0; + p=s1+1; + while (s0 < e0 && p < e1 && cs->to_upper[(uint) (uchar) *s0++] == + cs->to_upper[(uint) (uchar) *p++]) + /* no-op */; + if (p >= e1) + return 1; + } + return 0; +} + +void _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) +{ + FT_SEG_ITERATOR ftsi; FTB_EXPR *ftbe; float weight=ftbw->weight; - int yn=ftbw->flags, ythresh; + int yn=ftbw->flags, ythresh, mode=(ftsi_orig != 0); my_off_t curdoc=ftbw->docid[mode]; for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) @@ -291,11 +354,26 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) break; if (yn & FTB_FLAG_YES) { - ftbe->cur_weight+=weight; + weight /= ftbe->ythresh; + ftbe->cur_weight += weight; if (++ftbe->yesses == ythresh) { yn=ftbe->flags; weight=ftbe->cur_weight*ftbe->weight; + if (mode && ftbe->quot) + { + int not_found=1; + + memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); + while (_mi_ft_segiterator(&ftsi) && not_found) + { + if (!ftsi.pos) + continue; + not_found = ! _ftb_strstr(ftsi.pos, ftsi.pos+ftsi.len, + ftbe->quot, ftbe->qend, ftb->charset); + } + if (not_found) break; + } /* ftbe->quot */ } else break; @@ -315,7 +393,8 @@ void _ftb_climb_the_tree(FTB_WORD *ftbw, uint mode) } else { - ftbe->cur_weight+=weight; + if (ftbe->ythresh) weight/=3; + ftbe->cur_weight += weight; if (ftbe->yesses < ythresh) break; yn= (ftbe->yesses++ == ythresh) ? ftbe->flags : 0 ; @@ -352,18 +431,18 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record) { while (curdoc==(ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) { - _ftb_climb_the_tree(ftbw,0); + _ftb_climb_the_tree(ftb, ftbw, 0); /* update queue */ r=_mi_search(info, keyinfo, (uchar*) ftbw->word, USE_WHOLE_KEY, SEARCH_BIGGER , keyroot); if (!r) { - r= mi_compare_text(default_charset_info, + r= mi_compare_text(ftb->charset, info->lastkey + (ftbw->flags&FTB_FLAG_TRUNC), ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), ftbw->word + (ftbw->flags&FTB_FLAG_TRUNC), - ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), + ftbw->len - (ftbw->flags&FTB_FLAG_TRUNC), 0); } if (r) /* not found */ @@ -388,6 +467,11 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record) ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos) { /* curdoc matched ! */ + if (is_tree_inited(& ftb->no_dupes) && + tree_insert(& ftb->no_dupes, &curdoc, 0)->count >1) + /* but it managed to get past this line once */ + continue; + info->lastpos=curdoc; info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); /* why is this ? */ @@ -410,7 +494,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) FT_WORD word; FTB_WORD *ftbw; FTB_EXPR *ftbe; - FT_SEG_ITERATOR ftsi; + FT_SEG_ITERATOR ftsi, ftsi2; const byte *end; my_off_t docid=ftb->info->lastpos; @@ -419,17 +503,11 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) if (!ftb->queue.elements) return 0; -#if NOT_USED - if (ftb->state == READY || ftb->state == INDEX_DONE) - ftb->state=SCAN; - else if (ftb->state != SCAN) - return -3.0; -#endif - if (ftb->keynr==NO_SUCH_KEY) _mi_ft_segiterator_dummy_init(record, length, &ftsi); else _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); + memcpy(&ftsi2, &ftsi, sizeof(ftsi)); while (_mi_ft_segiterator(&ftsi)) { @@ -437,15 +515,15 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) continue; end=ftsi.pos+ftsi.len; - while (ft_simple_get_word((byte **)&ftsi.pos,(byte *)end,&word)) + while (ft_simple_get_word((byte **) &ftsi.pos,(byte *) end, &word)) { int a, b, c; for (a=0, b=ftb->queue.elements, c=(a+b)/2; b-a>1; c=(a+b)/2) { ftbw=(FTB_WORD *)(ftb->list[c]); - if (mi_compare_text(default_charset_info, word.pos,word.len, - (uchar*) ftbw->word+1,ftbw->len-1, - (ftbw->flags&FTB_FLAG_TRUNC) ) >0) + if (mi_compare_text(ftb->charset, word.pos, word.len, + (uchar*) ftbw->word+1, ftbw->len-1, + (my_bool) (ftbw->flags&FTB_FLAG_TRUNC)) >0) b=c; else a=c; @@ -453,14 +531,14 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) for (; c>=0; c--) { ftbw=(FTB_WORD *)(ftb->list[c]); - if (mi_compare_text(default_charset_info, word.pos,word.len, - (uchar*) ftbw->word+1,ftbw->len-1, - (ftbw->flags&FTB_FLAG_TRUNC) )) + if (mi_compare_text(ftb->charset, word.pos,word.len, + (uchar*) ftbw->word+1,ftbw->len-1, + (my_bool) (ftbw->flags&FTB_FLAG_TRUNC))) break; if (ftbw->docid[1] == docid) continue; ftbw->docid[1]=docid; - _ftb_climb_the_tree(ftbw,1); + _ftb_climb_the_tree(ftb, ftbw, &ftsi2); } } } @@ -479,6 +557,10 @@ float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length) void ft_boolean_close_search(FT_INFO *ftb) { + if (is_tree_inited(& ftb->no_dupes)) + { + delete_tree(& ftb->no_dupes); + } free_root(& ftb->mem_root, MYF(0)); my_free((gptr)ftb,MYF(0)); } |