diff options
Diffstat (limited to 'myisam/ftdefs.h')
-rw-r--r-- | myisam/ftdefs.h | 79 |
1 files changed, 69 insertions, 10 deletions
diff --git a/myisam/ftdefs.h b/myisam/ftdefs.h index d9b4ff6b44d..62fa4362e19 100644 --- a/myisam/ftdefs.h +++ b/myisam/ftdefs.h @@ -1,15 +1,15 @@ /* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -22,15 +22,18 @@ #include <m_ctype.h> #include <my_tree.h> -#define MIN_WORD_LEN 4 - #define HYPHEN_IS_DELIM #define HYPHEN_IS_CONCAT /* not used for now */ #define COMPILE_STOPWORDS_IN -/* Most of the formulae were shamelessly stolen from SMART distribution - ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z +/* Interested readers may consult SMART + (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) + for an excellent implementation of vector space model we use. + It also demonstrate the usage of different weghting techniques. + This code, though, is completely original and is not based on the + SMART code but was in some cases inspired by it. + NORM_PIVOT was taken from the article A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", ACM SIGIR'96, 21-29, 1996 @@ -75,13 +78,26 @@ extern ulong collstat; /* Mysterious, but w/o (double) GWS_IDF performs better :-o */ #define GWS_IDF log(aio->info->state->records/doc_cnt) #define GWS_IDF1 log((double)aio->info->state->records/doc_cnt) -#define GWS_PROB log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) +#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) #define GWS_FREQ (1.0/doc_cnt) #define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) #define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3) #define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) /*=================================================================*/ +/* Boolean search operators */ +#define FTB_YES (ft_boolean_syntax[0]) +#define FTB_EGAL (ft_boolean_syntax[1]) +#define FTB_NO (ft_boolean_syntax[2]) +#define FTB_INC (ft_boolean_syntax[3]) +#define FTB_DEC (ft_boolean_syntax[4]) +#define FTB_LBR (ft_boolean_syntax[5]) +#define FTB_RBR (ft_boolean_syntax[6]) +#define FTB_NEG (ft_boolean_syntax[7]) +#define FTB_TRUNC (ft_boolean_syntax[8]) +#define FTB_LQUOT (ft_boolean_syntax[10]) +#define FTB_RQUOT (ft_boolean_syntax[11]) + typedef struct st_ft_word { byte * pos; uint len; @@ -91,9 +107,52 @@ typedef struct st_ft_word { #endif /* EVAL_RUN */ } FT_WORD; +typedef struct st_ftb_param { + byte prev; + int yesno; + int plusminus; + bool pmsign; + bool trunc; + byte *quot; +} FTB_PARAM; + int is_stopword(char *word, uint len); uint _ft_make_key(MI_INFO *, uint , byte *, FT_WORD *, my_off_t); -TREE * ft_parse(TREE *, byte *, int); -FT_WORD * ft_linearize(MI_INFO *, uint, byte *, TREE *); +byte ft_get_word(byte **, byte *, FT_WORD *, FTB_PARAM *); +byte ft_simple_get_word(byte **, byte *, FT_WORD *); + +typedef struct _st_ft_seg_iterator { + uint num, len; + MI_KEYSEG *seg; + const byte *rec, *pos; +} FT_SEG_ITERATOR; + +void _mi_ft_segiterator_init(MI_INFO *, uint, const byte *, FT_SEG_ITERATOR *); +void _mi_ft_segiterator_dummy_init(const byte *, uint, FT_SEG_ITERATOR *); +uint _mi_ft_segiterator(FT_SEG_ITERATOR *); + +void ft_parse_init(TREE *, CHARSET_INFO *); +int ft_parse(TREE *, byte *, int); +FT_WORD * ft_linearize(TREE *); +FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *); +uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record); + +extern const struct _ft_vft _ft_vft_nlq; +FT_INFO *ft_init_nlq_search(MI_INFO *, uint, byte *, uint, my_bool); +int ft_nlq_read_next(FT_INFO *, char *); +float ft_nlq_find_relevance(FT_INFO *, byte *, uint); +void ft_nlq_close_search(FT_INFO *); +float ft_nlq_get_relevance(FT_INFO *); +my_off_t ft_nlq_get_docid(FT_INFO *); +void ft_nlq_reinit_search(FT_INFO *); + +extern const struct _ft_vft _ft_vft_boolean; +FT_INFO *ft_init_boolean_search(MI_INFO *, uint, byte *, uint, my_bool); +int ft_boolean_read_next(FT_INFO *, char *); +float ft_boolean_find_relevance(FT_INFO *, byte *, uint); +void ft_boolean_close_search(FT_INFO *); +float ft_boolean_get_relevance(FT_INFO *); +my_off_t ft_boolean_get_docid(FT_INFO *); +void ft_boolean_reinit_search(FT_INFO *); |