summaryrefslogtreecommitdiff
path: root/myisam/myisam_ftdump.c
diff options
context:
space:
mode:
Diffstat (limited to 'myisam/myisam_ftdump.c')
-rw-r--r--myisam/myisam_ftdump.c284
1 files changed, 284 insertions, 0 deletions
diff --git a/myisam/myisam_ftdump.c b/myisam/myisam_ftdump.c
new file mode 100644
index 00000000000..8ab6a7600b2
--- /dev/null
+++ b/myisam/myisam_ftdump.c
@@ -0,0 +1,284 @@
+/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+ added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ftdefs.h"
+#include <my_getopt.h>
+
+static void usage();
+static void complain(int val);
+static my_bool get_one_option(int, const struct my_option *, char *);
+
+static int count=0, stats=0, dump=0, lstats=0;
+static my_bool verbose;
+static char *query=NULL;
+static uint lengths[256];
+
+#define MAX_LEN (HA_FT_MAXBYTELEN+10)
+#define HOW_OFTEN_TO_WRITE 10000
+
+static struct my_option my_long_options[] =
+{
+ {"dump", 'd', "Dump index (incl. data offsets and word weights).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"stats", 's', "Report global stats.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"verbose", 'v', "Be verbose.",
+ (gptr*) &verbose, (gptr*) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"count", 'c', "Calculate per-word stats (counts and global weights).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"length", 'l', "Report length distribution.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifdef DISABLED
+ {"execute", 'e', "Execute given query.", (gptr*) &query, (gptr*) &query, 0,
+ GET_STR_ALLOC, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+ {"help", 'h', "Display help and exit.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"help", '?', "Synonym for -h.",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+int main(int argc,char *argv[])
+{
+ int error=0, subkeys;
+ uint keylen, keylen2=0, inx, doc_cnt=0;
+ float weight= 1.0;
+ double gws, min_gws=0, avg_gws=0;
+ MI_INFO *info;
+ char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN];
+ ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0;
+ struct { MI_INFO *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */
+
+ MY_INIT(argv[0]);
+ if ((error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+ exit(error);
+ if (count || dump)
+ verbose=0;
+ if (!count && !dump && !lstats && !query)
+ stats=1;
+
+ if (verbose)
+ setbuf(stdout,NULL);
+
+ if (argc < 2)
+ usage();
+
+ {
+ char *end;
+ inx= strtoll(argv[1], &end, 10);
+ if (*end)
+ usage();
+ }
+
+ init_key_cache(dflt_key_cache,MI_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0);
+
+ if (!(info=mi_open(argv[0],2,HA_OPEN_ABORT_IF_LOCKED)))
+ {
+ error=my_errno;
+ goto err;
+ }
+
+ *buf2=0;
+ aio->info=info;
+
+ if ((inx >= info->s->base.keys) ||
+ !(info->s->keyinfo[inx].flag & HA_FULLTEXT))
+ {
+ printf("Key %d in table %s is not a FULLTEXT key\n", inx, info->filename);
+ goto err;
+ }
+
+ mi_lock_database(info, F_EXTRA_LCK);
+
+ if (query)
+ {
+#if 0
+ FT_DOCLIST *result;
+ int i;
+
+ ft_init_stopwords(ft_precompiled_stopwords);
+
+ result=ft_nlq_init_search(info,inx,query,strlen(query),1);
+ if(!result)
+ goto err;
+
+ if (verbose)
+ printf("%d rows matched\n",result->ndocs);
+
+ for(i=0 ; i<result->ndocs ; i++)
+ printf("%9lx %20.7f\n",(ulong)result->doc[i].dpos,result->doc[i].weight);
+
+ ft_nlq_close_search(result);
+#else
+ printf("-e option is disabled\n");
+#endif
+ }
+ else
+ {
+ info->lastpos= HA_OFFSET_ERROR;
+ info->update|= HA_STATE_PREV_FOUND;
+
+ while (!(error=mi_rnext(info,NULL,inx)))
+ {
+ keylen=*(info->lastkey);
+
+ subkeys=ft_sintXkorr(info->lastkey+keylen+1);
+ if (subkeys >= 0)
+ weight=*(float*)&subkeys;
+
+#ifdef HAVE_SNPRINTF
+ snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey+1);
+#else
+ sprintf(buf,"%.*s",(int) keylen,info->lastkey+1);
+#endif
+ my_casedn_str(default_charset_info,buf);
+ total++;
+ lengths[keylen]++;
+
+ if (count || stats)
+ {
+ doc_cnt++;
+ if (strcmp(buf, buf2))
+ {
+ if (*buf2)
+ {
+ uniq++;
+ avg_gws+=gws=GWS_IN_USE;
+ if (count)
+ printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+ if (maxlen<keylen2)
+ {
+ maxlen=keylen2;
+ strmov(buf_maxlen, buf2);
+ }
+ if (max_doc_cnt < doc_cnt)
+ {
+ max_doc_cnt=doc_cnt;
+ strmov(buf_min_gws, buf2);
+ min_gws=gws;
+ }
+ }
+ strmov(buf2, buf);
+ keylen2=keylen;
+ doc_cnt=0;
+ }
+ }
+ if (dump)
+ {
+ if (subkeys>=0)
+ printf("%9lx %20.7f %s\n", (long) info->lastpos,weight,buf);
+ else
+ printf("%9lx => %17d %s\n",(long) info->lastpos,-subkeys,buf);
+ }
+ if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
+ printf("%10ld\r",total);
+ }
+ mi_lock_database(info, F_UNLCK);
+
+ if (stats)
+ {
+ count=0;
+ for (inx=0;inx<256;inx++)
+ {
+ count+=lengths[inx];
+ if ((ulong) count >= total/2)
+ break;
+ }
+ printf("Total rows: %lu\nTotal words: %lu\n"
+ "Unique words: %lu\nLongest word: %lu chars (%s)\n"
+ "Median length: %u\n"
+ "Average global weight: %f\n"
+ "Most common word: %lu times, weight: %f (%s)\n",
+ (long) info->state->records, total, uniq, maxlen, buf_maxlen,
+ inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws);
+ }
+ if (lstats)
+ {
+ count=0;
+ for (inx=0; inx<256; inx++)
+ {
+ count+=lengths[inx];
+ if (count && lengths[inx])
+ printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx,
+ (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count,
+ 100.0*count/total);
+ }
+ }
+ }
+
+err:
+ if (error && error != HA_ERR_END_OF_FILE)
+ printf("got error %d\n",my_errno);
+ if (info)
+ mi_close(info);
+ return 0;
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+ char *argument __attribute__((unused)))
+{
+ switch(optid) {
+ case 'd':
+ dump=1;
+ complain(count || query);
+ break;
+ case 's':
+ stats=1;
+ complain(query!=0);
+ break;
+ case 'c':
+ count= 1;
+ complain(dump || query);
+ break;
+ case 'l':
+ lstats=1;
+ complain(query!=0);
+ break;
+ case 'e':
+ complain(dump || count || stats);
+ break;
+ case '?':
+ case 'h':
+ usage();
+ }
+ return 0;
+}
+
+
+static void usage()
+{
+ printf("Use: myisam_ftdump <table_name> <index_num>\n");
+ my_print_help(my_long_options);
+ my_print_variables(my_long_options);
+ exit(1);
+}
+
+
+static void complain(int val) /* Kinda assert :-) */
+{
+ if (val)
+ {
+ printf("You cannot use these options together!\n");
+ exit(1);
+ }
+}