summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/Makefile.am2
-rw-r--r--include/my_trie.h142
-rw-r--r--mysys/Makefile.am2
-rw-r--r--mysys/trie.c237
4 files changed, 381 insertions, 2 deletions
diff --git a/include/Makefile.am b/include/Makefile.am
index 5f426843950..53b83bd7d88 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -28,7 +28,7 @@ noinst_HEADERS = config-win.h config-os2.h config-netware.h \
myisam.h myisampack.h myisammrg.h ft_global.h\
mysys_err.h my_base.h help_start.h help_end.h \
my_nosys.h my_alarm.h queues.h rijndael.h sha1.h \
- my_aes.h my_tree.h hash.h thr_alarm.h \
+ my_aes.h my_tree.h my_trie.h hash.h thr_alarm.h \
thr_lock.h t_ctype.h violite.h md5.h \
mysql_version.h.in my_handler.h my_time.h decimal.h
diff --git a/include/my_trie.h b/include/my_trie.h
new file mode 100644
index 00000000000..a8534cb11c1
--- /dev/null
+++ b/include/my_trie.h
@@ -0,0 +1,142 @@
+/* Copyright (C) 2005 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _trie_h
+#define _trie_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct st_trie_node
+{
+ uint16 leaf; /* Depth from root node if match, 0 else */
+ byte c; /* Label on this edge */
+ struct st_trie_node *next; /* Next label */
+ struct st_trie_node *links; /* Array of edges leaving this node */
+ struct st_trie_node *fail; /* AC failure function */
+} TRIE_NODE;
+
+typedef struct st_trie
+{
+ TRIE_NODE root;
+ MEM_ROOT mem_root;
+ CHARSET_INFO *charset;
+ uint32 nnodes;
+ uint32 nwords;
+} TRIE;
+
+typedef struct st_ac_trie_state
+{
+ TRIE *trie;
+ TRIE_NODE *node;
+} AC_TRIE_STATE;
+
+extern TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset);
+extern void trie_free (TRIE *trie);
+extern my_bool trie_insert (TRIE *trie, const byte *key, uint keylen);
+extern my_bool ac_trie_prepare (TRIE *trie);
+extern void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state);
+
+
+/* `trie_goto' is internal function and shouldn't be used. */
+
+static inline TRIE_NODE *trie_goto (TRIE_NODE *root, TRIE_NODE *node, byte c)
+{
+ TRIE_NODE *next;
+ DBUG_ENTER("trie_goto");
+ for (next= node->links; next; next= next->next)
+ if (next->c == c)
+ DBUG_RETURN(next);
+ if (root == node)
+ DBUG_RETURN(root);
+ DBUG_RETURN(NULL);
+}
+
+
+/*
+ SYNOPSIS
+ int ac_trie_next (AC_TRIE_STATE *state, byte *c);
+ state - valid pointer to `AC_TRIE_STATE'
+ c - character to lookup
+
+ DESCRIPTION
+ Implementation of search using Aho-Corasick automaton.
+ Performs char-by-char search.
+
+ RETURN VALUE
+ `ac_trie_next' returns length of matched word or 0.
+*/
+
+static inline int ac_trie_next (AC_TRIE_STATE *state, byte *c)
+{
+ TRIE_NODE *root, *node;
+ DBUG_ENTER("ac_trie_next");
+ DBUG_ASSERT(state && c);
+ root= &state->trie->root;
+ node= state->node;
+ while (! (state->node= trie_goto(root, node, *c)))
+ node= node->fail;
+ DBUG_RETURN(state->node->leaf);
+}
+
+
+/*
+ SYNOPSIS
+ my_bool trie_search (TRIE *trie, const byte *key, uint keylen);
+ trie - valid pointer to `TRIE'
+ key - valid pointer to key to insert
+ keylen - non-0 key length
+
+ DESCRIPTION
+ Performs key lookup in trie.
+
+ RETURN VALUE
+ `trie_search' returns `true' if key is in `trie'. Otherwise,
+ `false' is returned.
+
+ NOTES
+ Consecutive search here is "best by test". arrays are very short, so
+ binary search or hashing would add too much complexity that would
+ overweight speed gain. Especially because compiler can optimize simple
+ consecutive loop better (tested)
+*/
+
+static inline my_bool trie_search (TRIE *trie, const byte *key, uint keylen)
+{
+ TRIE_NODE *node;
+ uint k;
+ DBUG_ENTER("trie_search");
+ DBUG_ASSERT(trie && key && keylen);
+ node= &trie->root;
+
+ for (k= 0; k < keylen; k++)
+ {
+ byte p;
+ if (! (node= node->links))
+ DBUG_RETURN(FALSE);
+ p= key[k];
+ while (p != node->c)
+ if (! (node= node->next))
+ DBUG_RETURN(FALSE);
+ }
+
+ DBUG_RETURN(node->leaf > 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mysys/Makefile.am b/mysys/Makefile.am
index 3031edcc77d..f61c95f523a 100644
--- a/mysys/Makefile.am
+++ b/mysys/Makefile.am
@@ -42,7 +42,7 @@ libmysys_a_SOURCES = my_init.c my_getwd.c mf_getdate.c my_mmap.c \
mf_wcomp.c mf_wfile.c my_gethwaddr.c \
mf_qsort.c mf_qsort2.c mf_sort.c \
ptr_cmp.c mf_radix.c queues.c \
- tree.c list.c hash.c array.c string.c typelib.c \
+ tree.c trie.c list.c hash.c array.c string.c typelib.c \
my_copy.c my_append.c my_lib.c \
my_delete.c my_rename.c my_redel.c my_tempnam.c \
my_chsize.c my_lread.c my_lwrite.c my_clock.c \
diff --git a/mysys/trie.c b/mysys/trie.c
new file mode 100644
index 00000000000..1f638f8f732
--- /dev/null
+++ b/mysys/trie.c
@@ -0,0 +1,237 @@
+/* Copyright (C) 2005 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ Implementation of trie and Aho-Corasick automaton.
+ Supports only charsets that can be compared byte-wise.
+
+ TODO:
+ Add character frequencies. Can increase lookup speed
+ up to 30%.
+ Implement character-wise comparision.
+*/
+
+
+#include "mysys_priv.h"
+#include <m_string.h>
+#include <my_trie.h>
+#include <my_base.h>
+
+
+/*
+ SYNOPSIS
+ TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset);
+
+ DESCRIPTION
+ Allocates or initializes a `TRIE' object. If `trie' is a `NULL'
+ pointer, the function allocates, initializes, and returns a new
+ object. Otherwise, the object is initialized and the address of
+ the object is returned. If `trie_init()' allocates a new object,
+ it will be freed when `trie_free()' is called.
+
+ RETURN VALUE
+ An initialized `TRIE*' object. `NULL' if there was insufficient
+ memory to allocate a new object.
+*/
+
+TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset)
+{
+ MEM_ROOT mem_root;
+ DBUG_ENTER("trie_init");
+ DBUG_ASSERT(charset);
+ init_alloc_root(&mem_root,
+ (sizeof(TRIE_NODE) * 128) + ALLOC_ROOT_MIN_BLOCK_SIZE,
+ sizeof(TRIE_NODE) * 128);
+ if (! trie)
+ {
+ if (! (trie= (TRIE *)alloc_root(&mem_root, sizeof(TRIE))))
+ {
+ free_root(&mem_root, MYF(0));
+ DBUG_RETURN(NULL);
+ }
+ }
+
+ memcpy(&trie->mem_root, &mem_root, sizeof(MEM_ROOT));
+ trie->root.leaf= 0;
+ trie->root.c= 0;
+ trie->root.next= NULL;
+ trie->root.links= NULL;
+ trie->root.fail= NULL;
+ trie->charset= charset;
+ trie->nnodes= 0;
+ trie->nwords= 0;
+ DBUG_RETURN(trie);
+}
+
+
+/*
+ SYNOPSIS
+ void trie_free (TRIE *trie);
+ trie - valid pointer to `TRIE'
+
+ DESCRIPTION
+ Frees the memory allocated for a `trie'.
+
+ RETURN VALUE
+ None.
+*/
+
+void trie_free (TRIE *trie)
+{
+ MEM_ROOT mem_root;
+ DBUG_ENTER("trie_free");
+ DBUG_ASSERT(trie);
+ memcpy(&mem_root, &trie->mem_root, sizeof(MEM_ROOT));
+ free_root(&mem_root, MYF(0));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ SYNOPSIS
+ my_bool trie_insert (TRIE *trie, const byte *key, uint keylen);
+ trie - valid pointer to `TRIE'
+ key - valid pointer to key to insert
+ keylen - non-0 key length
+
+ DESCRIPTION
+ Inserts new key into trie.
+
+ RETURN VALUE
+ Upon successful completion, `trie_insert' returns `FALSE'. Otherwise
+ `TRUE' is returned.
+
+ NOTES
+ If this function fails you must assume `trie' is broken.
+ However it can be freed with trie_free().
+*/
+
+my_bool trie_insert (TRIE *trie, const byte *key, uint keylen)
+{
+ TRIE_NODE *node;
+ TRIE_NODE *next;
+ byte p;
+ uint k;
+ DBUG_ENTER("trie_insert");
+ DBUG_ASSERT(trie && key && keylen);
+ node= &trie->root;
+ trie->root.fail= NULL;
+ for (k= 0; k < keylen; k++)
+ {
+ p= key[k];
+ for (next= node->links; next; next= next->next)
+ if (next->c == p)
+ break;
+
+ if (! next)
+ {
+ TRIE_NODE *tmp= (TRIE_NODE *)alloc_root(&trie->mem_root,
+ sizeof(TRIE_NODE));
+ if (! tmp)
+ DBUG_RETURN(TRUE);
+ tmp->leaf= 0;
+ tmp->c= p;
+ tmp->links= tmp->fail= tmp->next= NULL;
+ trie->nnodes++;
+ if (! node->links)
+ {
+ node->links= tmp;
+ }
+ else
+ {
+ for (next= node->links; next->next; next= next->next) /* no-op */;
+ next->next= tmp;
+ }
+ node= tmp;
+ }
+ else
+ {
+ node= next;
+ }
+ }
+ node->leaf= keylen;
+ trie->nwords++;
+ DBUG_RETURN(FALSE);
+}
+
+
+/*
+ SYNOPSIS
+ my_bool trie_prepare (TRIE *trie);
+ trie - valid pointer to `TRIE'
+
+ DESCRIPTION
+ Constructs Aho-Corasick automaton.
+
+ RETURN VALUE
+ Upon successful completion, `trie_prepare' returns `FALSE'. Otherwise
+ `TRUE' is returned.
+*/
+
+my_bool ac_trie_prepare (TRIE *trie)
+{
+ TRIE_NODE **tmp_nodes;
+ TRIE_NODE *node;
+ uint32 fnode= 0;
+ uint32 lnode= 0;
+ DBUG_ENTER("trie_prepare");
+ DBUG_ASSERT(trie);
+
+ tmp_nodes= (TRIE_NODE **)my_malloc(trie->nnodes * sizeof(TRIE_NODE *), MYF(0));
+ if (! tmp_nodes)
+ DBUG_RETURN(TRUE);
+
+ trie->root.fail= &trie->root;
+ for (node= trie->root.links; node; node= node->next)
+ {
+ node->fail= &trie->root;
+ tmp_nodes[lnode++]= node;
+ }
+
+ while (fnode < lnode)
+ {
+ TRIE_NODE *current= (TRIE_NODE *)tmp_nodes[fnode++];
+ for (node= current->links; node; node= node->next)
+ {
+ TRIE_NODE *fail= current->fail;
+ tmp_nodes[lnode++]= node;
+ while (! (node->fail= trie_goto(&trie->root, fail, node->c)))
+ fail= fail->fail;
+ }
+ }
+ my_free((gptr)tmp_nodes, MYF(0));
+ DBUG_RETURN(FALSE);
+}
+
+
+/*
+ SYNOPSIS
+ void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state);
+ trie - valid pointer to `TRIE'
+ state - value pointer to `AC_TRIE_STATE'
+
+ DESCRIPTION
+ Initializes `AC_TRIE_STATE' object.
+*/
+
+void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state)
+{
+ DBUG_ENTER("ac_trie_init");
+ DBUG_ASSERT(trie && state);
+ state->trie= trie;
+ state->node= &trie->root;
+ DBUG_VOID_RETURN;
+}