diff options
-rw-r--r-- | include/Makefile.am | 2 | ||||
-rw-r--r-- | include/my_trie.h | 142 | ||||
-rw-r--r-- | mysys/Makefile.am | 2 | ||||
-rw-r--r-- | mysys/trie.c | 237 |
4 files changed, 381 insertions, 2 deletions
diff --git a/include/Makefile.am b/include/Makefile.am index 5f426843950..53b83bd7d88 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -28,7 +28,7 @@ noinst_HEADERS = config-win.h config-os2.h config-netware.h \ myisam.h myisampack.h myisammrg.h ft_global.h\ mysys_err.h my_base.h help_start.h help_end.h \ my_nosys.h my_alarm.h queues.h rijndael.h sha1.h \ - my_aes.h my_tree.h hash.h thr_alarm.h \ + my_aes.h my_tree.h my_trie.h hash.h thr_alarm.h \ thr_lock.h t_ctype.h violite.h md5.h \ mysql_version.h.in my_handler.h my_time.h decimal.h diff --git a/include/my_trie.h b/include/my_trie.h new file mode 100644 index 00000000000..a8534cb11c1 --- /dev/null +++ b/include/my_trie.h @@ -0,0 +1,142 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef _trie_h +#define _trie_h +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct st_trie_node +{ + uint16 leaf; /* Depth from root node if match, 0 else */ + byte c; /* Label on this edge */ + struct st_trie_node *next; /* Next label */ + struct st_trie_node *links; /* Array of edges leaving this node */ + struct st_trie_node *fail; /* AC failure function */ +} TRIE_NODE; + +typedef struct st_trie +{ + TRIE_NODE root; + MEM_ROOT mem_root; + CHARSET_INFO *charset; + uint32 nnodes; + uint32 nwords; +} TRIE; + +typedef struct st_ac_trie_state +{ + TRIE *trie; + TRIE_NODE *node; +} AC_TRIE_STATE; + +extern TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset); +extern void trie_free (TRIE *trie); +extern my_bool trie_insert (TRIE *trie, const byte *key, uint keylen); +extern my_bool ac_trie_prepare (TRIE *trie); +extern void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state); + + +/* `trie_goto' is internal function and shouldn't be used. */ + +static inline TRIE_NODE *trie_goto (TRIE_NODE *root, TRIE_NODE *node, byte c) +{ + TRIE_NODE *next; + DBUG_ENTER("trie_goto"); + for (next= node->links; next; next= next->next) + if (next->c == c) + DBUG_RETURN(next); + if (root == node) + DBUG_RETURN(root); + DBUG_RETURN(NULL); +} + + +/* + SYNOPSIS + int ac_trie_next (AC_TRIE_STATE *state, byte *c); + state - valid pointer to `AC_TRIE_STATE' + c - character to lookup + + DESCRIPTION + Implementation of search using Aho-Corasick automaton. + Performs char-by-char search. + + RETURN VALUE + `ac_trie_next' returns length of matched word or 0. +*/ + +static inline int ac_trie_next (AC_TRIE_STATE *state, byte *c) +{ + TRIE_NODE *root, *node; + DBUG_ENTER("ac_trie_next"); + DBUG_ASSERT(state && c); + root= &state->trie->root; + node= state->node; + while (! (state->node= trie_goto(root, node, *c))) + node= node->fail; + DBUG_RETURN(state->node->leaf); +} + + +/* + SYNOPSIS + my_bool trie_search (TRIE *trie, const byte *key, uint keylen); + trie - valid pointer to `TRIE' + key - valid pointer to key to insert + keylen - non-0 key length + + DESCRIPTION + Performs key lookup in trie. + + RETURN VALUE + `trie_search' returns `true' if key is in `trie'. Otherwise, + `false' is returned. + + NOTES + Consecutive search here is "best by test". arrays are very short, so + binary search or hashing would add too much complexity that would + overweight speed gain. Especially because compiler can optimize simple + consecutive loop better (tested) +*/ + +static inline my_bool trie_search (TRIE *trie, const byte *key, uint keylen) +{ + TRIE_NODE *node; + uint k; + DBUG_ENTER("trie_search"); + DBUG_ASSERT(trie && key && keylen); + node= &trie->root; + + for (k= 0; k < keylen; k++) + { + byte p; + if (! (node= node->links)) + DBUG_RETURN(FALSE); + p= key[k]; + while (p != node->c) + if (! (node= node->next)) + DBUG_RETURN(FALSE); + } + + DBUG_RETURN(node->leaf > 0); +} + +#ifdef __cplusplus +} +#endif +#endif diff --git a/mysys/Makefile.am b/mysys/Makefile.am index 3031edcc77d..f61c95f523a 100644 --- a/mysys/Makefile.am +++ b/mysys/Makefile.am @@ -42,7 +42,7 @@ libmysys_a_SOURCES = my_init.c my_getwd.c mf_getdate.c my_mmap.c \ mf_wcomp.c mf_wfile.c my_gethwaddr.c \ mf_qsort.c mf_qsort2.c mf_sort.c \ ptr_cmp.c mf_radix.c queues.c \ - tree.c list.c hash.c array.c string.c typelib.c \ + tree.c trie.c list.c hash.c array.c string.c typelib.c \ my_copy.c my_append.c my_lib.c \ my_delete.c my_rename.c my_redel.c my_tempnam.c \ my_chsize.c my_lread.c my_lwrite.c my_clock.c \ diff --git a/mysys/trie.c b/mysys/trie.c new file mode 100644 index 00000000000..1f638f8f732 --- /dev/null +++ b/mysys/trie.c @@ -0,0 +1,237 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + Implementation of trie and Aho-Corasick automaton. + Supports only charsets that can be compared byte-wise. + + TODO: + Add character frequencies. Can increase lookup speed + up to 30%. + Implement character-wise comparision. +*/ + + +#include "mysys_priv.h" +#include <m_string.h> +#include <my_trie.h> +#include <my_base.h> + + +/* + SYNOPSIS + TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset); + + DESCRIPTION + Allocates or initializes a `TRIE' object. If `trie' is a `NULL' + pointer, the function allocates, initializes, and returns a new + object. Otherwise, the object is initialized and the address of + the object is returned. If `trie_init()' allocates a new object, + it will be freed when `trie_free()' is called. + + RETURN VALUE + An initialized `TRIE*' object. `NULL' if there was insufficient + memory to allocate a new object. +*/ + +TRIE *trie_init (TRIE *trie, CHARSET_INFO *charset) +{ + MEM_ROOT mem_root; + DBUG_ENTER("trie_init"); + DBUG_ASSERT(charset); + init_alloc_root(&mem_root, + (sizeof(TRIE_NODE) * 128) + ALLOC_ROOT_MIN_BLOCK_SIZE, + sizeof(TRIE_NODE) * 128); + if (! trie) + { + if (! (trie= (TRIE *)alloc_root(&mem_root, sizeof(TRIE)))) + { + free_root(&mem_root, MYF(0)); + DBUG_RETURN(NULL); + } + } + + memcpy(&trie->mem_root, &mem_root, sizeof(MEM_ROOT)); + trie->root.leaf= 0; + trie->root.c= 0; + trie->root.next= NULL; + trie->root.links= NULL; + trie->root.fail= NULL; + trie->charset= charset; + trie->nnodes= 0; + trie->nwords= 0; + DBUG_RETURN(trie); +} + + +/* + SYNOPSIS + void trie_free (TRIE *trie); + trie - valid pointer to `TRIE' + + DESCRIPTION + Frees the memory allocated for a `trie'. + + RETURN VALUE + None. +*/ + +void trie_free (TRIE *trie) +{ + MEM_ROOT mem_root; + DBUG_ENTER("trie_free"); + DBUG_ASSERT(trie); + memcpy(&mem_root, &trie->mem_root, sizeof(MEM_ROOT)); + free_root(&mem_root, MYF(0)); + DBUG_VOID_RETURN; +} + + +/* + SYNOPSIS + my_bool trie_insert (TRIE *trie, const byte *key, uint keylen); + trie - valid pointer to `TRIE' + key - valid pointer to key to insert + keylen - non-0 key length + + DESCRIPTION + Inserts new key into trie. + + RETURN VALUE + Upon successful completion, `trie_insert' returns `FALSE'. Otherwise + `TRUE' is returned. + + NOTES + If this function fails you must assume `trie' is broken. + However it can be freed with trie_free(). +*/ + +my_bool trie_insert (TRIE *trie, const byte *key, uint keylen) +{ + TRIE_NODE *node; + TRIE_NODE *next; + byte p; + uint k; + DBUG_ENTER("trie_insert"); + DBUG_ASSERT(trie && key && keylen); + node= &trie->root; + trie->root.fail= NULL; + for (k= 0; k < keylen; k++) + { + p= key[k]; + for (next= node->links; next; next= next->next) + if (next->c == p) + break; + + if (! next) + { + TRIE_NODE *tmp= (TRIE_NODE *)alloc_root(&trie->mem_root, + sizeof(TRIE_NODE)); + if (! tmp) + DBUG_RETURN(TRUE); + tmp->leaf= 0; + tmp->c= p; + tmp->links= tmp->fail= tmp->next= NULL; + trie->nnodes++; + if (! node->links) + { + node->links= tmp; + } + else + { + for (next= node->links; next->next; next= next->next) /* no-op */; + next->next= tmp; + } + node= tmp; + } + else + { + node= next; + } + } + node->leaf= keylen; + trie->nwords++; + DBUG_RETURN(FALSE); +} + + +/* + SYNOPSIS + my_bool trie_prepare (TRIE *trie); + trie - valid pointer to `TRIE' + + DESCRIPTION + Constructs Aho-Corasick automaton. + + RETURN VALUE + Upon successful completion, `trie_prepare' returns `FALSE'. Otherwise + `TRUE' is returned. +*/ + +my_bool ac_trie_prepare (TRIE *trie) +{ + TRIE_NODE **tmp_nodes; + TRIE_NODE *node; + uint32 fnode= 0; + uint32 lnode= 0; + DBUG_ENTER("trie_prepare"); + DBUG_ASSERT(trie); + + tmp_nodes= (TRIE_NODE **)my_malloc(trie->nnodes * sizeof(TRIE_NODE *), MYF(0)); + if (! tmp_nodes) + DBUG_RETURN(TRUE); + + trie->root.fail= &trie->root; + for (node= trie->root.links; node; node= node->next) + { + node->fail= &trie->root; + tmp_nodes[lnode++]= node; + } + + while (fnode < lnode) + { + TRIE_NODE *current= (TRIE_NODE *)tmp_nodes[fnode++]; + for (node= current->links; node; node= node->next) + { + TRIE_NODE *fail= current->fail; + tmp_nodes[lnode++]= node; + while (! (node->fail= trie_goto(&trie->root, fail, node->c))) + fail= fail->fail; + } + } + my_free((gptr)tmp_nodes, MYF(0)); + DBUG_RETURN(FALSE); +} + + +/* + SYNOPSIS + void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state); + trie - valid pointer to `TRIE' + state - value pointer to `AC_TRIE_STATE' + + DESCRIPTION + Initializes `AC_TRIE_STATE' object. +*/ + +void ac_trie_init (TRIE *trie, AC_TRIE_STATE *state) +{ + DBUG_ENTER("ac_trie_init"); + DBUG_ASSERT(trie && state); + state->trie= trie; + state->node= &trie->root; + DBUG_VOID_RETURN; +} |