diff options
author | Dom Lachowicz <domlachowicz@gmail.com> | 2006-05-06 16:02:16 +0000 |
---|---|---|
committer | Dom Lachowicz <domlachowicz@gmail.com> | 2006-05-06 16:02:16 +0000 |
commit | 81037f9142c4375cb868cd459d58508aed486656 (patch) | |
tree | ef30023e9c224e1945e8eb9049d06f5889514cad | |
parent | f0a2f193b8cb6299971964c19234a332083be16b (diff) | |
download | enchant-81037f9142c4375cb868cd459d58508aed486656.tar.gz |
bug 10212 - implement a personal word list capable of returning suggestions. thanks in large part to ryan kelly
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@21132 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | configure.in | 4 | ||||
-rw-r--r-- | src/Makefile.am | 2 | ||||
-rw-r--r-- | src/enchant.c | 179 | ||||
-rw-r--r-- | src/pwl.c | 705 | ||||
-rw-r--r-- | src/pwl.h | 57 | ||||
-rw-r--r-- | tests/test-enchant.c | 24 |
7 files changed, 854 insertions, 126 deletions
@@ -1,4 +1,11 @@ -2006-04-32 Dom Lachowicz <cinamod@hotmail.com> +2006-05-06 Dom Lachowicz <cinamod@hotmail.com> + + * configure.in: Bump version number to indicate an unstable series + * src/pwl.[ch]: New personal wordlist implementation, based on Trie. + Thanks to Ryan Kelly <ryan-bugs@rfk.id.au> (bug 10212). + * src/enchant.c: Ditto + +2006-04-23 Dom Lachowicz <cinamod@hotmail.com> * configure.in: --with-aspell-prefix didn't work diff --git a/configure.in b/configure.in index 3175e0e..5e14efc 100644 --- a/configure.in +++ b/configure.in @@ -7,11 +7,11 @@ dnl 4a) Increment when removing or changing interfaces. ENCHANT_MAJOR_VERSION=1 dnl 4a) 5) Increment when adding interfaces. dnl 6) Set to zero when removing or changing interfaces. -ENCHANT_MINOR_VERSION=2 +ENCHANT_MINOR_VERSION=3 dnl 3) Increment when interfaces not changed at all, dnl only bug fixes or internal changes made. dnl 4b) Set to zero when adding, removing or changing interfaces. -ENCHANT_MICRO_VERSION=6 +ENCHANT_MICRO_VERSION=0 dnl dnl Set this too MAJOR_VERSION_PLUS_MINOR_VERSION=`expr $ENCHANT_MAJOR_VERSION + $ENCHANT_MINOR_VERSION` diff --git a/src/Makefile.am b/src/Makefile.am index fcec507..0b4756e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -10,7 +10,7 @@ else libenchant_la_LIBADD= $(ENCHANT_LIBS) endif libenchant_la_LDFLAGS = -version-info $(VERSION_INFO) -no-undefined -libenchant_la_SOURCES = prefix.c enchant.c enchant.h prefix.h +libenchant_la_SOURCES = prefix.c enchant.c pwl.c enchant.h prefix.h pwl.h libenchant_includedir = $(includedir)/enchant/ libenchant_include_HEADERS = enchant.h enchant-provider.h enchant++.h diff --git a/src/enchant.c b/src/enchant.c index 47afb4c..158cabe 100644 --- a/src/enchant.c +++ b/src/enchant.c @@ -32,17 +32,13 @@ #include <stdlib.h> #include <string.h> -#if defined(HAVE_FLOCK) || defined(HAVE_LOCKF) -#include <unistd.h> -#include <sys/file.h> -#endif /* HAVE_FLOCK || HAVE_LOCKF */ - #include <glib.h> #include <gmodule.h> #include <locale.h> #include "enchant.h" #include "enchant-provider.h" +#include "pwl.h" #ifdef XP_TARGET_COCOA #import "enchant_cocoa.h" @@ -75,7 +71,7 @@ struct str_enchant_broker typedef struct str_enchant_session { GHashTable *session; - GHashTable *personal; + EnchantPWL *personal; char * personal_filename; char * language_tag; @@ -90,37 +86,9 @@ typedef struct str_enchant_session typedef EnchantProvider *(*EnchantProviderInitFunc) (void); typedef void (*EnchantPreConfigureFunc) (EnchantProvider * provider, const char * module_dir); -#ifndef BUFSIZ -#define BUFSIZ 1024 -#endif - /********************************************************************************/ /********************************************************************************/ -static void -enchant_lock_file (FILE * f) -{ -#if defined(HAVE_FLOCK) - flock (fileno (f), LOCK_EX); -#elif defined(HAVE_LOCKF) - lockf (fileno (f), F_LOCK, 0); -#else - /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */ -#endif /* HAVE_FLOCK */ -} - -static void -enchant_unlock_file (FILE * f) -{ -#if defined(HAVE_FLOCK) - flock (fileno (f), LOCK_UN); -#elif defined(HAVE_LOCKF) - lockf (fileno (f), F_ULOCK, 0); -#else - /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */ -#endif /* HAVE_FLOCK */ -} - static char * enchant_get_module_dir (void) { @@ -288,7 +256,7 @@ static void enchant_session_destroy (EnchantSession * session) { g_hash_table_destroy (session->session); - g_hash_table_destroy (session->personal); + enchant_pwl_free (session->personal); g_free (session->personal_filename); g_free (session->language_tag); @@ -303,48 +271,25 @@ enchant_session_new_with_pwl (EnchantProvider * provider, const char * const pwl gboolean fail_if_no_pwl) { EnchantSession * session; - FILE * f; - char line[BUFSIZ]; + EnchantPWL *personal = NULL; + + if (pwl) + personal = enchant_pwl_init_with_file (pwl); + + if (personal == NULL) { + if (fail_if_no_pwl) + return NULL; + else + personal = enchant_pwl_init (); + } + session = g_new0 (EnchantSession, 1); session->session = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); - session->personal = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + session->personal = personal; session->provider = provider; session->language_tag = g_strdup (lang); - - if (pwl) - { - session->personal_filename = g_strdup (pwl); - - /* populate personal filename */ - f = fopen (pwl, "r"); - if (f) - { - enchant_lock_file (f); - - while (NULL != (fgets (line, sizeof (line), f))) - { - size_t l = strlen(line)-1; - if (line[l]=='\n') - line[l] = '\0'; - - g_hash_table_insert (session->personal, g_strdup (line), GINT_TO_POINTER(TRUE)); - } - - enchant_unlock_file (f); - fclose (f); - } - else if (fail_if_no_pwl) - { - enchant_session_destroy (session); - return NULL; - } - } - else if (fail_if_no_pwl) - { - enchant_session_destroy (session); - return NULL; - } + session->personal_filename = g_strdup (pwl); return session; } @@ -384,23 +329,7 @@ enchant_session_add (EnchantSession * session, const char * const word, size_t l static void enchant_session_add_personal (EnchantSession * session, const char * const word, size_t len) { - FILE * f; - - if (session->personal_filename) - { - f = fopen (session->personal_filename, "a"); - - if (f) - { - enchant_lock_file (f); - - fwrite (word, sizeof(char), len, f); - fwrite ("\n", sizeof(char), 1, f); - fclose (f); - - enchant_unlock_file (f); - } - } + enchant_pwl_add(session->personal, word, len); } static gboolean @@ -411,7 +340,7 @@ enchant_session_contains (EnchantSession * session, const char * const word, siz char * utf = g_strndup (word, len); if (g_hash_table_lookup (session->session, utf) || - g_hash_table_lookup (session->personal, utf)) + enchant_pwl_check (session->personal, utf, len) == 0) result = TRUE; g_free (utf); @@ -525,6 +454,11 @@ enchant_dict_check (EnchantDict * dict, const char *const word, ssize_t len) if (enchant_session_contains (session, word, len)) return 0; + if (session->personal) { + if (enchant_pwl_check (session->personal, word, len) == 0) + return 0; + } + if (dict->check) return (*dict->check) (dict, word, len); else if (session->is_pwl) @@ -549,41 +483,64 @@ ENCHANT_MODULE_EXPORT (char **) enchant_dict_suggest (EnchantDict * dict, const char *const word, ssize_t len, size_t * out_n_suggs) { - size_t n_suggs; - char ** suggs; + EnchantSession * session; + size_t n_suggs = 0, n_dict_suggs = 0, n_pwl_suggs = 0; + char **suggs, **dict_suggs = NULL, **pwl_suggs = NULL; g_return_val_if_fail (dict, NULL); g_return_val_if_fail (word, NULL); + session = (EnchantSession*)dict->enchant_private_data; + if (len < 0) len = strlen (word); - + + /* Check for suggestions from personal dictionary */ + if(session->personal) + pwl_suggs = enchant_pwl_suggest(session->personal, word, len, &n_pwl_suggs); + + /* Check for suggestions from provider dictionary */ if (dict->suggest) { - char ** tmp_suggs; - - tmp_suggs = (*dict->suggest) (dict, word, len, &n_suggs); + dict_suggs = (*dict->suggest) (dict, word, len, + &n_dict_suggs); + } + + /* Clone suggestions if there are any */ + n_suggs = n_pwl_suggs + n_dict_suggs; + if (n_suggs > 0) + { + size_t i, j, k; - /* clone the suggestion array */ - if (tmp_suggs) - { - size_t i; - - suggs = g_new0 (char *, n_suggs + 1); - for (i = 0; i < n_suggs; i++) - suggs[i] = g_strdup (tmp_suggs[i]); - - enchant_dict_free_string_list_impl (dict, tmp_suggs); - } - else - { - suggs = NULL; + suggs = g_new0 (char *, n_suggs + 1); + + /* Copy over suggestions from dict */ + for(i = 0; i < n_dict_suggs; i++) + suggs[i] = g_strdup (dict_suggs[i]); + if(dict_suggs) + enchant_dict_free_string_list_impl (dict, dict_suggs); + + /* Copy over suggestions from pwl, except dupes */ + for(j = 0; j < n_pwl_suggs; j++) { + int dupe = 0; + for(k = 0; k < n_dict_suggs; k++) { + if(strcmp(suggs[k],pwl_suggs[j])==0) { + dupe = 1; + break; + } } + if(!dupe) { + suggs[i] = g_strdup (pwl_suggs[j]); + i++; + } + } + + if(pwl_suggs) + enchant_pwl_free_string_list(session->personal,pwl_suggs); } else { suggs = NULL; - n_suggs = 0; } if (out_n_suggs) diff --git a/src/pwl.c b/src/pwl.c new file mode 100644 index 0000000..73299c7 --- /dev/null +++ b/src/pwl.c @@ -0,0 +1,705 @@ +/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* enchant + * Copyright (C) 2003, 2004 Dom Lachowicz + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * In addition, as a special exception, Dom Lachowicz + * gives permission to link the code of this program with + * non-LGPL Spelling Provider libraries (eg: a MSFT Office + * spell checker backend) and distribute linked combinations including + * the two. You must obey the GNU Lesser General Public License in all + * respects for all of the code used other than said providers. If you modify + * this file, you may extend this exception to your version of the + * file, but you are not obligated to do so. If you do not wish to + * do so, delete this exception statement from your version. + */ + +/** + * + * This file implements personal word list (PWL) dictionaries in the + * type EnchantPWL. + * + * Under the hood, a PWL is stored as a Trie. Checking strings for + * correctness and making suggestions is done by traversing the Trie + * while allowing a certain number of miss-steps. Due to the prefix + * compression of the Trie, this allows all strings in the PWL within + * a given edit distance of the target word to be enumerated quite + * efficiently. + * + * Ideas for the future: + * + * - Order the suggestions first by edit distance, then by + * soundex key, to put the most similar sounding suggestions + * at the front of the list. Would need a "soundex" that is + * general enough to handle languages other than English. + * + * - iterative deepending to find suggestions, rather than a straight + * search to depth three. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <glib.h> + +#include "pwl.h" + +#if defined(HAVE_FLOCK) || defined(HAVE_LOCKF) +#include <unistd.h> +#include <sys/file.h> +#endif /* HAVE_FLOCK || HAVE_LOCKF */ + +#define ENCHANT_PWL_MAX_ERRORS 3 +#define ENCHANT_PWL_MAX_SUGGS 15 + +/* A PWL dictionary is stored as a Trie-like data structure EnchantTrie. + * The EnchantTrie datatype is completely recursive - all child nodes + * are simply EnchantTrie pointers. This means that all functions + * that potentially modify a trie need to return the modified trie, + * as additional memory may have been allocated. + * + * The empty trie is simply the null pointer. If a trie contains + * a single string, it is recorded in the "value" attribute and + * "subtries" is set to NULL. When two or more strings are contained, + * "value" is NULL and "subtries" is a GHashTable mapping the first + * character of each string to the subtrie containing the remainder of + * that string. + * + * All strings stored in the Trie are assumed to be in UTF format. + * Branching is done on unicode characters, not individual bytes. + */ +typedef struct str_enchant_trie EnchantTrie; +struct str_enchant_trie +{ + char* value; /* final string found under this node */ + GHashTable* subtries; /* Maps characters to subtries */ +}; + +struct str_enchant_pwl +{ + EnchantTrie* trie; + char * filename; + GHashTable *words_in_trie; +}; + +/* Special Trie node indicating the end of a string */ +static EnchantTrie n_EOSTrie; +static EnchantTrie* EOSTrie = &n_EOSTrie; + +/* The EnchantTrieMatcher structure maintains the state necessary to + * search for matching strings within an EnchantTrie. It includes a + * callback function which will be called with each matching string + * as it is found. The arguments to this function are: + * + * - a freshly-allocated copy of the matching string, which must + * be freed by external code + * - the EnchantTrieMatcher object, giving the context of the match + * (e.g. number of errors) + */ +typedef struct str_enchant_trie_matcher EnchantTrieMatcher; +struct str_enchant_trie_matcher +{ + int num_errors; /* Num errors encountered so far. */ + int max_errors; /* Max errors before search should terminate */ + + const char* word; /* Word being searched for */ + ssize_t word_pos; /* Current position in the word */ + + char* path; /* Path taken through the trie so far */ + ssize_t path_len; /* Length of allocated path string */ + ssize_t path_pos; /* Current end pos of path string */ + + void (*cbfunc)(char*,EnchantTrieMatcher*); /* callback func */ + void* cbdata; /* Private data for use by callback func */ +}; + + +/* To allow the list of suggestions to be built up an item at a time, + * its state is maintained in an EnchantSuggList object. + */ +typedef struct str_enchant_sugg_list +{ + char** suggs; + int* sugg_errs; + size_t n_suggs; +} EnchantSuggList; + + +/* + * Function Prototypes + */ + +static void enchant_pwl_add_to_trie(EnchantPWL *pwl, + const char *const word, size_t len, + gboolean add_to_file); + +static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher); +static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher); +static EnchantTrie* enchant_trie_init(void); +static void enchant_trie_free(EnchantTrie* trie); +static void enchant_trie_free_cb(void*,void*,void*); +static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word); +static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher); +static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV); +static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word, + int maxerrs, + void(*cbfunc)(char*,EnchantTrieMatcher*), + void* cbdata); +static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher); +static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars); +static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num); + +static int edit_dist(const char* word1, const char* word2); +static char* soundex(const char* word); + +static void +enchant_lock_file (FILE * f) +{ +#if defined(HAVE_FLOCK) + flock (fileno (f), LOCK_EX); +#elif defined(HAVE_LOCKF) + lockf (fileno (f), F_LOCK, 0); +#else + /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */ +#endif /* HAVE_FLOCK */ +} + +static void +enchant_unlock_file (FILE * f) +{ +#if defined(HAVE_FLOCK) + flock (fileno (f), LOCK_UN); +#elif defined(HAVE_LOCKF) + lockf (fileno (f), F_ULOCK, 0); +#else + /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */ +#endif /* HAVE_FLOCK */ +} + +/** + * enchant_pwl_init + * + * Returns: a new PWL object used to store/check/suggest words. + */ +EnchantPWL* enchant_pwl_init(void) +{ + EnchantPWL *pwl; + + pwl = g_new0(EnchantPWL, 1); + pwl->words_in_trie = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + + return pwl; +} + +#ifndef BUFSIZ +#define BUFSIZ 1024 +#endif + +/** + * enchant_pwl_init + * + * Returns: a new PWL object used to store/check/suggest words. + */ +EnchantPWL* enchant_pwl_init_with_file(const char * file) +{ + FILE *f; + + f = fopen (file, "r"); + if (f) + { + EnchantPWL *pwl; + char line[BUFSIZ]; + + pwl = enchant_pwl_init(); + pwl->filename = g_strdup(file); + + enchant_lock_file (f); + + while (NULL != (fgets (line, sizeof (line), f))) + { + size_t l = strlen(line)-1; + if (line[l]=='\n') + line[l] = '\0'; + + enchant_pwl_add_to_trie(pwl, line, strlen(line), FALSE); + } + + enchant_unlock_file (f); + fclose (f); + + return pwl; + } + + return NULL; +} + +void enchant_pwl_free(EnchantPWL *pwl) +{ + enchant_trie_free(pwl->trie); + g_free(pwl->filename); + g_hash_table_destroy (pwl->words_in_trie); + g_free(pwl); +} + +static void enchant_pwl_add_to_trie(EnchantPWL *pwl, + const char *const word, size_t len, + gboolean add_to_file) +{ + char * case_folded_word; + + case_folded_word = g_utf8_casefold (word, len); + if(NULL != g_hash_table_lookup (pwl->words_in_trie, case_folded_word)) { + g_free (case_folded_word); + return; + } + + g_hash_table_insert (pwl->words_in_trie, case_folded_word, GINT_TO_POINTER(1)); + + pwl->trie = enchant_trie_insert(pwl->trie, case_folded_word); + + if (add_to_file && (pwl->filename != NULL)) + { + FILE *f; + + f = fopen(pwl->filename, "a"); + if (f) + { + enchant_lock_file (f); + fwrite (word, sizeof(char), len, f); + fwrite ("\n", sizeof(char), 1, f); + enchant_unlock_file (f); + fclose (f); + } + } +} + +void enchant_pwl_add(EnchantPWL *pwl, + const char *const word, size_t len) +{ + enchant_pwl_add_to_trie(pwl, word, len, TRUE); +} + +int enchant_pwl_check(EnchantPWL *pwl, const char *const word, size_t len) +{ + EnchantTrieMatcher* matcher; + char * case_folded_word; + int count = 0; + + case_folded_word = g_utf8_casefold (word, len); + + matcher = enchant_trie_matcher_init(case_folded_word,0,enchant_pwl_check_cb, + &count); + enchant_trie_find_matches(pwl->trie,matcher); + enchant_trie_matcher_free(matcher); + g_free(case_folded_word); + + return (count != 0 ? 0 : 1); +} + +static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher) +{ + g_free(match); + (*((int*)(matcher->cbdata)))++; +} + +char** enchant_pwl_suggest(EnchantPWL *pwl,const char *const word, + size_t len, size_t* out_n_suggs) +{ + EnchantTrieMatcher* matcher; + EnchantSuggList sugg_list; + char *case_folded_word; + + sugg_list.suggs = g_new0(char*,ENCHANT_PWL_MAX_SUGGS+1); + sugg_list.sugg_errs = g_new0(int,ENCHANT_PWL_MAX_SUGGS); + sugg_list.n_suggs = 0; + + case_folded_word = g_utf8_casefold (word, len); + matcher = enchant_trie_matcher_init(case_folded_word,ENCHANT_PWL_MAX_ERRORS, + enchant_pwl_suggest_cb, + &sugg_list); + enchant_trie_find_matches(pwl->trie,matcher); + enchant_trie_matcher_free(matcher); + g_free(case_folded_word); + + g_free(sugg_list.sugg_errs); + sugg_list.suggs[sugg_list.n_suggs] = NULL; + (*out_n_suggs) = sugg_list.n_suggs; + + return sugg_list.suggs; +} + +static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher) +{ + EnchantSuggList* sugg_list; + size_t loc, i, shuffleTo; + int changes = 0; /* num words added to list */ + + sugg_list = (EnchantSuggList*)(matcher->cbdata); + + /* Find appropriate location in the array, if any */ + /* In future, this could be done using binary search... */ + for(loc=0; loc < sugg_list->n_suggs; loc++) { + /* Better than an existing suggestion, so stop */ + if(sugg_list->sugg_errs[loc] > matcher->num_errors) { + break; + } + /* Already in the list with better score, just return */ + if(strcmp(match,sugg_list->suggs[loc])==0) { + g_free(match); + return; + } + } + /* If it's not going to fit, just throw it away */ + if(loc >= ENCHANT_PWL_MAX_SUGGS) { + g_free(match); + return; + } + + changes++; + + /* Find the location to shuffle other elements up to. + * If the new word already exists, delete it and stuffle up to there + * Otherwise, if we reach max suggs, delete last one + * Otherwise, shuffle up to end of list + */ + for(i=loc; i < sugg_list->n_suggs; i++){ + if(strcmp(match,sugg_list->suggs[i]) == 0) { + g_free(sugg_list->suggs[i]); + changes--; + break; + } + } + if(i == ENCHANT_PWL_MAX_SUGGS) { + i--; + changes--; + g_free(sugg_list->suggs[i]); + } + shuffleTo = i; + + /* Shuffle other entries along to make space for new one */ + for(i=shuffleTo; i > loc; i--) { + sugg_list->suggs[i] = sugg_list->suggs[i-1]; + sugg_list->sugg_errs[i] = sugg_list->sugg_errs[i-1]; + } + + sugg_list->suggs[loc] = match; + sugg_list->sugg_errs[loc] = matcher->num_errors; + sugg_list->n_suggs = sugg_list->n_suggs + changes; + +} + +void enchant_pwl_free_string_list(EnchantPWL *pwl, char** string_list) +{ + g_strfreev(string_list); +} + +static EnchantTrie* enchant_trie_init(void) +{ + EnchantTrie* trie; + + trie = g_new(EnchantTrie,1); + trie->value = NULL; + trie->subtries = NULL; + + return trie; +} + +static void enchant_trie_free(EnchantTrie* trie) +{ + /* Dont ever free NULL, or the EOSTrie pointer */ + if(trie == NULL || trie == EOSTrie) { + return; + } + + /* Because we have not set a destroy function for the hashtable + * (to make code cleaner below), we need to explicitly free all + * subtries with a callback function. + */ + if (trie->subtries != NULL) { + g_hash_table_foreach(trie->subtries,enchant_trie_free_cb,NULL); + g_hash_table_destroy(trie->subtries); + } + + if (trie->value != NULL) { + g_free(trie->value); + } + + g_free(trie); +} + +static void enchant_trie_free_cb(void* key, void* value, void* data) +{ + enchant_trie_free((EnchantTrie*) value); +} + +static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word) +{ + char *tmpWord; + ssize_t nxtCh = 0; + EnchantTrie* subtrie; + + if (trie == NULL) { + trie = enchant_trie_init(); + } + + if (trie->value == NULL) { + if (trie->subtries == NULL) { + /* When single word, store in trie->value */ + trie->value = g_strdup(word); + } else { + /* Store multiple words in subtries */ + if (word[0] == '\0') { + /* Mark end-of-string with special node */ + tmpWord = g_strdup(""); + g_hash_table_insert(trie->subtries, + tmpWord,EOSTrie); + } else { + nxtCh = (ssize_t)(g_utf8_next_char(word)-word); + tmpWord = g_strndup(word,nxtCh); + subtrie = g_hash_table_lookup(trie->subtries, + tmpWord); + subtrie = enchant_trie_insert(subtrie, + (word+nxtCh)); + g_hash_table_insert(trie->subtries, + tmpWord,subtrie); + } + } + } else { + /* Create new hastable for subtries, and reinsert */ + trie->subtries = g_hash_table_new_full(g_str_hash, + g_str_equal,g_free, NULL); + tmpWord = trie->value; + trie->value = NULL; + enchant_trie_insert(trie,tmpWord); + enchant_trie_insert(trie,word); + g_free(tmpWord); + } + + return trie; +} + +static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher) +{ + int errs = 0; + ssize_t nxtChI = 0, oldPos = 0; + char* nxtChS = NULL; + EnchantTrie* subtrie = NULL; + + g_return_if_fail(matcher); + + /* Cant match in the empty trie */ + if(trie == NULL) { + return; + } + + /* Bail out if over the error limits */ + if(matcher->num_errors > matcher->max_errors){ + return; + } + + /* If the end of a string has been reached, no point recursing */ + if (trie == EOSTrie) { + errs = matcher->num_errors; + matcher->num_errors = errs + strlen(matcher->word) \ + - matcher->word_pos; + if (matcher->num_errors <= matcher->max_errors) { + matcher->cbfunc(g_strdup(matcher->path),matcher); + } + matcher->num_errors = errs; + return; + } + + /* If there is a value, just check it, no recursion */ + if (trie->value != NULL) { + errs = matcher->num_errors; + matcher->num_errors = errs + edit_dist(trie->value, + &(matcher->word[matcher->word_pos])); + if (matcher->num_errors <= matcher->max_errors) { + matcher->cbfunc(g_strconcat(matcher->path, + trie->value,NULL), + matcher); + } + matcher->num_errors = errs; + return; + } + + nxtChI = (ssize_t)(g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word); + nxtChS = g_strndup(&matcher->word[matcher->word_pos], + (nxtChI - matcher->word_pos)); + + /* Precisely match the first character, and recurse */ + subtrie = g_hash_table_lookup(trie->subtries,nxtChS); + if (subtrie != NULL) { + enchant_trie_matcher_pushpath(matcher,nxtChS); + oldPos = matcher->word_pos; + matcher->word_pos = nxtChI; + enchant_trie_find_matches(subtrie,matcher); + matcher->word_pos = oldPos; + enchant_trie_matcher_poppath(matcher,strlen(nxtChS)); + } + + g_free(nxtChS); + + if (matcher->word[matcher->word_pos] != '\0') { + matcher->num_errors++; + /* Match on inserting word[0] */ + oldPos = matcher->word_pos; + matcher->word_pos = nxtChI; + enchant_trie_find_matches(trie,matcher); + matcher->word_pos = oldPos; + /* for each subtrie, match on delete or swap word[0] */ + g_hash_table_foreach(trie->subtries, + enchant_trie_find_matches_cb, + matcher); + matcher->num_errors--; + } +} + +static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV) +{ + char* key; + EnchantTrie* subtrie; + EnchantTrieMatcher* matcher; + ssize_t nxtChI, oldPos; + + key = (char*) keyV; + subtrie = (EnchantTrie*) subtrieV; + matcher = (EnchantTrieMatcher*) matcherV; + + nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word); + + /* Dont handle actual matches, that's already done */ + if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) { + return; + } + + enchant_trie_matcher_pushpath(matcher,key); + + /* Match on deleting word[0] */ + enchant_trie_find_matches(subtrie,matcher); + /* Match on transposing word[0] */ + oldPos = matcher->word_pos; + matcher->word_pos = nxtChI; + enchant_trie_find_matches(subtrie,matcher); + matcher->word_pos = oldPos; + + enchant_trie_matcher_poppath(matcher,strlen(key)); +} + +static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word, + int maxerrs, + void(*cbfunc)(char*,EnchantTrieMatcher*), + void* cbdata) +{ + EnchantTrieMatcher* matcher; + + matcher = g_new(EnchantTrieMatcher,1); + matcher->num_errors = 0; + matcher->max_errors = maxerrs; + matcher->word = word; + matcher->word_pos = 0; + matcher->path = g_new0(char,10); + matcher->path[0] = '\0'; + matcher->path_len = 10; + matcher->path_pos = 0; + matcher->cbfunc = cbfunc; + matcher->cbdata = cbdata; + + return matcher; +} + +static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher) +{ + g_free(matcher->path); + g_free(matcher); +} + +static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars) +{ + ssize_t len, i; + + len = strlen(newchars); + if(matcher->path_pos + len >= matcher->path_len) { + matcher->path_len = matcher->path_len + len + 10; + matcher->path = g_renew(char,matcher->path,matcher->path_len); + } + + for(i = 0; i < len; i++) { + matcher->path[matcher->path_pos + i] = newchars[i]; + } + matcher->path_pos = matcher->path_pos + len; + matcher->path[matcher->path_pos] = '\0'; +} + +static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num) +{ + g_return_if_fail(matcher->path_pos >= 0); + matcher->path_pos = matcher->path_pos - num; + if(matcher->path_pos < 0) { + matcher->path_pos = 0; + } + matcher->path[matcher->path_pos] = '\0'; +} + +static int edit_dist(const char* word1, const char* word2) +{ + int len1, len2, cost, i, j; + int v1, v2, v3; + int* table; + + len1 = strlen(word1); + len2 = strlen(word2); + + table = g_new0(int,(len1+1)*(len2+1)); + + /* Initialise outer rows of table */ + for (i=0; i < len1 + 1; i++) { + table[i*(len2+1)] = i; + } + for (j=0; j < len2 + 1; j++) { + table[j] = j; + } + + /* Fill in table in dynamic programming style */ + for (i=1; i < len1+1; i++){ + for(j=1; j < len2+1; j++) { + if(word1[i-1] == word2[j-1]) { + cost = 0; + } else { + cost = 1; + } + v1 = table[(i-1)*(len2+1)+j] + 1; + v2 = table[i*(len2+1)+(j-1)] + 1; + v3 = table[(i-1)*(len2+1)+(j-1)] + cost; + if (v1 < v2 && v1 < v3) { + cost = v1; + } else if (v2 < v3) { + cost = v2; + } else { + cost = v3; + } + table[i*(len2+1)+j] = cost; + } + } + + cost = table[len1*(len2+1) + len2]; + g_free(table); + return cost; +} + diff --git a/src/pwl.h b/src/pwl.h new file mode 100644 index 0000000..1aa9d15 --- /dev/null +++ b/src/pwl.h @@ -0,0 +1,57 @@ +/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* enchant + * Copyright (C) 2003 Dom Lachowicz + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * In addition, as a special exception, Dom Lachowicz + * gives permission to link the code of this program with + * non-LGPL Spelling Provider libraries (eg: a MSFT Office + * spell checker backend) and distribute linked combinations including + * the two. You must obey the GNU Lesser General Public License in all + * respects for all of the code used other than said providers. If you modify + * this file, you may extend this exception to your version of the + * file, but you are not obligated to do so. If you do not wish to + * do so, delete this exception statement from your version. + */ + +#ifndef PWL_H +#define PWL_H + +#include "enchant.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct str_enchant_pwl EnchantPWL; + +/* Create and initialise a new, empty PWL */ +EnchantPWL* enchant_pwl_init(void); +EnchantPWL* enchant_pwl_init_with_file(const char * file); + +void enchant_pwl_add(EnchantPWL * me, const char *const word, size_t len); +int enchant_pwl_check(EnchantPWL * me,const char *const word, size_t len); +char** enchant_pwl_suggest(EnchantPWL *me,const char *const word, + size_t len, size_t* out_n_suggs); +void enchant_pwl_free(EnchantPWL* me); +void enchant_pwl_free_string_list(EnchantPWL* me, char** string_list); + +#ifdef __cplusplus +} +#endif + +#endif /* PWL_H */ diff --git a/tests/test-enchant.c b/tests/test-enchant.c index 5d08f3b..b463449 100644 --- a/tests/test-enchant.c +++ b/tests/test-enchant.c @@ -52,6 +52,13 @@ describe_dict_fn (const char * const lang, printf ("%s: %s '%s' (%s)\n", lang, name, desc, file); } +static char * print_found(EnchantDict *dict, const char * word) +{ + if (enchant_dict_check (dict, word, -1) == 0) + return "found"; + return "not found"; +} + static void run_dict_tests (EnchantDict * dict) { @@ -64,9 +71,8 @@ run_dict_tests (EnchantDict * dict) for (i = 0; i < (sizeof (check_checks) / sizeof (check_checks[0])); i++) { - printf ("enchant_dict_check (%s): %d\n", check_checks[i], - enchant_dict_check (dict, check_checks[i], - strlen (check_checks[i]))); + printf ("enchant_dict_check (%s): %s\n", check_checks[i], + print_found(dict, check_checks[i])); } for (i = 0; i < (sizeof (sugg_checks) / sizeof (sugg_checks[0])); i++) @@ -89,21 +95,17 @@ run_dict_tests (EnchantDict * dict) enchant_dict_add_to_session (dict, "helllo", 6); for (i = 0; i < (sizeof (check_checks) / sizeof (check_checks[0])); i++) { - printf ("enchant_dict_check (%s): %d\n", check_checks[i], - enchant_dict_check (dict, check_checks[i], - strlen (check_checks[i]))); + printf ("enchant_dict_check (%s): %s\n", check_checks[i], + print_found(dict, check_checks[i])); } -#if 0 printf ("Adding 'helllo' to personal\n"); enchant_dict_add_to_pwl (dict, "helllo", 6); for (i = 0; i < (sizeof (check_checks) / sizeof (check_checks[0])); i++) { - printf ("enchant_dict_check (%s): %d\n", check_checks[i], - enchant_dict_check (dict, check_checks[i], - strlen (check_checks[i]))); + printf ("enchant_dict_check (%s): %s\n", check_checks[i], + print_found(dict, check_checks[i])); } -#endif } int |