summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDom Lachowicz <domlachowicz@gmail.com>2006-05-06 16:02:16 +0000
committerDom Lachowicz <domlachowicz@gmail.com>2006-05-06 16:02:16 +0000
commit81037f9142c4375cb868cd459d58508aed486656 (patch)
treeef30023e9c224e1945e8eb9049d06f5889514cad
parentf0a2f193b8cb6299971964c19234a332083be16b (diff)
downloadenchant-81037f9142c4375cb868cd459d58508aed486656.tar.gz
bug 10212 - implement a personal word list capable of returning suggestions. thanks in large part to ryan kelly
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@21132 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r--ChangeLog9
-rw-r--r--configure.in4
-rw-r--r--src/Makefile.am2
-rw-r--r--src/enchant.c179
-rw-r--r--src/pwl.c705
-rw-r--r--src/pwl.h57
-rw-r--r--tests/test-enchant.c24
7 files changed, 854 insertions, 126 deletions
diff --git a/ChangeLog b/ChangeLog
index 7fbc359..4c961ff 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,11 @@
-2006-04-32 Dom Lachowicz <cinamod@hotmail.com>
+2006-05-06 Dom Lachowicz <cinamod@hotmail.com>
+
+ * configure.in: Bump version number to indicate an unstable series
+ * src/pwl.[ch]: New personal wordlist implementation, based on Trie.
+ Thanks to Ryan Kelly <ryan-bugs@rfk.id.au> (bug 10212).
+ * src/enchant.c: Ditto
+
+2006-04-23 Dom Lachowicz <cinamod@hotmail.com>
* configure.in: --with-aspell-prefix didn't work
diff --git a/configure.in b/configure.in
index 3175e0e..5e14efc 100644
--- a/configure.in
+++ b/configure.in
@@ -7,11 +7,11 @@ dnl 4a) Increment when removing or changing interfaces.
ENCHANT_MAJOR_VERSION=1
dnl 4a) 5) Increment when adding interfaces.
dnl 6) Set to zero when removing or changing interfaces.
-ENCHANT_MINOR_VERSION=2
+ENCHANT_MINOR_VERSION=3
dnl 3) Increment when interfaces not changed at all,
dnl only bug fixes or internal changes made.
dnl 4b) Set to zero when adding, removing or changing interfaces.
-ENCHANT_MICRO_VERSION=6
+ENCHANT_MICRO_VERSION=0
dnl
dnl Set this too
MAJOR_VERSION_PLUS_MINOR_VERSION=`expr $ENCHANT_MAJOR_VERSION + $ENCHANT_MINOR_VERSION`
diff --git a/src/Makefile.am b/src/Makefile.am
index fcec507..0b4756e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -10,7 +10,7 @@ else
libenchant_la_LIBADD= $(ENCHANT_LIBS)
endif
libenchant_la_LDFLAGS = -version-info $(VERSION_INFO) -no-undefined
-libenchant_la_SOURCES = prefix.c enchant.c enchant.h prefix.h
+libenchant_la_SOURCES = prefix.c enchant.c pwl.c enchant.h prefix.h pwl.h
libenchant_includedir = $(includedir)/enchant/
libenchant_include_HEADERS = enchant.h enchant-provider.h enchant++.h
diff --git a/src/enchant.c b/src/enchant.c
index 47afb4c..158cabe 100644
--- a/src/enchant.c
+++ b/src/enchant.c
@@ -32,17 +32,13 @@
#include <stdlib.h>
#include <string.h>
-#if defined(HAVE_FLOCK) || defined(HAVE_LOCKF)
-#include <unistd.h>
-#include <sys/file.h>
-#endif /* HAVE_FLOCK || HAVE_LOCKF */
-
#include <glib.h>
#include <gmodule.h>
#include <locale.h>
#include "enchant.h"
#include "enchant-provider.h"
+#include "pwl.h"
#ifdef XP_TARGET_COCOA
#import "enchant_cocoa.h"
@@ -75,7 +71,7 @@ struct str_enchant_broker
typedef struct str_enchant_session
{
GHashTable *session;
- GHashTable *personal;
+ EnchantPWL *personal;
char * personal_filename;
char * language_tag;
@@ -90,37 +86,9 @@ typedef struct str_enchant_session
typedef EnchantProvider *(*EnchantProviderInitFunc) (void);
typedef void (*EnchantPreConfigureFunc) (EnchantProvider * provider, const char * module_dir);
-#ifndef BUFSIZ
-#define BUFSIZ 1024
-#endif
-
/********************************************************************************/
/********************************************************************************/
-static void
-enchant_lock_file (FILE * f)
-{
-#if defined(HAVE_FLOCK)
- flock (fileno (f), LOCK_EX);
-#elif defined(HAVE_LOCKF)
- lockf (fileno (f), F_LOCK, 0);
-#else
- /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */
-#endif /* HAVE_FLOCK */
-}
-
-static void
-enchant_unlock_file (FILE * f)
-{
-#if defined(HAVE_FLOCK)
- flock (fileno (f), LOCK_UN);
-#elif defined(HAVE_LOCKF)
- lockf (fileno (f), F_ULOCK, 0);
-#else
- /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */
-#endif /* HAVE_FLOCK */
-}
-
static char *
enchant_get_module_dir (void)
{
@@ -288,7 +256,7 @@ static void
enchant_session_destroy (EnchantSession * session)
{
g_hash_table_destroy (session->session);
- g_hash_table_destroy (session->personal);
+ enchant_pwl_free (session->personal);
g_free (session->personal_filename);
g_free (session->language_tag);
@@ -303,48 +271,25 @@ enchant_session_new_with_pwl (EnchantProvider * provider, const char * const pwl
gboolean fail_if_no_pwl)
{
EnchantSession * session;
- FILE * f;
- char line[BUFSIZ];
+ EnchantPWL *personal = NULL;
+
+ if (pwl)
+ personal = enchant_pwl_init_with_file (pwl);
+
+ if (personal == NULL) {
+ if (fail_if_no_pwl)
+ return NULL;
+ else
+ personal = enchant_pwl_init ();
+ }
+
session = g_new0 (EnchantSession, 1);
session->session = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
- session->personal = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
+ session->personal = personal;
session->provider = provider;
session->language_tag = g_strdup (lang);
-
- if (pwl)
- {
- session->personal_filename = g_strdup (pwl);
-
- /* populate personal filename */
- f = fopen (pwl, "r");
- if (f)
- {
- enchant_lock_file (f);
-
- while (NULL != (fgets (line, sizeof (line), f)))
- {
- size_t l = strlen(line)-1;
- if (line[l]=='\n')
- line[l] = '\0';
-
- g_hash_table_insert (session->personal, g_strdup (line), GINT_TO_POINTER(TRUE));
- }
-
- enchant_unlock_file (f);
- fclose (f);
- }
- else if (fail_if_no_pwl)
- {
- enchant_session_destroy (session);
- return NULL;
- }
- }
- else if (fail_if_no_pwl)
- {
- enchant_session_destroy (session);
- return NULL;
- }
+ session->personal_filename = g_strdup (pwl);
return session;
}
@@ -384,23 +329,7 @@ enchant_session_add (EnchantSession * session, const char * const word, size_t l
static void
enchant_session_add_personal (EnchantSession * session, const char * const word, size_t len)
{
- FILE * f;
-
- if (session->personal_filename)
- {
- f = fopen (session->personal_filename, "a");
-
- if (f)
- {
- enchant_lock_file (f);
-
- fwrite (word, sizeof(char), len, f);
- fwrite ("\n", sizeof(char), 1, f);
- fclose (f);
-
- enchant_unlock_file (f);
- }
- }
+ enchant_pwl_add(session->personal, word, len);
}
static gboolean
@@ -411,7 +340,7 @@ enchant_session_contains (EnchantSession * session, const char * const word, siz
char * utf = g_strndup (word, len);
if (g_hash_table_lookup (session->session, utf) ||
- g_hash_table_lookup (session->personal, utf))
+ enchant_pwl_check (session->personal, utf, len) == 0)
result = TRUE;
g_free (utf);
@@ -525,6 +454,11 @@ enchant_dict_check (EnchantDict * dict, const char *const word, ssize_t len)
if (enchant_session_contains (session, word, len))
return 0;
+ if (session->personal) {
+ if (enchant_pwl_check (session->personal, word, len) == 0)
+ return 0;
+ }
+
if (dict->check)
return (*dict->check) (dict, word, len);
else if (session->is_pwl)
@@ -549,41 +483,64 @@ ENCHANT_MODULE_EXPORT (char **)
enchant_dict_suggest (EnchantDict * dict, const char *const word,
ssize_t len, size_t * out_n_suggs)
{
- size_t n_suggs;
- char ** suggs;
+ EnchantSession * session;
+ size_t n_suggs = 0, n_dict_suggs = 0, n_pwl_suggs = 0;
+ char **suggs, **dict_suggs = NULL, **pwl_suggs = NULL;
g_return_val_if_fail (dict, NULL);
g_return_val_if_fail (word, NULL);
+ session = (EnchantSession*)dict->enchant_private_data;
+
if (len < 0)
len = strlen (word);
-
+
+ /* Check for suggestions from personal dictionary */
+ if(session->personal)
+ pwl_suggs = enchant_pwl_suggest(session->personal, word, len, &n_pwl_suggs);
+
+ /* Check for suggestions from provider dictionary */
if (dict->suggest)
{
- char ** tmp_suggs;
-
- tmp_suggs = (*dict->suggest) (dict, word, len, &n_suggs);
+ dict_suggs = (*dict->suggest) (dict, word, len,
+ &n_dict_suggs);
+ }
+
+ /* Clone suggestions if there are any */
+ n_suggs = n_pwl_suggs + n_dict_suggs;
+ if (n_suggs > 0)
+ {
+ size_t i, j, k;
- /* clone the suggestion array */
- if (tmp_suggs)
- {
- size_t i;
-
- suggs = g_new0 (char *, n_suggs + 1);
- for (i = 0; i < n_suggs; i++)
- suggs[i] = g_strdup (tmp_suggs[i]);
-
- enchant_dict_free_string_list_impl (dict, tmp_suggs);
- }
- else
- {
- suggs = NULL;
+ suggs = g_new0 (char *, n_suggs + 1);
+
+ /* Copy over suggestions from dict */
+ for(i = 0; i < n_dict_suggs; i++)
+ suggs[i] = g_strdup (dict_suggs[i]);
+ if(dict_suggs)
+ enchant_dict_free_string_list_impl (dict, dict_suggs);
+
+ /* Copy over suggestions from pwl, except dupes */
+ for(j = 0; j < n_pwl_suggs; j++) {
+ int dupe = 0;
+ for(k = 0; k < n_dict_suggs; k++) {
+ if(strcmp(suggs[k],pwl_suggs[j])==0) {
+ dupe = 1;
+ break;
+ }
}
+ if(!dupe) {
+ suggs[i] = g_strdup (pwl_suggs[j]);
+ i++;
+ }
+ }
+
+ if(pwl_suggs)
+ enchant_pwl_free_string_list(session->personal,pwl_suggs);
}
else
{
suggs = NULL;
- n_suggs = 0;
}
if (out_n_suggs)
diff --git a/src/pwl.c b/src/pwl.c
new file mode 100644
index 0000000..73299c7
--- /dev/null
+++ b/src/pwl.c
@@ -0,0 +1,705 @@
+/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/* enchant
+ * Copyright (C) 2003, 2004 Dom Lachowicz
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * In addition, as a special exception, Dom Lachowicz
+ * gives permission to link the code of this program with
+ * non-LGPL Spelling Provider libraries (eg: a MSFT Office
+ * spell checker backend) and distribute linked combinations including
+ * the two. You must obey the GNU Lesser General Public License in all
+ * respects for all of the code used other than said providers. If you modify
+ * this file, you may extend this exception to your version of the
+ * file, but you are not obligated to do so. If you do not wish to
+ * do so, delete this exception statement from your version.
+ */
+
+/**
+ *
+ * This file implements personal word list (PWL) dictionaries in the
+ * type EnchantPWL.
+ *
+ * Under the hood, a PWL is stored as a Trie. Checking strings for
+ * correctness and making suggestions is done by traversing the Trie
+ * while allowing a certain number of miss-steps. Due to the prefix
+ * compression of the Trie, this allows all strings in the PWL within
+ * a given edit distance of the target word to be enumerated quite
+ * efficiently.
+ *
+ * Ideas for the future:
+ *
+ * - Order the suggestions first by edit distance, then by
+ * soundex key, to put the most similar sounding suggestions
+ * at the front of the list. Would need a "soundex" that is
+ * general enough to handle languages other than English.
+ *
+ * - iterative deepending to find suggestions, rather than a straight
+ * search to depth three.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <glib.h>
+
+#include "pwl.h"
+
+#if defined(HAVE_FLOCK) || defined(HAVE_LOCKF)
+#include <unistd.h>
+#include <sys/file.h>
+#endif /* HAVE_FLOCK || HAVE_LOCKF */
+
+#define ENCHANT_PWL_MAX_ERRORS 3
+#define ENCHANT_PWL_MAX_SUGGS 15
+
+/* A PWL dictionary is stored as a Trie-like data structure EnchantTrie.
+ * The EnchantTrie datatype is completely recursive - all child nodes
+ * are simply EnchantTrie pointers. This means that all functions
+ * that potentially modify a trie need to return the modified trie,
+ * as additional memory may have been allocated.
+ *
+ * The empty trie is simply the null pointer. If a trie contains
+ * a single string, it is recorded in the "value" attribute and
+ * "subtries" is set to NULL. When two or more strings are contained,
+ * "value" is NULL and "subtries" is a GHashTable mapping the first
+ * character of each string to the subtrie containing the remainder of
+ * that string.
+ *
+ * All strings stored in the Trie are assumed to be in UTF format.
+ * Branching is done on unicode characters, not individual bytes.
+ */
+typedef struct str_enchant_trie EnchantTrie;
+struct str_enchant_trie
+{
+ char* value; /* final string found under this node */
+ GHashTable* subtries; /* Maps characters to subtries */
+};
+
+struct str_enchant_pwl
+{
+ EnchantTrie* trie;
+ char * filename;
+ GHashTable *words_in_trie;
+};
+
+/* Special Trie node indicating the end of a string */
+static EnchantTrie n_EOSTrie;
+static EnchantTrie* EOSTrie = &n_EOSTrie;
+
+/* The EnchantTrieMatcher structure maintains the state necessary to
+ * search for matching strings within an EnchantTrie. It includes a
+ * callback function which will be called with each matching string
+ * as it is found. The arguments to this function are:
+ *
+ * - a freshly-allocated copy of the matching string, which must
+ * be freed by external code
+ * - the EnchantTrieMatcher object, giving the context of the match
+ * (e.g. number of errors)
+ */
+typedef struct str_enchant_trie_matcher EnchantTrieMatcher;
+struct str_enchant_trie_matcher
+{
+ int num_errors; /* Num errors encountered so far. */
+ int max_errors; /* Max errors before search should terminate */
+
+ const char* word; /* Word being searched for */
+ ssize_t word_pos; /* Current position in the word */
+
+ char* path; /* Path taken through the trie so far */
+ ssize_t path_len; /* Length of allocated path string */
+ ssize_t path_pos; /* Current end pos of path string */
+
+ void (*cbfunc)(char*,EnchantTrieMatcher*); /* callback func */
+ void* cbdata; /* Private data for use by callback func */
+};
+
+
+/* To allow the list of suggestions to be built up an item at a time,
+ * its state is maintained in an EnchantSuggList object.
+ */
+typedef struct str_enchant_sugg_list
+{
+ char** suggs;
+ int* sugg_errs;
+ size_t n_suggs;
+} EnchantSuggList;
+
+
+/*
+ * Function Prototypes
+ */
+
+static void enchant_pwl_add_to_trie(EnchantPWL *pwl,
+ const char *const word, size_t len,
+ gboolean add_to_file);
+
+static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher);
+static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher);
+static EnchantTrie* enchant_trie_init(void);
+static void enchant_trie_free(EnchantTrie* trie);
+static void enchant_trie_free_cb(void*,void*,void*);
+static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word);
+static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher);
+static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV);
+static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word,
+ int maxerrs,
+ void(*cbfunc)(char*,EnchantTrieMatcher*),
+ void* cbdata);
+static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher);
+static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars);
+static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num);
+
+static int edit_dist(const char* word1, const char* word2);
+static char* soundex(const char* word);
+
+static void
+enchant_lock_file (FILE * f)
+{
+#if defined(HAVE_FLOCK)
+ flock (fileno (f), LOCK_EX);
+#elif defined(HAVE_LOCKF)
+ lockf (fileno (f), F_LOCK, 0);
+#else
+ /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */
+#endif /* HAVE_FLOCK */
+}
+
+static void
+enchant_unlock_file (FILE * f)
+{
+#if defined(HAVE_FLOCK)
+ flock (fileno (f), LOCK_UN);
+#elif defined(HAVE_LOCKF)
+ lockf (fileno (f), F_ULOCK, 0);
+#else
+ /* TODO: win32, UNIX fcntl. This race condition probably isn't too bad. */
+#endif /* HAVE_FLOCK */
+}
+
+/**
+ * enchant_pwl_init
+ *
+ * Returns: a new PWL object used to store/check/suggest words.
+ */
+EnchantPWL* enchant_pwl_init(void)
+{
+ EnchantPWL *pwl;
+
+ pwl = g_new0(EnchantPWL, 1);
+ pwl->words_in_trie = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
+
+ return pwl;
+}
+
+#ifndef BUFSIZ
+#define BUFSIZ 1024
+#endif
+
+/**
+ * enchant_pwl_init
+ *
+ * Returns: a new PWL object used to store/check/suggest words.
+ */
+EnchantPWL* enchant_pwl_init_with_file(const char * file)
+{
+ FILE *f;
+
+ f = fopen (file, "r");
+ if (f)
+ {
+ EnchantPWL *pwl;
+ char line[BUFSIZ];
+
+ pwl = enchant_pwl_init();
+ pwl->filename = g_strdup(file);
+
+ enchant_lock_file (f);
+
+ while (NULL != (fgets (line, sizeof (line), f)))
+ {
+ size_t l = strlen(line)-1;
+ if (line[l]=='\n')
+ line[l] = '\0';
+
+ enchant_pwl_add_to_trie(pwl, line, strlen(line), FALSE);
+ }
+
+ enchant_unlock_file (f);
+ fclose (f);
+
+ return pwl;
+ }
+
+ return NULL;
+}
+
+void enchant_pwl_free(EnchantPWL *pwl)
+{
+ enchant_trie_free(pwl->trie);
+ g_free(pwl->filename);
+ g_hash_table_destroy (pwl->words_in_trie);
+ g_free(pwl);
+}
+
+static void enchant_pwl_add_to_trie(EnchantPWL *pwl,
+ const char *const word, size_t len,
+ gboolean add_to_file)
+{
+ char * case_folded_word;
+
+ case_folded_word = g_utf8_casefold (word, len);
+ if(NULL != g_hash_table_lookup (pwl->words_in_trie, case_folded_word)) {
+ g_free (case_folded_word);
+ return;
+ }
+
+ g_hash_table_insert (pwl->words_in_trie, case_folded_word, GINT_TO_POINTER(1));
+
+ pwl->trie = enchant_trie_insert(pwl->trie, case_folded_word);
+
+ if (add_to_file && (pwl->filename != NULL))
+ {
+ FILE *f;
+
+ f = fopen(pwl->filename, "a");
+ if (f)
+ {
+ enchant_lock_file (f);
+ fwrite (word, sizeof(char), len, f);
+ fwrite ("\n", sizeof(char), 1, f);
+ enchant_unlock_file (f);
+ fclose (f);
+ }
+ }
+}
+
+void enchant_pwl_add(EnchantPWL *pwl,
+ const char *const word, size_t len)
+{
+ enchant_pwl_add_to_trie(pwl, word, len, TRUE);
+}
+
+int enchant_pwl_check(EnchantPWL *pwl, const char *const word, size_t len)
+{
+ EnchantTrieMatcher* matcher;
+ char * case_folded_word;
+ int count = 0;
+
+ case_folded_word = g_utf8_casefold (word, len);
+
+ matcher = enchant_trie_matcher_init(case_folded_word,0,enchant_pwl_check_cb,
+ &count);
+ enchant_trie_find_matches(pwl->trie,matcher);
+ enchant_trie_matcher_free(matcher);
+ g_free(case_folded_word);
+
+ return (count != 0 ? 0 : 1);
+}
+
+static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher)
+{
+ g_free(match);
+ (*((int*)(matcher->cbdata)))++;
+}
+
+char** enchant_pwl_suggest(EnchantPWL *pwl,const char *const word,
+ size_t len, size_t* out_n_suggs)
+{
+ EnchantTrieMatcher* matcher;
+ EnchantSuggList sugg_list;
+ char *case_folded_word;
+
+ sugg_list.suggs = g_new0(char*,ENCHANT_PWL_MAX_SUGGS+1);
+ sugg_list.sugg_errs = g_new0(int,ENCHANT_PWL_MAX_SUGGS);
+ sugg_list.n_suggs = 0;
+
+ case_folded_word = g_utf8_casefold (word, len);
+ matcher = enchant_trie_matcher_init(case_folded_word,ENCHANT_PWL_MAX_ERRORS,
+ enchant_pwl_suggest_cb,
+ &sugg_list);
+ enchant_trie_find_matches(pwl->trie,matcher);
+ enchant_trie_matcher_free(matcher);
+ g_free(case_folded_word);
+
+ g_free(sugg_list.sugg_errs);
+ sugg_list.suggs[sugg_list.n_suggs] = NULL;
+ (*out_n_suggs) = sugg_list.n_suggs;
+
+ return sugg_list.suggs;
+}
+
+static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher)
+{
+ EnchantSuggList* sugg_list;
+ size_t loc, i, shuffleTo;
+ int changes = 0; /* num words added to list */
+
+ sugg_list = (EnchantSuggList*)(matcher->cbdata);
+
+ /* Find appropriate location in the array, if any */
+ /* In future, this could be done using binary search... */
+ for(loc=0; loc < sugg_list->n_suggs; loc++) {
+ /* Better than an existing suggestion, so stop */
+ if(sugg_list->sugg_errs[loc] > matcher->num_errors) {
+ break;
+ }
+ /* Already in the list with better score, just return */
+ if(strcmp(match,sugg_list->suggs[loc])==0) {
+ g_free(match);
+ return;
+ }
+ }
+ /* If it's not going to fit, just throw it away */
+ if(loc >= ENCHANT_PWL_MAX_SUGGS) {
+ g_free(match);
+ return;
+ }
+
+ changes++;
+
+ /* Find the location to shuffle other elements up to.
+ * If the new word already exists, delete it and stuffle up to there
+ * Otherwise, if we reach max suggs, delete last one
+ * Otherwise, shuffle up to end of list
+ */
+ for(i=loc; i < sugg_list->n_suggs; i++){
+ if(strcmp(match,sugg_list->suggs[i]) == 0) {
+ g_free(sugg_list->suggs[i]);
+ changes--;
+ break;
+ }
+ }
+ if(i == ENCHANT_PWL_MAX_SUGGS) {
+ i--;
+ changes--;
+ g_free(sugg_list->suggs[i]);
+ }
+ shuffleTo = i;
+
+ /* Shuffle other entries along to make space for new one */
+ for(i=shuffleTo; i > loc; i--) {
+ sugg_list->suggs[i] = sugg_list->suggs[i-1];
+ sugg_list->sugg_errs[i] = sugg_list->sugg_errs[i-1];
+ }
+
+ sugg_list->suggs[loc] = match;
+ sugg_list->sugg_errs[loc] = matcher->num_errors;
+ sugg_list->n_suggs = sugg_list->n_suggs + changes;
+
+}
+
+void enchant_pwl_free_string_list(EnchantPWL *pwl, char** string_list)
+{
+ g_strfreev(string_list);
+}
+
+static EnchantTrie* enchant_trie_init(void)
+{
+ EnchantTrie* trie;
+
+ trie = g_new(EnchantTrie,1);
+ trie->value = NULL;
+ trie->subtries = NULL;
+
+ return trie;
+}
+
+static void enchant_trie_free(EnchantTrie* trie)
+{
+ /* Dont ever free NULL, or the EOSTrie pointer */
+ if(trie == NULL || trie == EOSTrie) {
+ return;
+ }
+
+ /* Because we have not set a destroy function for the hashtable
+ * (to make code cleaner below), we need to explicitly free all
+ * subtries with a callback function.
+ */
+ if (trie->subtries != NULL) {
+ g_hash_table_foreach(trie->subtries,enchant_trie_free_cb,NULL);
+ g_hash_table_destroy(trie->subtries);
+ }
+
+ if (trie->value != NULL) {
+ g_free(trie->value);
+ }
+
+ g_free(trie);
+}
+
+static void enchant_trie_free_cb(void* key, void* value, void* data)
+{
+ enchant_trie_free((EnchantTrie*) value);
+}
+
+static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word)
+{
+ char *tmpWord;
+ ssize_t nxtCh = 0;
+ EnchantTrie* subtrie;
+
+ if (trie == NULL) {
+ trie = enchant_trie_init();
+ }
+
+ if (trie->value == NULL) {
+ if (trie->subtries == NULL) {
+ /* When single word, store in trie->value */
+ trie->value = g_strdup(word);
+ } else {
+ /* Store multiple words in subtries */
+ if (word[0] == '\0') {
+ /* Mark end-of-string with special node */
+ tmpWord = g_strdup("");
+ g_hash_table_insert(trie->subtries,
+ tmpWord,EOSTrie);
+ } else {
+ nxtCh = (ssize_t)(g_utf8_next_char(word)-word);
+ tmpWord = g_strndup(word,nxtCh);
+ subtrie = g_hash_table_lookup(trie->subtries,
+ tmpWord);
+ subtrie = enchant_trie_insert(subtrie,
+ (word+nxtCh));
+ g_hash_table_insert(trie->subtries,
+ tmpWord,subtrie);
+ }
+ }
+ } else {
+ /* Create new hastable for subtries, and reinsert */
+ trie->subtries = g_hash_table_new_full(g_str_hash,
+ g_str_equal,g_free, NULL);
+ tmpWord = trie->value;
+ trie->value = NULL;
+ enchant_trie_insert(trie,tmpWord);
+ enchant_trie_insert(trie,word);
+ g_free(tmpWord);
+ }
+
+ return trie;
+}
+
+static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher)
+{
+ int errs = 0;
+ ssize_t nxtChI = 0, oldPos = 0;
+ char* nxtChS = NULL;
+ EnchantTrie* subtrie = NULL;
+
+ g_return_if_fail(matcher);
+
+ /* Cant match in the empty trie */
+ if(trie == NULL) {
+ return;
+ }
+
+ /* Bail out if over the error limits */
+ if(matcher->num_errors > matcher->max_errors){
+ return;
+ }
+
+ /* If the end of a string has been reached, no point recursing */
+ if (trie == EOSTrie) {
+ errs = matcher->num_errors;
+ matcher->num_errors = errs + strlen(matcher->word) \
+ - matcher->word_pos;
+ if (matcher->num_errors <= matcher->max_errors) {
+ matcher->cbfunc(g_strdup(matcher->path),matcher);
+ }
+ matcher->num_errors = errs;
+ return;
+ }
+
+ /* If there is a value, just check it, no recursion */
+ if (trie->value != NULL) {
+ errs = matcher->num_errors;
+ matcher->num_errors = errs + edit_dist(trie->value,
+ &(matcher->word[matcher->word_pos]));
+ if (matcher->num_errors <= matcher->max_errors) {
+ matcher->cbfunc(g_strconcat(matcher->path,
+ trie->value,NULL),
+ matcher);
+ }
+ matcher->num_errors = errs;
+ return;
+ }
+
+ nxtChI = (ssize_t)(g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word);
+ nxtChS = g_strndup(&matcher->word[matcher->word_pos],
+ (nxtChI - matcher->word_pos));
+
+ /* Precisely match the first character, and recurse */
+ subtrie = g_hash_table_lookup(trie->subtries,nxtChS);
+ if (subtrie != NULL) {
+ enchant_trie_matcher_pushpath(matcher,nxtChS);
+ oldPos = matcher->word_pos;
+ matcher->word_pos = nxtChI;
+ enchant_trie_find_matches(subtrie,matcher);
+ matcher->word_pos = oldPos;
+ enchant_trie_matcher_poppath(matcher,strlen(nxtChS));
+ }
+
+ g_free(nxtChS);
+
+ if (matcher->word[matcher->word_pos] != '\0') {
+ matcher->num_errors++;
+ /* Match on inserting word[0] */
+ oldPos = matcher->word_pos;
+ matcher->word_pos = nxtChI;
+ enchant_trie_find_matches(trie,matcher);
+ matcher->word_pos = oldPos;
+ /* for each subtrie, match on delete or swap word[0] */
+ g_hash_table_foreach(trie->subtries,
+ enchant_trie_find_matches_cb,
+ matcher);
+ matcher->num_errors--;
+ }
+}
+
+static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV)
+{
+ char* key;
+ EnchantTrie* subtrie;
+ EnchantTrieMatcher* matcher;
+ ssize_t nxtChI, oldPos;
+
+ key = (char*) keyV;
+ subtrie = (EnchantTrie*) subtrieV;
+ matcher = (EnchantTrieMatcher*) matcherV;
+
+ nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word);
+
+ /* Dont handle actual matches, that's already done */
+ if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) {
+ return;
+ }
+
+ enchant_trie_matcher_pushpath(matcher,key);
+
+ /* Match on deleting word[0] */
+ enchant_trie_find_matches(subtrie,matcher);
+ /* Match on transposing word[0] */
+ oldPos = matcher->word_pos;
+ matcher->word_pos = nxtChI;
+ enchant_trie_find_matches(subtrie,matcher);
+ matcher->word_pos = oldPos;
+
+ enchant_trie_matcher_poppath(matcher,strlen(key));
+}
+
+static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word,
+ int maxerrs,
+ void(*cbfunc)(char*,EnchantTrieMatcher*),
+ void* cbdata)
+{
+ EnchantTrieMatcher* matcher;
+
+ matcher = g_new(EnchantTrieMatcher,1);
+ matcher->num_errors = 0;
+ matcher->max_errors = maxerrs;
+ matcher->word = word;
+ matcher->word_pos = 0;
+ matcher->path = g_new0(char,10);
+ matcher->path[0] = '\0';
+ matcher->path_len = 10;
+ matcher->path_pos = 0;
+ matcher->cbfunc = cbfunc;
+ matcher->cbdata = cbdata;
+
+ return matcher;
+}
+
+static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher)
+{
+ g_free(matcher->path);
+ g_free(matcher);
+}
+
+static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars)
+{
+ ssize_t len, i;
+
+ len = strlen(newchars);
+ if(matcher->path_pos + len >= matcher->path_len) {
+ matcher->path_len = matcher->path_len + len + 10;
+ matcher->path = g_renew(char,matcher->path,matcher->path_len);
+ }
+
+ for(i = 0; i < len; i++) {
+ matcher->path[matcher->path_pos + i] = newchars[i];
+ }
+ matcher->path_pos = matcher->path_pos + len;
+ matcher->path[matcher->path_pos] = '\0';
+}
+
+static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num)
+{
+ g_return_if_fail(matcher->path_pos >= 0);
+ matcher->path_pos = matcher->path_pos - num;
+ if(matcher->path_pos < 0) {
+ matcher->path_pos = 0;
+ }
+ matcher->path[matcher->path_pos] = '\0';
+}
+
+static int edit_dist(const char* word1, const char* word2)
+{
+ int len1, len2, cost, i, j;
+ int v1, v2, v3;
+ int* table;
+
+ len1 = strlen(word1);
+ len2 = strlen(word2);
+
+ table = g_new0(int,(len1+1)*(len2+1));
+
+ /* Initialise outer rows of table */
+ for (i=0; i < len1 + 1; i++) {
+ table[i*(len2+1)] = i;
+ }
+ for (j=0; j < len2 + 1; j++) {
+ table[j] = j;
+ }
+
+ /* Fill in table in dynamic programming style */
+ for (i=1; i < len1+1; i++){
+ for(j=1; j < len2+1; j++) {
+ if(word1[i-1] == word2[j-1]) {
+ cost = 0;
+ } else {
+ cost = 1;
+ }
+ v1 = table[(i-1)*(len2+1)+j] + 1;
+ v2 = table[i*(len2+1)+(j-1)] + 1;
+ v3 = table[(i-1)*(len2+1)+(j-1)] + cost;
+ if (v1 < v2 && v1 < v3) {
+ cost = v1;
+ } else if (v2 < v3) {
+ cost = v2;
+ } else {
+ cost = v3;
+ }
+ table[i*(len2+1)+j] = cost;
+ }
+ }
+
+ cost = table[len1*(len2+1) + len2];
+ g_free(table);
+ return cost;
+}
+
diff --git a/src/pwl.h b/src/pwl.h
new file mode 100644
index 0000000..1aa9d15
--- /dev/null
+++ b/src/pwl.h
@@ -0,0 +1,57 @@
+/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/* enchant
+ * Copyright (C) 2003 Dom Lachowicz
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * In addition, as a special exception, Dom Lachowicz
+ * gives permission to link the code of this program with
+ * non-LGPL Spelling Provider libraries (eg: a MSFT Office
+ * spell checker backend) and distribute linked combinations including
+ * the two. You must obey the GNU Lesser General Public License in all
+ * respects for all of the code used other than said providers. If you modify
+ * this file, you may extend this exception to your version of the
+ * file, but you are not obligated to do so. If you do not wish to
+ * do so, delete this exception statement from your version.
+ */
+
+#ifndef PWL_H
+#define PWL_H
+
+#include "enchant.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct str_enchant_pwl EnchantPWL;
+
+/* Create and initialise a new, empty PWL */
+EnchantPWL* enchant_pwl_init(void);
+EnchantPWL* enchant_pwl_init_with_file(const char * file);
+
+void enchant_pwl_add(EnchantPWL * me, const char *const word, size_t len);
+int enchant_pwl_check(EnchantPWL * me,const char *const word, size_t len);
+char** enchant_pwl_suggest(EnchantPWL *me,const char *const word,
+ size_t len, size_t* out_n_suggs);
+void enchant_pwl_free(EnchantPWL* me);
+void enchant_pwl_free_string_list(EnchantPWL* me, char** string_list);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PWL_H */
diff --git a/tests/test-enchant.c b/tests/test-enchant.c
index 5d08f3b..b463449 100644
--- a/tests/test-enchant.c
+++ b/tests/test-enchant.c
@@ -52,6 +52,13 @@ describe_dict_fn (const char * const lang,
printf ("%s: %s '%s' (%s)\n", lang, name, desc, file);
}
+static char * print_found(EnchantDict *dict, const char * word)
+{
+ if (enchant_dict_check (dict, word, -1) == 0)
+ return "found";
+ return "not found";
+}
+
static void
run_dict_tests (EnchantDict * dict)
{
@@ -64,9 +71,8 @@ run_dict_tests (EnchantDict * dict)
for (i = 0; i < (sizeof (check_checks) / sizeof (check_checks[0])); i++)
{
- printf ("enchant_dict_check (%s): %d\n", check_checks[i],
- enchant_dict_check (dict, check_checks[i],
- strlen (check_checks[i])));
+ printf ("enchant_dict_check (%s): %s\n", check_checks[i],
+ print_found(dict, check_checks[i]));
}
for (i = 0; i < (sizeof (sugg_checks) / sizeof (sugg_checks[0])); i++)
@@ -89,21 +95,17 @@ run_dict_tests (EnchantDict * dict)
enchant_dict_add_to_session (dict, "helllo", 6);
for (i = 0; i < (sizeof (check_checks) / sizeof (check_checks[0])); i++)
{
- printf ("enchant_dict_check (%s): %d\n", check_checks[i],
- enchant_dict_check (dict, check_checks[i],
- strlen (check_checks[i])));
+ printf ("enchant_dict_check (%s): %s\n", check_checks[i],
+ print_found(dict, check_checks[i]));
}
-#if 0
printf ("Adding 'helllo' to personal\n");
enchant_dict_add_to_pwl (dict, "helllo", 6);
for (i = 0; i < (sizeof (check_checks) / sizeof (check_checks[0])); i++)
{
- printf ("enchant_dict_check (%s): %d\n", check_checks[i],
- enchant_dict_check (dict, check_checks[i],
- strlen (check_checks[i])));
+ printf ("enchant_dict_check (%s): %s\n", check_checks[i],
+ print_found(dict, check_checks[i]));
}
-#endif
}
int