summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Albright <eric_albright@sil.org>2007-12-11 02:45:26 +0000
committerEric Albright <eric_albright@sil.org>2007-12-11 02:45:26 +0000
commita48e108a6cd3349904b91cba9cfb0bc20edb9ebb (patch)
tree0bda2e4097af42261448f33aff03bc8fbd4f06c5
parent32791d6f1414a1bcc3bd99a1816ce629b43790db (diff)
downloadenchant-a48e108a6cd3349904b91cba9cfb0bc20edb9ebb.tar.gz
Use Damerau-Levenshtein edit distance algorithm instead of Levenshtein.
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@22369 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r--src/pwl.c67
-rw-r--r--unittests/pwl/enchant_pwl_tests.cpp97
2 files changed, 151 insertions, 13 deletions
diff --git a/src/pwl.c b/src/pwl.c
index e3dc39c..3a95d46 100644
--- a/src/pwl.c
+++ b/src/pwl.c
@@ -902,6 +902,25 @@ static void enchant_trie_remove(EnchantTrie* trie,const char *const word)
}
}
+static EnchantTrie* enchant_trie_get_subtrie(EnchantTrie* trie,
+ EnchantTrieMatcher* matcher,
+ char** nxtChS)
+{
+ EnchantTrie* subtrie;
+
+ if(trie->subtries == NULL || *nxtChS == NULL)
+ return NULL;
+
+ subtrie = g_hash_table_lookup(trie->subtries,*nxtChS);
+ if(subtrie == NULL && matcher->mode == case_insensitive) {
+ char* nxtChSUp = g_utf8_strup(*nxtChS, -1); /* we ignore the title case scenario since that will give us an edit_distance of one which is acceptable since this mode is used for suggestions*/
+ g_free(*nxtChS);
+ *nxtChS = nxtChSUp;
+ subtrie = g_hash_table_lookup(trie->subtries,nxtChSUp);
+ }
+ return subtrie;
+}
+
static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher)
{
int errs = 0;
@@ -965,13 +984,7 @@ static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matc
(nxtChI - matcher->word_pos));
/* Precisely match the first character, and recurse */
- subtrie = g_hash_table_lookup(trie->subtries,nxtChS);
- if(subtrie == NULL && matcher->mode == case_insensitive) {
- char* nxtChSUp = g_utf8_strup(nxtChS, nxtChI - matcher->word_pos); /* we ignore the title case scenario since that will give us an edit_distance of one which is acceptable since this mode is used for suggestions*/
- g_free(nxtChS);
- nxtChS = nxtChSUp;
- subtrie = g_hash_table_lookup(trie->subtries,nxtChSUp);
- }
+ subtrie = enchant_trie_get_subtrie(trie, matcher, &nxtChS);
if (subtrie != NULL) {
enchant_trie_matcher_pushpath(matcher,nxtChS);
@@ -992,7 +1005,7 @@ static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matc
enchant_trie_find_matches(trie,matcher);
matcher->word_pos = oldPos;
}
- /* for each subtrie, match on delete or substitute word[0] */
+ /* for each subtrie, match on delete or substitute word[0] or transpose word[0] and word[1] */
g_hash_table_foreach(trie->subtries,
enchant_trie_find_matches_cb,
matcher);
@@ -1001,8 +1014,8 @@ static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matc
static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV)
{
- char* key;
- EnchantTrie* subtrie;
+ char* key, *key2;
+ EnchantTrie* subtrie, *subtrie2;
EnchantTrieMatcher* matcher;
ssize_t nxtChI, oldPos;
@@ -1021,13 +1034,33 @@ static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcher
/* Match on deleting word[0] */
enchant_trie_find_matches(subtrie,matcher);
- /* Match on transposing word[0] */
+ /* Match on substituting word[0] */
oldPos = matcher->word_pos;
matcher->word_pos = nxtChI;
enchant_trie_find_matches(subtrie,matcher);
- matcher->word_pos = oldPos;
enchant_trie_matcher_poppath(matcher,strlen(key));
+
+ /* Match on transposing word[0] and word[1] */
+ key2 = g_strndup(&matcher->word[oldPos],nxtChI-oldPos);
+ subtrie2 = enchant_trie_get_subtrie(subtrie, matcher, &key2);
+
+ if(subtrie2 != NULL) {
+ nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word);
+ if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) {
+ matcher->word_pos = nxtChI;
+ enchant_trie_matcher_pushpath(matcher,key);
+ enchant_trie_matcher_pushpath(matcher,key2);
+
+ enchant_trie_find_matches(subtrie2,matcher);
+ enchant_trie_matcher_poppath(matcher,strlen(key2));
+ enchant_trie_matcher_poppath(matcher,strlen(key));
+}
+ }
+
+ g_free(key2);
+
+ matcher->word_pos = oldPos;
}
static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word,
@@ -1104,7 +1137,7 @@ static int edit_dist(const char* utf8word1, const char* utf8word2)
{
gunichar * word1, * word2;
glong len1, len2, cost, i, j;
- int v1, v2, v3;
+ int v1, v2, v3, v4;
int* table;
word1 = g_utf8_to_ucs4_fast(utf8word1, -1, &len1);
@@ -1131,6 +1164,13 @@ static int edit_dist(const char* utf8word1, const char* utf8word2)
v1 = table[(i-1)*(len2+1)+j] + 1;
v2 = table[i*(len2+1)+(j-1)] + 1;
v3 = table[(i-1)*(len2+1)+(j-1)] + cost;
+
+ if(i > 1 && j > 1 && word1[i-1] == word2[j-2] && word1[i-2] == word2[j-1]) {
+ v4 = table[(i-2)*(len2+1)+(j-2)] + cost;
+ if(v4 < v1)
+ v1 = v4;
+ }
+
if (v1 < v2 && v1 < v3) {
cost = v1;
} else if (v2 < v3) {
@@ -1147,3 +1187,4 @@ static int edit_dist(const char* utf8word1, const char* utf8word2)
return cost;
}
+
diff --git a/unittests/pwl/enchant_pwl_tests.cpp b/unittests/pwl/enchant_pwl_tests.cpp
index 50d343f..24fcc35 100644
--- a/unittests/pwl/enchant_pwl_tests.cpp
+++ b/unittests/pwl/enchant_pwl_tests.cpp
@@ -1392,3 +1392,100 @@ TEST_FIXTURE(EnchantPwl_TestFixture,
CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_HasProperSubset_Transpose1)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("small"); //1
+
+ AddWordsToDictionary(sWords);
+
+ AddWordToDictionary("smallest"); //4
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("smlal");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_HasProperSubset_Transpose2)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("catch"); //2
+
+ AddWordsToDictionary(sWords);
+
+ AddWordToDictionary("catcher"); //4
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("acthc");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_HasProperSubset_Transpose3)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("hasten"); //3
+
+ AddWordsToDictionary(sWords);
+
+ AddWordToDictionary("hastens"); //4
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("ahtsne");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_Transpose1Insert2)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("catch"); //3
+
+ AddWordsToDictionary(sWords);
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("act");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_Transpose2)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("catch"); //2
+
+ AddWordsToDictionary(sWords);
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("acthc");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+