diff options
author | Eric Albright <eric_albright@sil.org> | 2007-12-11 02:45:26 +0000 |
---|---|---|
committer | Eric Albright <eric_albright@sil.org> | 2007-12-11 02:45:26 +0000 |
commit | a48e108a6cd3349904b91cba9cfb0bc20edb9ebb (patch) | |
tree | 0bda2e4097af42261448f33aff03bc8fbd4f06c5 | |
parent | 32791d6f1414a1bcc3bd99a1816ce629b43790db (diff) | |
download | enchant-a48e108a6cd3349904b91cba9cfb0bc20edb9ebb.tar.gz |
Use Damerau-Levenshtein edit distance algorithm instead of Levenshtein.
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@22369 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r-- | src/pwl.c | 67 | ||||
-rw-r--r-- | unittests/pwl/enchant_pwl_tests.cpp | 97 |
2 files changed, 151 insertions, 13 deletions
@@ -902,6 +902,25 @@ static void enchant_trie_remove(EnchantTrie* trie,const char *const word) } } +static EnchantTrie* enchant_trie_get_subtrie(EnchantTrie* trie, + EnchantTrieMatcher* matcher, + char** nxtChS) +{ + EnchantTrie* subtrie; + + if(trie->subtries == NULL || *nxtChS == NULL) + return NULL; + + subtrie = g_hash_table_lookup(trie->subtries,*nxtChS); + if(subtrie == NULL && matcher->mode == case_insensitive) { + char* nxtChSUp = g_utf8_strup(*nxtChS, -1); /* we ignore the title case scenario since that will give us an edit_distance of one which is acceptable since this mode is used for suggestions*/ + g_free(*nxtChS); + *nxtChS = nxtChSUp; + subtrie = g_hash_table_lookup(trie->subtries,nxtChSUp); + } + return subtrie; +} + static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher) { int errs = 0; @@ -965,13 +984,7 @@ static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matc (nxtChI - matcher->word_pos)); /* Precisely match the first character, and recurse */ - subtrie = g_hash_table_lookup(trie->subtries,nxtChS); - if(subtrie == NULL && matcher->mode == case_insensitive) { - char* nxtChSUp = g_utf8_strup(nxtChS, nxtChI - matcher->word_pos); /* we ignore the title case scenario since that will give us an edit_distance of one which is acceptable since this mode is used for suggestions*/ - g_free(nxtChS); - nxtChS = nxtChSUp; - subtrie = g_hash_table_lookup(trie->subtries,nxtChSUp); - } + subtrie = enchant_trie_get_subtrie(trie, matcher, &nxtChS); if (subtrie != NULL) { enchant_trie_matcher_pushpath(matcher,nxtChS); @@ -992,7 +1005,7 @@ static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matc enchant_trie_find_matches(trie,matcher); matcher->word_pos = oldPos; } - /* for each subtrie, match on delete or substitute word[0] */ + /* for each subtrie, match on delete or substitute word[0] or transpose word[0] and word[1] */ g_hash_table_foreach(trie->subtries, enchant_trie_find_matches_cb, matcher); @@ -1001,8 +1014,8 @@ static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matc static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV) { - char* key; - EnchantTrie* subtrie; + char* key, *key2; + EnchantTrie* subtrie, *subtrie2; EnchantTrieMatcher* matcher; ssize_t nxtChI, oldPos; @@ -1021,13 +1034,33 @@ static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcher /* Match on deleting word[0] */ enchant_trie_find_matches(subtrie,matcher); - /* Match on transposing word[0] */ + /* Match on substituting word[0] */ oldPos = matcher->word_pos; matcher->word_pos = nxtChI; enchant_trie_find_matches(subtrie,matcher); - matcher->word_pos = oldPos; enchant_trie_matcher_poppath(matcher,strlen(key)); + + /* Match on transposing word[0] and word[1] */ + key2 = g_strndup(&matcher->word[oldPos],nxtChI-oldPos); + subtrie2 = enchant_trie_get_subtrie(subtrie, matcher, &key2); + + if(subtrie2 != NULL) { + nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word); + if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) { + matcher->word_pos = nxtChI; + enchant_trie_matcher_pushpath(matcher,key); + enchant_trie_matcher_pushpath(matcher,key2); + + enchant_trie_find_matches(subtrie2,matcher); + enchant_trie_matcher_poppath(matcher,strlen(key2)); + enchant_trie_matcher_poppath(matcher,strlen(key)); +} + } + + g_free(key2); + + matcher->word_pos = oldPos; } static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word, @@ -1104,7 +1137,7 @@ static int edit_dist(const char* utf8word1, const char* utf8word2) { gunichar * word1, * word2; glong len1, len2, cost, i, j; - int v1, v2, v3; + int v1, v2, v3, v4; int* table; word1 = g_utf8_to_ucs4_fast(utf8word1, -1, &len1); @@ -1131,6 +1164,13 @@ static int edit_dist(const char* utf8word1, const char* utf8word2) v1 = table[(i-1)*(len2+1)+j] + 1; v2 = table[i*(len2+1)+(j-1)] + 1; v3 = table[(i-1)*(len2+1)+(j-1)] + cost; + + if(i > 1 && j > 1 && word1[i-1] == word2[j-2] && word1[i-2] == word2[j-1]) { + v4 = table[(i-2)*(len2+1)+(j-2)] + cost; + if(v4 < v1) + v1 = v4; + } + if (v1 < v2 && v1 < v3) { cost = v1; } else if (v2 < v3) { @@ -1147,3 +1187,4 @@ static int edit_dist(const char* utf8word1, const char* utf8word2) return cost; } + diff --git a/unittests/pwl/enchant_pwl_tests.cpp b/unittests/pwl/enchant_pwl_tests.cpp index 50d343f..24fcc35 100644 --- a/unittests/pwl/enchant_pwl_tests.cpp +++ b/unittests/pwl/enchant_pwl_tests.cpp @@ -1392,3 +1392,100 @@ TEST_FIXTURE(EnchantPwl_TestFixture, CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_HasProperSubset_Transpose1)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("small"); //1
+
+ AddWordsToDictionary(sWords);
+
+ AddWordToDictionary("smallest"); //4
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("smlal");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_HasProperSubset_Transpose2)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("catch"); //2
+
+ AddWordsToDictionary(sWords);
+
+ AddWordToDictionary("catcher"); //4
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("acthc");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_HasProperSubset_Transpose3)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("hasten"); //3
+
+ AddWordsToDictionary(sWords);
+
+ AddWordToDictionary("hastens"); //4
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("ahtsne");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_Transpose1Insert2)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("catch"); //3
+
+ AddWordsToDictionary(sWords);
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("act");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
+TEST_FIXTURE(EnchantPwl_TestFixture,
+ PwlSuggest_Transpose2)
+{
+ std::vector<const std::string> sWords;
+ sWords.push_back("catch"); //2
+
+ AddWordsToDictionary(sWords);
+
+ std::vector<const std::string> suggestions = GetSuggestionsFromWord("acthc");
+
+ CHECK_EQUAL(sWords.size(), suggestions.size());
+
+ std::sort(sWords.begin(), sWords.end());
+ std::sort(suggestions.begin(), suggestions.end());
+
+ CHECK_ARRAY_EQUAL(sWords, suggestions, std::min(sWords.size(), suggestions.size()));
+}
+
|