Support longer candidate

author: Peng Wu <alexepico@gmail.com> 2022-09-20 17:24:23 +0800
committer: Peng Wu <alexepico@gmail.com> 2022-09-21 15:10:34 +0800
commit: 698c01539cc8636dd79bd12a49e2d9a960209f98 (patch)
tree: 6b469c2eddee8f4802ce8ea0546e63c9fc4830fb
parent: baedddb15b41d7d6dbe1030195c0b343159a3b6c (diff)
download: libpinyin-698c01539cc8636dd79bd12a49e2d9a960209f98.tar.gz
3 files changed, 86 insertions, 3 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 59e424b..68e237c 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -1662,6 +1662,70 @@ static void _compute_frequency_of_items(pinyin_context_t * context,
     }
 }
 
+static bool _prepend_longer_candidates(pinyin_instance_t * instance,
+                                       CandidateVector candidates) {
+
+    pinyin_context_t * & context = instance->m_context;
+    FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+    PhoneticKeyMatrix & matrix = instance->m_matrix;
+    size_t prefix_len = instance->m_parsed_key_len;
+
+    GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+    PhraseTokens tokens;
+    memset(tokens, 0, sizeof(tokens));
+    phrase_index->prepare_tokens(tokens);
+    int result = search_suggestion_with_matrix
+        (context->m_pinyin_table, &matrix, prefix_len, tokens);
+    int num = reduce_tokens(tokens, tokenarray, false);
+    phrase_index->destroy_tokens(tokens);
+
+    phrase_token_t longer_token = null_token;
+    PhraseItem longer_item, item;
+    for (int i = 0; i < tokenarray->len; ++i) {
+        phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i);
+
+        if (ERROR_OK != phrase_index->get_phrase_item(token, item))
+            continue;
+
+        /* skip the phrase longer than prefix_len * 2 + 1 */
+        if (item.get_phrase_length() > (prefix_len * 2 + 1))
+            continue;
+
+        if (longer_token == null_token) {
+            longer_token = token;
+            phrase_index->get_phrase_item(longer_token, longer_item);
+            continue;
+        }
+
+        if (item.get_unigram_frequency() >
+            longer_item.get_unigram_frequency()) {
+            longer_token = token;
+            phrase_index->get_phrase_item(longer_token, longer_item);
+        }
+    }
+
+    if (longer_token == null_token)
+        return false;
+
+    /* compute the unigram frequency. */
+    gfloat lambda = context->m_system_table_info.get_lambda();
+    guint32 total_freq = phrase_index->get_phrase_index_total_freq();
+    guint32 freq = ((1 - lambda) *
+                    longer_item.get_unigram_frequency() /
+                    (gfloat) total_freq) * 256 * 256 * 256;
+
+    /* prepend longer candidate to candidates. */
+    lookup_candidate_t candidate;
+    candidate.m_candidate_type = LONGER_CANDIDATE;
+    candidate.m_token = longer_token;
+    candidate.m_freq = freq;
+    g_array_prepend_val(candidates, candidate);
+
+    g_array_free(tokenarray, TRUE);
+    return true;
+}
+
 static bool _prepend_sentence_candidates(pinyin_instance_t * instance,
                                          CandidateVector candidates) {
     const size_t size = instance->m_nbest_results.size();
@@ -1737,6 +1801,7 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
             break;
         }
         case NORMAL_CANDIDATE:
+        case LONGER_CANDIDATE:
         case PREDICTED_BIGRAM_CANDIDATE:
             _token_get_phrase
                 (instance->m_context->m_phrase_index,
@@ -1781,8 +1846,7 @@ static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
 
 
 static bool _remove_duplicated_items_by_phrase_string
-(pinyin_instance_t * instance,
- CandidateVector candidates) {
+(pinyin_instance_t * instance, CandidateVector candidates) {
     size_t i;
     /* create the GArray of indexed item */
     GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
@@ -1809,6 +1873,22 @@ static bool _remove_duplicated_items_by_phrase_string
                         cur_item->m_phrase_string)) {
             /* found duplicated candidates */
 
+            /* as the longer candidates is longer than the pinyin input,
+               then only longer candidates can be equal. */
+
+            if (LONGER_CANDIDATE == saved_item->m_candidate_type &&
+                LONGER_CANDIDATE == cur_item->m_candidate_type) {
+                /* keep the high possiblity one */
+                if (saved_item->m_freq < cur_item->m_freq) {
+                    cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+                } else {
+                    saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+                    saved_item = cur_item;
+                }
+
+                continue;
+            }
+
             /* both are nbest match candidate */
             if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type &&
                 NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
@@ -2002,6 +2082,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance,
 
     /* post process to remove duplicated candidates */
 
+    _prepend_longer_candidates(instance, instance->m_candidates);
+
     _prepend_sentence_candidates(instance, instance->m_candidates);
 
     _compute_phrase_strings_of_items(instance, instance->m_candidates);
diff --git a/src/pinyin.h b/src/pinyin.h
index 4b631fe..cc1a5cb 100644
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -46,6 +46,7 @@ typedef enum _lookup_candidate_type_t{
     PREDICTED_BIGRAM_CANDIDATE,
     PREDICTED_PREFIX_CANDIDATE,
     ADDON_CANDIDATE,
+    LONGER_CANDIDATE,
 } lookup_candidate_type_t;
 
 typedef enum _sort_option_t{
diff --git a/src/storage/chewing_large_table2_kyotodb.cpp b/src/storage/chewing_large_table2_kyotodb.cpp
index b05f100..d5aeeb9 100644
--- a/src/storage/chewing_large_table2_kyotodb.cpp
+++ b/src/storage/chewing_large_table2_kyotodb.cpp
@@ -228,7 +228,7 @@ int ChewingLargeTable2::search_suggestion_internal
 
     entry->m_chunk.set_chunk(chunk.begin(), chunk.size(), NULL);
 
-    result = entry->search(prefix_keys, tokens) | result;
+    result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result;
 
     entry->m_chunk.set_size(0);
author	Peng Wu <alexepico@gmail.com>	2022-09-20 17:24:23 +0800
committer	Peng Wu <alexepico@gmail.com>	2022-09-21 15:10:34 +0800
commit	698c01539cc8636dd79bd12a49e2d9a960209f98 (patch)
tree	6b469c2eddee8f4802ce8ea0546e63c9fc4830fb
parent	baedddb15b41d7d6dbe1030195c0b343159a3b6c (diff)
download	libpinyin-698c01539cc8636dd79bd12a49e2d9a960209f98.tar.gz