summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2022-09-20 17:24:23 +0800
committerPeng Wu <alexepico@gmail.com>2022-09-21 15:10:34 +0800
commit698c01539cc8636dd79bd12a49e2d9a960209f98 (patch)
tree6b469c2eddee8f4802ce8ea0546e63c9fc4830fb
parentbaedddb15b41d7d6dbe1030195c0b343159a3b6c (diff)
downloadlibpinyin-698c01539cc8636dd79bd12a49e2d9a960209f98.tar.gz
Support longer candidate
-rw-r--r--src/pinyin.cpp86
-rw-r--r--src/pinyin.h1
-rw-r--r--src/storage/chewing_large_table2_kyotodb.cpp2
3 files changed, 86 insertions, 3 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 59e424b..68e237c 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -1662,6 +1662,70 @@ static void _compute_frequency_of_items(pinyin_context_t * context,
}
}
+static bool _prepend_longer_candidates(pinyin_instance_t * instance,
+ CandidateVector candidates) {
+
+ pinyin_context_t * & context = instance->m_context;
+ FacadePhraseIndex * & phrase_index = context->m_phrase_index;
+ PhoneticKeyMatrix & matrix = instance->m_matrix;
+ size_t prefix_len = instance->m_parsed_key_len;
+
+ GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
+
+ PhraseTokens tokens;
+ memset(tokens, 0, sizeof(tokens));
+ phrase_index->prepare_tokens(tokens);
+ int result = search_suggestion_with_matrix
+ (context->m_pinyin_table, &matrix, prefix_len, tokens);
+ int num = reduce_tokens(tokens, tokenarray, false);
+ phrase_index->destroy_tokens(tokens);
+
+ phrase_token_t longer_token = null_token;
+ PhraseItem longer_item, item;
+ for (int i = 0; i < tokenarray->len; ++i) {
+ phrase_token_t token = g_array_index(tokenarray, phrase_token_t, i);
+
+ if (ERROR_OK != phrase_index->get_phrase_item(token, item))
+ continue;
+
+ /* skip the phrase longer than prefix_len * 2 + 1 */
+ if (item.get_phrase_length() > (prefix_len * 2 + 1))
+ continue;
+
+ if (longer_token == null_token) {
+ longer_token = token;
+ phrase_index->get_phrase_item(longer_token, longer_item);
+ continue;
+ }
+
+ if (item.get_unigram_frequency() >
+ longer_item.get_unigram_frequency()) {
+ longer_token = token;
+ phrase_index->get_phrase_item(longer_token, longer_item);
+ }
+ }
+
+ if (longer_token == null_token)
+ return false;
+
+ /* compute the unigram frequency. */
+ gfloat lambda = context->m_system_table_info.get_lambda();
+ guint32 total_freq = phrase_index->get_phrase_index_total_freq();
+ guint32 freq = ((1 - lambda) *
+ longer_item.get_unigram_frequency() /
+ (gfloat) total_freq) * 256 * 256 * 256;
+
+ /* prepend longer candidate to candidates. */
+ lookup_candidate_t candidate;
+ candidate.m_candidate_type = LONGER_CANDIDATE;
+ candidate.m_token = longer_token;
+ candidate.m_freq = freq;
+ g_array_prepend_val(candidates, candidate);
+
+ g_array_free(tokenarray, TRUE);
+ return true;
+}
+
static bool _prepend_sentence_candidates(pinyin_instance_t * instance,
CandidateVector candidates) {
const size_t size = instance->m_nbest_results.size();
@@ -1737,6 +1801,7 @@ static bool _compute_phrase_strings_of_items(pinyin_instance_t * instance,
break;
}
case NORMAL_CANDIDATE:
+ case LONGER_CANDIDATE:
case PREDICTED_BIGRAM_CANDIDATE:
_token_get_phrase
(instance->m_context->m_phrase_index,
@@ -1781,8 +1846,7 @@ static gint compare_indexed_item_with_phrase_string(gconstpointer lhs,
static bool _remove_duplicated_items_by_phrase_string
-(pinyin_instance_t * instance,
- CandidateVector candidates) {
+(pinyin_instance_t * instance, CandidateVector candidates) {
size_t i;
/* create the GArray of indexed item */
GArray * indices = g_array_new(FALSE, FALSE, sizeof(size_t));
@@ -1809,6 +1873,22 @@ static bool _remove_duplicated_items_by_phrase_string
cur_item->m_phrase_string)) {
/* found duplicated candidates */
+ /* as the longer candidates is longer than the pinyin input,
+ then only longer candidates can be equal. */
+
+ if (LONGER_CANDIDATE == saved_item->m_candidate_type &&
+ LONGER_CANDIDATE == cur_item->m_candidate_type) {
+ /* keep the high possiblity one */
+ if (saved_item->m_freq < cur_item->m_freq) {
+ cur_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ } else {
+ saved_item->m_candidate_type = ZOMBIE_CANDIDATE;
+ saved_item = cur_item;
+ }
+
+ continue;
+ }
+
/* both are nbest match candidate */
if (NBEST_MATCH_CANDIDATE == saved_item->m_candidate_type &&
NBEST_MATCH_CANDIDATE == cur_item->m_candidate_type) {
@@ -2002,6 +2082,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance,
/* post process to remove duplicated candidates */
+ _prepend_longer_candidates(instance, instance->m_candidates);
+
_prepend_sentence_candidates(instance, instance->m_candidates);
_compute_phrase_strings_of_items(instance, instance->m_candidates);
diff --git a/src/pinyin.h b/src/pinyin.h
index 4b631fe..cc1a5cb 100644
--- a/src/pinyin.h
+++ b/src/pinyin.h
@@ -46,6 +46,7 @@ typedef enum _lookup_candidate_type_t{
PREDICTED_BIGRAM_CANDIDATE,
PREDICTED_PREFIX_CANDIDATE,
ADDON_CANDIDATE,
+ LONGER_CANDIDATE,
} lookup_candidate_type_t;
typedef enum _sort_option_t{
diff --git a/src/storage/chewing_large_table2_kyotodb.cpp b/src/storage/chewing_large_table2_kyotodb.cpp
index b05f100..d5aeeb9 100644
--- a/src/storage/chewing_large_table2_kyotodb.cpp
+++ b/src/storage/chewing_large_table2_kyotodb.cpp
@@ -228,7 +228,7 @@ int ChewingLargeTable2::search_suggestion_internal
entry->m_chunk.set_chunk(chunk.begin(), chunk.size(), NULL);
- result = entry->search(prefix_keys, tokens) | result;
+ result = entry->search_suggestion(prefix_len, prefix_keys, tokens) | result;
entry->m_chunk.set_size(0);