/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2010,2013 Peng Wu
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include
#include
#include
#include
#include "pinyin_internal.h"
#include "utils_helper.h"
void print_help(){
printf("Usage: spseg [--generate-extra-enter] [-o outputfile] [inputfile]\n");
}
static gboolean gen_extra_enter = FALSE;
static gchar * outputfile = NULL;
static GOptionEntry entries[] =
{
{"outputfile", 'o', 0, G_OPTION_ARG_FILENAME, &outputfile, "output", "filename"},
{"generate-extra-enter", 0, 0, G_OPTION_ARG_NONE, &gen_extra_enter, "generate ", NULL},
{NULL}
};
/* graph shortest path sentence segment. */
/* Note:
* Currently libpinyin only supports ucs4 characters, as this is a
* pre-processor tool for raw corpus, it will skip all sentences
* which contains non-ucs4 characters.
*/
enum CONTEXT_STATE{
CONTEXT_INIT,
CONTEXT_SEGMENTABLE,
CONTEXT_UNKNOWN
};
struct SegmentStep{
phrase_token_t m_handle;
ucs4_t * m_phrase;
size_t m_phrase_len;
//use formula W = number of words. Zero handle means one word.
guint m_nword;
//backtrace information, -1 one step backward.
gint m_backward_nstep;
public:
SegmentStep(){
m_handle = null_token;
m_phrase = NULL;
m_phrase_len = 0;
m_nword = UINT_MAX;
m_backward_nstep = -0;
}
};
bool backtrace(GArray * steps, glong phrase_len, GArray * strings);
/* Note: do not free phrase, as it is used by strings (array of segment). */
bool segment(FacadePhraseTable3 * phrase_table,
FacadePhraseIndex * phrase_index,
GArray * current_ucs4,
GArray * strings /* Array of SegmentStep. */){
ucs4_t * phrase = (ucs4_t *)current_ucs4->data;
guint phrase_len = current_ucs4->len;
/* Prepare for shortest path segment dynamic programming. */
GArray * steps = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
SegmentStep step;
for ( glong i = 0; i < phrase_len + 1; ++i ){
g_array_append_val(steps, step);
}
SegmentStep * first_step = &g_array_index(steps, SegmentStep, 0);
first_step->m_nword = 0;
PhraseTokens tokens;
memset(tokens, 0, sizeof(PhraseTokens));
phrase_index->prepare_tokens(tokens);
for ( glong i = 0; i < phrase_len + 1; ++i ) {
SegmentStep * step_begin = &g_array_index(steps, SegmentStep, i);
size_t nword = step_begin->m_nword;
for ( glong k = i + 1; k < phrase_len + 1; ++k ) {
size_t len = k - i;
ucs4_t * cur_phrase = phrase + i;
phrase_token_t token = null_token;
phrase_index->clear_tokens(tokens);
int result = phrase_table->search(len, cur_phrase, tokens);
int num = get_first_token(tokens, token);
if ( !(result & SEARCH_OK) ){
token = null_token;
if ( 1 != len )
continue;
}
++nword;
SegmentStep * step_end = &g_array_index(steps, SegmentStep, k);
if ( nword < step_end->m_nword ) {
step_end->m_handle = token;
step_end->m_phrase = cur_phrase;
step_end->m_phrase_len = len;
step_end->m_nword = nword;
step_end->m_backward_nstep = i - k;
}
if ( !(result & SEARCH_CONTINUED) )
break;
}
}
phrase_index->destroy_tokens(tokens);
return backtrace(steps, phrase_len, strings);
}
bool backtrace(GArray * steps, glong phrase_len, GArray * strings){
/* backtracing to get the result. */
size_t cur_step = phrase_len;
g_array_set_size(strings, 0);
while ( cur_step ){
SegmentStep * step = &g_array_index(steps, SegmentStep, cur_step);
g_array_append_val(strings, *step);
cur_step = cur_step + step->m_backward_nstep;
/* intended to avoid leaking internal informations. */
step->m_nword = 0; step->m_backward_nstep = 0;
}
/* reverse the strings. */
for ( size_t i = 0; i < strings->len / 2; ++i ) {
SegmentStep * head, * tail;
head = &g_array_index(strings, SegmentStep, i);
tail = &g_array_index(strings, SegmentStep, strings->len - 1 - i );
SegmentStep tmp;
tmp = *head;
*head = *tail;
*tail = tmp;
}
g_array_free(steps, TRUE);
return true;
}
bool deal_with_segmentable(FacadePhraseTable3 * phrase_table,
FacadePhraseIndex * phrase_index,
GArray * current_ucs4,
FILE * output){
/* do segment stuff. */
GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
segment(phrase_table, phrase_index, current_ucs4, strings);
/* print out the split phrase. */
for ( glong i = 0; i < strings->len; ++i ) {
SegmentStep * step = &g_array_index(strings, SegmentStep, i);
char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
fprintf(output, "%d %s\n", step->m_handle, string);
g_free(string);
}
g_array_free(strings, TRUE);
return true;
}
bool deal_with_unknown(GArray * current_ucs4, FILE * output){
char * result_string = g_ucs4_to_utf8
( (ucs4_t *) current_ucs4->data, current_ucs4->len,
NULL, NULL, NULL);
fprintf(output, "%d %s\n", null_token, result_string);
g_free(result_string);
return true;
}
int main(int argc, char * argv[]){
FILE * input = stdin;
FILE * output = stdout;
setlocale(LC_ALL, "");
GError * error = NULL;
GOptionContext * context;
context = g_option_context_new("- shortest path segment");
g_option_context_add_main_entries(context, entries, NULL);
if (!g_option_context_parse(context, &argc, &argv, &error)) {
g_print("option parsing failed:%s\n", error->message);
exit(EINVAL);
}
if (outputfile) {
output = fopen(outputfile, "w");
if (NULL == output) {
perror("open file failed");
exit(EINVAL);
}
}
if (argc > 2) {
fprintf(stderr, "too many arguments.\n");
exit(EINVAL);
}
if (2 == argc) {
input = fopen(argv[1], "r");
if (NULL == input) {
perror("open file failed");
exit(EINVAL);
}
}
SystemTableInfo2 system_table_info;
bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
if (!retval) {
fprintf(stderr, "load table.conf failed.\n");
exit(ENOENT);
}
/* init phrase table */
FacadePhraseTable3 phrase_table;
phrase_table.load(SYSTEM_PHRASE_INDEX, NULL);
/* init phrase index */
FacadePhraseIndex phrase_index;
const pinyin_table_info_t * phrase_files =
system_table_info.get_default_tables();
if (!load_phrase_index(phrase_files, &phrase_index))
exit(ENOENT);
CONTEXT_STATE state, next_state;
GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));
PhraseTokens tokens;
memset(tokens, 0, sizeof(PhraseTokens));
phrase_index.prepare_tokens(tokens);
char * linebuf = NULL; size_t size = 0; ssize_t read;
while( (read = getline(&linebuf, &size, input)) != -1 ){
if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
linebuf[strlen(linebuf) - 1] = '\0';
}
/* check non-ucs4 characters. */
const glong num_of_chars = g_utf8_strlen(linebuf, -1);
glong len = 0;
ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
if ( len != num_of_chars ) {
fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
fprintf(output, "%d \n", null_token);
continue;
}
/* only new-line persists. */
if ( 0 == num_of_chars ) {
fprintf(output, "%d \n", null_token);
continue;
}
state = CONTEXT_INIT;
int result = phrase_table.search( 1, sentence, tokens);
g_array_append_val( current_ucs4, sentence[0]);
if ( result & SEARCH_OK )
state = CONTEXT_SEGMENTABLE;
else
state = CONTEXT_UNKNOWN;
for ( int i = 1; i < num_of_chars; ++i) {
int result = phrase_table.search( 1, sentence + i, tokens);
if ( result & SEARCH_OK )
next_state = CONTEXT_SEGMENTABLE;
else
next_state = CONTEXT_UNKNOWN;
if ( state == next_state ){
g_array_append_val(current_ucs4, sentence[i]);
continue;
}
assert ( state != next_state );
if ( state == CONTEXT_SEGMENTABLE )
deal_with_segmentable(&phrase_table, &phrase_index,
current_ucs4, output);
if ( state == CONTEXT_UNKNOWN )
deal_with_unknown(current_ucs4, output);
/* save the current character */
g_array_set_size(current_ucs4, 0);
g_array_append_val(current_ucs4, sentence[i]);
state = next_state;
}
if ( current_ucs4->len ) {
/* this seems always true. */
if ( state == CONTEXT_SEGMENTABLE )
deal_with_segmentable(&phrase_table, &phrase_index,
current_ucs4, output);
if ( state == CONTEXT_UNKNOWN )
deal_with_unknown(current_ucs4, output);
g_array_set_size(current_ucs4, 0);
}
/* print extra enter */
if ( gen_extra_enter )
fprintf(output, "%d \n", null_token);
g_free(sentence);
}
phrase_index.destroy_tokens(tokens);
/* print enter at file tail */
fprintf(output, "%d \n", null_token);
g_array_free(current_ucs4, TRUE);
free(linebuf);
fclose(input);
fclose(output);
return 0;
}