diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2009-02-07 16:32:56 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2009-02-07 16:32:56 +0000 |
commit | a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch) | |
tree | a966aeee62e69ae3ad13275d07ddb15049b14e0e /mkworddic/mkdic.c | |
download | anthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz |
anthy-9100hHEADanthy-9100hmaster
Diffstat (limited to 'mkworddic/mkdic.c')
-rw-r--r-- | mkworddic/mkdic.c | 1198 |
1 files changed, 1198 insertions, 0 deletions
diff --git a/mkworddic/mkdic.c b/mkworddic/mkdic.c new file mode 100644 index 0000000..7c99fe0 --- /dev/null +++ b/mkworddic/mkdic.c @@ -0,0 +1,1198 @@ +/* + * cannadic形式のファイルから辞書ファイルを作る + * + * Funded by IPA未踏ソフトウェア創造事業 2002 1/1 + * + * Copyright (C) 2000-2007 TABATA Yusuke + * Copyright (C) 2005 YOSHIDA Yuichi + * Copyright (C) 2001-2002 TAKAI Kousuke + */ +/* + * 辞書は読みをindexとし、品詞や変換後の単語(=entry)を検索 + * する構造になっている。 + * + * 読み -> 単語、単語、、 + * + * 辞書ファイルはネットワークバイトオーダーを用いる。 + * + * 辞書ファイルは複数のセクションから構成されている + * 0 ヘッダ 16*4 bytes + * 2 読みのインデックス (読み512個ごと) + * 3 読み + * 4 ページ + * 5 ページのインデックス + * 6 用例辞書(?) + * 7 読み hash + * + * source 元の辞書ファイル + * file_dic 生成するファイル + * + * yomi_hash 辞書ファイルに出力されるhashのbitmap + * index_hash このソース中でstruct yomi_entryを検索するためのhash + * + */ + +#include <sys/types.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <ctype.h> + +#include <config.h> + +#include <anthy/anthy.h> +#include <anthy/xstr.h> +#include <anthy/wtype.h> +#include <anthy/ruleparser.h> +#include <anthy/word_dic.h> +#include <anthy/diclib.h> +#include "mkdic.h" + +#define MAX_LINE_LEN 10240 +#define NR_HEADER_SECTIONS 16 +#define SECTION_ALIGNMENT 8 +#define MAX_WTYPE_LEN 20 + +#define DEFAULT_FN "anthy.wdic" + +static const char *progname; + +/* writewords.cからアクセスするために、global変数 */ +FILE *yomi_entry_index_out, *yomi_entry_out; +FILE *page_out, *page_index_out; +/**/ +static FILE *uc_out; +static FILE *yomi_hash_out; +/* ハッシュの衝突の数、統計情報 */ +static int yomi_hash_collision; + +/* ファイル中の順序に従って並べる */ +struct file_section { + FILE **fpp; + char *fn; +} file_array[] = { + {&yomi_entry_index_out, NULL}, + {&yomi_entry_out, NULL}, + {&page_out, NULL}, + {&page_index_out, NULL}, + {&uc_out, NULL}, + {&yomi_hash_out, NULL}, + {NULL, NULL}, +}; + +/* 辞書生成の状態 */ +struct mkdic_stat { + /* 単語のリスト */ + struct yomi_entry_list yl; + /**/ + struct adjust_command ac_list; + /* 用例辞書 */ + struct uc_dict *ud; + /**/ + const char *output_fn; + /**/ + int input_encoding; + /**/ + int nr_excluded; + char **excluded_wtypes; +}; + +/* 辞書の出力先のファイルをオープンする */ +static void +open_output_files(void) +{ + struct file_section *fs; + for (fs = file_array; fs->fpp; fs ++) { + char *tmpdir = getenv("TMPDIR"); + fs->fn = NULL; + if (tmpdir) { + /* tmpfile()がTMPDIRを見ないため、TMPDIRを指定された場合mkstempを使う。*/ + char buf[256]; + int fd = -1; + snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir); + fd = mkstemp(buf); + if (fd == -1) { + *(fs->fpp) = NULL; + } else { + *(fs->fpp) = fdopen(fd, "w+"); + fs->fn = strdup(buf); + } + } else { + *(fs->fpp) = tmpfile(); + } + /**/ + if (!(*(fs->fpp))) { + fprintf (stderr, "%s: cannot open temporary file: %s\n", + progname, strerror (errno)); + exit (2); + } + } +} + +/* fflushする */ +static void +flush_output_files (void) +{ + struct file_section *fs; + for (fs = file_array; fs->fpp; fs ++) { + if (ferror(*(fs->fpp))) { + fprintf (stderr, "%s: write error\n", progname); + exit (1); + } + } + for (fs = file_array; fs->fpp; fs ++) { + if (fflush(*(fs->fpp))) { + fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno)); + exit (1); + } + } +} + +/* ネットワークbyteorderで4bytes書き出す */ +void +write_nl(FILE *fp, int i) +{ + i = anthy_dic_htonl(i); + fwrite(&i, sizeof(int), 1, fp); +} + +static void +print_usage(void) +{ + printf("please do not use mkanthydic command directly.\n"); + exit(0); +} + +static char * +read_line(FILE *fp, char *buf) +{ + /* 長すぎる行を無視する */ + int toolong = 0; + + while (fgets(buf, MAX_LINE_LEN, fp)) { + int len = strlen(buf); + if (buf[0] == '#') { + continue ; + } + if (buf[len - 1] != '\n') { + toolong = 1; + continue ; + } + + buf[len - 1] = 0; + if (toolong) { + toolong = 0; + } else { + return buf; + } + } + return NULL; +} + +/** cannadic形式の辞書の行からindexとなる部分を取り出す */ +static xstr * +get_index_from_line(struct mkdic_stat *mds, char *buf) +{ + char *sp; + xstr *xs; + sp = strchr(buf, ' '); + if (!sp) { + /* 辞書のフォーマットがおかしい */ + return NULL; + } + *sp = 0; + xs = anthy_cstr_to_xstr(buf, mds->input_encoding); + *sp = ' '; + return xs; +} + +/** cannadic形式の辞書の行からindex以外の部分を取り出す */ +static char * +get_entry_from_line(char *buf) +{ + char *sp; + sp = strchr(buf, ' '); + while(*sp == ' ') { + sp ++; + } + return strdup(sp); +} + +static int +index_hash(xstr *xs) +{ + int i; + unsigned int h = 0; + for (i = 0; i < xs->len; i++) { + h += xs->str[i] * 11; + } + return (int)(h % YOMI_HASH); +} + +const char * +get_wt_name(const char *name) +{ + wtype_t dummy; + const char *res; + if (!strcmp(name, "#T35")) { + return "#T"; + } + res = anthy_type_to_wtype(name, &dummy); + if (!res) { + return "unknown"; + } + return res; +} + +/** 読みに対して、単語を一つを追加する */ +static void +push_back_word_entry(struct mkdic_stat *mds, + struct yomi_entry *ye, const char *wt_name, + int freq, const char *word, int order) +{ + wtype_t wt; + char *s; + if (freq == 0) { + return ; + } + if (!anthy_type_to_wtype(wt_name, &wt)) { + /* anthyの知らない品詞 */ + return ; + } + ye->entries = realloc(ye->entries, + sizeof(struct word_entry) * + (ye->nr_entries + 1)); + ye->entries[ye->nr_entries].ye = ye; + ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name); + ye->entries[ye->nr_entries].raw_freq = freq; + ye->entries[ye->nr_entries].feature = 0; + ye->entries[ye->nr_entries].source_order = order; + if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) { + s = anthy_conv_euc_to_utf8(word); + } else { + s = strdup(word); + } + ye->entries[ye->nr_entries].word_utf8 = s; + ye->nr_entries ++; +} + +static int +parse_wtype(char *wtbuf, char *cur) +{ + /* 品詞 */ + char *t; + int freq; + if (strlen(cur) >= MAX_WTYPE_LEN) { + return 0; + } + strcpy(wtbuf, cur); + /* 頻度 */ + t = strchr(wtbuf, '*'); + freq = 1; + if (t) { + int tmp_freq; + *t = 0; + t++; + tmp_freq = atoi(t); + if (tmp_freq) { + freq = tmp_freq; + } + } + return freq; +} + +/* 複合語の要素の長さは 1,2,3, ... 9,a,b,c */ +static int +get_element_len(xchar xc) +{ + if (xc > '0' && xc <= '9') { + return xc - '0'; + } + if (xc >= 'a' && xc <= 'z') { + return xc - 'a' + 10; + } + return 0; +} + +/** 複合候補の形式チェック */ +static int +check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur) +{ + /* 読みの文字数の合計を数える */ + xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding); + int i, total = 0; + for (i = 0; i < xs->len - 1; i++) { + if (xs->str[i] == '_') { + total += get_element_len(xs->str[i+1]); + } + } + anthy_free_xstr(xs); + /* 比較する */ + if (total != index->len) { + fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n", + cur, total); + return 0; + } + return 1; +} + +static int +is_excluded_wtype(struct mkdic_stat *mds, char *wt) +{ + int i; + for (i = 0; i < mds->nr_excluded; i++) { + if (!strcmp(mds->excluded_wtypes[i], wt)) { + return 1; + } + } + return 0; +} + +static char * +find_token_end(char *cur) +{ + char *n; + for (n = cur; *n != ' ' && *n != '\0'; n++) { + if (*n == '\\') { + if (!n[1]) { + return NULL; + } + n++; + } + } + return n; +} + +/** 読みに対応する行を分割して、配列を構成する */ +static void +push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye, + const char *ent) +{ + char *buf = alloca(strlen(ent) + 1); + char *cur = buf; + char *n; + char wtbuf[MAX_WTYPE_LEN]; + int freq = 0; + int order = 0; + + strcpy(buf, ent); + wtbuf[0] = 0; + + while (1) { + /* トークンを\0で切る。curの後の空白か\0を探す */ + n = find_token_end(cur); + if (!n) { + fprintf(stderr, "invalid \\ at the end of line (%s).\n", + ent); + return ; + } + if (*n) { + *n = 0; + } else { + n = NULL; + } + /**/ + if (cur[0] == '#') { + if (isalpha((unsigned char)cur[1])) { + /* #XX*?? をパース */ + freq = parse_wtype(wtbuf, cur); + } else { + if (cur[1] == '_' && + check_compound_candidate(mds, ye->index_xstr, &cur[1])) { + /* #_ 複合候補 */ + push_back_word_entry(mds, ye, wtbuf, freq, cur, order); + order ++; + } + } + } else { + /* 品詞が除去リストに入っているかをチェック */ + if (!is_excluded_wtype(mds, wtbuf)) { + /* 単語を追加 */ + push_back_word_entry(mds, ye, wtbuf, freq, cur, order); + order ++; + }/* :to extract excluded words + else { + anthy_putxstr(ye->index_xstr); + printf(" %s*%d %s\n", wtbuf, freq, cur); + }*/ + } + if (!n) { + /* 行末 */ + return ; + } + cur = n; + cur ++; + } +} + +/** 同じ単語が無いかチェック */ +static int +check_same_word(struct yomi_entry *ye, int idx) +{ + struct word_entry *base = &ye->entries[idx]; + int i; + for (i = idx -1; i >= 0; i--) { + struct word_entry *cur = &ye->entries[i]; + if (base->raw_freq != cur->raw_freq) { + return 0; + } + if (strcmp(base->wt_name, cur->wt_name)) { + return 0; + } + if (strcmp(base->word_utf8, cur->word_utf8)) { + return 0; + } + /* 同じだった */ + return 1; + } + return 0; +} + +/** qsort用の比較関数 */ +static int +compare_word_entry_by_freq(const void *p1, const void *p2) +{ + const struct word_entry *e1 = p1; + const struct word_entry *e2 = p2; + return e2->raw_freq - e1->raw_freq; +} + +/** qsort用の比較関数 */ +static int +compare_word_entry_by_wtype(const void *p1, const void *p2) +{ + const struct word_entry *e1 = p1; + const struct word_entry *e2 = p2; + int ret = strcmp(e1->wt_name, e2->wt_name); + if (ret != 0) { + return ret; + } else { + return compare_word_entry_by_freq(p1, p2); + } +} + +/** 読みに対する単語を頻度順に並べ、いらない単語を消す */ +static int +normalize_word_entry(struct yomi_entry *ye) +{ + int i, nr_dup = 0; + if (!ye) { + return 0; + } + /* 単語を並べる */ + qsort(ye->entries, ye->nr_entries, + sizeof(struct word_entry), + compare_word_entry_by_freq); + /* ダブったら、0点 */ + for (i = 0; i < ye->nr_entries; i++) { + if (check_same_word(ye, i)) { + ye->entries[i].raw_freq = 0; + nr_dup ++; + } + } + /* 再びソート */ + qsort(ye->entries, ye->nr_entries, + sizeof(struct word_entry), + compare_word_entry_by_wtype); + return ye->nr_entries - nr_dup; +} + +/*その読みに対応するyomi_entryを返す +**/ +struct yomi_entry * +find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create) +{ + struct yomi_entry *ye; + int hash = index_hash(index); + int search = 0; + /* hash chainから探す */ + for (ye = yl->hash[hash];ye ; ye = ye->hash_next) { + search ++; + if (!anthy_xstrcmp(ye->index_xstr, index)) { + return ye; + } + } + if (!create) { + return NULL; + } + + /* 無いので確保 */ + ye = malloc(sizeof(struct yomi_entry)); + ye->nr_entries = 0; + ye->entries = 0; + ye->next = NULL; + ye->index_xstr = anthy_xstr_dup(index); + ye->index_str = NULL; + + /* hash chainにつなぐ */ + ye->hash_next = yl->hash[hash]; + yl->hash[hash] = ye; + + /* リストにつなぐ */ + + ye->next = yl->head; + yl->head = ye; + + yl->nr_entries ++; + + return ye; +} + +/* 辞書ファイル中のhash bitmapにマークを付ける */ +static void +mark_hash_array(unsigned char *hash_array, xstr *xs) +{ + int val, idx, bit, mask; + val = anthy_xstr_hash(xs); + val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1); + idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1); + bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1); + mask = (1<<bit); + if (hash_array[idx] & mask) { + yomi_hash_collision ++; + } + hash_array[idx] |= mask; +} + +/* 読みhashのビットマップを作る */ +static void +mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl) +{ + unsigned char *hash_array; + int i; + struct yomi_entry *ye; + hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE); + for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) { + hash_array[i] = 0; + } + for (i = 0; i < yl->nr_valid_entries; i++) { + ye = yl->ye_array[i]; + mark_hash_array(hash_array, ye->index_xstr); + } + fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out); + printf("generated yomi hash bitmap (%d collisions/%d entries)\n", + yomi_hash_collision, yl->nr_valid_entries); + +} + +static struct adjust_command * +parse_modify_freq_command(const char *buf) +{ + char *line = alloca(strlen(buf) + 1); + char *yomi, *wt, *word, *type_str; + struct adjust_command *cmd; + int type = 0; + strcpy(line, buf); + yomi = strtok(line, " "); + wt = strtok(NULL, " "); + word = strtok(NULL, " "); + type_str = strtok(NULL, " "); + if (!yomi || !wt || !word || !type_str) { + return NULL; + } + if (!strcmp(type_str, "up")) { + type = ADJUST_FREQ_UP; + } + if (!strcmp(type_str, "down")) { + type = ADJUST_FREQ_DOWN; + } + if (!strcmp(type_str, "kill")) { + type = ADJUST_FREQ_KILL; + } + if (!type) { + return NULL; + } + cmd = malloc(sizeof(struct adjust_command)); + cmd->type = type; + cmd->yomi = anthy_cstr_to_xstr(yomi, ANTHY_EUC_JP_ENCODING); + cmd->wt = get_wt_name(wt); + cmd->word = anthy_conv_euc_to_utf8(word); + return cmd; +} + +static void +parse_adjust_command(const char *buf, struct adjust_command *ac_list) +{ + struct adjust_command *cmd = NULL; + if (!strncmp("\\modify_freq ", buf, 13)) { + cmd = parse_modify_freq_command(&buf[13]); + } + if (cmd) { + cmd->next = ac_list->next; + ac_list->next = cmd; + } +} + +/** 辞書を一行ずつ読み込んでリストを作る + * このコマンドのコア */ +static void +parse_dict_file(FILE *fin, struct mkdic_stat *mds) +{ + xstr *index_xs; + char buf[MAX_LINE_LEN]; + char *ent; + struct yomi_entry *ye = NULL; + + /* 1行ずつ処理 */ + while (read_line(fin, buf)) { + if (buf[0] == '\\' && buf[1] != ' ') { + parse_adjust_command(buf, &mds->ac_list); + continue ; + } + index_xs = get_index_from_line(mds, buf); + if (!index_xs) { + break; + } + ent = get_entry_from_line(buf); + + /* 読みが30文字を越える場合は無視 */ + if (index_xs->len < 31) { + ye = find_yomi_entry(&mds->yl, index_xs, 1); + push_back_word_entry_line(mds, ye, ent); + } + + free(ent); + anthy_free_xstr(index_xs); + } +} + +/* 読み、品詞、単語の三つ組から単語の構造体を取得する */ +static struct word_entry * +find_word_entry(struct yomi_entry_list *yl, xstr *yomi, + const char *wt, char *word) +{ + struct yomi_entry *ye = find_yomi_entry(yl, yomi, 0); + int i; + if (!ye) { + return NULL; + } + for (i = 0; i < ye->nr_entries; i++) { + struct word_entry *we = &ye->entries[i]; + if (!strcmp(we->wt_name, wt) && + !strcmp(we->word_utf8, word)) { + return we; + } + } + return NULL; +} + +/* 頻度調整のコマンドを適用する */ +static void +apply_adjust_command(struct yomi_entry_list *yl, + struct adjust_command *ac_list) +{ + struct adjust_command *cmd; + for (cmd = ac_list->next; cmd; cmd = cmd->next) { + struct word_entry *we = find_word_entry(yl, cmd->yomi, + cmd->wt, cmd->word); + if (!we) { + char *yomi = anthy_xstr_to_cstr(cmd->yomi, ANTHY_UTF8_ENCODING); + printf("failed to find target of adjust command (%s, %s, %s)\n", + yomi, cmd->wt, cmd->word); + free(yomi); + continue; + } + if (cmd->type == ADJUST_FREQ_UP) { + we->raw_freq *= 4; + } + if (cmd->type == ADJUST_FREQ_DOWN) { + we->raw_freq /= 4; + if (we->raw_freq == 0) { + we->raw_freq = 1; + } + } + if (cmd->type == ADJUST_FREQ_KILL) { + we->raw_freq = 0; + } + } +} + +/* qsort用の比較関数 */ +static int +compare_yomi_entry(const void *p1, const void *p2) +{ + const struct yomi_entry *const *y1 = p1; + const struct yomi_entry *const *y2 = p2; + return strcmp((*y1)->index_str, (*y2)->index_str); +} + +/* yomi_entryでsortする */ +static void +sort_word_dict(struct yomi_entry_list *yl) +{ + int i; + struct yomi_entry *ye; + yl->nr_valid_entries = 0; + /* 単語を持つ読みだけを yl->ye_arrayに詰め直す */ + yl->ye_array = malloc(sizeof(struct yomi_entry *) * yl->nr_entries); + for (i = 0, ye = yl->head; i < yl->nr_entries; i++, ye = ye->next) { + if (ye->nr_entries > 0) { + yl->ye_array[yl->nr_valid_entries] = ye; + yl->nr_valid_entries ++; + } + } + /**/ + for (i = 0; i < yl->nr_valid_entries; i++) { + struct yomi_entry *ye = yl->ye_array[i]; + ye->index_str = anthy_xstr_to_cstr(ye->index_xstr, yl->index_encoding); + } + /* ソートする */ + qsort(yl->ye_array, yl->nr_valid_entries, + sizeof(struct yomi_entry *), + compare_yomi_entry); + /* 不要な単語を消す */ + yl->nr_words = 0; + for (i = 0; i < yl->nr_valid_entries; i++) { + struct yomi_entry *ye = yl->ye_array[i]; + yl->nr_words += normalize_word_entry(ye); + } +} + +/** ファイルのサイズを取得する */ +static int +get_file_size(FILE *fp) +{ + if (!fp) { + return 0; + } + return (ftell (fp) + SECTION_ALIGNMENT - 1) & (-SECTION_ALIGNMENT); +} + +static void +copy_file(struct mkdic_stat *mds, FILE *in, FILE *out) +{ + int i; + size_t nread; + char buf[BUFSIZ]; + + /* Pad OUT to the next aligned offset. */ + for (i = ftell (out); i & (SECTION_ALIGNMENT - 1); i++) { + fputc (0, out); + } + + /* Copy the contents. */ + rewind (in); + while ((nread = fread (buf, 1, sizeof buf, in)) > 0) { + if (fwrite (buf, 1, nread, out) < nread) { + /* Handle short write (maybe disk full). */ + fprintf (stderr, "%s: %s: write error: %s\n", + progname, mds->output_fn, strerror (errno)); + exit (1); + } + } +} + +static void +generate_header(FILE *fp) +{ + int buf[NR_HEADER_SECTIONS]; + int i; + struct file_section *fs; + int off; + + /* 初期化 */ + for (i = 0; i < NR_HEADER_SECTIONS; i++) { + buf[i] = 0; + } + + /* ヘッダ */ + buf[0] = NR_HEADER_SECTIONS * sizeof(int); + buf[1] = 0; + + /* 各セクションのオフセット */ + off = buf[0]; + for (i = 2, fs = file_array; fs->fpp; fs ++, i++) { + buf[i] = off; + off += get_file_size(*(fs->fpp)); + } + + /* ファイルへ出力する */ + for (i = 0; i < NR_HEADER_SECTIONS; i++) { + write_nl(fp, buf[i]); + } +} + +/* 各セクションのファイルをマージして、ひとつの辞書ファイルを作る */ +static void +link_dics(struct mkdic_stat *mds) +{ + FILE *fp; + struct file_section *fs; + + fp = fopen (mds->output_fn, "w"); + if (!fp) { + fprintf (stderr, "%s: %s: cannot create: %s\n", + progname, mds->output_fn, strerror (errno)); + exit (1); + } + + /* ヘッダを出力する */ + generate_header(fp); + + for (fs = file_array; fs->fpp; fs ++) { + /* 各セクションのファイルを結合する */ + copy_file(mds, *(fs->fpp), fp); + if (fs->fn) { + unlink(fs->fn); + } + } + + if (fclose (fp)) { + fprintf (stderr, "%s: %s: write error: %s\n", + progname, mds->output_fn, strerror (errno)); + exit (1); + } +} + +static void +read_dict_file(struct mkdic_stat *mds, const char *fn) +{ + FILE *fp; + /* ファイル名が指定されたので読み込む */ + fp = fopen(fn, "r"); + if (fp) { + printf("file = %s\n", fn); + parse_dict_file(fp, mds); + fclose(fp); + } else { + printf("failed file = %s\n", fn); + } +} + +static void +complete_words(struct mkdic_stat *mds) +{ + /* 頻度補正を適用する */ + apply_adjust_command(&mds->yl, &mds->ac_list); + + /**/ + calc_freq(&mds->yl); + + /* 読みで並び替える */ + sort_word_dict(&mds->yl); + + /* ファイルを準備する */ + open_output_files(); + /* 単語辞書を出力する */ + output_word_dict(&mds->yl); + + /* 読みハッシュを作る */ + mk_yomi_hash(yomi_hash_out, &mds->yl); +} + +static void +read_udict_file(struct mkdic_stat *mds, const char *fn) +{ + if (!mds->ud) { + mds->ud = create_uc_dict(); + complete_words(mds); + } + read_uc_file(mds->ud, fn); + printf("uc = %s\n", fn); +} + +static xstr* +xstr_strncat(xstr* xs, xchar* src, int n) +{ + int i; + xs->str = realloc(xs->str, sizeof(xchar) * (xs->len + n + 1)); + + for (i = 0; i < n; ++i) { + xs->str[xs->len + i] = src[i]; + } + xs->len += n; + return xs; +} + +static void +reverse_multi_segment_word(struct mkdic_stat *mds, struct word_entry *we) +{ + /* + 「かなかんじへんかんえんじん #T35 #_2仮名_3漢字_4変換_4エンジン」 + から + 「仮名漢字変換エンジン #T35 #_2かな_2かんじ_2へんかん_4えんじん」 + を作る + */ + int j; + /* yomiは仮名漢字混じり wordは平仮名のみからなる */ + int yomi_seg_start = 0; + int word_seg_start = 0; + int word_seg_len = 0; + xstr *yomibuf = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING); + xstr *wordbuf = we->ye->index_xstr; + xstr *yomi_xs = anthy_cstr_to_xstr("", 0); + xstr *word_xs = anthy_cstr_to_xstr("#", 0); + char *word; + char ch[256]; + struct yomi_entry *target_ye; + + for (j = 0; j <= yomibuf->len; ++j) { + if (j == yomibuf->len || yomibuf->str[j] == '_') { + if (yomi_seg_start != 0) { + anthy_xstrappend(word_xs, '_'); + snprintf(ch, 256, "%x", j - yomi_seg_start); + anthy_xstrappend(word_xs, (xchar)ch[0]); + xstr_strncat(word_xs, &wordbuf->str[word_seg_start], word_seg_len); + xstr_strncat(yomi_xs, &yomibuf->str[yomi_seg_start], j - yomi_seg_start); + } + if (j == yomibuf->len) { + break; + } + yomi_seg_start = j + 2; + word_seg_start += word_seg_len; + word_seg_len = get_element_len(yomibuf->str[j + 1]); + } + } + + target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1); + word = anthy_xstr_to_cstr(word_xs, mds->input_encoding); + + /* 逆変換用の辞書はfreqが負 */ + push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq, + word, we->source_order); + + free(word); + anthy_free_xstr(yomibuf); + anthy_free_xstr(yomi_xs); + anthy_free_xstr(word_xs); +} + +/* 逆変換用の辞書を作る */ +static void +build_reverse_dict(struct mkdic_stat *mds) +{ + struct yomi_entry *ye; + int i, n; + struct word_entry *we_array; + printf("building reverse index\n"); + + /* 単語の数を数える */ + n = 0; + for (ye = mds->yl.head; ye; ye = ye->next) { + for (i = 0; i < ye->nr_entries; i++) { + n++; + } + } + /* コピーする + * (元の辞書中のポインタはreallocで動くのでコピーが必要) + */ + we_array = malloc(sizeof(struct word_entry )* n); + n = 0; + for (ye = mds->yl.head; ye; ye = ye->next) { + for (i = 0; i < ye->nr_entries; i++) { + we_array[n] = ye->entries[i]; + n++; + } + } + + /* 辞書に追加していく */ + for (i = 0; i < n; i++) { + struct word_entry *we; + struct yomi_entry *target_ye; + + we = &we_array[i]; + if (we->word_utf8[0] == '#') { + if (we->word_utf8[1] == '_') { + reverse_multi_segment_word(mds, we); + } + } else { + /* yomiは仮名漢字混じり wordは平仮名のみからなる */ + xstr *yomi_xs; + char *word; + + yomi_xs = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING); + target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1); + word = anthy_xstr_to_cstr(we->ye->index_xstr, mds->input_encoding); + + /* 逆変換用の辞書はfreqが負 */ + push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq, + word, we->source_order); + + anthy_free_xstr(yomi_xs); + free(word); + } + } + /**/ + free(we_array); +} + +static void +clear_exclude_wtypes(struct mkdic_stat *mds) +{ + int i; + for (i = 0; i < mds->nr_excluded; i++) { + free(mds->excluded_wtypes[i]); + } + free(mds->excluded_wtypes); + /**/ + mds->excluded_wtypes = NULL; + mds->nr_excluded = 0; +} + +static void +set_exclude_wtypes(struct mkdic_stat *mds, int nr, char **tokens) +{ + int i; + mds->nr_excluded = nr - 1; + mds->excluded_wtypes = malloc(sizeof(char *) * (nr - 1)); + /**/ + for (i = 1; i < nr; i++) { + mds->excluded_wtypes[i - 1] = strdup(tokens[i]); + } +} + +static void +set_dict_encoding(struct mkdic_stat *mds, const char *enc) +{ + if (!strcmp(enc, "utf8")) { + mds->yl.body_encoding = ANTHY_UTF8_ENCODING; + } +} + +static void +set_input_encoding(struct mkdic_stat *mds, const char *enc) +{ + if (!strcmp(enc, "utf8")) { + mds->input_encoding = ANTHY_UTF8_ENCODING; + } + if (!strcmp(enc, "eucjp")) { + mds->input_encoding = ANTHY_EUC_JP_ENCODING; + } +} + +static void +write_dict_file(struct mkdic_stat *mds) +{ + if (!mds->ud) { + printf("can not build without use case dict\n"); + exit(1); + } + + /* 用例辞書を作る */ + make_ucdict(uc_out, mds->ud); + + /* 辞書ファイルにまとめる */ + flush_output_files(); + link_dics(mds); +} + +static void +show_command(char **tokens, int nr) +{ + int i; + printf("cmd:"); + for (i = 0; i < nr; i++) { + printf(" %s", tokens[i]); + } + printf("\n"); +} + +static int +execute_batch(struct mkdic_stat *mds, const char *fn) +{ + int nr; + char **tokens; + if (anthy_open_file(fn)) { + printf("mkanthydic: failed to open %s\n", fn); + return 1; + } + while (!anthy_read_line(&tokens, &nr)) { + char *cmd = tokens[0]; + show_command(tokens, nr); + if (!strcmp(cmd, "read") && nr == 2) { + read_dict_file(mds, tokens[1]); + } else if (!strcmp(cmd, "read_uc") && nr == 2) { + read_udict_file(mds, tokens[1]); + } else if (!strcmp(cmd, "build_reverse_dict")) { + build_reverse_dict(mds); + } else if (!strcmp(cmd, "write")) { + write_dict_file(mds); + } else if (!strcmp(cmd, "set_exclude_wtypes")) { + set_exclude_wtypes(mds, nr, tokens); + } else if (!strcmp(cmd, "clear_exclude_wtypes")) { + clear_exclude_wtypes(mds); + } else if (!strcmp(cmd, "set_dict_encoding") && nr == 2) { + set_dict_encoding(mds, tokens[1]); + } else if (!strcmp(cmd, "set_input_encoding") && nr == 2) { + set_input_encoding(mds, tokens[1]); + } else if (!strcmp(cmd, "done")) { + anthy_free_line(); + break; + } else { + printf("Unknown command(%s).\n", cmd); + } + anthy_free_line(); + } + anthy_close_file(); + return 0; +} + +/* 辞書生成のための変数の初期化 */ +static void +init_mds(struct mkdic_stat *mds) +{ + int i; + mds->output_fn = DEFAULT_FN; + mds->ud = NULL; + + /* 単語辞書を初期化する */ + mds->yl.head = NULL; + mds->yl.nr_entries = 0; + for (i = 0; i < YOMI_HASH; i++) { + mds->yl.hash[i] = NULL; + } + mds->yl.index_encoding = ANTHY_UTF8_ENCODING; + mds->yl.body_encoding = ANTHY_EUC_JP_ENCODING; + /**/ + mds->ac_list.next = NULL; + /**/ + mds->input_encoding = ANTHY_EUC_JP_ENCODING; + /**/ + mds->nr_excluded = 0; + mds->excluded_wtypes = NULL; +} + +/* libanthyの使用する部分だけを初期化する */ +static void +init_libs(void) +{ + int res; + res = anthy_init_xstr(); + if (res == -1) { + fprintf (stderr, "failed to init dic lib\n"); + exit(1); + } +} + +/**/ +int +main(int argc, char **argv) +{ + struct mkdic_stat mds; + int i; + char *script_fn = NULL; + int help_mode = 0; + + anthy_init_wtypes(); + init_libs(); + init_mds(&mds); + + for (i = 1; i < argc; i++) { + char *arg = argv[i]; + char *prev_arg = argv[i-1]; + if (!strcmp(arg, "--help")) { + help_mode = 1; + } + if (!strcmp(prev_arg, "-f")) { + script_fn = arg; + } + } + + if (help_mode || !script_fn) { + print_usage(); + } + + return execute_batch(&mds, script_fn); +} |