summaryrefslogtreecommitdiff
path: root/mkworddic/mkdic.c
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2009-02-07 16:32:56 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2009-02-07 16:32:56 +0000
commita7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch)
treea966aeee62e69ae3ad13275d07ddb15049b14e0e /mkworddic/mkdic.c
downloadanthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz
Diffstat (limited to 'mkworddic/mkdic.c')
-rw-r--r--mkworddic/mkdic.c1198
1 files changed, 1198 insertions, 0 deletions
diff --git a/mkworddic/mkdic.c b/mkworddic/mkdic.c
new file mode 100644
index 0000000..7c99fe0
--- /dev/null
+++ b/mkworddic/mkdic.c
@@ -0,0 +1,1198 @@
+/*
+ * cannadic形式のファイルから辞書ファイルを作る
+ *
+ * Funded by IPA未踏ソフトウェア創造事業 2002 1/1
+ *
+ * Copyright (C) 2000-2007 TABATA Yusuke
+ * Copyright (C) 2005 YOSHIDA Yuichi
+ * Copyright (C) 2001-2002 TAKAI Kousuke
+ */
+/*
+ * 辞書は読みをindexとし、品詞や変換後の単語(=entry)を検索
+ * する構造になっている。
+ *
+ * 読み -> 単語、単語、、
+ *
+ * 辞書ファイルはネットワークバイトオーダーを用いる。
+ *
+ * 辞書ファイルは複数のセクションから構成されている
+ * 0 ヘッダ 16*4 bytes
+ * 2 読みのインデックス (読み512個ごと)
+ * 3 読み
+ * 4 ページ
+ * 5 ページのインデックス
+ * 6 用例辞書(?)
+ * 7 読み hash
+ *
+ * source 元の辞書ファイル
+ * file_dic 生成するファイル
+ *
+ * yomi_hash 辞書ファイルに出力されるhashのbitmap
+ * index_hash このソース中でstruct yomi_entryを検索するためのhash
+ *
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <ctype.h>
+
+#include <config.h>
+
+#include <anthy/anthy.h>
+#include <anthy/xstr.h>
+#include <anthy/wtype.h>
+#include <anthy/ruleparser.h>
+#include <anthy/word_dic.h>
+#include <anthy/diclib.h>
+#include "mkdic.h"
+
+#define MAX_LINE_LEN 10240
+#define NR_HEADER_SECTIONS 16
+#define SECTION_ALIGNMENT 8
+#define MAX_WTYPE_LEN 20
+
+#define DEFAULT_FN "anthy.wdic"
+
+static const char *progname;
+
+/* writewords.cからアクセスするために、global変数 */
+FILE *yomi_entry_index_out, *yomi_entry_out;
+FILE *page_out, *page_index_out;
+/**/
+static FILE *uc_out;
+static FILE *yomi_hash_out;
+/* ハッシュの衝突の数、統計情報 */
+static int yomi_hash_collision;
+
+/* ファイル中の順序に従って並べる */
+struct file_section {
+ FILE **fpp;
+ char *fn;
+} file_array[] = {
+ {&yomi_entry_index_out, NULL},
+ {&yomi_entry_out, NULL},
+ {&page_out, NULL},
+ {&page_index_out, NULL},
+ {&uc_out, NULL},
+ {&yomi_hash_out, NULL},
+ {NULL, NULL},
+};
+
+/* 辞書生成の状態 */
+struct mkdic_stat {
+ /* 単語のリスト */
+ struct yomi_entry_list yl;
+ /**/
+ struct adjust_command ac_list;
+ /* 用例辞書 */
+ struct uc_dict *ud;
+ /**/
+ const char *output_fn;
+ /**/
+ int input_encoding;
+ /**/
+ int nr_excluded;
+ char **excluded_wtypes;
+};
+
+/* 辞書の出力先のファイルをオープンする */
+static void
+open_output_files(void)
+{
+ struct file_section *fs;
+ for (fs = file_array; fs->fpp; fs ++) {
+ char *tmpdir = getenv("TMPDIR");
+ fs->fn = NULL;
+ if (tmpdir) {
+ /* tmpfile()がTMPDIRを見ないため、TMPDIRを指定された場合mkstempを使う。*/
+ char buf[256];
+ int fd = -1;
+ snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
+ fd = mkstemp(buf);
+ if (fd == -1) {
+ *(fs->fpp) = NULL;
+ } else {
+ *(fs->fpp) = fdopen(fd, "w+");
+ fs->fn = strdup(buf);
+ }
+ } else {
+ *(fs->fpp) = tmpfile();
+ }
+ /**/
+ if (!(*(fs->fpp))) {
+ fprintf (stderr, "%s: cannot open temporary file: %s\n",
+ progname, strerror (errno));
+ exit (2);
+ }
+ }
+}
+
+/* fflushする */
+static void
+flush_output_files (void)
+{
+ struct file_section *fs;
+ for (fs = file_array; fs->fpp; fs ++) {
+ if (ferror(*(fs->fpp))) {
+ fprintf (stderr, "%s: write error\n", progname);
+ exit (1);
+ }
+ }
+ for (fs = file_array; fs->fpp; fs ++) {
+ if (fflush(*(fs->fpp))) {
+ fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
+ exit (1);
+ }
+ }
+}
+
+/* ネットワークbyteorderで4bytes書き出す */
+void
+write_nl(FILE *fp, int i)
+{
+ i = anthy_dic_htonl(i);
+ fwrite(&i, sizeof(int), 1, fp);
+}
+
+static void
+print_usage(void)
+{
+ printf("please do not use mkanthydic command directly.\n");
+ exit(0);
+}
+
+static char *
+read_line(FILE *fp, char *buf)
+{
+ /* 長すぎる行を無視する */
+ int toolong = 0;
+
+ while (fgets(buf, MAX_LINE_LEN, fp)) {
+ int len = strlen(buf);
+ if (buf[0] == '#') {
+ continue ;
+ }
+ if (buf[len - 1] != '\n') {
+ toolong = 1;
+ continue ;
+ }
+
+ buf[len - 1] = 0;
+ if (toolong) {
+ toolong = 0;
+ } else {
+ return buf;
+ }
+ }
+ return NULL;
+}
+
+/** cannadic形式の辞書の行からindexとなる部分を取り出す */
+static xstr *
+get_index_from_line(struct mkdic_stat *mds, char *buf)
+{
+ char *sp;
+ xstr *xs;
+ sp = strchr(buf, ' ');
+ if (!sp) {
+ /* 辞書のフォーマットがおかしい */
+ return NULL;
+ }
+ *sp = 0;
+ xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
+ *sp = ' ';
+ return xs;
+}
+
+/** cannadic形式の辞書の行からindex以外の部分を取り出す */
+static char *
+get_entry_from_line(char *buf)
+{
+ char *sp;
+ sp = strchr(buf, ' ');
+ while(*sp == ' ') {
+ sp ++;
+ }
+ return strdup(sp);
+}
+
+static int
+index_hash(xstr *xs)
+{
+ int i;
+ unsigned int h = 0;
+ for (i = 0; i < xs->len; i++) {
+ h += xs->str[i] * 11;
+ }
+ return (int)(h % YOMI_HASH);
+}
+
+const char *
+get_wt_name(const char *name)
+{
+ wtype_t dummy;
+ const char *res;
+ if (!strcmp(name, "#T35")) {
+ return "#T";
+ }
+ res = anthy_type_to_wtype(name, &dummy);
+ if (!res) {
+ return "unknown";
+ }
+ return res;
+}
+
+/** 読みに対して、単語を一つを追加する */
+static void
+push_back_word_entry(struct mkdic_stat *mds,
+ struct yomi_entry *ye, const char *wt_name,
+ int freq, const char *word, int order)
+{
+ wtype_t wt;
+ char *s;
+ if (freq == 0) {
+ return ;
+ }
+ if (!anthy_type_to_wtype(wt_name, &wt)) {
+ /* anthyの知らない品詞 */
+ return ;
+ }
+ ye->entries = realloc(ye->entries,
+ sizeof(struct word_entry) *
+ (ye->nr_entries + 1));
+ ye->entries[ye->nr_entries].ye = ye;
+ ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
+ ye->entries[ye->nr_entries].raw_freq = freq;
+ ye->entries[ye->nr_entries].feature = 0;
+ ye->entries[ye->nr_entries].source_order = order;
+ if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
+ s = anthy_conv_euc_to_utf8(word);
+ } else {
+ s = strdup(word);
+ }
+ ye->entries[ye->nr_entries].word_utf8 = s;
+ ye->nr_entries ++;
+}
+
+static int
+parse_wtype(char *wtbuf, char *cur)
+{
+ /* 品詞 */
+ char *t;
+ int freq;
+ if (strlen(cur) >= MAX_WTYPE_LEN) {
+ return 0;
+ }
+ strcpy(wtbuf, cur);
+ /* 頻度 */
+ t = strchr(wtbuf, '*');
+ freq = 1;
+ if (t) {
+ int tmp_freq;
+ *t = 0;
+ t++;
+ tmp_freq = atoi(t);
+ if (tmp_freq) {
+ freq = tmp_freq;
+ }
+ }
+ return freq;
+}
+
+/* 複合語の要素の長さは 1,2,3, ... 9,a,b,c */
+static int
+get_element_len(xchar xc)
+{
+ if (xc > '0' && xc <= '9') {
+ return xc - '0';
+ }
+ if (xc >= 'a' && xc <= 'z') {
+ return xc - 'a' + 10;
+ }
+ return 0;
+}
+
+/** 複合候補の形式チェック */
+static int
+check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
+{
+ /* 読みの文字数の合計を数える */
+ xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
+ int i, total = 0;
+ for (i = 0; i < xs->len - 1; i++) {
+ if (xs->str[i] == '_') {
+ total += get_element_len(xs->str[i+1]);
+ }
+ }
+ anthy_free_xstr(xs);
+ /* 比較する */
+ if (total != index->len) {
+ fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
+ cur, total);
+ return 0;
+ }
+ return 1;
+}
+
+static int
+is_excluded_wtype(struct mkdic_stat *mds, char *wt)
+{
+ int i;
+ for (i = 0; i < mds->nr_excluded; i++) {
+ if (!strcmp(mds->excluded_wtypes[i], wt)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static char *
+find_token_end(char *cur)
+{
+ char *n;
+ for (n = cur; *n != ' ' && *n != '\0'; n++) {
+ if (*n == '\\') {
+ if (!n[1]) {
+ return NULL;
+ }
+ n++;
+ }
+ }
+ return n;
+}
+
+/** 読みに対応する行を分割して、配列を構成する */
+static void
+push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
+ const char *ent)
+{
+ char *buf = alloca(strlen(ent) + 1);
+ char *cur = buf;
+ char *n;
+ char wtbuf[MAX_WTYPE_LEN];
+ int freq = 0;
+ int order = 0;
+
+ strcpy(buf, ent);
+ wtbuf[0] = 0;
+
+ while (1) {
+ /* トークンを\0で切る。curの後の空白か\0を探す */
+ n = find_token_end(cur);
+ if (!n) {
+ fprintf(stderr, "invalid \\ at the end of line (%s).\n",
+ ent);
+ return ;
+ }
+ if (*n) {
+ *n = 0;
+ } else {
+ n = NULL;
+ }
+ /**/
+ if (cur[0] == '#') {
+ if (isalpha((unsigned char)cur[1])) {
+ /* #XX*?? をパース */
+ freq = parse_wtype(wtbuf, cur);
+ } else {
+ if (cur[1] == '_' &&
+ check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
+ /* #_ 複合候補 */
+ push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
+ order ++;
+ }
+ }
+ } else {
+ /* 品詞が除去リストに入っているかをチェック */
+ if (!is_excluded_wtype(mds, wtbuf)) {
+ /* 単語を追加 */
+ push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
+ order ++;
+ }/* :to extract excluded words
+ else {
+ anthy_putxstr(ye->index_xstr);
+ printf(" %s*%d %s\n", wtbuf, freq, cur);
+ }*/
+ }
+ if (!n) {
+ /* 行末 */
+ return ;
+ }
+ cur = n;
+ cur ++;
+ }
+}
+
+/** 同じ単語が無いかチェック */
+static int
+check_same_word(struct yomi_entry *ye, int idx)
+{
+ struct word_entry *base = &ye->entries[idx];
+ int i;
+ for (i = idx -1; i >= 0; i--) {
+ struct word_entry *cur = &ye->entries[i];
+ if (base->raw_freq != cur->raw_freq) {
+ return 0;
+ }
+ if (strcmp(base->wt_name, cur->wt_name)) {
+ return 0;
+ }
+ if (strcmp(base->word_utf8, cur->word_utf8)) {
+ return 0;
+ }
+ /* 同じだった */
+ return 1;
+ }
+ return 0;
+}
+
+/** qsort用の比較関数 */
+static int
+compare_word_entry_by_freq(const void *p1, const void *p2)
+{
+ const struct word_entry *e1 = p1;
+ const struct word_entry *e2 = p2;
+ return e2->raw_freq - e1->raw_freq;
+}
+
+/** qsort用の比較関数 */
+static int
+compare_word_entry_by_wtype(const void *p1, const void *p2)
+{
+ const struct word_entry *e1 = p1;
+ const struct word_entry *e2 = p2;
+ int ret = strcmp(e1->wt_name, e2->wt_name);
+ if (ret != 0) {
+ return ret;
+ } else {
+ return compare_word_entry_by_freq(p1, p2);
+ }
+}
+
+/** 読みに対する単語を頻度順に並べ、いらない単語を消す */
+static int
+normalize_word_entry(struct yomi_entry *ye)
+{
+ int i, nr_dup = 0;
+ if (!ye) {
+ return 0;
+ }
+ /* 単語を並べる */
+ qsort(ye->entries, ye->nr_entries,
+ sizeof(struct word_entry),
+ compare_word_entry_by_freq);
+ /* ダブったら、0点 */
+ for (i = 0; i < ye->nr_entries; i++) {
+ if (check_same_word(ye, i)) {
+ ye->entries[i].raw_freq = 0;
+ nr_dup ++;
+ }
+ }
+ /* 再びソート */
+ qsort(ye->entries, ye->nr_entries,
+ sizeof(struct word_entry),
+ compare_word_entry_by_wtype);
+ return ye->nr_entries - nr_dup;
+}
+
+/*その読みに対応するyomi_entryを返す
+**/
+struct yomi_entry *
+find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
+{
+ struct yomi_entry *ye;
+ int hash = index_hash(index);
+ int search = 0;
+ /* hash chainから探す */
+ for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
+ search ++;
+ if (!anthy_xstrcmp(ye->index_xstr, index)) {
+ return ye;
+ }
+ }
+ if (!create) {
+ return NULL;
+ }
+
+ /* 無いので確保 */
+ ye = malloc(sizeof(struct yomi_entry));
+ ye->nr_entries = 0;
+ ye->entries = 0;
+ ye->next = NULL;
+ ye->index_xstr = anthy_xstr_dup(index);
+ ye->index_str = NULL;
+
+ /* hash chainにつなぐ */
+ ye->hash_next = yl->hash[hash];
+ yl->hash[hash] = ye;
+
+ /* リストにつなぐ */
+
+ ye->next = yl->head;
+ yl->head = ye;
+
+ yl->nr_entries ++;
+
+ return ye;
+}
+
+/* 辞書ファイル中のhash bitmapにマークを付ける */
+static void
+mark_hash_array(unsigned char *hash_array, xstr *xs)
+{
+ int val, idx, bit, mask;
+ val = anthy_xstr_hash(xs);
+ val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
+ idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
+ bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
+ mask = (1<<bit);
+ if (hash_array[idx] & mask) {
+ yomi_hash_collision ++;
+ }
+ hash_array[idx] |= mask;
+}
+
+/* 読みhashのビットマップを作る */
+static void
+mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
+{
+ unsigned char *hash_array;
+ int i;
+ struct yomi_entry *ye;
+ hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
+ for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
+ hash_array[i] = 0;
+ }
+ for (i = 0; i < yl->nr_valid_entries; i++) {
+ ye = yl->ye_array[i];
+ mark_hash_array(hash_array, ye->index_xstr);
+ }
+ fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
+ printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
+ yomi_hash_collision, yl->nr_valid_entries);
+
+}
+
+static struct adjust_command *
+parse_modify_freq_command(const char *buf)
+{
+ char *line = alloca(strlen(buf) + 1);
+ char *yomi, *wt, *word, *type_str;
+ struct adjust_command *cmd;
+ int type = 0;
+ strcpy(line, buf);
+ yomi = strtok(line, " ");
+ wt = strtok(NULL, " ");
+ word = strtok(NULL, " ");
+ type_str = strtok(NULL, " ");
+ if (!yomi || !wt || !word || !type_str) {
+ return NULL;
+ }
+ if (!strcmp(type_str, "up")) {
+ type = ADJUST_FREQ_UP;
+ }
+ if (!strcmp(type_str, "down")) {
+ type = ADJUST_FREQ_DOWN;
+ }
+ if (!strcmp(type_str, "kill")) {
+ type = ADJUST_FREQ_KILL;
+ }
+ if (!type) {
+ return NULL;
+ }
+ cmd = malloc(sizeof(struct adjust_command));
+ cmd->type = type;
+ cmd->yomi = anthy_cstr_to_xstr(yomi, ANTHY_EUC_JP_ENCODING);
+ cmd->wt = get_wt_name(wt);
+ cmd->word = anthy_conv_euc_to_utf8(word);
+ return cmd;
+}
+
+static void
+parse_adjust_command(const char *buf, struct adjust_command *ac_list)
+{
+ struct adjust_command *cmd = NULL;
+ if (!strncmp("\\modify_freq ", buf, 13)) {
+ cmd = parse_modify_freq_command(&buf[13]);
+ }
+ if (cmd) {
+ cmd->next = ac_list->next;
+ ac_list->next = cmd;
+ }
+}
+
+/** 辞書を一行ずつ読み込んでリストを作る
+ * このコマンドのコア */
+static void
+parse_dict_file(FILE *fin, struct mkdic_stat *mds)
+{
+ xstr *index_xs;
+ char buf[MAX_LINE_LEN];
+ char *ent;
+ struct yomi_entry *ye = NULL;
+
+ /* 1行ずつ処理 */
+ while (read_line(fin, buf)) {
+ if (buf[0] == '\\' && buf[1] != ' ') {
+ parse_adjust_command(buf, &mds->ac_list);
+ continue ;
+ }
+ index_xs = get_index_from_line(mds, buf);
+ if (!index_xs) {
+ break;
+ }
+ ent = get_entry_from_line(buf);
+
+ /* 読みが30文字を越える場合は無視 */
+ if (index_xs->len < 31) {
+ ye = find_yomi_entry(&mds->yl, index_xs, 1);
+ push_back_word_entry_line(mds, ye, ent);
+ }
+
+ free(ent);
+ anthy_free_xstr(index_xs);
+ }
+}
+
+/* 読み、品詞、単語の三つ組から単語の構造体を取得する */
+static struct word_entry *
+find_word_entry(struct yomi_entry_list *yl, xstr *yomi,
+ const char *wt, char *word)
+{
+ struct yomi_entry *ye = find_yomi_entry(yl, yomi, 0);
+ int i;
+ if (!ye) {
+ return NULL;
+ }
+ for (i = 0; i < ye->nr_entries; i++) {
+ struct word_entry *we = &ye->entries[i];
+ if (!strcmp(we->wt_name, wt) &&
+ !strcmp(we->word_utf8, word)) {
+ return we;
+ }
+ }
+ return NULL;
+}
+
+/* 頻度調整のコマンドを適用する */
+static void
+apply_adjust_command(struct yomi_entry_list *yl,
+ struct adjust_command *ac_list)
+{
+ struct adjust_command *cmd;
+ for (cmd = ac_list->next; cmd; cmd = cmd->next) {
+ struct word_entry *we = find_word_entry(yl, cmd->yomi,
+ cmd->wt, cmd->word);
+ if (!we) {
+ char *yomi = anthy_xstr_to_cstr(cmd->yomi, ANTHY_UTF8_ENCODING);
+ printf("failed to find target of adjust command (%s, %s, %s)\n",
+ yomi, cmd->wt, cmd->word);
+ free(yomi);
+ continue;
+ }
+ if (cmd->type == ADJUST_FREQ_UP) {
+ we->raw_freq *= 4;
+ }
+ if (cmd->type == ADJUST_FREQ_DOWN) {
+ we->raw_freq /= 4;
+ if (we->raw_freq == 0) {
+ we->raw_freq = 1;
+ }
+ }
+ if (cmd->type == ADJUST_FREQ_KILL) {
+ we->raw_freq = 0;
+ }
+ }
+}
+
+/* qsort用の比較関数 */
+static int
+compare_yomi_entry(const void *p1, const void *p2)
+{
+ const struct yomi_entry *const *y1 = p1;
+ const struct yomi_entry *const *y2 = p2;
+ return strcmp((*y1)->index_str, (*y2)->index_str);
+}
+
+/* yomi_entryでsortする */
+static void
+sort_word_dict(struct yomi_entry_list *yl)
+{
+ int i;
+ struct yomi_entry *ye;
+ yl->nr_valid_entries = 0;
+ /* 単語を持つ読みだけを yl->ye_arrayに詰め直す */
+ yl->ye_array = malloc(sizeof(struct yomi_entry *) * yl->nr_entries);
+ for (i = 0, ye = yl->head; i < yl->nr_entries; i++, ye = ye->next) {
+ if (ye->nr_entries > 0) {
+ yl->ye_array[yl->nr_valid_entries] = ye;
+ yl->nr_valid_entries ++;
+ }
+ }
+ /**/
+ for (i = 0; i < yl->nr_valid_entries; i++) {
+ struct yomi_entry *ye = yl->ye_array[i];
+ ye->index_str = anthy_xstr_to_cstr(ye->index_xstr, yl->index_encoding);
+ }
+ /* ソートする */
+ qsort(yl->ye_array, yl->nr_valid_entries,
+ sizeof(struct yomi_entry *),
+ compare_yomi_entry);
+ /* 不要な単語を消す */
+ yl->nr_words = 0;
+ for (i = 0; i < yl->nr_valid_entries; i++) {
+ struct yomi_entry *ye = yl->ye_array[i];
+ yl->nr_words += normalize_word_entry(ye);
+ }
+}
+
+/** ファイルのサイズを取得する */
+static int
+get_file_size(FILE *fp)
+{
+ if (!fp) {
+ return 0;
+ }
+ return (ftell (fp) + SECTION_ALIGNMENT - 1) & (-SECTION_ALIGNMENT);
+}
+
+static void
+copy_file(struct mkdic_stat *mds, FILE *in, FILE *out)
+{
+ int i;
+ size_t nread;
+ char buf[BUFSIZ];
+
+ /* Pad OUT to the next aligned offset. */
+ for (i = ftell (out); i & (SECTION_ALIGNMENT - 1); i++) {
+ fputc (0, out);
+ }
+
+ /* Copy the contents. */
+ rewind (in);
+ while ((nread = fread (buf, 1, sizeof buf, in)) > 0) {
+ if (fwrite (buf, 1, nread, out) < nread) {
+ /* Handle short write (maybe disk full). */
+ fprintf (stderr, "%s: %s: write error: %s\n",
+ progname, mds->output_fn, strerror (errno));
+ exit (1);
+ }
+ }
+}
+
+static void
+generate_header(FILE *fp)
+{
+ int buf[NR_HEADER_SECTIONS];
+ int i;
+ struct file_section *fs;
+ int off;
+
+ /* 初期化 */
+ for (i = 0; i < NR_HEADER_SECTIONS; i++) {
+ buf[i] = 0;
+ }
+
+ /* ヘッダ */
+ buf[0] = NR_HEADER_SECTIONS * sizeof(int);
+ buf[1] = 0;
+
+ /* 各セクションのオフセット */
+ off = buf[0];
+ for (i = 2, fs = file_array; fs->fpp; fs ++, i++) {
+ buf[i] = off;
+ off += get_file_size(*(fs->fpp));
+ }
+
+ /* ファイルへ出力する */
+ for (i = 0; i < NR_HEADER_SECTIONS; i++) {
+ write_nl(fp, buf[i]);
+ }
+}
+
+/* 各セクションのファイルをマージして、ひとつの辞書ファイルを作る */
+static void
+link_dics(struct mkdic_stat *mds)
+{
+ FILE *fp;
+ struct file_section *fs;
+
+ fp = fopen (mds->output_fn, "w");
+ if (!fp) {
+ fprintf (stderr, "%s: %s: cannot create: %s\n",
+ progname, mds->output_fn, strerror (errno));
+ exit (1);
+ }
+
+ /* ヘッダを出力する */
+ generate_header(fp);
+
+ for (fs = file_array; fs->fpp; fs ++) {
+ /* 各セクションのファイルを結合する */
+ copy_file(mds, *(fs->fpp), fp);
+ if (fs->fn) {
+ unlink(fs->fn);
+ }
+ }
+
+ if (fclose (fp)) {
+ fprintf (stderr, "%s: %s: write error: %s\n",
+ progname, mds->output_fn, strerror (errno));
+ exit (1);
+ }
+}
+
+static void
+read_dict_file(struct mkdic_stat *mds, const char *fn)
+{
+ FILE *fp;
+ /* ファイル名が指定されたので読み込む */
+ fp = fopen(fn, "r");
+ if (fp) {
+ printf("file = %s\n", fn);
+ parse_dict_file(fp, mds);
+ fclose(fp);
+ } else {
+ printf("failed file = %s\n", fn);
+ }
+}
+
+static void
+complete_words(struct mkdic_stat *mds)
+{
+ /* 頻度補正を適用する */
+ apply_adjust_command(&mds->yl, &mds->ac_list);
+
+ /**/
+ calc_freq(&mds->yl);
+
+ /* 読みで並び替える */
+ sort_word_dict(&mds->yl);
+
+ /* ファイルを準備する */
+ open_output_files();
+ /* 単語辞書を出力する */
+ output_word_dict(&mds->yl);
+
+ /* 読みハッシュを作る */
+ mk_yomi_hash(yomi_hash_out, &mds->yl);
+}
+
+static void
+read_udict_file(struct mkdic_stat *mds, const char *fn)
+{
+ if (!mds->ud) {
+ mds->ud = create_uc_dict();
+ complete_words(mds);
+ }
+ read_uc_file(mds->ud, fn);
+ printf("uc = %s\n", fn);
+}
+
+static xstr*
+xstr_strncat(xstr* xs, xchar* src, int n)
+{
+ int i;
+ xs->str = realloc(xs->str, sizeof(xchar) * (xs->len + n + 1));
+
+ for (i = 0; i < n; ++i) {
+ xs->str[xs->len + i] = src[i];
+ }
+ xs->len += n;
+ return xs;
+}
+
+static void
+reverse_multi_segment_word(struct mkdic_stat *mds, struct word_entry *we)
+{
+ /*
+ 「かなかんじへんかんえんじん #T35 #_2仮名_3漢字_4変換_4エンジン」
+ から
+ 「仮名漢字変換エンジン #T35 #_2かな_2かんじ_2へんかん_4えんじん」
+ を作る
+ */
+ int j;
+ /* yomiは仮名漢字混じり wordは平仮名のみからなる */
+ int yomi_seg_start = 0;
+ int word_seg_start = 0;
+ int word_seg_len = 0;
+ xstr *yomibuf = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
+ xstr *wordbuf = we->ye->index_xstr;
+ xstr *yomi_xs = anthy_cstr_to_xstr("", 0);
+ xstr *word_xs = anthy_cstr_to_xstr("#", 0);
+ char *word;
+ char ch[256];
+ struct yomi_entry *target_ye;
+
+ for (j = 0; j <= yomibuf->len; ++j) {
+ if (j == yomibuf->len || yomibuf->str[j] == '_') {
+ if (yomi_seg_start != 0) {
+ anthy_xstrappend(word_xs, '_');
+ snprintf(ch, 256, "%x", j - yomi_seg_start);
+ anthy_xstrappend(word_xs, (xchar)ch[0]);
+ xstr_strncat(word_xs, &wordbuf->str[word_seg_start], word_seg_len);
+ xstr_strncat(yomi_xs, &yomibuf->str[yomi_seg_start], j - yomi_seg_start);
+ }
+ if (j == yomibuf->len) {
+ break;
+ }
+ yomi_seg_start = j + 2;
+ word_seg_start += word_seg_len;
+ word_seg_len = get_element_len(yomibuf->str[j + 1]);
+ }
+ }
+
+ target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
+ word = anthy_xstr_to_cstr(word_xs, mds->input_encoding);
+
+ /* 逆変換用の辞書はfreqが負 */
+ push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
+ word, we->source_order);
+
+ free(word);
+ anthy_free_xstr(yomibuf);
+ anthy_free_xstr(yomi_xs);
+ anthy_free_xstr(word_xs);
+}
+
+/* 逆変換用の辞書を作る */
+static void
+build_reverse_dict(struct mkdic_stat *mds)
+{
+ struct yomi_entry *ye;
+ int i, n;
+ struct word_entry *we_array;
+ printf("building reverse index\n");
+
+ /* 単語の数を数える */
+ n = 0;
+ for (ye = mds->yl.head; ye; ye = ye->next) {
+ for (i = 0; i < ye->nr_entries; i++) {
+ n++;
+ }
+ }
+ /* コピーする
+ * (元の辞書中のポインタはreallocで動くのでコピーが必要)
+ */
+ we_array = malloc(sizeof(struct word_entry )* n);
+ n = 0;
+ for (ye = mds->yl.head; ye; ye = ye->next) {
+ for (i = 0; i < ye->nr_entries; i++) {
+ we_array[n] = ye->entries[i];
+ n++;
+ }
+ }
+
+ /* 辞書に追加していく */
+ for (i = 0; i < n; i++) {
+ struct word_entry *we;
+ struct yomi_entry *target_ye;
+
+ we = &we_array[i];
+ if (we->word_utf8[0] == '#') {
+ if (we->word_utf8[1] == '_') {
+ reverse_multi_segment_word(mds, we);
+ }
+ } else {
+ /* yomiは仮名漢字混じり wordは平仮名のみからなる */
+ xstr *yomi_xs;
+ char *word;
+
+ yomi_xs = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
+ target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
+ word = anthy_xstr_to_cstr(we->ye->index_xstr, mds->input_encoding);
+
+ /* 逆変換用の辞書はfreqが負 */
+ push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
+ word, we->source_order);
+
+ anthy_free_xstr(yomi_xs);
+ free(word);
+ }
+ }
+ /**/
+ free(we_array);
+}
+
+static void
+clear_exclude_wtypes(struct mkdic_stat *mds)
+{
+ int i;
+ for (i = 0; i < mds->nr_excluded; i++) {
+ free(mds->excluded_wtypes[i]);
+ }
+ free(mds->excluded_wtypes);
+ /**/
+ mds->excluded_wtypes = NULL;
+ mds->nr_excluded = 0;
+}
+
+static void
+set_exclude_wtypes(struct mkdic_stat *mds, int nr, char **tokens)
+{
+ int i;
+ mds->nr_excluded = nr - 1;
+ mds->excluded_wtypes = malloc(sizeof(char *) * (nr - 1));
+ /**/
+ for (i = 1; i < nr; i++) {
+ mds->excluded_wtypes[i - 1] = strdup(tokens[i]);
+ }
+}
+
+static void
+set_dict_encoding(struct mkdic_stat *mds, const char *enc)
+{
+ if (!strcmp(enc, "utf8")) {
+ mds->yl.body_encoding = ANTHY_UTF8_ENCODING;
+ }
+}
+
+static void
+set_input_encoding(struct mkdic_stat *mds, const char *enc)
+{
+ if (!strcmp(enc, "utf8")) {
+ mds->input_encoding = ANTHY_UTF8_ENCODING;
+ }
+ if (!strcmp(enc, "eucjp")) {
+ mds->input_encoding = ANTHY_EUC_JP_ENCODING;
+ }
+}
+
+static void
+write_dict_file(struct mkdic_stat *mds)
+{
+ if (!mds->ud) {
+ printf("can not build without use case dict\n");
+ exit(1);
+ }
+
+ /* 用例辞書を作る */
+ make_ucdict(uc_out, mds->ud);
+
+ /* 辞書ファイルにまとめる */
+ flush_output_files();
+ link_dics(mds);
+}
+
+static void
+show_command(char **tokens, int nr)
+{
+ int i;
+ printf("cmd:");
+ for (i = 0; i < nr; i++) {
+ printf(" %s", tokens[i]);
+ }
+ printf("\n");
+}
+
+static int
+execute_batch(struct mkdic_stat *mds, const char *fn)
+{
+ int nr;
+ char **tokens;
+ if (anthy_open_file(fn)) {
+ printf("mkanthydic: failed to open %s\n", fn);
+ return 1;
+ }
+ while (!anthy_read_line(&tokens, &nr)) {
+ char *cmd = tokens[0];
+ show_command(tokens, nr);
+ if (!strcmp(cmd, "read") && nr == 2) {
+ read_dict_file(mds, tokens[1]);
+ } else if (!strcmp(cmd, "read_uc") && nr == 2) {
+ read_udict_file(mds, tokens[1]);
+ } else if (!strcmp(cmd, "build_reverse_dict")) {
+ build_reverse_dict(mds);
+ } else if (!strcmp(cmd, "write")) {
+ write_dict_file(mds);
+ } else if (!strcmp(cmd, "set_exclude_wtypes")) {
+ set_exclude_wtypes(mds, nr, tokens);
+ } else if (!strcmp(cmd, "clear_exclude_wtypes")) {
+ clear_exclude_wtypes(mds);
+ } else if (!strcmp(cmd, "set_dict_encoding") && nr == 2) {
+ set_dict_encoding(mds, tokens[1]);
+ } else if (!strcmp(cmd, "set_input_encoding") && nr == 2) {
+ set_input_encoding(mds, tokens[1]);
+ } else if (!strcmp(cmd, "done")) {
+ anthy_free_line();
+ break;
+ } else {
+ printf("Unknown command(%s).\n", cmd);
+ }
+ anthy_free_line();
+ }
+ anthy_close_file();
+ return 0;
+}
+
+/* 辞書生成のための変数の初期化 */
+static void
+init_mds(struct mkdic_stat *mds)
+{
+ int i;
+ mds->output_fn = DEFAULT_FN;
+ mds->ud = NULL;
+
+ /* 単語辞書を初期化する */
+ mds->yl.head = NULL;
+ mds->yl.nr_entries = 0;
+ for (i = 0; i < YOMI_HASH; i++) {
+ mds->yl.hash[i] = NULL;
+ }
+ mds->yl.index_encoding = ANTHY_UTF8_ENCODING;
+ mds->yl.body_encoding = ANTHY_EUC_JP_ENCODING;
+ /**/
+ mds->ac_list.next = NULL;
+ /**/
+ mds->input_encoding = ANTHY_EUC_JP_ENCODING;
+ /**/
+ mds->nr_excluded = 0;
+ mds->excluded_wtypes = NULL;
+}
+
+/* libanthyの使用する部分だけを初期化する */
+static void
+init_libs(void)
+{
+ int res;
+ res = anthy_init_xstr();
+ if (res == -1) {
+ fprintf (stderr, "failed to init dic lib\n");
+ exit(1);
+ }
+}
+
+/**/
+int
+main(int argc, char **argv)
+{
+ struct mkdic_stat mds;
+ int i;
+ char *script_fn = NULL;
+ int help_mode = 0;
+
+ anthy_init_wtypes();
+ init_libs();
+ init_mds(&mds);
+
+ for (i = 1; i < argc; i++) {
+ char *arg = argv[i];
+ char *prev_arg = argv[i-1];
+ if (!strcmp(arg, "--help")) {
+ help_mode = 1;
+ }
+ if (!strcmp(prev_arg, "-f")) {
+ script_fn = arg;
+ }
+ }
+
+ if (help_mode || !script_fn) {
+ print_usage();
+ }
+
+ return execute_batch(&mds, script_fn);
+}