anthy-9100hHEAD anthy-9100h master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2009-02-07 16:32:56 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2009-02-07 16:32:56 +0000
commit: a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch)
tree: a966aeee62e69ae3ad13275d07ddb15049b14e0e /mkworddic/mkdic.c
download: anthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz
1 files changed, 1198 insertions, 0 deletions
diff --git a/mkworddic/mkdic.c b/mkworddic/mkdic.c
new file mode 100644
index 0000000..7c99fe0
--- /dev/null
+++ b/mkworddic/mkdic.c
@@ -0,0 +1,1198 @@
+/*
+ * cannadic形式のファイルから辞書ファイルを作る
+ *
+ * Funded by IPA未踏ソフトウェア創造事業 2002 1/1
+ *
+ * Copyright (C) 2000-2007 TABATA Yusuke
+ * Copyright (C) 2005 YOSHIDA Yuichi
+ * Copyright (C) 2001-2002 TAKAI Kousuke
+ */
+/*
+ * 辞書は読みをindexとし、品詞や変換後の単語(=entry)を検索
+ * する構造になっている。
+ *
+ * 読み -> 単語、単語、、
+ *
+ * 辞書ファイルはネットワークバイトオーダーを用いる。
+ *
+ * 辞書ファイルは複数のセクションから構成されている
+ *  0 ヘッダ 16*4 bytes
+ *  2 読みのインデックス (読み512個ごと)
+ *  3 読み
+ *  4 ページ
+ *  5 ページのインデックス
+ *  6 用例辞書(?)
+ *  7 読み hash
+ *
+ * source 元の辞書ファイル
+ * file_dic 生成するファイル
+ *
+ * yomi_hash 辞書ファイルに出力されるhashのbitmap
+ * index_hash このソース中でstruct yomi_entryを検索するためのhash
+ *
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <ctype.h>
+
+#include <config.h>
+
+#include <anthy/anthy.h>
+#include <anthy/xstr.h>
+#include <anthy/wtype.h>
+#include <anthy/ruleparser.h>
+#include <anthy/word_dic.h>
+#include <anthy/diclib.h>
+#include "mkdic.h"
+
+#define MAX_LINE_LEN 10240
+#define NR_HEADER_SECTIONS 16
+#define SECTION_ALIGNMENT 8
+#define MAX_WTYPE_LEN 20
+
+#define DEFAULT_FN "anthy.wdic"
+
+static const char *progname;
+
+/* writewords.cからアクセスするために、global変数 */
+FILE *yomi_entry_index_out, *yomi_entry_out;
+FILE *page_out, *page_index_out;
+/**/
+static FILE *uc_out;
+static FILE *yomi_hash_out;
+/* ハッシュの衝突の数、統計情報 */
+static int yomi_hash_collision;
+
+/* ファイル中の順序に従って並べる */
+struct file_section {
+  FILE **fpp;
+  char *fn;
+} file_array[] = {
+  {&yomi_entry_index_out, NULL},
+  {&yomi_entry_out, NULL},
+  {&page_out, NULL},
+  {&page_index_out, NULL},
+  {&uc_out, NULL},
+  {&yomi_hash_out, NULL},
+  {NULL, NULL},
+};
+
+/* 辞書生成の状態 */
+struct mkdic_stat {
+  /* 単語のリスト */
+  struct yomi_entry_list yl;
+  /**/
+  struct adjust_command ac_list;
+  /* 用例辞書 */
+  struct uc_dict *ud;
+  /**/
+  const char *output_fn;
+  /**/
+  int input_encoding;
+  /**/
+  int nr_excluded;
+  char **excluded_wtypes;
+};
+
+/* 辞書の出力先のファイルをオープンする */
+static void
+open_output_files(void)
+{
+  struct file_section *fs;
+  for (fs = file_array; fs->fpp; fs ++) {
+    char *tmpdir = getenv("TMPDIR");
+    fs->fn = NULL;
+    if (tmpdir) {
+      /* tmpfile()がTMPDIRを見ないため、TMPDIRを指定された場合mkstempを使う。*/
+      char buf[256];
+      int fd = -1;
+      snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
+      fd = mkstemp(buf);
+      if (fd == -1) {
+	*(fs->fpp) = NULL;
+      } else {
+	*(fs->fpp) = fdopen(fd, "w+");
+	fs->fn = strdup(buf);
+      }
+    } else {
+      *(fs->fpp) = tmpfile();
+    }
+    /**/
+    if (!(*(fs->fpp))) {
+      fprintf (stderr, "%s: cannot open temporary file: %s\n",
+	       progname, strerror (errno));
+      exit (2);
+    }
+  }
+}
+
+/* fflushする */
+static void
+flush_output_files (void)
+{
+  struct file_section *fs;
+  for (fs = file_array; fs->fpp; fs ++) {
+    if (ferror(*(fs->fpp))) {
+      fprintf (stderr, "%s: write error\n", progname);
+      exit (1);
+    }
+  }
+  for (fs = file_array; fs->fpp; fs ++) {
+    if (fflush(*(fs->fpp))) {
+      fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
+      exit (1);
+    }
+  }
+}
+
+/* ネットワークbyteorderで4bytes書き出す */
+void
+write_nl(FILE *fp, int i)
+{
+  i = anthy_dic_htonl(i);
+  fwrite(&i, sizeof(int), 1, fp);
+}
+
+static void
+print_usage(void)
+{
+  printf("please do not use mkanthydic command directly.\n");
+  exit(0);
+}
+
+static char *
+read_line(FILE *fp, char *buf)
+{
+  /* 長すぎる行を無視する */
+  int toolong = 0;
+
+  while (fgets(buf, MAX_LINE_LEN, fp)) {
+    int len = strlen(buf);
+    if (buf[0] == '#') {
+      continue ;
+    }
+    if (buf[len - 1] != '\n') {
+      toolong = 1;
+      continue ;
+    }
+
+    buf[len - 1] = 0;
+    if (toolong) {
+      toolong = 0;
+    } else {
+      return buf;
+    }
+  }
+  return NULL;
+}
+
+/** cannadic形式の辞書の行からindexとなる部分を取り出す */
+static xstr *
+get_index_from_line(struct mkdic_stat *mds, char *buf)
+{
+  char *sp;
+  xstr *xs;
+  sp = strchr(buf, ' ');
+  if (!sp) {
+    /* 辞書のフォーマットがおかしい */
+    return NULL;
+  }
+  *sp = 0;
+  xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
+  *sp = ' ';
+  return xs;
+}
+
+/** cannadic形式の辞書の行からindex以外の部分を取り出す */
+static char *
+get_entry_from_line(char *buf)
+{
+  char *sp;
+  sp = strchr(buf, ' ');
+  while(*sp == ' ') {
+    sp ++;
+  }
+  return strdup(sp);
+}
+
+static int
+index_hash(xstr *xs)
+{
+  int i;
+  unsigned int h = 0;
+  for (i = 0; i < xs->len; i++) {
+    h += xs->str[i] * 11;
+  }
+  return (int)(h % YOMI_HASH);
+}
+
+const char *
+get_wt_name(const char *name)
+{
+  wtype_t dummy;
+  const char *res;
+  if (!strcmp(name, "#T35")) {
+    return "#T";
+  }
+  res = anthy_type_to_wtype(name, &dummy);
+  if (!res) {
+    return "unknown";
+  }
+  return res;
+}
+
+/** 読みに対して、単語を一つを追加する */
+static void
+push_back_word_entry(struct mkdic_stat *mds,
+		     struct yomi_entry *ye, const char *wt_name,
+		     int freq, const char *word, int order)
+{
+  wtype_t wt;
+  char *s;
+  if (freq == 0) {
+    return ;
+  }
+  if (!anthy_type_to_wtype(wt_name, &wt)) {
+    /* anthyの知らない品詞 */
+    return ;
+  }
+  ye->entries = realloc(ye->entries,
+			sizeof(struct word_entry) *
+			(ye->nr_entries + 1));
+  ye->entries[ye->nr_entries].ye = ye;
+  ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
+  ye->entries[ye->nr_entries].raw_freq = freq;
+  ye->entries[ye->nr_entries].feature = 0;
+  ye->entries[ye->nr_entries].source_order = order;
+  if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
+    s = anthy_conv_euc_to_utf8(word);
+  } else {
+    s = strdup(word);
+  }
+  ye->entries[ye->nr_entries].word_utf8 = s;
+  ye->nr_entries ++;
+}
+
+static int
+parse_wtype(char *wtbuf, char *cur)
+{
+  /* 品詞 */
+  char *t;
+  int freq;
+  if (strlen(cur) >= MAX_WTYPE_LEN) {
+    return 0;
+  }
+  strcpy(wtbuf, cur);
+  /* 頻度 */
+  t = strchr(wtbuf, '*');
+  freq = 1;
+  if (t) {
+    int tmp_freq;
+    *t = 0;
+    t++;
+    tmp_freq = atoi(t);
+    if (tmp_freq) {
+      freq = tmp_freq;
+    }
+  }
+  return freq;
+}
+
+/* 複合語の要素の長さは 1,2,3, ... 9,a,b,c */
+static int
+get_element_len(xchar xc)
+{
+  if (xc > '0' && xc <= '9') {
+    return xc - '0';
+  }
+  if (xc >= 'a' && xc <= 'z') {
+    return xc - 'a' + 10;
+  }
+  return 0;
+}
+
+/** 複合候補の形式チェック */
+static int
+check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
+{
+  /* 読みの文字数の合計を数える */
+  xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
+  int i, total = 0;
+  for (i = 0; i < xs->len - 1; i++) {
+    if (xs->str[i] == '_') {
+      total += get_element_len(xs->str[i+1]);
+    }
+  }
+  anthy_free_xstr(xs);
+  /* 比較する */
+  if (total != index->len) {
+    fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
+	    cur, total);
+    return 0;
+  }
+  return 1;
+}
+
+static int
+is_excluded_wtype(struct mkdic_stat *mds, char *wt)
+{
+  int i;
+  for (i = 0; i < mds->nr_excluded; i++) {
+    if (!strcmp(mds->excluded_wtypes[i], wt)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static char *
+find_token_end(char *cur)
+{
+  char *n;
+  for (n = cur; *n != ' ' && *n != '\0'; n++) {
+    if (*n == '\\') {
+      if (!n[1]) {
+	return NULL;
+      }
+      n++;
+    }
+  }
+  return n;
+}
+
+/** 読みに対応する行を分割して、配列を構成する */
+static void
+push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
+			  const char *ent)
+{
+  char *buf = alloca(strlen(ent) + 1);
+  char *cur = buf;
+  char *n;
+  char wtbuf[MAX_WTYPE_LEN];
+  int freq = 0;
+  int order = 0;
+
+  strcpy(buf, ent);
+  wtbuf[0] = 0;
+
+  while (1) {
+    /* トークンを\0で切る。curの後の空白か\0を探す */
+    n = find_token_end(cur);
+    if (!n) {
+      fprintf(stderr, "invalid \\ at the end of line (%s).\n",
+	      ent);
+      return ;
+    }
+    if (*n) {
+      *n = 0;
+    } else {
+      n = NULL;
+    }
+    /**/
+    if (cur[0] == '#') {
+      if (isalpha((unsigned char)cur[1])) {
+	/* #XX*?? をパース */
+	freq = parse_wtype(wtbuf, cur);
+      } else {
+	if (cur[1] == '_' &&
+	    check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
+	  /* #_ 複合候補 */
+	  push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
+	  order ++;
+	}
+      }
+    } else {
+      /* 品詞が除去リストに入っているかをチェック */
+      if (!is_excluded_wtype(mds, wtbuf)) {
+	/* 単語を追加 */
+	push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
+	order ++;
+      }/* :to extract excluded words
+	  else {
+	  anthy_putxstr(ye->index_xstr);
+	  printf(" %s*%d %s\n", wtbuf, freq, cur);
+	  }*/
+    }
+    if (!n) {
+      /* 行末 */
+      return ;
+    }
+    cur = n;
+    cur ++;
+  }
+}
+
+/** 同じ単語が無いかチェック */
+static int
+check_same_word(struct yomi_entry *ye, int idx)
+{
+  struct word_entry *base = &ye->entries[idx];
+  int i;
+  for (i = idx -1; i >= 0; i--) {
+    struct word_entry *cur = &ye->entries[i];
+    if (base->raw_freq != cur->raw_freq) {
+      return 0;
+    }
+    if (strcmp(base->wt_name, cur->wt_name)) {
+      return 0;
+    }
+    if (strcmp(base->word_utf8, cur->word_utf8)) {
+      return 0;
+    }
+    /* 同じだった */
+    return 1;
+  }
+  return 0;
+}
+
+/** qsort用の比較関数 */
+static int
+compare_word_entry_by_freq(const void *p1, const void *p2)
+{
+  const struct word_entry *e1 = p1;
+  const struct word_entry *e2 = p2;
+  return e2->raw_freq - e1->raw_freq;
+}
+
+/** qsort用の比較関数 */
+static int
+compare_word_entry_by_wtype(const void *p1, const void *p2)
+{
+  const struct word_entry *e1 = p1;
+  const struct word_entry *e2 = p2;
+  int ret = strcmp(e1->wt_name, e2->wt_name);
+  if (ret != 0) {
+    return ret;
+  } else {
+    return compare_word_entry_by_freq(p1, p2);
+  }
+}
+
+/** 読みに対する単語を頻度順に並べ、いらない単語を消す */
+static int
+normalize_word_entry(struct yomi_entry *ye)
+{
+  int i, nr_dup = 0;
+  if (!ye) {
+    return 0;
+  }
+  /* 単語を並べる */
+  qsort(ye->entries, ye->nr_entries,
+	sizeof(struct word_entry),
+	compare_word_entry_by_freq);
+  /* ダブったら、0点 */
+  for (i = 0; i < ye->nr_entries; i++) {
+    if (check_same_word(ye, i)) {
+      ye->entries[i].raw_freq = 0;
+      nr_dup ++;
+    }
+  }
+  /* 再びソート */
+  qsort(ye->entries, ye->nr_entries,
+	sizeof(struct word_entry),
+	compare_word_entry_by_wtype);
+  return ye->nr_entries - nr_dup;
+}
+
+/*その読みに対応するyomi_entryを返す
+**/
+struct yomi_entry *
+find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
+{
+  struct yomi_entry *ye;
+  int hash = index_hash(index);
+  int search = 0;
+  /* hash chainから探す */
+  for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
+    search ++;
+    if (!anthy_xstrcmp(ye->index_xstr, index)) {
+      return ye;
+    }
+  }
+  if (!create) {
+    return NULL;
+  }
+
+  /* 無いので確保 */
+  ye = malloc(sizeof(struct yomi_entry));
+  ye->nr_entries = 0;
+  ye->entries = 0;
+  ye->next = NULL;
+  ye->index_xstr = anthy_xstr_dup(index);
+  ye->index_str = NULL;
+
+  /* hash chainにつなぐ */
+  ye->hash_next = yl->hash[hash];
+  yl->hash[hash] = ye;
+
+  /* リストにつなぐ */
+
+  ye->next = yl->head;
+  yl->head = ye;
+
+  yl->nr_entries ++;
+
+  return ye;
+}
+
+/* 辞書ファイル中のhash bitmapにマークを付ける */
+static void
+mark_hash_array(unsigned char *hash_array, xstr *xs)
+{
+  int val, idx, bit, mask;
+  val = anthy_xstr_hash(xs);
+  val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
+  idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
+  bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
+  mask = (1<<bit);
+  if (hash_array[idx] & mask) {
+    yomi_hash_collision ++;
+  }
+  hash_array[idx] |= mask;
+}
+
+/* 読みhashのビットマップを作る */
+static void
+mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
+{
+  unsigned char *hash_array;
+  int i;
+  struct yomi_entry *ye;
+  hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
+  for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
+    hash_array[i] = 0;
+  }
+  for (i = 0; i < yl->nr_valid_entries; i++) {
+    ye = yl->ye_array[i];
+    mark_hash_array(hash_array, ye->index_xstr);
+  }
+  fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
+  printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
+	 yomi_hash_collision, yl->nr_valid_entries);
+	 
+}
+
+static struct adjust_command *
+parse_modify_freq_command(const char *buf)
+{
+  char *line = alloca(strlen(buf) + 1);
+  char *yomi, *wt, *word, *type_str;
+  struct adjust_command *cmd;
+  int type = 0;
+  strcpy(line, buf);
+  yomi = strtok(line, " ");
+  wt = strtok(NULL, " ");
+  word = strtok(NULL, " ");
+  type_str = strtok(NULL, " ");
+  if (!yomi || !wt || !word || !type_str) {
+    return NULL;
+  }
+  if (!strcmp(type_str, "up")) {
+    type = ADJUST_FREQ_UP;
+  }
+  if (!strcmp(type_str, "down")) {
+    type = ADJUST_FREQ_DOWN;
+  }
+  if (!strcmp(type_str, "kill")) {
+    type = ADJUST_FREQ_KILL;
+  }
+  if (!type) {
+    return NULL;
+  }
+  cmd = malloc(sizeof(struct adjust_command));
+  cmd->type = type;
+  cmd->yomi = anthy_cstr_to_xstr(yomi, ANTHY_EUC_JP_ENCODING);
+  cmd->wt = get_wt_name(wt);
+  cmd->word = anthy_conv_euc_to_utf8(word);
+  return cmd;
+}
+
+static void
+parse_adjust_command(const char *buf, struct adjust_command *ac_list)
+{
+  struct adjust_command *cmd = NULL;
+  if (!strncmp("\\modify_freq ", buf, 13)) {
+    cmd = parse_modify_freq_command(&buf[13]);
+  }
+  if (cmd) {
+    cmd->next = ac_list->next;
+    ac_list->next = cmd;
+  }
+}
+
+/** 辞書を一行ずつ読み込んでリストを作る
+ * このコマンドのコア */
+static void
+parse_dict_file(FILE *fin, struct mkdic_stat *mds)
+{
+  xstr *index_xs;
+  char buf[MAX_LINE_LEN];
+  char *ent;
+  struct yomi_entry *ye = NULL;
+
+  /* １行ずつ処理 */
+  while (read_line(fin, buf)) {
+    if (buf[0] == '\\' && buf[1] != ' ') {
+      parse_adjust_command(buf, &mds->ac_list);
+      continue ;
+    }
+    index_xs = get_index_from_line(mds, buf);
+    if (!index_xs) {
+      break;
+    }
+    ent = get_entry_from_line(buf);
+
+    /* 読みが30文字を越える場合は無視 */
+    if (index_xs->len < 31) {
+      ye = find_yomi_entry(&mds->yl, index_xs, 1);
+      push_back_word_entry_line(mds, ye, ent);
+    }
+
+    free(ent);
+    anthy_free_xstr(index_xs);
+  }
+}
+
+/* 読み、品詞、単語の三つ組から単語の構造体を取得する */
+static struct word_entry *
+find_word_entry(struct yomi_entry_list *yl, xstr *yomi,
+		const char *wt, char *word)
+{
+  struct yomi_entry *ye = find_yomi_entry(yl, yomi, 0);
+  int i;
+  if (!ye) {
+    return NULL;
+  }
+  for (i = 0; i < ye->nr_entries; i++) {
+    struct word_entry *we = &ye->entries[i];
+    if (!strcmp(we->wt_name, wt) &&
+	!strcmp(we->word_utf8, word)) {
+      return we;
+    }
+  }
+  return NULL;
+}
+		
+/* 頻度調整のコマンドを適用する */
+static void
+apply_adjust_command(struct yomi_entry_list *yl,
+		     struct adjust_command *ac_list)
+{
+  struct adjust_command *cmd;
+  for (cmd = ac_list->next; cmd; cmd = cmd->next) {
+    struct word_entry *we = find_word_entry(yl, cmd->yomi,
+					    cmd->wt, cmd->word);
+    if (!we) {
+      char *yomi = anthy_xstr_to_cstr(cmd->yomi, ANTHY_UTF8_ENCODING);
+      printf("failed to find target of adjust command (%s, %s, %s)\n",
+	     yomi, cmd->wt, cmd->word);
+      free(yomi);
+      continue;
+    }
+    if (cmd->type == ADJUST_FREQ_UP) {
+      we->raw_freq *= 4;
+    }
+    if (cmd->type == ADJUST_FREQ_DOWN) {
+      we->raw_freq /= 4;
+      if (we->raw_freq == 0) {
+	we->raw_freq = 1;
+      }
+    }
+    if (cmd->type == ADJUST_FREQ_KILL) {
+      we->raw_freq = 0;
+    }
+  }
+}
+
+/* qsort用の比較関数 */
+static int
+compare_yomi_entry(const void *p1, const void *p2)
+{
+  const struct yomi_entry *const *y1 = p1;
+  const struct yomi_entry *const *y2 = p2;
+  return strcmp((*y1)->index_str, (*y2)->index_str);
+}
+
+/* yomi_entryでsortする */
+static void
+sort_word_dict(struct yomi_entry_list *yl)
+{
+  int i;
+  struct yomi_entry *ye;
+  yl->nr_valid_entries = 0;
+  /* 単語を持つ読みだけを yl->ye_arrayに詰め直す */
+  yl->ye_array = malloc(sizeof(struct yomi_entry *) * yl->nr_entries);
+  for (i = 0, ye = yl->head; i < yl->nr_entries; i++, ye = ye->next) {
+    if (ye->nr_entries > 0) {
+      yl->ye_array[yl->nr_valid_entries] = ye;
+      yl->nr_valid_entries ++;
+    }
+  }
+  /**/
+  for (i = 0; i < yl->nr_valid_entries; i++) {
+    struct yomi_entry *ye = yl->ye_array[i];
+    ye->index_str = anthy_xstr_to_cstr(ye->index_xstr, yl->index_encoding);
+  }
+  /* ソートする */
+  qsort(yl->ye_array, yl->nr_valid_entries,
+	sizeof(struct yomi_entry *),
+	compare_yomi_entry);
+  /* 不要な単語を消す */
+  yl->nr_words = 0;
+  for (i = 0; i < yl->nr_valid_entries; i++) {
+    struct yomi_entry *ye = yl->ye_array[i];
+    yl->nr_words += normalize_word_entry(ye);
+  }
+}
+
+/** ファイルのサイズを取得する */
+static int
+get_file_size(FILE *fp)
+{
+  if (!fp) {
+    return 0;
+  }
+  return (ftell (fp) + SECTION_ALIGNMENT - 1) & (-SECTION_ALIGNMENT);
+}
+
+static void
+copy_file(struct mkdic_stat *mds, FILE *in, FILE *out)
+{
+  int i;
+  size_t nread;
+  char buf[BUFSIZ];
+
+  /* Pad OUT to the next aligned offset.  */
+  for (i = ftell (out); i & (SECTION_ALIGNMENT - 1); i++) {
+    fputc (0, out);
+  }
+
+  /* Copy the contents.  */
+  rewind (in);
+  while ((nread = fread (buf, 1, sizeof buf, in)) > 0) {
+    if (fwrite (buf, 1, nread, out) < nread) {
+      /* Handle short write (maybe disk full).  */
+      fprintf (stderr, "%s: %s: write error: %s\n",
+	       progname, mds->output_fn, strerror (errno));
+      exit (1);
+    }
+  }
+}
+
+static void
+generate_header(FILE *fp)
+{
+  int buf[NR_HEADER_SECTIONS];
+  int i;
+  struct file_section *fs;
+  int off;
+
+  /* 初期化 */
+  for (i = 0; i < NR_HEADER_SECTIONS; i++) {
+    buf[i] = 0;
+  }
+
+  /* ヘッダ */
+  buf[0] = NR_HEADER_SECTIONS * sizeof(int);
+  buf[1] = 0;
+
+  /* 各セクションのオフセット */
+  off = buf[0];
+  for (i = 2, fs = file_array; fs->fpp; fs ++, i++) {
+    buf[i] = off;
+    off += get_file_size(*(fs->fpp));
+  }
+
+  /* ファイルへ出力する */
+  for (i = 0; i < NR_HEADER_SECTIONS; i++) {
+    write_nl(fp, buf[i]);
+  }
+}
+
+/* 各セクションのファイルをマージして、ひとつの辞書ファイルを作る */
+static void
+link_dics(struct mkdic_stat *mds)
+{
+  FILE *fp;
+  struct file_section *fs;
+
+  fp = fopen (mds->output_fn, "w");
+  if (!fp) {
+      fprintf (stderr, "%s: %s: cannot create: %s\n",
+	       progname, mds->output_fn, strerror (errno));
+      exit (1);
+  }
+
+  /* ヘッダを出力する */
+  generate_header(fp);
+
+  for (fs = file_array; fs->fpp; fs ++) {
+    /* 各セクションのファイルを結合する */
+    copy_file(mds, *(fs->fpp), fp);
+    if (fs->fn) {
+      unlink(fs->fn);
+    }
+  }
+
+  if (fclose (fp)) {
+    fprintf (stderr, "%s: %s: write error: %s\n",
+	     progname, mds->output_fn, strerror (errno));
+    exit (1);
+  }
+}
+
+static void
+read_dict_file(struct mkdic_stat *mds, const char *fn)
+{
+  FILE *fp;
+  /* ファイル名が指定されたので読み込む */
+  fp = fopen(fn, "r");
+  if (fp) {
+    printf("file = %s\n", fn);
+    parse_dict_file(fp, mds);
+    fclose(fp);
+  } else {
+    printf("failed file = %s\n", fn);
+  }
+}
+
+static void
+complete_words(struct mkdic_stat *mds)
+{
+  /* 頻度補正を適用する */
+  apply_adjust_command(&mds->yl, &mds->ac_list);
+
+  /**/
+  calc_freq(&mds->yl);
+
+  /* 読みで並び替える */
+  sort_word_dict(&mds->yl);
+
+  /* ファイルを準備する */
+  open_output_files();
+  /* 単語辞書を出力する */
+  output_word_dict(&mds->yl);
+
+  /* 読みハッシュを作る */
+  mk_yomi_hash(yomi_hash_out, &mds->yl);
+}
+
+static void
+read_udict_file(struct mkdic_stat *mds, const char *fn)
+{
+  if (!mds->ud) {
+    mds->ud = create_uc_dict();
+    complete_words(mds);
+  }
+  read_uc_file(mds->ud, fn);
+  printf("uc = %s\n", fn);
+}
+
+static xstr*
+xstr_strncat(xstr* xs, xchar* src, int n)
+{
+  int i;
+  xs->str = realloc(xs->str, sizeof(xchar) * (xs->len + n + 1));
+
+  for (i = 0; i < n; ++i) {
+    xs->str[xs->len + i] = src[i];
+  }
+  xs->len += n;
+  return xs;
+}
+
+static void
+reverse_multi_segment_word(struct mkdic_stat *mds, struct word_entry *we)
+{
+  /*
+    「かなかんじへんかんえんじん #T35 #_2仮名_3漢字_4変換_4エンジン」
+    から
+    「仮名漢字変換エンジン #T35 #_2かな_2かんじ_2へんかん_4えんじん」
+    を作る
+  */
+  int j;
+  /* yomiは仮名漢字混じり wordは平仮名のみからなる */
+  int yomi_seg_start = 0;
+  int word_seg_start = 0;
+  int word_seg_len = 0;
+  xstr *yomibuf = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
+  xstr *wordbuf = we->ye->index_xstr;
+  xstr *yomi_xs = anthy_cstr_to_xstr("", 0);
+  xstr *word_xs = anthy_cstr_to_xstr("#", 0);
+  char *word;
+  char ch[256];
+  struct yomi_entry *target_ye;
+
+  for (j = 0; j <= yomibuf->len; ++j) {
+    if (j == yomibuf->len || yomibuf->str[j] == '_') {
+      if (yomi_seg_start != 0) {
+	anthy_xstrappend(word_xs, '_');
+	snprintf(ch, 256, "%x", j - yomi_seg_start);
+	anthy_xstrappend(word_xs, (xchar)ch[0]);
+	xstr_strncat(word_xs, &wordbuf->str[word_seg_start], word_seg_len);
+	xstr_strncat(yomi_xs, &yomibuf->str[yomi_seg_start], j - yomi_seg_start);
+      }
+      if (j == yomibuf->len) {
+	break;
+      }
+      yomi_seg_start = j + 2;
+      word_seg_start += word_seg_len;
+      word_seg_len = get_element_len(yomibuf->str[j + 1]);
+    }
+  }
+
+  target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
+  word = anthy_xstr_to_cstr(word_xs, mds->input_encoding);
+
+  /* 逆変換用の辞書はfreqが負 */
+  push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
+		       word, we->source_order);
+
+  free(word);
+  anthy_free_xstr(yomibuf);
+  anthy_free_xstr(yomi_xs);
+  anthy_free_xstr(word_xs);
+}
+
+/* 逆変換用の辞書を作る */
+static void
+build_reverse_dict(struct mkdic_stat *mds)
+{
+  struct yomi_entry *ye;
+  int i, n;
+  struct word_entry *we_array;
+  printf("building reverse index\n");
+
+  /* 単語の数を数える */
+  n = 0;
+  for (ye = mds->yl.head; ye; ye = ye->next) {
+    for (i = 0; i < ye->nr_entries; i++) {
+      n++;
+    }
+  }
+  /* コピーする
+   * (元の辞書中のポインタはreallocで動くのでコピーが必要)
+   */
+  we_array = malloc(sizeof(struct word_entry )* n);
+  n = 0;
+  for (ye = mds->yl.head; ye; ye = ye->next) {
+    for (i = 0; i < ye->nr_entries; i++) {
+      we_array[n] = ye->entries[i];
+      n++;
+    }
+  }
+
+  /* 辞書に追加していく */
+  for (i = 0; i < n; i++) {
+    struct word_entry *we;
+    struct yomi_entry *target_ye;
+
+    we = &we_array[i];
+    if (we->word_utf8[0] == '#') {
+      if (we->word_utf8[1] == '_') {
+	reverse_multi_segment_word(mds, we);
+      }
+    } else {
+      /* yomiは仮名漢字混じり wordは平仮名のみからなる */
+      xstr *yomi_xs;
+      char *word;
+
+      yomi_xs = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
+      target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
+      word = anthy_xstr_to_cstr(we->ye->index_xstr, mds->input_encoding);
+
+      /* 逆変換用の辞書はfreqが負 */
+      push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
+			   word, we->source_order);
+
+      anthy_free_xstr(yomi_xs);
+      free(word);
+    }
+  }
+  /**/
+  free(we_array);
+}
+
+static void
+clear_exclude_wtypes(struct mkdic_stat *mds)
+{
+  int i;
+  for (i = 0; i < mds->nr_excluded; i++) {
+    free(mds->excluded_wtypes[i]);
+  }
+  free(mds->excluded_wtypes);
+  /**/
+  mds->excluded_wtypes = NULL;
+  mds->nr_excluded = 0;
+}
+
+static void
+set_exclude_wtypes(struct mkdic_stat *mds, int nr, char **tokens)
+{
+  int i;
+  mds->nr_excluded = nr - 1;
+  mds->excluded_wtypes = malloc(sizeof(char *) * (nr - 1));
+  /**/
+  for (i = 1; i < nr; i++) {
+    mds->excluded_wtypes[i - 1] = strdup(tokens[i]);
+  }
+}
+
+static void
+set_dict_encoding(struct mkdic_stat *mds, const char *enc)
+{
+  if (!strcmp(enc, "utf8")) {
+    mds->yl.body_encoding = ANTHY_UTF8_ENCODING;
+  }
+}
+
+static void
+set_input_encoding(struct mkdic_stat *mds, const char *enc)
+{
+  if (!strcmp(enc, "utf8")) {
+    mds->input_encoding = ANTHY_UTF8_ENCODING;
+  }
+  if (!strcmp(enc, "eucjp")) {
+    mds->input_encoding = ANTHY_EUC_JP_ENCODING;
+  }
+}
+
+static void
+write_dict_file(struct mkdic_stat *mds)
+{
+  if (!mds->ud) {
+    printf("can not build without use case dict\n");
+    exit(1);
+  }
+
+  /* 用例辞書を作る */
+  make_ucdict(uc_out, mds->ud);
+
+  /* 辞書ファイルにまとめる */
+  flush_output_files();
+  link_dics(mds);
+}
+
+static void
+show_command(char **tokens, int nr)
+{
+  int i;
+  printf("cmd:");
+  for (i = 0; i < nr; i++) {
+    printf(" %s", tokens[i]);
+  }
+  printf("\n");
+}
+
+static int
+execute_batch(struct mkdic_stat *mds, const char *fn)
+{
+  int nr;
+  char **tokens;
+  if (anthy_open_file(fn)) {
+    printf("mkanthydic: failed to open %s\n", fn);
+    return 1;
+  }
+  while (!anthy_read_line(&tokens, &nr)) {
+    char *cmd = tokens[0];
+    show_command(tokens, nr);
+    if (!strcmp(cmd, "read") && nr == 2) {
+      read_dict_file(mds, tokens[1]);
+    } else if (!strcmp(cmd, "read_uc") && nr == 2) {
+      read_udict_file(mds, tokens[1]);
+    } else if (!strcmp(cmd, "build_reverse_dict")) {
+      build_reverse_dict(mds);
+    } else if (!strcmp(cmd, "write")) {
+      write_dict_file(mds);
+    } else if (!strcmp(cmd, "set_exclude_wtypes")) {
+      set_exclude_wtypes(mds, nr, tokens);
+    } else if (!strcmp(cmd, "clear_exclude_wtypes")) {
+      clear_exclude_wtypes(mds);
+    } else if (!strcmp(cmd, "set_dict_encoding") && nr == 2) {
+      set_dict_encoding(mds, tokens[1]);
+    } else if (!strcmp(cmd, "set_input_encoding") && nr == 2) {
+      set_input_encoding(mds, tokens[1]);
+    } else if (!strcmp(cmd, "done")) {
+      anthy_free_line();
+      break;
+    } else {
+      printf("Unknown command(%s).\n", cmd);
+    }
+    anthy_free_line();
+  }
+  anthy_close_file();
+  return 0;
+}
+
+/* 辞書生成のための変数の初期化 */
+static void
+init_mds(struct mkdic_stat *mds)
+{
+  int i;
+  mds->output_fn = DEFAULT_FN;
+  mds->ud = NULL;
+
+  /* 単語辞書を初期化する */
+  mds->yl.head = NULL;
+  mds->yl.nr_entries = 0;
+  for (i = 0; i < YOMI_HASH; i++) {
+    mds->yl.hash[i] = NULL;
+  }
+  mds->yl.index_encoding = ANTHY_UTF8_ENCODING;
+  mds->yl.body_encoding = ANTHY_EUC_JP_ENCODING;
+  /**/
+  mds->ac_list.next = NULL;
+  /**/
+  mds->input_encoding = ANTHY_EUC_JP_ENCODING;
+  /**/
+  mds->nr_excluded = 0;
+  mds->excluded_wtypes = NULL;
+}
+
+/* libanthyの使用する部分だけを初期化する */
+static void
+init_libs(void)
+{
+  int res;
+  res = anthy_init_xstr();
+  if (res == -1) {
+    fprintf (stderr, "failed to init dic lib\n");
+    exit(1);
+  }
+}
+
+/**/
+int
+main(int argc, char **argv)
+{
+  struct mkdic_stat mds;
+  int i;
+  char *script_fn = NULL;
+  int help_mode = 0;
+
+  anthy_init_wtypes();
+  init_libs();
+  init_mds(&mds);
+
+  for (i = 1; i < argc; i++) {
+    char *arg = argv[i];
+    char *prev_arg = argv[i-1];
+    if (!strcmp(arg, "--help")) {
+      help_mode = 1;
+    }
+    if (!strcmp(prev_arg, "-f")) {
+      script_fn = arg;
+    }
+  }
+
+  if (help_mode || !script_fn) {
+    print_usage();
+  }
+
+  return execute_batch(&mds, script_fn);
+}
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2009-02-07 16:32:56 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2009-02-07 16:32:56 +0000
commit	a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch)
tree	a966aeee62e69ae3ad13275d07ddb15049b14e0e /mkworddic/mkdic.c
download	anthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz