mkworddic/mkudic.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

/*
 * 用例辞書を作る
 *
 * Copyright (C) 2003-2005 TABATA Yusuke
 */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include <anthy/matrix.h>
#include "mkdic.h"

#define LINE_LEN 256

/* 用例 */
struct use_case {
  int id[2];
  struct use_case *next;
};

/* 用例辞書 */
struct uc_dict {
  /* 用例リスト */
  struct use_case uc_head;
  int nr_ucs;
};

/* 用例定義の行から単語のidを求める
 */
static int
get_id_from_word_line(char *buf)
{
  char yomi[LINE_LEN];
  char okuri[LINE_LEN];
  char wt[LINE_LEN];
  char kanji[LINE_LEN];
  int res, id;
  xstr *xs;

  res = sscanf(buf, "%s %s %s %s", yomi, okuri, wt, kanji);
  if (res != 4) {
    return -1;
  }
  xs = anthy_cstr_to_xstr(kanji, 0);
  id = anthy_xstr_hash(xs);
  anthy_free_xstr(xs);
  return id;
}

static void
commit_uc(struct uc_dict *dict, int x, int y)
{
  struct use_case *uc;
  if (x < 0 || y < 0) {
    return ;
  }
  uc = malloc(sizeof(struct use_case));
  uc->id[0] = x;
  uc->id[1] = y;
  /**/
  uc->next = dict->uc_head.next;
  dict->uc_head.next = uc;
  dict->nr_ucs ++;
}

/* 用例データベースを作る */
struct uc_dict *
create_uc_dict(void)
{
  struct uc_dict *dict = malloc(sizeof(struct uc_dict));

  dict->uc_head.next = NULL;
  dict->nr_ucs = 0;

  return dict;
}

/* 用例ファイルを読み込む */
void
read_uc_file(struct uc_dict *dict, const char *fn)
{
  char buf[LINE_LEN];
  FILE *uc_file;
  int off, base = 0, cur;
  int line_number = 0;

  uc_file = fopen(fn, "r");
  if (!uc_file) {
    return ;
  }

  /* off=0      : 最初の単語
   * off=1,2..n : それと関係ある単語
   */
  off = 0;
  while (fgets(buf, LINE_LEN, uc_file)) {
    /**/
    line_number ++;
    /**/
    if (buf[0] == '#') {
      /* コメント */
      continue;
    }
    if (buf[0] == '-') {
      /* 区切り記号 */
      off = 0;
      continue;
    }
    cur = get_id_from_word_line(buf);
    if (cur == -1) {
      fprintf(stderr, "Invalid line(%d):%s\n", line_number, buf);
    }
    /**/
    if (off == 0) {
      /* 一つめの項目 */
      base = cur;
    } else {
      /* 二つめ以降の項目 */
      commit_uc(dict, cur, base);
    }
    off ++;
  }
}

/* 用例辞書をファイルに書き出す */
void
make_ucdict(FILE *uc_out, struct uc_dict *dict)
{
  struct use_case *uc;
  struct sparse_matrix *sm;
  struct matrix_image *mi;
  int i;
  /* 疎行列に詰め込む */
  sm = anthy_sparse_matrix_new();
  if (dict) {
    for (uc = dict->uc_head.next; uc; uc = uc->next) {
      anthy_sparse_matrix_set(sm, uc->id[0], uc->id[1], 1, NULL);
    }
  }
  anthy_sparse_matrix_make_matrix(sm);
  /* 疎行列のイメージを作成してファイルに書き出す */
  mi = anthy_matrix_image_new(sm);
  for (i = 0; i < mi->size; i++) {
    write_nl(uc_out, mi->image[i]);
  }
  if (dict) {
    printf("udic: %d use examples.\n", dict->nr_ucs);
  } else {
    printf("udic: no use examples.\n");
  }

}