storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/generate_uca_table.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286

#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013  Kouhei Sutou <kou@clear-code.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

require "optparse"

$LOAD_PATH.unshift(File.dirname(__FILE__))
require "parser"

@suffix = ""
@split_small_kana_p = false
@split_kana_with_voiced_sound_mark_p = false
@split_kana_with_semi_voiced_sound_mark_p = false

option_parser = OptionParser.new
option_parser.banner += " MYSQL_SOURCE/strings/ctype-uca.c"

option_parser.on("--suffix=SUFFIX", "Add SUFFIX to names") do |suffix|
  @suffix = suffix
end

option_parser.on("--[no-]split-small-kana",
                 "Split small hiragana (katakana) and " +
                   "large hiragana (katakana)",
                 "(#{@split_small_kana_p})") do |boolean|
  @split_small_kana_p = boolean
end

option_parser.on("--[no-]split-kana-with-voiced-sound-mark",
                 "Split hiragana (katakana) with voiced sound mark",
                 "(#{@split_kana_with_voiced_sound_mark})") do |boolean|
  @split_kana_with_voiced_sound_mark_p = boolean
end

option_parser.on("--[no-]split-kana-with-semi-voiced-sound-mark",
                 "Split hiragana (katakana) with semi-voiced sound mark",
                 "(#{@split_kana_with_semi_voiced_sound_mark})") do |boolean|
  @split_kana_with_semi_voiced_sound_mark_p = boolean
end

begin
  option_parser.parse!(ARGV)
rescue OptionParser::Error
  puts($!)
  exit(false)
end

if ARGV.size != 1
  puts(option_parser)
  exit(false)
end

ctype_uca_c_path = ARGV[0]

parser = CTypeUCAParser.new
File.open(ctype_uca_c_path) do |ctype_uca_c|
  parser.parse(ctype_uca_c)
end

SMALL_KANAS = [
  "ぁ", "ぃ", "ぅ", "ぇ", "ぉ",
  "っ",
  "ゃ", "ゅ", "ょ",
  "ゎ",
  "ァ", "ィ", "ゥ", "ェ", "ォ",
  "ッ",
  "ャ", "ュ", "ョ",
  "ヮ",
  "ｧ", "ｨ", "ｩ", "ｪ", "ｫ",
  "ｯ",
  "ｬ", "ｭ", "ｮ",
]
def small_kana?(character)
  SMALL_KANAS.include?(character[:utf8])
end

KANA_WITH_VOICED_SOUND_MARKS = [
  "が", "ぎ", "ぐ", "げ", "ご",
  "ざ", "じ", "ず", "ぜ", "ぞ",
  "だ", "ぢ", "づ", "で", "ど",
  "ば", "び", "ぶ", "べ", "ぼ",
  "ガ", "ギ", "グ", "ゲ", "ゴ",
  "ザ", "ジ", "ズ", "ゼ", "ゾ",
  "ダ", "ヂ", "ヅ", "デ", "ド",
  "バ", "ビ", "ブ", "ベ", "ボ",
]
def kana_with_voiced_sound_mark?(character)
  KANA_WITH_VOICED_SOUND_MARKS.include?(character[:utf8])
end

KANA_WITH_SEMI_VOICED_SOUND_MARKS = [
  "ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
  "パ", "ピ", "プ", "ペ", "ポ",
]
def kana_with_semi_voiced_sound_mark?(character)
  KANA_WITH_SEMI_VOICED_SOUND_MARKS.include?(character[:utf8])
end

def split_characters(characters)
  grouped_characters = characters.group_by do |character|
    if @split_small_kana_p and small_kana?(character)
      :small_kana
    elsif @split_kana_with_voiced_sound_mark_p and
        kana_with_voiced_sound_mark?(character)
      :kana_with_voiced_sound_mark
    elsif @split_kana_with_semi_voiced_sound_mark_p and
        kana_with_semi_voiced_sound_mark?(character)
      :kana_with_semi_voiced_sound_mark
    else
      :other
    end
  end
  grouped_characters.values
end

grouped_characters = []
parser.weight_based_characters.each do |weight, characters|
  grouped_characters.concat(split_characters(characters))
end

GREEK_CAPITAL_UNICODE_RANGE = Unicode.from_utf8("Α")..Unicode.from_utf8("Ω")
def find_greek_capital_character(characters)
  characters.find do |character|
    GREEK_CAPITAL_UNICODE_RANGE.cover?(character[:code_point])
  end
end

def find_representative_character(characters)
  representative_character = nil
  case characters.first[:utf8]
  when "⺄", "⺇", "⺈", "⺊", "⺌", "⺗"
    representative_character = characters.last
  when "⺜", "⺝", "⺧", "⺫", "⺬", "⺮", "⺶", "⺻", "⺼", "⺽"
    representative_character = characters[1]
  when "⻆", "⻊", "⻏", "⻑", "⻕", "⻗", "⻝", "⻡", "⻣", "⻤"
    representative_character = characters.last
  when "⻱", "⼀", "⼆", "⼈"
    representative_character = characters[1]
  when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ"
    representative_character = characters[1] unless @split_small_kana_p
  else
    representative_character ||= find_greek_capital_character(characters)
  end
  representative_character ||= characters.first
  representative_character
end

target_pages = {}
grouped_characters.each do |characters|
  next if characters.size == 1
  representative_character = find_representative_character(characters)
  representative_code_point = representative_character[:code_point]
  rest_characters = characters.reject do |character|
    character == representative_character
  end
  rest_characters.each do |character|
    code_point = character[:code_point]
    page = code_point >> 8
    low_code = code_point & 0xff
    target_pages[page] ||= [nil] * 256
    target_pages[page][low_code] = representative_code_point
  end
end

sorted_target_pages = target_pages.sort_by do |page, code_points|
  page
end


normalized_ctype_uca_c_path =
  ctype_uca_c_path.sub(/\A.*\/([^\/]+\/strings\/ctype-uca\.c)\z/, "\\1")

@suffix_upper_case = @suffix.upcase

puts(<<-HEADER)
/*
  Copyright(C) 2013  Kouhei Sutou <kou@clear-code.com>

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Library General Public
  License as published by the Free Software Foundation; version 2
  of the License.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Library General Public License for more details.

  You should have received a copy of the GNU Library General Public
  License along with this library; if not, write to the Free
  Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
  MA 02110-1301, USA

  This file uses normalization table defined in
  #{normalized_ctype_uca_c_path}.
  The following is the header of the file:

    Copyright (c) 2004, 2011, Oracle and/or its affiliates. All rights reserved.

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; version 2
    of the License.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
    MA 02110-1301, USA

    UCA (Unicode Collation Algorithm) support.
    Written by Alexander Barkov <bar@mysql.com>
*/

#ifndef MYSQL_UCA#{@suffix_upper_case}_H
#define MYSQL_UCA#{@suffix_upper_case}_H

#include <stdint.h>
HEADER

def page_name(page)
  "unicode_ci#{@suffix}_page_%02x" % page
end

sorted_target_pages.each do |page, characters|
  puts(<<-PAGE_HEADER)

static uint32_t #{page_name(page)}[] = {
PAGE_HEADER
  lines = characters.each_with_index.each_slice(8).collect do |characters_group|
    formatted_code_points = characters_group.collect do |normalized, low_code|
      normalized ||= (page << 8) + low_code
      "0x%05x" % normalized
    end
    "  " + formatted_code_points.join(", ")
  end
  puts(lines.join(",\n"))
  puts(<<-PAGE_FOOTER)
};
PAGE_FOOTER
end

puts(<<-PAGES_HEADER)

static uint32_t *unicode_ci#{@suffix}_table[256] = {
PAGES_HEADER

pages = ["NULL"] * 256
sorted_target_pages.each do |page, characters|
  pages[page] = page_name(page)
end
lines = pages.each_slice(2).collect do |pages_group|
  formatted_pages = pages_group.collect do |page|
    "%19s" % page
  end
  "  " + formatted_pages.join(", ")
end
puts(lines.join(",\n"))

puts(<<-PAGES_FOOTER)
};
PAGES_FOOTER

puts(<<-FOOTER)

#endif
FOOTER