1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
|
%# -*- mode: ruby; coding: utf-8 -*-
<%
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...
# Constants for input and output directory
InputDataDir = ARGV[0] || 'enc/unicode/data'
unicode_version = InputDataDir[/.*\/(\d+\.\d+\.\d+)(?=\/|\z)/, 1]
# convenience methods
class Integer
def to_UTF8() # convert to string, taking legibility into account
if self>0xFFFF
"\\u{#{to_s(16).upcase}}"
elsif self>0x7f
"\\u#{to_s(16).upcase.rjust(4, '0')}"
else
chr.sub(/[\\\"]/, "\\\\\\\&")
end
end
end
module Enumerable
unless method_defined?(:each_slice)
def each_slice(n)
ary = []
each do |i|
ary << i
if ary.size >= n
yield ary
ary = []
end
end
yield ary unless ary.empty?
self
end
end
end
class Array
def to_UTF8() collect {|c| c.to_UTF8}.join('') end
def each_regexp_chars(n = 1) # converts an array of Integers to character ranges
sort.inject([]) do |ranges, value|
if ranges.last and ranges.last[1]+1>=value
ranges.last[1] = value
ranges
else
ranges << [value, value]
end
end.collect do |first, last|
case last-first
when 0
first.to_UTF8
when 1
first.to_UTF8 + last.to_UTF8
else
first.to_UTF8 + '-' + last.to_UTF8
end
end.each_slice(n) do |slice|
yield slice.join('')
end
end
end
# read the file 'CompositionExclusions.txt'
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt", 'rb') {|f|
base = Regexp.quote(File.basename(f.path, '.*'))
ext = Regexp.quote(File.extname(f.path))
version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
abort "No file version in #{f.path}: #{line}"
(unicode_version ||= version) == version or
abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
}
decomposition_table = {}
kompatible_table = {}
combining_class = {} # constant to allow use in Integer#to_UTF8
# read the file 'UnicodeData.txt'
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
codepoint, name, _, char_class, _, decomposition, *_rest = line.split(";")
case decomposition
when /^[0-9A-F]/
decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
when /^</
kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
end
combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
warn "Unexpected: Character range with data relevant to normalization!"
end
end
# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
composition_exclusions.member? character or # predefined composition exclusion
decomposition.length<=1 or # Singleton Decomposition
combining_class[character] or # character is not a Starter
combining_class[decomposition.first] # decomposition begins with a character that is not a Starter
end.invert
# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values
accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
composition_starters = composition_table.keys.collect {|key| key.first}
hangul_no_trailing = []
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
# expand decomposition table values
decomposition_table.each do |key, value|
position = 0
while position < value.length
if decomposition = decomposition_table[value[position]]
decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
value[position, 1] = decomposition
else
position += 1
end
end
end
# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
value = value.dup
expanded = false
position = 0
while position < value.length
if decomposition = kompatible_table[value[position]]
value[position, 1] = decomposition
expanded = true
else
position += 1
end
end
kompatible_table[key] = value if expanded
end
while kompatible_table.any? {|key, value|
expanded = value.map {|v| kompatible_table[v] || v}.flatten
kompatible_table[key] = expanded unless value == expanded
}
end
# generate normalization tables file
%># coding: us-ascii
# frozen_string_literal: true
%# >
# automatically generated by template/unicode_norm_gen.tmpl
module UnicodeNormalize # :nodoc:
accents = "" \
"[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]"
ACCENTS = accents
REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
}" \
"[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]#{accents}*" \
"|#{'' # characters that can be the result of a composition, except composition starters
}" \
"[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]?#{accents}+" \
"|#{'' # precomposed Hangul syllables
}" \
"[\u{AC00}-\u{D7A4}]"
REGEXP_C_STRING = "#{'' # composition exclusions
}" \
"[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]#{accents}*" \
"|#{'' # composition starters and characters that can be the result of a composition
}" \
"[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]?#{accents}+" \
"|#{'' # Hangul syllables with separate trailer
}" \
"[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>][\u11A8-\u11C2]" \
"|#{'' # decomposed Hangul syllables
}" \
"[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?"
REGEXP_K_STRING = "" \
"[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
"<%end%>]"
class_table = {
% combining_class.each do |key, value|
"<%=key.to_UTF8%>"=><%=value%><%=%>,
% end
}
class_table.default = 0
CLASS_TABLE = class_table.freeze
DECOMPOSITION_TABLE = {
% decomposition_table.each do |key, value|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
% end
}.freeze
KOMPATIBLE_TABLE = {
% kompatible_table.each do |key, value|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
% end
}.freeze
COMPOSITION_TABLE = {
% composition_table.each do |key, value|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
% end
}.freeze
end
|