1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
# utf8pat.rb
#
# Generate utf8-encoded ragel or colm patterns for unicode code character sets.
#
# The dict structure is a hash mapping upper ends of ranges to a hash that
# contains:
# 1. The lower end.
# 2. A dict for the tail of the pattern.
#
# We index by the upper end of the range because we assume all unicode points
# to be read in increasing order and we check for extension as we add points by
# looking up the upper end.
#
# dict: { upper => { :lower => lower, :dict => dict } }
#
#
# utf8pat.rb <category-regex> <indentation-level> <unicode-data-file>
#
#
# ruby utf8pat.rb 'L[lutmo]' 1 UnicodeData.txt
target_category = Regexp.new( ARGV[0] )
indentation_level = ARGV[1].to_i
unicode_data = ARGV[2]
def utf8_enc( n )
if n <= 0x7F
return [ n ]
elsif n <= 0x7FF
return [
0xC0 | (n >> 6),
0x80 | (n & 0x3F)
]
elsif n <= 0xFFFF
return [
0xE0 | (n >> 12),
0x80 | (n >> 6) & 0x3F,
0x80 | n & 0x3F
]
elsif n <= 0x10ffff
return [
0xF0 | (n >> 18),
0x80 | (n >> 12) & 0x3F,
0x80 | (n >> 6) & 0x3F,
0x80 | n & 0x3F
]
end
end
def add_to_dict( dict, utf8val )
return if utf8val.size == 0
nk = utf8val[0]
if utf8val.size == 1 && nk > 0 && dict.key?( nk - 1 )
dict[nk] = dict[nk - 1]
dict.delete( nk - 1 )
else
if ! dict.key?( utf8val[0] )
dict[nk] = { :lower => nk, :dict => {} }
end
end
add_to_dict( dict[nk][:dict], utf8val[1..-1] )
end
def compare( dict1, dict2 )
# First check if we have equal size. If so, iterate dict1 and endsure key
# is present in dict2. Then check lower end of the range matches and
# recurse on the tails.
return false if dict1.size != dict2.size
dict1.each do |key, value|
return false if !dict2.key?( key )
return false if value[:lower] != dict2[key][:lower]
return false if !compare( value[:dict], dict2[key][:dict] )
end
return true
end
def merge( dict )
previous = nil
dict.each do |key, value|
# First recurse, ensuring dict is merged.
merge( value[:dict] )
if !previous.nil? && ( previous + 1 ) == value[:lower] &&
compare( dict[previous][:dict], value[:dict] )
# The previous and cur entries make a contiguous range AND the two
# tails are identical patterns.
value[:lower] = dict[previous][:lower]
dict[previous][:lower] = -1
dict.delete( previous )
end
previous = key
end
end
def indent( level )
for l in 1..level
print "\t"
end
end
def print_level( level, dict )
first = true
dict.each do |key, value|
print " |\n" if !first
indent( level )
if value[:lower] != key
print "0x%02X .. " % value[:lower]
end
print "0x%02X" % key
if value[:dict].size > 0
print " (\n"
print_level( level + 1, value[:dict] )
indent( level )
print ")"
end
first = false
end
print "\n"
end
file = open( unicode_data )
dict = {}
file.each_line do |line|
next if line =~ /^[ \t\v]*#/;
next if line =~ /^[ \t\v]*$/;
range, description, category = line.split(/;/)
if category =~ target_category
add_to_dict( dict, utf8_enc( range.hex ) )
end
end
merge( dict )
print_level( indentation_level, dict )
|