diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/nfkc.rb')
-rwxr-xr-x | storage/mroonga/vendor/groonga/lib/nfkc.rb | 989 |
1 files changed, 734 insertions, 255 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/nfkc.rb b/storage/mroonga/vendor/groonga/lib/nfkc.rb index 1a134384c80..9ad25bba9ff 100755 --- a/storage/mroonga/vendor/groonga/lib/nfkc.rb +++ b/storage/mroonga/vendor/groonga/lib/nfkc.rb @@ -1,7 +1,7 @@ #!/usr/bin/env ruby # -*- coding: utf-8 -*- # -# Copyright(C) 2010 Brazil +# Copyright(C) 2010-2016 Brazil # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -16,80 +16,701 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -$KCODE = 'u' - CUSTOM_RULE_PATH = 'nfkc-custom-rules.txt' -def gen_bc(file, hash, level) - bl = ' ' * (level * 2) - h2 = {} - hash.each{|key,val| - head = key[0] - rest = key[1..-1] - if h2[head] - h2[head][rest] = val +class SwitchGenerator + def initialize(unicode_version, output) + @unicode_version = unicode_version + @output = output + end + + def generate(bc, decompose_map, compose_map) + STDERR.puts('generating char type code..') + generate_blockcode_char_type(bc) + STDERR.puts('generating decompose code..') + generate_decompose(decompose_map) + STDERR.puts('generating compose code..') + generate_compose(compose_map) + end + + private + def generate_blockcode_char_type(bc) + @output.puts(<<-HEADER) + +grn_char_type +grn_nfkc#{@unicode_version}_char_type(const unsigned char *str) +{ + HEADER + + @lv = 0 + gen_bc(bc, 0) + + @output.puts(<<-FOOTER) + return -1; +} + FOOTER + end + + def gen_bc(hash, level) + bl = ' ' * (level * 2) + h2 = {} + hash.each{|key,val| + key = key.dup + key.force_encoding("ASCII-8BIT") + head = key.bytes[0] + rest = key[1..-1] + if h2[head] + h2[head][rest] = val + else + h2[head] = {rest => val} + end + } + if h2.size < 3 + h2.keys.sort.each{|k| + if (0x80 < k) + @output.printf("#{bl}if (str[#{level}] < 0x%02X) { return #{@lv}; }\n", k) + end + h = h2[k] + if h.keys.join =~ /^\x80*$/n + @lv, = h.values + else + @output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", k) + gen_bc(h, level + 1) + @output.puts bl + '}' + end + } + @output.puts bl + "return #{@lv};" else - h2[head] = {rest => val} + @output.puts bl + "switch (str[#{level}]) {" + lk = 0x80 + br = true + h2.keys.sort.each{|k| + if (lk < k) + for j in lk..k-1 + @output.printf("#{bl}case 0x%02X :\n", j) + end + br = false + end + unless br + @output.puts bl + " return #{@lv};" + @output.puts bl + ' break;' + end + h = h2[k] + @output.printf("#{bl}case 0x%02X :\n", k) + if h.keys.join =~ /^\x80*$/n + @lv, = h.values + br = false + else + gen_bc(h, level + 1) + @output.puts bl + ' break;' + br = true + end + lk = k + 1 + } + @output.puts bl + 'default :' + @output.puts bl + " return #{@lv};" + @output.puts bl + ' break;' + @output.puts bl + '}' end - } - if h2.size < 3 - h2.keys.sort.each{|k| - if (0x80 < k) - file.printf("#{bl}if (str[#{level}] < 0x%02X) { return #{$lv}; }\n", k) + end + + def generate_decompose(hash) + @output.puts(<<-HEADER) + +const char * +grn_nfkc#{@unicode_version}_decompose(const unsigned char *str) +{ + HEADER + + gen_decompose(hash, 0) + + @output.puts(<<-FOOTER) + return 0; +} + FOOTER + end + + def gen_decompose(hash, level) + bl = ' ' * ((level + 0) * 2) + if hash[''] + dst = '' + hash[''].each_byte{|b| dst << format('\x%02X', b)} + @output.puts "#{bl}return \"#{dst}\";" + hash.delete('') + end + return if hash.empty? + h2 = {} + hash.each{|key,val| + key = key.dup + key.force_encoding("ASCII-8BIT") + head = key.bytes[0] + rest = key[1..-1] + if h2[head] + h2[head][rest] = val + else + h2[head] = {rest => val} end - h = h2[k] - if h.keys.join =~ /^\x80*$/ - $lv, = h.values + } + if h2.size == 1 + h2.each{|key,val| + @output.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", key) + gen_decompose(val, level + 1) + @output.puts bl + '}' + } + else + @output.puts "#{bl}switch (str[#{level}]) {" + h2.keys.sort.each{|k| + @output.printf("#{bl}case 0x%02X :\n", k) + gen_decompose(h2[k], level + 1) + @output.puts("#{bl} break;") + } + @output.puts bl + '}' + end + end + + def generate_compose(compose_map) + @output.puts(<<-HEADER) + +const char * +grn_nfkc#{@unicode_version}_compose(const unsigned char *prefix, const unsigned char *suffix) +{ + HEADER + suffix = {} + compose_map.each{|src,dst| + chars = src.chars + if chars.size != 2 + STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}" + end + s = chars.pop + if suffix[s] + suffix[s][chars.join] = dst else - file.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", k) - gen_bc(file, h, level + 1) - file.puts bl + '}' + suffix[s] = {chars.join=>dst} end } - file.puts bl + "return #{$lv};" - else - file.puts bl + "switch (str[#{level}]) {" - lk = 0x80 - br = true - h2.keys.sort.each{|k| - if (lk < k) - for j in lk..k-1 - file.printf("#{bl}case 0x%02X :\n", j) + gen_compose_sub(suffix, 0) + @output.puts(<<-FOOTER) + return 0; +} + FOOTER + end + + def gen_compose_sub2(hash, level, indent) + bl = ' ' * ((level + indent + 0) * 2) + if hash[''] + @output.print "#{bl}return \"" + hash[''].each_byte{|b| @output.printf('\x%02X', b)} + @output.puts "\";" + hash.delete('') + end + return if hash.empty? + + h2 = {} + hash.each{|key,val| + key = key.dup + key.force_encoding("ASCII-8BIT") + head = key.bytes[0] + rest = key[1..-1] + if h2[head] + h2[head][rest] = val + else + h2[head] = {rest => val} + end + } + + if h2.size == 1 + h2.each{|key,val| + @output.printf("#{bl}if (prefix[#{level}] == 0x%02X) {\n", key) + gen_compose_sub2(val, level + 1, indent) + @output.puts bl + '}' + } + else + @output.puts "#{bl}switch (prefix[#{level}]) {" + h2.keys.sort.each{|k| + @output.printf("#{bl}case 0x%02X :\n", k) + gen_compose_sub2(h2[k], level + 1, indent) + @output.puts("#{bl} break;") + } + @output.puts bl + '}' + end + end + + def gen_compose_sub(hash, level) + bl = ' ' * ((level + 0) * 2) + if hash[''] + gen_compose_sub2(hash[''], 0, level) + hash.delete('') + end + return if hash.empty? + h2 = {} + hash.each{|key,val| + key = key.dup + key.force_encoding("ASCII-8BIT") + head = key.bytes[0] + rest = key[1..-1] + if h2[head] + h2[head][rest] = val + else + h2[head] = {rest => val} + end + } + if h2.size == 1 + h2.each{|key,val| + @output.printf("#{bl}if (suffix[#{level}] == 0x%02X) {\n", key) + gen_compose_sub(val, level + 1) + @output.puts bl + '}' + } + else + @output.puts "#{bl}switch (suffix[#{level}]) {" + h2.keys.sort.each{|k| + @output.printf("#{bl}case 0x%02X :\n", k) + gen_compose_sub(h2[k], level + 1) + @output.puts("#{bl} break;") + } + @output.puts bl + '}' + end + end +end + +class TableGenerator < SwitchGenerator + private + def name_prefix + "grn_nfkc#{@unicode_version}_" + end + + def table_name(type, common_bytes) + suffix = common_bytes.collect {|byte| "%02x" % byte}.join("") + "#{name_prefix}#{type}_table_#{suffix}" + end + + def function_name(type) + "#{name_prefix}#{type}" + end + + def generate_char_convert_tables(type, return_type, byte_size_groups) + if return_type.end_with?("*") + space = "" + else + space = " " + end + byte_size_groups.keys.sort.each do |common_bytes| + chars = byte_size_groups[common_bytes] + lines = [] + all_values = [] + last_bytes = chars.collect {|char| char.bytes.last} + last_bytes.min.step(last_bytes.max).each_slice(8) do |slice| + values = slice.collect do |last_byte| + char = (common_bytes + [last_byte]).pack("c*") + char.force_encoding("UTF-8") + yield(char) + end + all_values.concat(values) + lines << (" " + values.join(", ")) + end + + next if all_values.uniq.size == 1 + + @output.puts(<<-TABLE_HEADER) + +static #{return_type}#{space}#{table_name(type, common_bytes)}[] = { + TABLE_HEADER + @output.puts(lines.join(",\n")) + @output.puts(<<-TABLE_FOOTER) +}; + TABLE_FOOTER + end + end + + def generate_char_convert_function(type, + argument_list, + char_variable, + default, + return_type, + byte_size_groups, + options={}) + modifier = options[:internal] ? "static inline " : "" + @output.puts(<<-HEADER) + +#{modifier}#{return_type} +#{function_name(type)}(#{argument_list}) +{ + HEADER + + prev_common_bytes = [] + prev_n_common_bytes = 0 + first_group = true + byte_size_groups.keys.sort.each do |common_bytes| + chars = byte_size_groups[common_bytes] + chars_bytes = chars.collect(&:bytes).sort + min = chars_bytes.first.last + max = chars_bytes.last.last + n_common_bytes = 0 + if common_bytes.empty? + indent = " " + yield(:no_common_bytes, indent, chars, chars_bytes) + else + if first_group + @output.puts(<<-BODY) + { + BODY + end + + found_different_byte = false + common_bytes.each_with_index do |common_byte, i| + unless found_different_byte + if prev_common_bytes[i] == common_byte + n_common_bytes += 1 + next + end + found_different_byte = true + end + indent = " " * i + # p [i, prev_common_bytes.collect{|x| "%#04x" % x}, common_bytes.collect{|x| "%#04x" % x}, "%#04x" % common_byte, n_common_bytes, prev_n_common_bytes] + # TODO: The following code may be able to be simplified. + if prev_common_bytes[i].nil? + # p nil + @output.puts(<<-BODY) + #{indent}switch (#{char_variable}[#{i}]) { + BODY + elsif i < prev_n_common_bytes + # p :prev + @output.puts(<<-BODY) + #{indent} default : + #{indent} break; + #{indent} } + #{indent} break; + BODY + elsif n_common_bytes < prev_n_common_bytes + # p :common_prev + @output.puts(<<-BODY) + #{indent}switch (#{char_variable}[#{i}]) { + BODY + else + # p :else + prev_common_bytes.size.downto(common_bytes.size + 1) do |j| + sub_indent = " " * (j - 1) + @output.puts(<<-BODY) + #{indent}#{sub_indent}default : + #{indent}#{sub_indent} break; + #{indent}#{sub_indent}} + #{indent}#{sub_indent}break; + BODY + end + end + @output.puts(<<-BODY) + #{indent}case #{"%#04x" % common_byte} : + BODY + end + + n = chars_bytes.first.size - 1 + indent = " " + (" " * common_bytes.size) + yield(:have_common_bytes, indent, chars, chars_bytes, n, common_bytes) + end + + prev_common_bytes = common_bytes + prev_n_common_bytes = n_common_bytes + first_group = false + end + + # p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes] + + (prev_common_bytes.size - 1).step(0, -1) do |i| + indent = " " * i + @output.puts(<<-BODY) + #{indent}default : + #{indent} break; + #{indent}} + BODY + if i > 0 + @output.puts(<<-BODY) + #{indent}break; + BODY + end + end + + @output.puts(<<-FOOTER) + } + + return #{default}; +} + FOOTER + end + + def generate_char_converter(type, + function_type, + char_map, + default, + return_type, + options={}, + &converter) + byte_size_groups = char_map.keys.group_by do |from| + bytes = from.bytes + bytes[0..-2] + end + + generate_char_convert_tables(type, + return_type, + byte_size_groups, + &converter) + + char_variable = "utf8" + generate_char_convert_function(function_type, + "const unsigned char *#{char_variable}", + char_variable, + default, + return_type, + byte_size_groups, + options) do |state, *args| + case state + when :no_common_bytes + indent, chars, chars_bytes = args + if chars.size == 1 + char = chars[0] + char_byte = chars_bytes.first.first + value = yield(char) + @output.puts(<<-BODY) +#{indent}if (#{char_variable}[0] < 0x80) { +#{indent} if (#{char_variable}[0] == #{"%#04x" % char_byte}) { +#{indent} return #{value}; +#{indent} } else { +#{indent} return #{default}; +#{indent} } +#{indent}} else { + BODY + else + min = chars_bytes.first.first + max = chars_bytes.last.first + @output.puts(<<-BODY) +#{indent}if (#{char_variable}[0] < 0x80) { +#{indent} if (#{char_variable}[0] >= #{"%#04x" % min} && +#{indent} #{char_variable}[0] <= #{"%#04x" % max}) { +#{indent} return #{table_name(type, [])}[#{char_variable}[0] - #{"%#04x" % min}]; +#{indent} } else { +#{indent} return #{default}; +#{indent} } +#{indent}} else { + BODY + end + when :have_common_bytes + indent, chars, chars_bytes, n, common_bytes = args + if chars.size == 1 + char = chars[0] + char_byte = chars_bytes.first.last + value = yield(char) + @output.puts(<<-BODY) +#{indent}if (#{char_variable}[#{n}] == #{"%#04x" % char_byte}) { +#{indent} return #{value}; +#{indent}} +#{indent}break; + BODY + else + sorted_chars = chars.sort + min = chars_bytes.first.last + max = chars_bytes.last.last + all_values = (min..max).collect do |last_byte| + char = (common_bytes + [last_byte]).pack("c*") + char.force_encoding("UTF-8") + yield(char) + end + if all_values.uniq.size == 1 + value = all_values.first + else + value = "#{table_name(type, common_bytes)}[#{char_variable}[#{n}] - #{"%#04x" % min}]" + end + last_n_bits_for_char_in_utf8 = 6 + max_n_chars_in_byte = 2 ** last_n_bits_for_char_in_utf8 + if all_values.size == max_n_chars_in_byte + @output.puts(<<-BODY) +#{indent}return #{value}; + BODY + else + @output.puts(<<-BODY) +#{indent}if (#{char_variable}[#{n}] >= #{"%#04x" % min} && +#{indent} #{char_variable}[#{n}] <= #{"%#04x" % max}) { +#{indent} return #{value}; +#{indent}} +#{indent}break; + BODY + end end - br = false end - unless br - file.puts bl + " return #{$lv};" - file.puts bl + ' break;' + end + end + + def generate_blockcode_char_type(block_codes) + default = "GRN_CHAR_OTHERS" + + char_types = {} + current_type = default + prev_char = nil + block_codes.keys.sort.each do |char| + type = block_codes[char] + if current_type != default + prev_code_point = prev_char.codepoints[0] + code_point = char.codepoints[0] + (prev_code_point...code_point).each do |target_code_point| + target_char = [target_code_point].pack("U*") + char_types[target_char] = current_type + end end - h = h2[k] - file.printf("#{bl}case 0x%02X :\n", k) - if h.keys.join =~ /^\x80*$/ - $lv, = h.values - br = false + current_type = type + prev_char = char + end + unless current_type == default + raise "TODO: Consider the max unicode character" + max_unicode_char = "\u{10ffff}" + (prev_char..max_unicode_char).each do |target_char| + char_types[target_char] = current_type + end + end + + generate_char_converter("char_type", + "char_type", + char_types, + default, + "grn_char_type") do |char| + char_types[char] || default + end + end + + def generate_decompose(decompose_map) + default = "NULL" + generate_char_converter("decompose", + "decompose", + decompose_map, + default, + "const char *") do |from| + to = decompose_map[from] + if to + escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("") + "\"#{escaped_value}\"" else - gen_bc(file, h, level + 1) - file.puts bl + ' break;' - br = true + default end - lk = k + 1 - } - file.puts bl + 'default :' - file.puts bl + " return #{$lv};" - file.puts bl + ' break;' - file.puts bl + '}' + end + end + + def generate_compose(compose_map) + # require "pp" + # p compose_map.size + # pp compose_map.keys.group_by {|x| x.chars[1]}.size + # pp compose_map.keys.group_by {|x| x.chars[1]}.collect {|k, vs| [k, k.codepoints, vs.size, vs.group_by {|x| x.chars[0].bytesize}.collect {|k2, vs2| [k2, vs2.size]}]} + # pp compose_map.keys.group_by {|x| x.chars[0].bytesize}.collect {|k, vs| [k, vs.size]} + # pp compose_map + + suffix_char_map = {} + compose_map.each do |source, destination| + chars = source.chars + if chars.size != 2 + STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}" + return + end + prefix, suffix = chars + suffix_char_map[suffix] ||= {} + suffix_char_map[suffix][prefix] = destination + end + + suffix_char_map.each do |suffix, prefix_char_map| + suffix_bytes = suffix.bytes.collect {|byte| "%02x" % byte}.join("") + default = "NULL" + generate_char_converter("compose_prefix_#{suffix_bytes}", + "compose_prefix_#{suffix_bytes}", + prefix_char_map, + default, + "const char *", + :internal => true) do |prefix| + to = prefix_char_map[prefix] + if to + escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("") + "\"#{escaped_value}\"" + else + default + end + end + end + + + char_variable = "suffix_utf8" + argument_list = + "const unsigned char *prefix_utf8, " + + "const unsigned char *#{char_variable}" + default = "NULL" + byte_size_groups = suffix_char_map.keys.group_by do |from| + bytes = from.bytes + bytes[0..-2] + end + generate_char_convert_function("compose", + argument_list, + char_variable, + default, + "const char *", + byte_size_groups) do |type, *args| + case type + when :no_common_bytes + indent, chars, chars_bytes = args + @output.puts(<<-BODY) +#{indent}switch (#{char_variable}[0]) { + BODY + chars.each do |char| + suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("") + type = "compose_prefix_#{suffix_bytes}" + @output.puts(<<-BODY) +#{indent}case #{"%#04x" % char.bytes.last} : +#{indent} return #{function_name(type)}(prefix_utf8); + BODY + end + @output.puts(<<-BODY) +#{indent}default : +#{indent} return #{default}; +#{indent}} +#{indent}break; + BODY + when :have_common_bytes + indent, chars, chars_bytes, n, common_bytes = args + @output.puts(<<-BODY) +#{indent}switch (#{char_variable}[#{n}]) { + BODY + chars.each do |char| + suffix_bytes = char.bytes.collect {|byte| "%02x" % byte}.join("") + type = "compose_prefix_#{suffix_bytes}" + @output.puts(<<-BODY) +#{indent}case #{"%#04x" % char.bytes.last} : +#{indent} return #{function_name(type)}(prefix_utf8); + BODY + end + @output.puts(<<-BODY) +#{indent}default : +#{indent} return #{default}; +#{indent}} +#{indent}break; + BODY + end + end + end + + def to_bytes_map(char_map) + bytes_map = {} + char_map.each_key do |from| + parent = bytes_map + from.bytes[0..-2].each do |byte| + parent[byte] ||= {} + parent = parent[byte] + end + parent[from.bytes.last] = char_map[from] + end + bytes_map end end -def generate_blockcode_char_type(file, option) +def create_bc(option) bc = {} open("|./icudump --#{option}").each{|l| src,_,code = l.chomp.split("\t") - str = src.split(':').collect{|c| format("%c", c.hex)}.join + str = src.split(':').collect(&:hex).pack("c*") + str.force_encoding("UTF-8") bc[str] = code } - $lv = 0 - gen_bc(file, bc, 0) + bc end def ccpush(hash, src, dst) @@ -104,7 +725,7 @@ end def subst(hash, str) cand = nil - src = str.split(//) + src = str.chars for i in 0..src.size-1 h = hash for j in i..src.size-1 @@ -112,7 +733,7 @@ def subst(hash, str) h = h[head] break unless h if h[nil] - cand = src[0,i].to_s + h[nil] + src[j + 1..-1].to_s + cand = src[0,i].join("") + h[nil] + src[j + 1..-1].join("") end end return cand if cand @@ -120,7 +741,7 @@ def subst(hash, str) return str end -def map_entry(map1, cc, src, dst) +def map_entry(decompose, cc, src, dst) dst.downcase! unless $case_sensitive loop { dst2 = subst(cc, dst) @@ -130,43 +751,43 @@ def map_entry(map1, cc, src, dst) unless $keep_space dst = $1 if dst =~ /^ +([^ ].*)$/ end - map1[src] = dst if src != dst + decompose[src] = dst if src != dst end -def create_map1() +def create_decompose_map() cc = {} open('|./icudump --cc').each{|l| _,src,dst = l.chomp.split("\t") if cc[src] STDERR.puts "caution: ambiguous mapping #{src}|#{cc[src]}|#{dst}" if cc[src] != dst end - ccpush(cc, src.split(//), dst) + ccpush(cc, src.chars, dst) } - map1 = {} + decompose_map = {} open('|./icudump --nfkd').each{|l| n,src,dst = l.chomp.split("\t") - map_entry(map1, cc, src, dst) + map_entry(decompose_map, cc, src, dst) } if File.readable?(CUSTOM_RULE_PATH) open(CUSTOM_RULE_PATH).each{|l| src,dst = l.chomp.split("\t") - map_entry(map1, cc, src, dst) + map_entry(decompose_map, cc, src, dst) } end unless $case_sensitive for c in 'A'..'Z' - map1[c] = c.downcase + decompose_map[c] = c.downcase end end - return map1 + return decompose_map end -def create_map2(map1) +def create_compose_map(decompose_map) cc = {} open('|./icudump --cc').each{|l| _,src,dst = l.chomp.split("\t") - src = src.split(//).collect{|c| map1[c] || c}.join - dst = map1[dst] || dst + src = src.chars.collect{|c| decompose_map[c] || c}.join + dst = decompose_map[dst] || dst if cc[src] && cc[src] != dst STDERR.puts("caution: inconsitent mapping '#{src}' => '#{cc[src]}'|'#{dst}'") end @@ -177,14 +798,15 @@ def create_map2(map1) cc2 = {} cc.each {|src,dst| src2 = src - chars = src.split(//) + chars = src.chars l = chars.size - 1 for i in 0..l for j in i..l next if i == 0 && j == l str = chars[i..j].join - if map1[str] - STDERR.printf("caution: recursive mapping '%s'=>'%s'\n", str, map1[str]) + if decompose_map[str] + STDERR.printf("caution: recursive mapping '%s'=>'%s'\n", + str, decompose_map[str]) end if cc[str] src2 = (i > 0 ? chars[0..i-1].join : '') + cc[str] + (j < l ? chars[j+1..l].join : '') @@ -202,134 +824,42 @@ def create_map2(map1) return cc end -def generate_map1(file, hash, level) - bl = ' ' * ((level + 0) * 2) - if hash[''] - dst = '' - hash[''].each_byte{|b| dst << format('\x%02X', b)} - file.puts "#{bl}return \"#{dst}\";" - hash.delete('') - end - return if hash.empty? - h2 = {} - hash.each{|key,val| - head = key[0] - rest = key[1..-1] - if h2[head] - h2[head][rest] = val - else - h2[head] = {rest => val} - end - } - if h2.size == 1 - h2.each{|key,val| - file.printf("#{bl}if (str[#{level}] == 0x%02X) {\n", key) - generate_map1(file, val, level + 1) - file.puts bl + '}' - } - else - file.puts "#{bl}switch (str[#{level}]) {" - h2.keys.sort.each{|k| - file.printf("#{bl}case 0x%02X :\n", k) - generate_map1(file, h2[k], level + 1) - file.puts("#{bl} break;") - } - file.puts bl + '}' - end -end +######## main ####### -def gen_map2_sub2(file, hash, level, indent) - bl = ' ' * ((level + indent + 0) * 2) - if hash[''] - file.print "#{bl}return \"" - hash[''].each_byte{|b| file.printf('\x%02X', b)} - file.puts "\";" - hash.delete('') +generator_class = SwitchGenerator +ARGV.each{|arg| + case arg + when /-*c/i + $case_sensitive = true + when /-*s/i + $keep_space = true + when "--impl=switch" + generator_class = SwitchGenerator + when "--impl=table" + generator_class = TableGenerator end - return if hash.empty? - - h2 = {} - hash.each{|key,val| - head = key[0] - rest = key[1..-1] - if h2[head] - h2[head][rest] = val - else - h2[head] = {rest => val} - end - } +} - if h2.size == 1 - h2.each{|key,val| - file.printf("#{bl}if (prefix[#{level}] == 0x%02X) {\n", key) - gen_map2_sub2(file, val, level + 1, indent) - file.puts bl + '}' - } - else - file.puts "#{bl}switch (prefix[#{level}]) {" - h2.keys.sort.each{|k| - file.printf("#{bl}case 0x%02X :\n", k) - gen_map2_sub2(file, h2[k], level + 1, indent) - file.puts("#{bl} break;") - } - file.puts bl + '}' - end -end +STDERR.puts('compiling icudump') +system('cc -Wall -O3 -o icudump -I/tmp/local/include -L/tmp/local/lib icudump.c -licuuc -licui18n') -def gen_map2_sub(file, hash, level) - bl = ' ' * ((level + 0) * 2) - if hash[''] - gen_map2_sub2(file, hash[''], 0, level) - hash.delete('') - end - return if hash.empty? - h2 = {} - hash.each{|key,val| - head = key[0] - rest = key[1..-1] - if h2[head] - h2[head][rest] = val - else - h2[head] = {rest => val} - end - } - if h2.size == 1 - h2.each{|key,val| - file.printf("#{bl}if (suffix[#{level}] == 0x%02X) {\n", key) - gen_map2_sub(file, val, level + 1) - file.puts bl + '}' - } - else - file.puts "#{bl}switch (suffix[#{level}]) {" - h2.keys.sort.each{|k| - file.printf("#{bl}case 0x%02X :\n", k) - gen_map2_sub(file, h2[k], level + 1) - file.puts("#{bl} break;") - } - file.puts bl + '}' - end -end +STDERR.puts('getting Unicode version') +unicode_version = `./icudump --version`.strip.gsub(".", "") -def generate_map2(file, map2) - suffix = {} - map2.each{|src,dst| - chars = src.split(//) - if chars.size != 2 - STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}" - end - s = chars.pop - if suffix[s] - suffix[s][chars.join] = dst - else - suffix[s] = {chars.join=>dst} - end - } - gen_map2_sub(file, suffix, 0) -end +STDERR.puts('creating bc..') +bc = create_bc("gc") -template = <<END +STDERR.puts('creating decompose map..') +decompose_map = create_decompose_map() + +STDERR.puts('creating compose map..') +compose_map = create_compose_map(decompose_map) + +File.open("nfkc#{unicode_version}.c", "w") do |output| + output.puts(<<-HEADER) /* -*- c-basic-offset: 2 -*- */ -/* Copyright(C) 2010 Brazil +/* + Copyright(C) 2010-2016 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -343,76 +873,25 @@ template = <<END You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ -don't edit this file by hand. it generated automatically by nfkc.rb +/* + Don't edit this file by hand. it generated automatically by nfkc.rb. */ -#include "nfkc.h" +#include "grn.h" +#include "grn_nfkc.h" +#include <groonga/nfkc.h> #ifdef GRN_WITH_NFKC + HEADER -unsigned char -grn_nfkc_char_type(const unsigned char *str) -{ -% return -1; -} + generator = generator_class.new(unicode_version, output) + generator.generate(bc, decompose_map, compose_map) -const char * -grn_nfkc_map1(const unsigned char *str) -{ -% return 0; -} - -const char * -grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix) -{ -% return 0; -} + output.puts(<<-FOOTER) #endif /* GRN_WITH_NFKC */ -END - -######## main ####### - -ARGV.each{|arg| - case arg - when /-*c/i - $case_sensitive = true - when /-*s/i - $keep_space = true - end -} - -STDERR.puts('compiling icudump') -system('cc -Wall -O3 -o icudump icudump.c -licui18n') - -STDERR.puts('creating map1..') -map1 = create_map1() - -STDERR.puts('creating map2..') -map2 = create_map2(map1) - -outf = open('nfkc.c', 'w') - -tmps = template.split(/%/) - -#STDERR.puts('generating block code..') -#outf.print(tmps.shift) -#generate_blockcode_char_type(outf, 'bc') - -STDERR.puts('generating char type code..') -outf.print(tmps.shift) -generate_blockcode_char_type(outf, 'gc') - -STDERR.puts('generating map1 code..') -outf.print(tmps.shift) -generate_map1(outf, map1, 0) - -STDERR.puts('generating map2 code..') -outf.print(tmps.shift) -generate_map2(outf, map2) - -outf.print(tmps.shift) -outf.close -STDERR.puts('done.') + FOOTER +end |