From ca4cbe59eda77a3855094c843486759868794e85 Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Wed, 14 Sep 2022 19:15:45 +0900 Subject: Move case-folding.rb to tooldir with enc-prefix --- enc/unicode/case-folding.rb | 418 -------------------------------------------- 1 file changed, 418 deletions(-) delete mode 100644 enc/unicode/case-folding.rb (limited to 'enc') diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb deleted file mode 100644 index 4a29fdebf7..0000000000 --- a/enc/unicode/case-folding.rb +++ /dev/null @@ -1,418 +0,0 @@ -#!/usr/bin/ruby -require 'stringio' - -# Usage (for case folding only): -# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt -# $ ruby case-folding.rb CaseFolding.txt -o casefold.h -# or (for case folding and case mapping): -# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt -# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt -# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt -# $ ruby case-folding.rb -m . -o casefold.h -# using -d or --debug will include UTF-8 characters in comments for debugging - -class CaseFolding - module Util - module_function - - def hex_seq(v) - v.map { |i| "0x%04x" % i }.join(", ") - end - - def print_table_1(dest, type, mapping_data, data) - for k, v in data = data.sort - sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k) - if type=='CaseUnfold_11' and v.length>1 - # reorder CaseUnfold_11 entries to avoid special treatment for U+03B9/U+03BC/U+A64B - item = mapping_data.map("%04X" % k[0]) - upper = item.upper if item - v = v.sort_by { |i| ("%04X"%i) == upper ? 0 : 1 } - end - ck = @debug ? ' /* ' + Array(k).pack("U*") + ' */' : '' - cv = @debug ? ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' : '' - dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n") - end - data - end - - def print_table(dest, type, mapping_data, data) - dest.print("static const #{type}_Type #{type}_Table[] = {\n") - i = 0 - ret = data.inject([]) do |a, (n, d)| - dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n") - i += d.size - a.concat(print_table_1(dest, type, mapping_data, d)) - end - dest.print("};\n\n") - ret - end - end - - include Util - - attr_reader :fold, :fold_locale, :unfold, :unfold_locale, :version - - def load(filename) - pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ - - @fold = fold = {} - @unfold = unfold = [{}, {}, {}] - @debug = false - @version = nil - turkic = [] - - IO.foreach(filename, mode: "rb") do |line| - @version ||= line[/-([0-9.]+).txt/, 1] - next unless res = pattern.match(line) - ch_from = res[1].to_i(16) - - if res[2] == 'T' - # Turkic case folding - turkic << ch_from - next - end - - # store folding data - ch_to = res[3..6].inject([]) do |a, i| - break a unless i - a << i.to_i(16) - end - fold[ch_from] = ch_to - - # store unfolding data - i = ch_to.length - 1 - (unfold[i][ch_to] ||= []) << ch_from - end - - # move locale dependent data to (un)fold_locale - @fold_locale = fold_locale = {} - @unfold_locale = unfold_locale = [{}, {}] - for ch_from in turkic - key = fold[ch_from] - i = key.length - 1 - unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) - fold_locale[ch_from] = fold.delete(ch_from) - end - self - end - - def range_check(code) - "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE" - end - - def lookup_hash(key, type, data) - hash = "onigenc_unicode_#{key}_hash" - lookup = "onigenc_unicode_#{key}_lookup" - arity = Array(data[0][0]).size - gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(',')} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n" - argname = arity > 1 ? "codes" : "code" - argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}" - n = 7 - m = (1 << n) - 1 - min, max = data.map {|c, *|c}.flatten.minmax - src = IO.popen(gperf, "r+") {|f| - f << "short\n%%\n" - data.each_with_index {|(k, _), i| - k = Array(k) - ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("") - f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i - } - f << "%%\n" - f.close_write - f.read - } - src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { - name = $1 - body = $2 - body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)") - "#{name}(#{argdecl})\n{\n#{body}}" - } - src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { - name = $1 - body = $2 - body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1") - body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})") - body.gsub!(/\{"",-1}/, "-1") - body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1') - body.sub!(/(\s+if\s)\(len\b.*\)/) do - "#$1(" << - (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) << - ")" - end - v = nil - body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) { - pre = $1 - indent = $2 - s = $3 - s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]') - v = $1 - s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))") - "#{pre}{#{s}\n#{indent}}" - } - body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;") - "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}" - } - src - end - - def display(dest, mapping_data) - # print the header - dest.print("/* DO NOT EDIT THIS FILE. */\n") - dest.print("/* Generated by enc/unicode/case-folding.rb */\n\n") - - versions = version.scan(/\d+/) - dest.print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") - %w[MAJOR MINOR TEENY].zip(versions) do |n, v| - dest.print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") - end - dest.print(" 1)\n") - dest.print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") - dest.print("#endif\n") - dest.print("#define ONIG_UNICODE_VERSION_STRING #{version.dump}\n") - %w[MAJOR MINOR TEENY].zip(versions) do |n, v| - dest.print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") - end - dest.print("\n") - - # print folding data - - # CaseFold + CaseFold_Locale - name = "CaseFold_11" - data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale) - dest.print lookup_hash(name, "CodePointList3", data) - - # print unfolding data - - # CaseUnfold_11 + CaseUnfold_11_Locale - name = "CaseUnfold_11" - data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0]) - dest.print lookup_hash(name, "CodePointList3", data) - - # CaseUnfold_12 + CaseUnfold_12_Locale - name = "CaseUnfold_12" - data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) - dest.print lookup_hash(name, "CodePointList2", data) - - # CaseUnfold_13 - name = "CaseUnfold_13" - data = print_table(dest, name, mapping_data, name=>unfold[2]) - dest.print lookup_hash(name, "CodePointList2", data) - - # TitleCase - dest.print mapping_data.specials_output - end - - def debug! - @debug = true - end - - def self.load(*args) - new.load(*args) - end -end - -class MapItem - attr_accessor :upper, :lower, :title, :code - - def initialize(code, upper, lower, title) - @code = code - @upper = upper unless upper == '' - @lower = lower unless lower == '' - @title = title unless title == '' - end -end - -class CaseMapping - attr_reader :filename, :version - - def initialize(mapping_directory) - @mappings = {} - @specials = [] - @specials_length = 0 - @version = nil - IO.foreach(File.join(mapping_directory, 'UnicodeData.txt'), mode: "rb") do |line| - next if line =~ /^ 1 - end - if mapping_directory - if ARGV[0] - warn "Either specify directory or individual file, but not both." - exit - end - filename = File.join(mapping_directory, 'CaseFolding.txt') - mapping_data = CaseMapping.load(mapping_directory) - end - filename ||= ARGV[0] || 'CaseFolding.txt' - data = CaseFolding.load(filename) - if mapping_data and data.version != mapping_data.version - abort "Unicode data version mismatch\n" \ - " #{filename} = #{data.version}\n" \ - " #{mapping_data.filename} = #{mapping_data.version}" - end - mapping_data ||= CaseMappingDummy.new - - if debug - data.debug! - mapping_data.debug! - end - f = StringIO.new - begin - data.display(f, mapping_data) - rescue Errno::ENOENT => e - raise unless /gperf/ =~ e.message - warn e.message - abort unless dest - File.utime(nil, nil, dest) # assume existing file is OK - exit - else - s = f.string - end - if dest - open(dest, "wb") do |file| - file.print(s) - end - else - STDOUT.print(s) - end -end -- cgit v1.2.1