diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb')
-rw-r--r-- | storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb new file mode 100644 index 00000000000..b8652825f80 --- /dev/null +++ b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb @@ -0,0 +1,161 @@ +# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "English" + +module Unicode + module_function + def to_utf8(code_point) + [code_point].pack("U") + end + + def from_utf8(utf8) + utf8.unpack("U")[0] + end +end + +class CTypeUTF8Parser + def initialize + @pages = {} + end + + def parse(input) + parse_ctype_utf8(input) + normalize_pages + end + + def sorted_pages + @pages.sort_by do |page, characters| + page + end + end + + private + def parse_ctype_utf8(input) + current_page = nil + input.each_line do |line| + case line + when / plane([\da-fA-F]{2})\[\]=/ + current_page = $1.to_i(16) + @pages[current_page] = [] + when /^\s* + \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\}, + \s* + \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},?$/ix + next if current_page.nil? + parsed_characters = $LAST_MATCH_INFO.captures.collect do |value| + Unicode.to_utf8(value.to_i(16)) + end + upper1, lower1, sort1, upper2, lower2, sort2 = parsed_characters + characters = @pages[current_page] + characters << {:upper => upper1, :lower => lower1, :sort => sort1} + characters << {:upper => upper2, :lower => lower2, :sort => sort2} + when /^\};$/ + current_page = nil + end + end + end + + def normalize_pages + @pages.each do |page, characters| + characters.each_with_index do |character, i| + character[:base] = Unicode.to_utf8((page << 8) + i) + end + end + end +end + +class CTypeUCAParser + attr_reader :pages + def initialize + @pages = {} + @lengths = [] + end + + def parse(input) + parse_ctype_uca(input) + normalize_pages + end + + def weight_based_characters + weight_based_characters = {} + sorted_pages.each do |page, characters| + characters.each do |character| + weight = character[:weight] + weight_based_characters[weight] ||= [] + weight_based_characters[weight] << character + end + end + weight_based_characters + end + + def sorted_pages + @pages.sort_by do |page, characters| + page + end + end + + private + def parse_ctype_uca(input) + current_page = nil + in_length = false + input.each_line do |line| + case line + when / page([\da-fA-F]{3})data\[\]=/ + current_page = $1.to_i(16) + @pages[current_page] = [] + when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i + next if current_page.nil? + weights = line.chomp.split(/,\s*/).collect do |component| + Integer(component) + end + @pages[current_page].concat(weights) + when / uca_length\[256\]=/ + in_length = true + when /^\d+(?:,\d+)*,?$/ + next unless in_length + current_lengths = line.chomp.split(/,/).collect do |length| + Integer(length) + end + @lengths.concat(current_lengths) + when /^\};$/ + current_page = nil + in_length = false + end + end + end + + def normalize_pages + @pages.each do |page, flatten_weights| + weights = flatten_weights.each_slice(@lengths[page]) + @pages[page] = weights.with_index.collect do |weight, i| + if weight.all?(&:zero?) + weight = [0] + else + while weight.last.zero? + weight.pop + end + end + code_point = (page << 8) + i + { + :weight => weight, + :code_point => code_point, + :utf8 => Unicode.to_utf8(code_point), + } + end + end + end +end |