summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb')
-rw-r--r--storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb161
1 files changed, 161 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb
new file mode 100644
index 00000000000..b8652825f80
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/parser.rb
@@ -0,0 +1,161 @@
+# Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Library General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Library General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+require "English"
+
+module Unicode
+ module_function
+ def to_utf8(code_point)
+ [code_point].pack("U")
+ end
+
+ def from_utf8(utf8)
+ utf8.unpack("U")[0]
+ end
+end
+
+class CTypeUTF8Parser
+ def initialize
+ @pages = {}
+ end
+
+ def parse(input)
+ parse_ctype_utf8(input)
+ normalize_pages
+ end
+
+ def sorted_pages
+ @pages.sort_by do |page, characters|
+ page
+ end
+ end
+
+ private
+ def parse_ctype_utf8(input)
+ current_page = nil
+ input.each_line do |line|
+ case line
+ when / plane([\da-fA-F]{2})\[\]=/
+ current_page = $1.to_i(16)
+ @pages[current_page] = []
+ when /^\s*
+ \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},
+ \s*
+ \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},?$/ix
+ next if current_page.nil?
+ parsed_characters = $LAST_MATCH_INFO.captures.collect do |value|
+ Unicode.to_utf8(value.to_i(16))
+ end
+ upper1, lower1, sort1, upper2, lower2, sort2 = parsed_characters
+ characters = @pages[current_page]
+ characters << {:upper => upper1, :lower => lower1, :sort => sort1}
+ characters << {:upper => upper2, :lower => lower2, :sort => sort2}
+ when /^\};$/
+ current_page = nil
+ end
+ end
+ end
+
+ def normalize_pages
+ @pages.each do |page, characters|
+ characters.each_with_index do |character, i|
+ character[:base] = Unicode.to_utf8((page << 8) + i)
+ end
+ end
+ end
+end
+
+class CTypeUCAParser
+ attr_reader :pages
+ def initialize
+ @pages = {}
+ @lengths = []
+ end
+
+ def parse(input)
+ parse_ctype_uca(input)
+ normalize_pages
+ end
+
+ def weight_based_characters
+ weight_based_characters = {}
+ sorted_pages.each do |page, characters|
+ characters.each do |character|
+ weight = character[:weight]
+ weight_based_characters[weight] ||= []
+ weight_based_characters[weight] << character
+ end
+ end
+ weight_based_characters
+ end
+
+ def sorted_pages
+ @pages.sort_by do |page, characters|
+ page
+ end
+ end
+
+ private
+ def parse_ctype_uca(input)
+ current_page = nil
+ in_length = false
+ input.each_line do |line|
+ case line
+ when / page([\da-fA-F]{3})data\[\]=/
+ current_page = $1.to_i(16)
+ @pages[current_page] = []
+ when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i
+ next if current_page.nil?
+ weights = line.chomp.split(/,\s*/).collect do |component|
+ Integer(component)
+ end
+ @pages[current_page].concat(weights)
+ when / uca_length\[256\]=/
+ in_length = true
+ when /^\d+(?:,\d+)*,?$/
+ next unless in_length
+ current_lengths = line.chomp.split(/,/).collect do |length|
+ Integer(length)
+ end
+ @lengths.concat(current_lengths)
+ when /^\};$/
+ current_page = nil
+ in_length = false
+ end
+ end
+ end
+
+ def normalize_pages
+ @pages.each do |page, flatten_weights|
+ weights = flatten_weights.each_slice(@lengths[page])
+ @pages[page] = weights.with_index.collect do |weight, i|
+ if weight.all?(&:zero?)
+ weight = [0]
+ else
+ while weight.last.zero?
+ weight.pop
+ end
+ end
+ code_point = (page << 8) + i
+ {
+ :weight => weight,
+ :code_point => code_point,
+ :utf8 => Unicode.to_utf8(code_point),
+ }
+ end
+ end
+ end
+end