summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/tool/dump_difference_uca.rb
blob: 45e60c2973f135bb7c770e6d2924cb1482b51fe7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Copyright (C) 2013  Kouhei Sutou <kou@clear-code.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

$LOAD_PATH.unshift(File.dirname(__FILE__))
require "parser"

if ARGV.size != 1
  puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-uca.c")
  exit(false)
end

parser = CTypeUCAParser.new
parser.parse(ARGF)

n_idencials = 0
n_expanded_characters = 0
parser.weight_based_characters.each do |weight, characters|
  next if characters.size == 1
  n_idencials += 1
  representative_character = characters.first
  rest_characters = characters[1..-1]
  rest_characters.each do |character|
    if representative_character[:utf8].bytesize > character[:utf8].bytesize
      n_expanded_characters += 1
    end
  end
  formatted_weight = weight.collect {|component| '%#07x' % component}.join(', ')
  puts "weight: #{formatted_weight}"
  characters.each do |character|
    utf8 = character[:utf8]
    code_point = character[:code_point]
    p ["U+%04x" % code_point, utf8]
  end
end

puts "Number of idencial weights #{n_idencials}"
puts "Number of expanded characters: #{n_expanded_characters}"