summaryrefslogtreecommitdiff
path: root/tool/enc-emoji4unicode.rb
blob: 1e7d45901f0d14a7b2078ca7ec18a045543e01ea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env ruby

# example:
# ./enc-emoji4unicode.rb emoji4unicode.xml > ../enc/trans/emoji-exchange-tbl.rb

require 'rexml/document'
require File.expand_path("../transcode-tblgen", __FILE__)

class EmojiTable
  VERBOSE_MODE = false

  def initialize(xml_path)
    @doc = REXML::Document.new File.open(xml_path)
    @kddi_undoc = make_kddi_undoc_map()
  end

  def conversion(from_carrier, to_carrier, &block)
    REXML::XPath.each(@doc.root, '//e') do |e|
      from = e.attribute(from_carrier.downcase).to_s
      to = e.attribute(to_carrier.downcase).to_s
      text_fallback = e.attribute('text_fallback').to_s
      name = e.attribute('name').to_s
      if from =~ /^(?:\*|\+)(.+)$/ # proposed or unified
        from = $1
      end
      if from.empty? || from !~ /^[0-9A-F]+$/
        # do nothing
      else
        from_utf8 = [from.hex].pack("U").unpack("H*").first
        if to =~ /^(?:>|\*)?([0-9A-F\+]+)$/
          str_to = $1
          if str_to =~ /^\+/ # unicode "proposed" begins at "+"
            proposal = true
            str_to.sub!(/^\+/, '')
          else
            proposal = false
          end
          tos = str_to.split('+')
          to_utf8 = tos.map(&:hex).pack("U*").unpack("H*").first
          comment = "[%s] U+%X -> %s" % [name, from.hex, tos.map{|c| "U+%X"%c.hex}.join(' ')]
          block.call(:from => from_utf8,
                     :to => to_utf8,
                     :comment => comment,
                     :fallback => false,
                     :proposal => proposal)
        elsif to.empty?
          if text_fallback.empty?
            comment = "[%s] U+%X -> U+3013 (GETA)" % [name, from.hex]
            block.call(:from => from_utf8,
                       :to => "\u{3013}".unpack("H*").first,
                       :comment => comment, # geta
                       :fallback => true,
                       :proposal => false)
          else
            to_utf8 = text_fallback.unpack("H*").first
            comment = %([%s] U+%X -> "%s") % [name, from.hex, text_fallback]
            block.call(:from => from_utf8,
                       :to => to_utf8,
                       :comment => comment,
                       :fallback => true,
                       :proposal => false)
          end
        else
          raise "something wrong: %s -> %s" % [from, to]
        end
      end
    end
  end

  def generate(io, from_carrier, to_carrier)
    from_encoding = (from_carrier == "Unicode") ? "UTF-8" : "UTF8-"+from_carrier
    to_encoding   = (to_carrier == "Unicode" )  ? "UTF-8" : "UTF8-"+to_carrier
      io.puts "EMOJI_EXCHANGE_TBL['#{from_encoding}']['#{to_encoding}'] = ["
      io.puts "  # for documented codepoints" if from_carrier == "KDDI"
      self.conversion(from_carrier, to_carrier) do |params|
        from, to = params[:from], %Q{"#{params[:to]}"}
        to = ":undef" if params[:fallback] || params[:proposal]
        io.puts %{  ["#{from}", #{to}], # #{params[:comment]}}
      end
      if from_carrier == "KDDI"
        io.puts "  # for undocumented codepoints"
        self.conversion(from_carrier, to_carrier) do |params|
          from, to = params[:from], %Q{"#{params[:to]}"}
          to = ":undef" if params[:fallback] || params[:proposal]
          unicode = utf8_to_ucs(from)
          undoc = ucs_to_utf8(@kddi_undoc[unicode])
          io.puts %{  ["#{undoc}", #{to}], # #{params[:comment]}}
        end
      end
      io.puts "]"
      io.puts
  end

  private

  def utf8_to_ucs(cp)
    return [cp].pack("H*").unpack("U*").first
  end

  def ucs_to_utf8(cp)
    return [cp].pack("U*").unpack("H*").first
  end

  def make_kddi_undoc_map()
    pub_to_sjis = citrus_decode_mapsrc(
      "mskanji", 2, "UCS/EMOJI_SHIFT_JIS-KDDI").sort_by{|u, s| s}
    sjis_to_undoc = citrus_decode_mapsrc(
      "mskanji", 2, "EMOJI_SHIFT_JIS-KDDI-UNDOC/UCS").sort_by{|s, u| s}
    return pub_to_sjis.zip(sjis_to_undoc).inject({}) {|h, rec|
      raise "no match sjis codepoint" if rec[0][1] != rec[1][0]
      h[rec[0][0]] = rec[1][1]
      next h
    }
  end
end

if ARGV.empty?
  puts "usage: #$0 [emoji4unicode.xml]"
  exit 1
end
$srcdir = File.expand_path("../../enc/trans", __FILE__)
emoji_table = EmojiTable.new(ARGV[0])

companies = %w(DoCoMo KDDI SoftBank Unicode)

io = STDOUT
io.puts "EMOJI_EXCHANGE_TBL = Hash.new{|h,k| h[k] = {}}"
companies.each do |from_company|
  companies.each do |to_company|
    next if from_company == to_company
    emoji_table.generate(io, from_company, to_company)
  end
end