summaryrefslogtreecommitdiff
path: root/lib/unicode_normalize/normalize.rb
blob: b27cdadaaac9526281f477de5dbb251aeabe9389 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# coding: utf-8
# frozen_string_literal: false

# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

# This file, the companion file tables.rb (autogenerated), and the module,
# constants, and method defined herein are part of the implementation of the
# built-in String class, not part of the standard library. They should
# therefore never be gemified. They implement the methods
# String#unicode_normalize, String#unicode_normalize!, and String#unicode_normalized?.
#
# They are placed here because they are written in Ruby. They are loaded on
# demand when any of the three methods mentioned above is executed for the
# first time. This reduces the memory footprint and startup time for scripts
# and applications that do not use those methods.
#
# The name and even the existence of the module UnicodeNormalize and all of its
# content are purely an implementation detail, and should not be exposed in
# any test or spec or otherwise.

require_relative 'tables'


module UnicodeNormalize  # :nodoc:
  ## Constant for max hash capacity to avoid DoS attack
  MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow

  ## Regular Expressions and Hash Constants
  REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
  REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
  REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
  NF_HASH_D = Hash.new do |hash, key|
                         hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
                         hash[key] = nfd_one(key)
                       end
  NF_HASH_C = Hash.new do |hash, key|
                         hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
                         hash[key] = nfc_one(key)
                       end

  ## Constants For Hangul
  # for details such as the meaning of the identifiers below, please see
  # http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf, pp. 144/145
  SBASE = 0xAC00
  LBASE = 0x1100
  VBASE = 0x1161
  TBASE = 0x11A7
  LCOUNT = 19
  VCOUNT = 21
  TCOUNT = 28
  NCOUNT = VCOUNT * TCOUNT
  SCOUNT = LCOUNT * NCOUNT

  # Unicode-based encodings (except UTF-8)
  UNICODE_ENCODINGS = [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE,
                       Encoding::GB18030, Encoding::UCS_2BE, Encoding::UCS_4BE]

  ## Hangul Algorithm
  def self.hangul_decomp_one(target)
    syllable_index = target.ord - SBASE
    return target if syllable_index < 0 || syllable_index >= SCOUNT
    l = LBASE + syllable_index / NCOUNT
    v = VBASE + (syllable_index % NCOUNT) / TCOUNT
    t = TBASE + syllable_index % TCOUNT
    (t==TBASE ? [l, v] : [l, v, t]).pack('U*') + target[1..-1]
  end

  def self.hangul_comp_one(string)
    length = string.length
    if length>1 and 0 <= (lead =string[0].ord-LBASE) and lead  < LCOUNT and
                    0 <= (vowel=string[1].ord-VBASE) and vowel < VCOUNT
      lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
      if length>2 and 0 < (trail=string[2].ord-TBASE) and trail < TCOUNT
        (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
      else
        lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
      end
    else
      string
    end
  end

  ## Canonical Ordering
  def self.canonical_ordering_one(string)
    sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
    (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
      (0..i).each do |j|
        later_class = sorting[j+1].last
        if 0<later_class and later_class<sorting[j].last
          sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
        end
      end
    end
    return sorting.collect(&:first).join('')
  end

  ## Normalization Forms for Patterns (not whole Strings)
  def self.nfd_one(string)
    string = string.chars.map! {|c| DECOMPOSITION_TABLE[c] || c}.join('')
    canonical_ordering_one(hangul_decomp_one(string))
  end

  def self.nfc_one(string)
    nfd_string = nfd_one string
    start = nfd_string[0]
    last_class = CLASS_TABLE[start]-1
    accents = ''
    nfd_string[1..-1].each_char do |accent|
      accent_class = CLASS_TABLE[accent]
      if last_class<accent_class and composite = COMPOSITION_TABLE[start+accent]
        start = composite
      else
        accents << accent
        last_class = accent_class
      end
    end
    hangul_comp_one(start+accents)
  end

  def self.normalize(string, form = :nfc)
    encoding = string.encoding
    case encoding
    when Encoding::UTF_8
      case form
      when :nfc then
        string.gsub REGEXP_C, NF_HASH_C
      when :nfd then
        string.gsub REGEXP_D, NF_HASH_D
      when :nfkc then
        string.gsub(REGEXP_K, KOMPATIBLE_TABLE).gsub(REGEXP_C, NF_HASH_C)
      when :nfkd then
        string.gsub(REGEXP_K, KOMPATIBLE_TABLE).gsub(REGEXP_D, NF_HASH_D)
      else
        raise ArgumentError, "Invalid normalization form #{form}."
      end
    when Encoding::US_ASCII
      string
    when *UNICODE_ENCODINGS
      normalize(string.encode(Encoding::UTF_8), form).encode(encoding)
    else
      raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
    end
  end

  def self.normalized?(string, form = :nfc)
    encoding = string.encoding
    case encoding
    when Encoding::UTF_8
      case form
      when :nfc then
        string.scan REGEXP_C do |match|
          return false  if NF_HASH_C[match] != match
        end
        true
      when :nfd then
        string.scan REGEXP_D do |match|
          return false  if NF_HASH_D[match] != match
        end
        true
      when :nfkc then
        normalized?(string, :nfc) and string !~ REGEXP_K
      when :nfkd then
        normalized?(string, :nfd) and string !~ REGEXP_K
      else
        raise ArgumentError, "Invalid normalization form #{form}."
      end
    when Encoding::US_ASCII
      true
    when *UNICODE_ENCODINGS
      normalized? string.encode(Encoding::UTF_8), form
    else
      raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
    end
  end
end # module