summaryrefslogtreecommitdiff
path: root/src/third_party/re2/dist/re2/make_unicode_groups.py
blob: 46aef40cfb1cac90970ddc8f22a4b537496518b6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/python
# Copyright 2008 The RE2 Authors.  All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

"""Generate C++ tables for Unicode Script and Category groups."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import unicode

_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc

#include "re2/unicode_groups.h"

namespace re2 {

"""

_trailer = """

}  // namespace re2

"""

n16 = 0
n32 = 0

def MakeRanges(codes):
  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
  ranges = []
  last = -100
  for c in codes:
    if c == last+1:
      ranges[-1][1] = c
    else:
      ranges.append([c, c])
    last = c
  return ranges

def PrintRanges(type, name, ranges):
  """Print the ranges as an array of type named name."""
  print("static const %s %s[] = {" % (type, name))
  for lo, hi in ranges:
    print("\t{ %d, %d }," % (lo, hi))
  print("};")

# def PrintCodes(type, name, codes):
#   """Print the codes as an array of type named name."""
#   print("static %s %s[] = {" % (type, name))
#   for c in codes:
#     print("\t%d," % (c,))
#   print("};")

def PrintGroup(name, codes):
  """Print the data structures for the group of codes.
  Return a UGroup literal for the group."""

  # See unicode_groups.h for a description of the data structure.

  # Split codes into 16-bit ranges and 32-bit ranges.
  range16 = MakeRanges([c for c in codes if c < 65536])
  range32 = MakeRanges([c for c in codes if c >= 65536])

  # Pull singleton ranges out of range16.
  # code16 = [lo for lo, hi in range16 if lo == hi]
  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]

  global n16
  global n32
  n16 += len(range16)
  n32 += len(range32)

  ugroup = "{ \"%s\", +1" % (name,)
  # if len(code16) > 0:
  #   PrintCodes("uint16_t", name+"_code16", code16)
  #   ugroup += ", %s_code16, %d" % (name, len(code16))
  # else:
  #   ugroup += ", 0, 0"
  if len(range16) > 0:
    PrintRanges("URange16", name+"_range16", range16)
    ugroup += ", %s_range16, %d" % (name, len(range16))
  else:
    ugroup += ", 0, 0"
  if len(range32) > 0:
    PrintRanges("URange32", name+"_range32", range32)
    ugroup += ", %s_range32, %d" % (name, len(range32))
  else:
    ugroup += ", 0, 0"
  ugroup += " }"
  return ugroup

def main():
  categories = unicode.Categories()
  scripts = unicode.Scripts()
  print(_header)
  ugroups = []
  for name in sorted(categories):
    ugroups.append(PrintGroup(name, categories[name]))
  for name in sorted(scripts):
    ugroups.append(PrintGroup(name, scripts[name]))
  print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32))
  print("const UGroup unicode_groups[] = {")
  ugroups.sort()
  for ug in ugroups:
    print("\t%s," % (ug,))
  print("};")
  print("const int num_unicode_groups = %d;" % (len(ugroups),))
  print(_trailer)

if __name__ == '__main__':
  main()