tools/gen-break-table.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

#!/usr/bin/python

from __future__ import print_function, division, absolute_import
import sys
import os.path
from collections import OrderedDict


header = []
ranges = OrderedDict()

def load_data(filename, prefix=""):
        global header, ranges
        f = open(filename)
        lines = f.readlines()
        for line in lines:
                if not line.startswith("#"):
                        break
                header.append(line)

        for line in lines:
	        line = line.strip()
	        if not line or line[0] == '#':
		        continue
	        rang, typ = [s.strip() for s in line.split('#')[0].split(';')[:2]]
	        typ = prefix + typ

	        rang = [int(s, 16) for s in rang.split('..')]
	        if len(rang) > 1:
		        start, end = rang
	        else:
		        start = end = rang[0]

	        if typ not in ranges:
		        ranges[typ] = []
	        if ranges[typ] and ranges[typ][-1][1] == start - 1:
		        ranges[typ][-1] = (ranges[typ][-1][0], end)
	        else:
		        ranges[typ].append((start, end))


def onecondition(start, end):
        condition = ''
        if start == end:
                condition = 'wc == 0x' + format(start, '04X')
        elif start < end:
                condition = '(' + 'wc >= 0x' + format(start, '04X') + ' && ' + 'wc <= 0x' + format(end, '04X') + ')'
        return condition


# print out the numbers in compact form
def print_if_branch(ranges):
        conditions = []
        for start, end in ranges:
                condition = onecondition(start, end)
                conditions.append(condition)
        statement = "||\n".join(conditions)
        print("if (%s)" % statement)
        print("\treturn TRUE;")
        print("return FALSE;")


def print_one_line(start, end):
        if start < end:
                outline = 'if (' + onecondition(start, end) + ')'
                print(outline)

def print_ranges(ranges):
        if 4 >= len(ranges):
                conditions = []
                for start, end in ranges:
                        conditions.append(onecondition(start, end))

                statement = " ||\n".join(conditions)
                print('if (' + statement + ')')
                print('\treturn TRUE;')
                return

        start = ranges[0][0]
        end = ranges[-1][1]
        print_one_line(start, end)
        print('{')
        print_balanced_search(ranges)
        print('}')


# print if branch like 4-way balanced search
def print_balanced_search(ranges):
        if 4 >= len(ranges):
                print_ranges(ranges)
                print("return FALSE;")
                return

        length = len(ranges)
        step = int(length / 4)
        first = step
        second = int(length * 2 / 4)
        third = second + step

        newranges = ranges[0:first]
        print_ranges(newranges)

        newranges = ranges[first:second]
        print_ranges(newranges)

        newranges = ranges[second:third]
        print_ranges(newranges)

        newranges = ranges[third:]
        print_ranges(newranges)

        print("return FALSE;")


def print_table():
        global header, ranges
        print("/* == Start of generated table == */")
        print("/*")
        print(" * The following tables are generated by running:")
        print(" *")
        print(" *   ./gen-break-table.py SentenceBreakProperty.txt IndicSyllabicCategory.txt EastAsianWidth.txt | indent")
        print(" *")
        print(" * on files with these headers:")
        print(" *")
        for l in header:
	        print(" * %s" % (l.strip()))
        print(" */")
        print()
        print("#ifndef PANGO_BREAK_TABLE_H")
        print("#define PANGO_BREAK_TABLE_H")
        print()
        print("#include <glib.h>")
        print()

        for typ,s in ranges.items():
	        if typ not in ['STerm',
	                       'Virama',
	                       'Vowel_Dependent',
	                       'Consonant_Prefixed',
	                       'Consonant_Preceding_Repha']: continue
	        print()
	        print("static inline gboolean _pango_is_%s (gunichar wc)" % typ)
	        print("{")
	        print_balanced_search(sorted(s))
	        print("}")

        s = ranges["EastAsian_F"] + ranges["EastAsian_W"] + ranges["EastAsian_H"]
        print("static inline gboolean _pango_is_EastAsianWide (gunichar wc)")
        print("{")
        print_balanced_search(sorted(s))
        print("}")

        print()
        print("#endif /* PANGO_BREAK_TABLE_H */")
        print()
        print("/* == End of generated table == */")


if __name__ == "__main__":
        if len (sys.argv) != 4:
	        print("usage: ./gen-break-table.py SentenceBreakProperty.txt IndicSyllabicCategory.txt EastAsianWidth.txt | indent", file=sys.stderr)
	        sys.exit (1)

        load_data(sys.argv[1])
        load_data(sys.argv[2])
        load_data(sys.argv[3], "EastAsian_")
        print_table()