1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
include
def generate(unicode_casefold_file, target):
"""Generates a C++ source file that contains a Unicode case folding
function.
The case folding function contains a switch statement with cases for every
Unicode codepoint that has a case folding mapping.
"""
out = open(target, "w")
out.write(getCopyrightNotice())
out.write(include("mongo/db/fts/unicode/codepoints.h"))
out.write("\n")
out.write(openNamespaces())
case_mappings = {}
cf_file = open(unicode_casefold_file, 'rU')
for line in cf_file:
# Filter out blank lines and lines that start with #
data = line[:line.find('#')]
if(data == ""):
continue
# Parse the data on the line
values = data.split("; ")
assert(len(values) == 4)
status = values[1]
if status == 'C' or status == 'S':
# We only include the "Common" and "Simple" mappings. "Full" case
# folding mappings expand certain letters to multiple codepoints,
# which we currently do not support.
original_codepoint = int(values[0], 16)
codepoint_mapping = int(values[2], 16)
case_mappings[original_codepoint] = codepoint_mapping
turkishMapping = {
0x49: 0x131, # I -> ı
0x130: 0x069, # İ -> i
}
out.write(
"""char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) {
if (codepoint <= 0x7f) {
if (codepoint >= 'A' && codepoint <= 'Z') {
return (mode == CaseFoldMode::kTurkish && codepoint == 'I')
? 0x131
: (codepoint | 0x20); // Set the ascii lowercase bit on the character.
}
return codepoint;
}
switch (codepoint) {\n""")
mappings_list = []
for mapping in case_mappings:
mappings_list.append((mapping, case_mappings[mapping]))
# Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle
# them even if we'd skip the letter in non-turkish mode.
for mapping in turkishMapping:
if mapping not in case_mappings:
mappings_list.append((mapping, mapping))
sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])
for mapping in sorted_mappings:
if mapping[0] <= 0x7f:
continue # ascii is special cased above.
if mapping[0] in turkishMapping:
out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
% (mapping[0], turkishMapping[mapping[0]], mapping[1]))
else:
out.write("case 0x%x: return 0x%x;\n"%mapping)
out.write("\
default: return codepoint;\n }\n}")
out.write(closeNamespaces())
if __name__ == "__main__":
generate(sys.argv[1], sys.argv[2])
|