summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts/unicode/gen_casefold_map.py
blob: 19003693a2f9b16a85771906b14ff294203aca1a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys

from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
    include

def generate(unicode_casefold_file, target):
    """Generates a C++ source file that contains a Unicode case folding
       function.

    The case folding function contains a switch statement with cases for every
    Unicode codepoint that has a case folding mapping.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    case_mappings = {}

    cf_file = open(unicode_casefold_file, 'rU')

    for line in cf_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if(data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert(len(values) == 4)

        status = values[1]
        if status == 'C' or status == 'S':
            # We only include the "Common" and "Simple" mappings. "Full" case
            # folding mappings expand certain letters to multiple codepoints,
            # which we currently do not support.
            original_codepoint = int(values[0], 16)
            codepoint_mapping  = int(values[2], 16)
            case_mappings[original_codepoint] = codepoint_mapping

    turkishMapping = {
        0x49: 0x131,  # I -> ı
        0x130: 0x069,   # İ -> i
    }

    out.write(
        """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) {
               if (codepoint <= 0x7f) {
                    if (codepoint >= 'A' && codepoint <= 'Z') {
                       return (mode == CaseFoldMode::kTurkish && codepoint == 'I')
                              ? 0x131
                              : (codepoint | 0x20); // Set the ascii lowercase bit on the character.
                   }
                   return codepoint;
               }

               switch (codepoint) {\n""")

    mappings_list = []

    for mapping in case_mappings:
        mappings_list.append((mapping, case_mappings[mapping]))

    # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle
    # them even if we'd skip the letter in non-turkish mode.
    for mapping in turkishMapping:
        if mapping not in case_mappings:
            mappings_list.append((mapping, mapping))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        if mapping[0] <= 0x7f:
            continue # ascii is special cased above.

        if mapping[0] in turkishMapping:
            out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
                      % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
        else:
            out.write("case 0x%x: return 0x%x;\n"%mapping)

    out.write("\
    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())

if __name__ == "__main__":
    generate(sys.argv[1], sys.argv[2])