src/mongo/db/fts/unicode/gen_diacritic_map.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
from unicodedata import normalize, category, unidata_version

from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
    include

diacritics = set()


def load_diacritics(unicode_proplist_file):
    proplist_file = open(unicode_proplist_file, 'r')

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if (data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert (len(values) == 2)

        uproperty = values[1].strip()
        if uproperty == "Diacritic":
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in diacritics:
                        diacritics.add(i)
            else:
                if int(values[0], 16) not in diacritics:
                    diacritics.add(int(values[0], 16))


diacritic_mappings = {}


def add_diacritic_mapping(codepoint):
    # a : original unicode character
    # d : decomposed unicode character
    # r : decomposed unicode character with diacritics removed
    # c : recomposed unicode character with diacritics removed
    a = chr(codepoint)
    d = normalize('NFD', a)
    r = ''

    for i in range(len(d)):
        if ord(d[i]) not in diacritics:
            r += d[i]

    c = normalize('NFC', r)

    # Only use mappings where the final recomposed form is a single codepoint
    if (a != c and len(c) == 1):
        assert c != '\0'  # This is used to indicate the codepoint is a pure diacritic.
        assert ord(c) not in diacritics
        diacritic_mappings[codepoint] = ord(c[0])


def add_diacritic_range(start, end):
    for x in range(start, end + 1):
        add_diacritic_mapping(x)


def generate(target):
    """Generates a C++ source file that contains a diacritic removal mapping
       function.

    The delimiter checking function contains a switch statement with cases for
    every character in Unicode that has a removable combining diacritical mark.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    # Map diacritics from 0 to the maximum Unicode codepoint
    add_diacritic_range(0x0000, 0x10FFFF)

    for diacritic in diacritics:
        diacritic_mappings[diacritic] = 0

    out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) {
    switch (codepoint) {\n""")

    mappings_list = []

    for mapping in diacritic_mappings:
        mappings_list.append((mapping, diacritic_mappings[mapping]))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        out.write("    case " + str(hex(mapping[0])) + ": return " + \
            str(hex(mapping[1])) +";\n")

    out.write("    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())


if __name__ == "__main__":
    if (unidata_version != '8.0.0'):
        print("""ERROR: This script must be run with a version of Python that \
            contains the Unicode 8.0.0 Character Database.""")
        sys.exit(1)
    load_diacritics(sys.argv[1])
    generate(sys.argv[2])