src/mongo/db/fts/unicode/gen_delimiter_list.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys

from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
    include

def generate(unicode_proplist_file, target):
    """Generates a C++ source file that contains a delimiter checking function.

    The delimiter checking function contains a switch statement with cases for 
    every delimiter in the Unicode Character Database with the properties 
    specified in delim_properties.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    delim_codepoints = set()

    proplist_file = open(unicode_proplist_file, 'rU')

    delim_properties = ["White_Space", 
                        "Dash", 
                        "Hyphen", 
                        "Quotation_Mark", 
                        "Terminal_Punctuation", 
                        "Pattern_Syntax", 
                        "STerm"]

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if(data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert(len(values) == 2)

        uproperty = values[1].strip()
        if uproperty in delim_properties:
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end   = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in delim_codepoints: 
                        delim_codepoints.add(i)
            else:
                if int(values[0], 16) not in delim_codepoints:
                    delim_codepoints.add(int(values[0], 16))

    # As of Unicode 8.0.0, all of the delimiters we used for text index 
    # version 2 are also in the list.
    out.write("static const bool englishAsciiDelimiters[128] = {\n")
    for cp in range(0x80):
        if cp == ord("'"):
            out.write("    0, // ' special case\n")
        else:
            out.write("    %d, // 0x%x\n" % (cp in delim_codepoints, cp))
    out.write("};\n")

    out.write("static const bool nonEnglishAsciiDelimiters[128] = {\n")
    for cp in range(0x80):
        out.write("    %d, // 0x%x\n" % (cp in delim_codepoints, cp))
    out.write("};\n")

    out.write("""bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang) {
    if (codepoint <= 0x7f) {
        if (lang == DelimiterListLanguage::kEnglish) {
            return englishAsciiDelimiters[codepoint];
        }
        return nonEnglishAsciiDelimiters[codepoint];
    }

    switch (codepoint) {\n""")

    for delim in sorted(delim_codepoints):
        if delim <= 0x7f: # ascii codepoints handled in lists above.
            continue
        out.write("\
    case " + str(hex(delim)) + ": return true;\n")

    out.write("\
    default: return false;\n    }\n}")

    out.write(closeNamespaces())

if __name__ == "__main__":
    generate(sys.argv[1], sys.argv[2])