diff options
author | Mathew Robinson <chasinglogic@gmail.com> | 2019-02-19 10:50:57 -0500 |
---|---|---|
committer | Mathew Robinson <chasinglogic@gmail.com> | 2019-04-08 14:08:49 -0400 |
commit | 8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6 (patch) | |
tree | 69e936c4953cbead2e3bae2690157c5fe75e709d /src/mongo/db/fts | |
parent | c600aa9d7423eca8151daf626e2799d9a6c7b31c (diff) | |
download | mongo-8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6.tar.gz |
SERVER-32295 Support Python 3
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r-- | src/mongo/db/fts/generate_stop_words.py | 9 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_casefold_map.py | 15 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_delimiter_list.py | 23 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_diacritic_list.py | 10 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_diacritic_map.py | 18 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_helper.py | 3 |
6 files changed, 46 insertions, 32 deletions
diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py index 31603eb92ed..0d356a2a351 100644 --- a/src/mongo/db/fts/generate_stop_words.py +++ b/src/mongo/db/fts/generate_stop_words.py @@ -1,7 +1,7 @@ import sys def generate( header, source, language_files ): - out = open( header, "wb" ) + out = open( header, "w" ) out.write( """ #pragma once #include <set> @@ -18,8 +18,8 @@ namespace fts { - out = open( source, "wb" ) - out.write( '#include "%s"' % header.rpartition( "/" )[2].rpartition( "\\" )[2] ) + out = open( source, "w", encoding='utf-8') + out.write( '#include "{}"'.format(header.rpartition( "/" )[2].rpartition( "\\" )[2]) ) out.write( """ namespace mongo { namespace fts { @@ -35,12 +35,13 @@ namespace fts { out.write( ' {\n' ) out.write( ' const char* const words[] = {\n' ) for word in open( l_file, "rb" ): - out.write( ' "%s",\n' % word.strip() ) + out.write( ' "%s",\n' % word.decode('utf-8').strip() ) out.write( ' };\n' ) out.write( ' const size_t wordcnt = sizeof(words) / sizeof(words[0]);\n' ) out.write( ' std::set< std::string >& l = (*m)["%s"];\n' % l ) out.write( ' l.insert(&words[0], &words[wordcnt]);\n' ) out.write( ' }\n' ) + out.write( """ } } // namespace fts diff --git a/src/mongo/db/fts/unicode/gen_casefold_map.py b/src/mongo/db/fts/unicode/gen_casefold_map.py index 19003693a2f..98378d94fb1 100644 --- a/src/mongo/db/fts/unicode/gen_casefold_map.py +++ b/src/mongo/db/fts/unicode/gen_casefold_map.py @@ -6,6 +6,7 @@ import sys from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \ include + def generate(unicode_casefold_file, target): """Generates a C++ source file that contains a Unicode case folding function. @@ -13,7 +14,7 @@ def generate(unicode_casefold_file, target): The case folding function contains a switch statement with cases for every Unicode codepoint that has a case folding mapping. """ - out = open(target, "w") + out = open(target, "w", encoding='utf-8') out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) @@ -22,9 +23,10 @@ def generate(unicode_casefold_file, target): case_mappings = {} - cf_file = open(unicode_casefold_file, 'rU') + cf_file = open(unicode_casefold_file, 'rb') for line in cf_file: + line = line.decode('utf-8') # Filter out blank lines and lines that start with # data = line[:line.find('#')] if(data == ""): @@ -76,18 +78,19 @@ def generate(unicode_casefold_file, target): for mapping in sorted_mappings: if mapping[0] <= 0x7f: - continue # ascii is special cased above. + continue # ascii is special cased above. if mapping[0] in turkishMapping: - out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" - % (mapping[0], turkishMapping[mapping[0]], mapping[1])) + out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" % + (mapping[0], turkishMapping[mapping[0]], mapping[1])) else: - out.write("case 0x%x: return 0x%x;\n"%mapping) + out.write("case 0x%x: return 0x%x;\n" % mapping) out.write("\ default: return codepoint;\n }\n}") out.write(closeNamespaces()) + if __name__ == "__main__": generate(sys.argv[1], sys.argv[2]) diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py index 6cb007ab52a..152fcd77993 100644 --- a/src/mongo/db/fts/unicode/gen_delimiter_list.py +++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py @@ -5,6 +5,7 @@ import sys from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \ include + def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a delimiter checking function. @@ -21,25 +22,22 @@ def generate(unicode_proplist_file, target): delim_codepoints = set() - proplist_file = open(unicode_proplist_file, 'rU') + proplist_file = open(unicode_proplist_file, 'r') - delim_properties = ["White_Space", - "Dash", - "Hyphen", - "Quotation_Mark", - "Terminal_Punctuation", - "Pattern_Syntax", - "STerm"] + delim_properties = [ + "White_Space", "Dash", "Hyphen", "Quotation_Mark", "Terminal_Punctuation", "Pattern_Syntax", + "STerm" + ] for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] - if(data == ""): + if (data == ""): continue # Parse the data on the line values = data.split("; ") - assert(len(values) == 2) + assert (len(values) == 2) uproperty = values[1].strip() if uproperty in delim_properties: @@ -47,7 +45,7 @@ def generate(unicode_proplist_file, target): codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) - end = int(codepoint_range[1], 16) + 1 + end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in delim_codepoints: @@ -82,7 +80,7 @@ def generate(unicode_proplist_file, target): switch (codepoint) {\n""") for delim in sorted(delim_codepoints): - if delim <= 0x7f: # ascii codepoints handled in lists above. + if delim <= 0x7f: # ascii codepoints handled in lists above. continue out.write("\ case " + str(hex(delim)) + ": return true;\n") @@ -92,5 +90,6 @@ def generate(unicode_proplist_file, target): out.write(closeNamespaces()) + if __name__ == "__main__": generate(sys.argv[1], sys.argv[2]) diff --git a/src/mongo/db/fts/unicode/gen_diacritic_list.py b/src/mongo/db/fts/unicode/gen_diacritic_list.py index baab6e0b9b7..3859e0e7fe3 100644 --- a/src/mongo/db/fts/unicode/gen_diacritic_list.py +++ b/src/mongo/db/fts/unicode/gen_diacritic_list.py @@ -5,6 +5,7 @@ import sys from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \ include + def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a diacritic checking function. @@ -20,17 +21,17 @@ def generate(unicode_proplist_file, target): diacritics = set() - proplist_file = open(unicode_proplist_file, 'rU') + proplist_file = open(unicode_proplist_file, 'r') for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] - if(data == ""): + if (data == ""): continue # Parse the data on the line values = data.split("; ") - assert(len(values) == 2) + assert (len(values) == 2) uproperty = values[1].strip() if uproperty in "Diacritic": @@ -38,7 +39,7 @@ def generate(unicode_proplist_file, target): codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) - end = int(codepoint_range[1], 16) + 1 + end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in diacritics: @@ -59,5 +60,6 @@ def generate(unicode_proplist_file, target): out.write(closeNamespaces()) + if __name__ == "__main__": generate(sys.argv[1], sys.argv[2]) diff --git a/src/mongo/db/fts/unicode/gen_diacritic_map.py b/src/mongo/db/fts/unicode/gen_diacritic_map.py index d77a7d1dd16..bad8919c24c 100644 --- a/src/mongo/db/fts/unicode/gen_diacritic_map.py +++ b/src/mongo/db/fts/unicode/gen_diacritic_map.py @@ -8,18 +8,19 @@ from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \ diacritics = set() + def load_diacritics(unicode_proplist_file): proplist_file = open(unicode_proplist_file, 'r') for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] - if(data == ""): + if (data == ""): continue # Parse the data on the line values = data.split("; ") - assert(len(values) == 2) + assert (len(values) == 2) uproperty = values[1].strip() if uproperty == "Diacritic": @@ -27,7 +28,7 @@ def load_diacritics(unicode_proplist_file): codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) - end = int(codepoint_range[1], 16) + 1 + end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in diacritics: @@ -36,8 +37,10 @@ def load_diacritics(unicode_proplist_file): if int(values[0], 16) not in diacritics: diacritics.add(int(values[0], 16)) + diacritic_mappings = {} + def add_diacritic_mapping(codepoint): # a : original unicode character # d : decomposed unicode character @@ -45,7 +48,7 @@ def add_diacritic_mapping(codepoint): # c : recomposed unicode character with diacritics removed a = chr(codepoint) d = normalize('NFD', a) - r = u'' + r = '' for i in range(len(d)): if ord(d[i]) not in diacritics: @@ -55,14 +58,16 @@ def add_diacritic_mapping(codepoint): # Only use mappings where the final recomposed form is a single codepoint if (a != c and len(c) == 1): - assert c != '\0' # This is used to indicate the codepoint is a pure diacritic. + assert c != '\0' # This is used to indicate the codepoint is a pure diacritic. assert ord(c) not in diacritics diacritic_mappings[codepoint] = ord(c[0]) + def add_diacritic_range(start, end): for x in range(start, end + 1): add_diacritic_mapping(x) + def generate(target): """Generates a C++ source file that contains a diacritic removal mapping function. @@ -101,8 +106,9 @@ def generate(target): out.write(closeNamespaces()) + if __name__ == "__main__": - if(unidata_version != '8.0.0'): + if (unidata_version != '8.0.0'): print("""ERROR: This script must be run with a version of Python that \ contains the Unicode 8.0.0 Character Database.""") sys.exit(1) diff --git a/src/mongo/db/fts/unicode/gen_helper.py b/src/mongo/db/fts/unicode/gen_helper.py index 5825bea57de..9f470904c4a 100644 --- a/src/mongo/db/fts/unicode/gen_helper.py +++ b/src/mongo/db/fts/unicode/gen_helper.py @@ -30,11 +30,14 @@ def getCopyrightNotice(): * THIS IS A GENERATED FILE, DO NOT MODIFY. */\n\n""" + def openNamespaces(): return "namespace mongo {\nnamespace unicode {\n\n" + def closeNamespaces(): return "\n} // namespace unicode\n} // namespace mongo\n" + def include(header): return '#include "' + header + '"\n' |