summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
authorMathew Robinson <chasinglogic@gmail.com>2019-02-19 10:50:57 -0500
committerMathew Robinson <chasinglogic@gmail.com>2019-04-08 14:08:49 -0400
commit8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6 (patch)
tree69e936c4953cbead2e3bae2690157c5fe75e709d /src/mongo/db/fts
parentc600aa9d7423eca8151daf626e2799d9a6c7b31c (diff)
downloadmongo-8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6.tar.gz
SERVER-32295 Support Python 3
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/generate_stop_words.py9
-rw-r--r--src/mongo/db/fts/unicode/gen_casefold_map.py15
-rw-r--r--src/mongo/db/fts/unicode/gen_delimiter_list.py23
-rw-r--r--src/mongo/db/fts/unicode/gen_diacritic_list.py10
-rw-r--r--src/mongo/db/fts/unicode/gen_diacritic_map.py18
-rw-r--r--src/mongo/db/fts/unicode/gen_helper.py3
6 files changed, 46 insertions, 32 deletions
diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py
index 31603eb92ed..0d356a2a351 100644
--- a/src/mongo/db/fts/generate_stop_words.py
+++ b/src/mongo/db/fts/generate_stop_words.py
@@ -1,7 +1,7 @@
import sys
def generate( header, source, language_files ):
- out = open( header, "wb" )
+ out = open( header, "w" )
out.write( """
#pragma once
#include <set>
@@ -18,8 +18,8 @@ namespace fts {
- out = open( source, "wb" )
- out.write( '#include "%s"' % header.rpartition( "/" )[2].rpartition( "\\" )[2] )
+ out = open( source, "w", encoding='utf-8')
+ out.write( '#include "{}"'.format(header.rpartition( "/" )[2].rpartition( "\\" )[2]) )
out.write( """
namespace mongo {
namespace fts {
@@ -35,12 +35,13 @@ namespace fts {
out.write( ' {\n' )
out.write( ' const char* const words[] = {\n' )
for word in open( l_file, "rb" ):
- out.write( ' "%s",\n' % word.strip() )
+ out.write( ' "%s",\n' % word.decode('utf-8').strip() )
out.write( ' };\n' )
out.write( ' const size_t wordcnt = sizeof(words) / sizeof(words[0]);\n' )
out.write( ' std::set< std::string >& l = (*m)["%s"];\n' % l )
out.write( ' l.insert(&words[0], &words[wordcnt]);\n' )
out.write( ' }\n' )
+
out.write( """
}
} // namespace fts
diff --git a/src/mongo/db/fts/unicode/gen_casefold_map.py b/src/mongo/db/fts/unicode/gen_casefold_map.py
index 19003693a2f..98378d94fb1 100644
--- a/src/mongo/db/fts/unicode/gen_casefold_map.py
+++ b/src/mongo/db/fts/unicode/gen_casefold_map.py
@@ -6,6 +6,7 @@ import sys
from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
include
+
def generate(unicode_casefold_file, target):
"""Generates a C++ source file that contains a Unicode case folding
function.
@@ -13,7 +14,7 @@ def generate(unicode_casefold_file, target):
The case folding function contains a switch statement with cases for every
Unicode codepoint that has a case folding mapping.
"""
- out = open(target, "w")
+ out = open(target, "w", encoding='utf-8')
out.write(getCopyrightNotice())
out.write(include("mongo/db/fts/unicode/codepoints.h"))
@@ -22,9 +23,10 @@ def generate(unicode_casefold_file, target):
case_mappings = {}
- cf_file = open(unicode_casefold_file, 'rU')
+ cf_file = open(unicode_casefold_file, 'rb')
for line in cf_file:
+ line = line.decode('utf-8')
# Filter out blank lines and lines that start with #
data = line[:line.find('#')]
if(data == ""):
@@ -76,18 +78,19 @@ def generate(unicode_casefold_file, target):
for mapping in sorted_mappings:
if mapping[0] <= 0x7f:
- continue # ascii is special cased above.
+ continue # ascii is special cased above.
if mapping[0] in turkishMapping:
- out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
- % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
+ out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" %
+ (mapping[0], turkishMapping[mapping[0]], mapping[1]))
else:
- out.write("case 0x%x: return 0x%x;\n"%mapping)
+ out.write("case 0x%x: return 0x%x;\n" % mapping)
out.write("\
default: return codepoint;\n }\n}")
out.write(closeNamespaces())
+
if __name__ == "__main__":
generate(sys.argv[1], sys.argv[2])
diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py
index 6cb007ab52a..152fcd77993 100644
--- a/src/mongo/db/fts/unicode/gen_delimiter_list.py
+++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py
@@ -5,6 +5,7 @@ import sys
from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
include
+
def generate(unicode_proplist_file, target):
"""Generates a C++ source file that contains a delimiter checking function.
@@ -21,25 +22,22 @@ def generate(unicode_proplist_file, target):
delim_codepoints = set()
- proplist_file = open(unicode_proplist_file, 'rU')
+ proplist_file = open(unicode_proplist_file, 'r')
- delim_properties = ["White_Space",
- "Dash",
- "Hyphen",
- "Quotation_Mark",
- "Terminal_Punctuation",
- "Pattern_Syntax",
- "STerm"]
+ delim_properties = [
+ "White_Space", "Dash", "Hyphen", "Quotation_Mark", "Terminal_Punctuation", "Pattern_Syntax",
+ "STerm"
+ ]
for line in proplist_file:
# Filter out blank lines and lines that start with #
data = line[:line.find('#')]
- if(data == ""):
+ if (data == ""):
continue
# Parse the data on the line
values = data.split("; ")
- assert(len(values) == 2)
+ assert (len(values) == 2)
uproperty = values[1].strip()
if uproperty in delim_properties:
@@ -47,7 +45,7 @@ def generate(unicode_proplist_file, target):
codepoint_range = values[0].split('..')
start = int(codepoint_range[0], 16)
- end = int(codepoint_range[1], 16) + 1
+ end = int(codepoint_range[1], 16) + 1
for i in range(start, end):
if i not in delim_codepoints:
@@ -82,7 +80,7 @@ def generate(unicode_proplist_file, target):
switch (codepoint) {\n""")
for delim in sorted(delim_codepoints):
- if delim <= 0x7f: # ascii codepoints handled in lists above.
+ if delim <= 0x7f: # ascii codepoints handled in lists above.
continue
out.write("\
case " + str(hex(delim)) + ": return true;\n")
@@ -92,5 +90,6 @@ def generate(unicode_proplist_file, target):
out.write(closeNamespaces())
+
if __name__ == "__main__":
generate(sys.argv[1], sys.argv[2])
diff --git a/src/mongo/db/fts/unicode/gen_diacritic_list.py b/src/mongo/db/fts/unicode/gen_diacritic_list.py
index baab6e0b9b7..3859e0e7fe3 100644
--- a/src/mongo/db/fts/unicode/gen_diacritic_list.py
+++ b/src/mongo/db/fts/unicode/gen_diacritic_list.py
@@ -5,6 +5,7 @@ import sys
from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
include
+
def generate(unicode_proplist_file, target):
"""Generates a C++ source file that contains a diacritic checking function.
@@ -20,17 +21,17 @@ def generate(unicode_proplist_file, target):
diacritics = set()
- proplist_file = open(unicode_proplist_file, 'rU')
+ proplist_file = open(unicode_proplist_file, 'r')
for line in proplist_file:
# Filter out blank lines and lines that start with #
data = line[:line.find('#')]
- if(data == ""):
+ if (data == ""):
continue
# Parse the data on the line
values = data.split("; ")
- assert(len(values) == 2)
+ assert (len(values) == 2)
uproperty = values[1].strip()
if uproperty in "Diacritic":
@@ -38,7 +39,7 @@ def generate(unicode_proplist_file, target):
codepoint_range = values[0].split('..')
start = int(codepoint_range[0], 16)
- end = int(codepoint_range[1], 16) + 1
+ end = int(codepoint_range[1], 16) + 1
for i in range(start, end):
if i not in diacritics:
@@ -59,5 +60,6 @@ def generate(unicode_proplist_file, target):
out.write(closeNamespaces())
+
if __name__ == "__main__":
generate(sys.argv[1], sys.argv[2])
diff --git a/src/mongo/db/fts/unicode/gen_diacritic_map.py b/src/mongo/db/fts/unicode/gen_diacritic_map.py
index d77a7d1dd16..bad8919c24c 100644
--- a/src/mongo/db/fts/unicode/gen_diacritic_map.py
+++ b/src/mongo/db/fts/unicode/gen_diacritic_map.py
@@ -8,18 +8,19 @@ from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
diacritics = set()
+
def load_diacritics(unicode_proplist_file):
proplist_file = open(unicode_proplist_file, 'r')
for line in proplist_file:
# Filter out blank lines and lines that start with #
data = line[:line.find('#')]
- if(data == ""):
+ if (data == ""):
continue
# Parse the data on the line
values = data.split("; ")
- assert(len(values) == 2)
+ assert (len(values) == 2)
uproperty = values[1].strip()
if uproperty == "Diacritic":
@@ -27,7 +28,7 @@ def load_diacritics(unicode_proplist_file):
codepoint_range = values[0].split('..')
start = int(codepoint_range[0], 16)
- end = int(codepoint_range[1], 16) + 1
+ end = int(codepoint_range[1], 16) + 1
for i in range(start, end):
if i not in diacritics:
@@ -36,8 +37,10 @@ def load_diacritics(unicode_proplist_file):
if int(values[0], 16) not in diacritics:
diacritics.add(int(values[0], 16))
+
diacritic_mappings = {}
+
def add_diacritic_mapping(codepoint):
# a : original unicode character
# d : decomposed unicode character
@@ -45,7 +48,7 @@ def add_diacritic_mapping(codepoint):
# c : recomposed unicode character with diacritics removed
a = chr(codepoint)
d = normalize('NFD', a)
- r = u''
+ r = ''
for i in range(len(d)):
if ord(d[i]) not in diacritics:
@@ -55,14 +58,16 @@ def add_diacritic_mapping(codepoint):
# Only use mappings where the final recomposed form is a single codepoint
if (a != c and len(c) == 1):
- assert c != '\0' # This is used to indicate the codepoint is a pure diacritic.
+ assert c != '\0' # This is used to indicate the codepoint is a pure diacritic.
assert ord(c) not in diacritics
diacritic_mappings[codepoint] = ord(c[0])
+
def add_diacritic_range(start, end):
for x in range(start, end + 1):
add_diacritic_mapping(x)
+
def generate(target):
"""Generates a C++ source file that contains a diacritic removal mapping
function.
@@ -101,8 +106,9 @@ def generate(target):
out.write(closeNamespaces())
+
if __name__ == "__main__":
- if(unidata_version != '8.0.0'):
+ if (unidata_version != '8.0.0'):
print("""ERROR: This script must be run with a version of Python that \
contains the Unicode 8.0.0 Character Database.""")
sys.exit(1)
diff --git a/src/mongo/db/fts/unicode/gen_helper.py b/src/mongo/db/fts/unicode/gen_helper.py
index 5825bea57de..9f470904c4a 100644
--- a/src/mongo/db/fts/unicode/gen_helper.py
+++ b/src/mongo/db/fts/unicode/gen_helper.py
@@ -30,11 +30,14 @@ def getCopyrightNotice():
* THIS IS A GENERATED FILE, DO NOT MODIFY.
*/\n\n"""
+
def openNamespaces():
return "namespace mongo {\nnamespace unicode {\n\n"
+
def closeNamespaces():
return "\n} // namespace unicode\n} // namespace mongo\n"
+
def include(header):
return '#include "' + header + '"\n'