SERVER-32295 Support Python 3

author: Mathew Robinson <chasinglogic@gmail.com> 2019-02-19 10:50:57 -0500
committer: Mathew Robinson <chasinglogic@gmail.com> 2019-04-08 14:08:49 -0400
commit: 8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6 (patch)
tree: 69e936c4953cbead2e3bae2690157c5fe75e709d /src/mongo/db/fts
parent: c600aa9d7423eca8151daf626e2799d9a6c7b31c (diff)
download: mongo-8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6.tar.gz
6 files changed, 46 insertions, 32 deletions
diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py
index 31603eb92ed..0d356a2a351 100644
--- a/src/mongo/db/fts/generate_stop_words.py
+++ b/src/mongo/db/fts/generate_stop_words.py
@@ -1,7 +1,7 @@
 import sys
 
 def generate( header, source, language_files ):
-    out = open( header, "wb" )
+    out = open( header, "w" )
     out.write( """
 #pragma once
 #include <set>
@@ -18,8 +18,8 @@ namespace fts {
 
 
 
-    out = open( source, "wb" )
-    out.write( '#include "%s"' % header.rpartition( "/" )[2].rpartition( "\\" )[2] )
+    out = open( source, "w", encoding='utf-8')
+    out.write( '#include "{}"'.format(header.rpartition( "/" )[2].rpartition( "\\" )[2]) )
     out.write( """
 namespace mongo {
 namespace fts {
@@ -35,12 +35,13 @@ namespace fts {
         out.write( '  {\n' )
         out.write( '   const char* const words[] = {\n' )
         for word in open( l_file, "rb" ):
-            out.write( '       "%s",\n' % word.strip() )
+            out.write( '       "%s",\n' % word.decode('utf-8').strip() )
         out.write( '   };\n' )
         out.write( '   const size_t wordcnt = sizeof(words) / sizeof(words[0]);\n' )
         out.write( '   std::set< std::string >& l = (*m)["%s"];\n' % l )
         out.write( '   l.insert(&words[0], &words[wordcnt]);\n' )
         out.write( '  }\n' )
+
     out.write( """
   }
 } // namespace fts
diff --git a/src/mongo/db/fts/unicode/gen_casefold_map.py b/src/mongo/db/fts/unicode/gen_casefold_map.py
index 19003693a2f..98378d94fb1 100644
--- a/src/mongo/db/fts/unicode/gen_casefold_map.py
+++ b/src/mongo/db/fts/unicode/gen_casefold_map.py
@@ -6,6 +6,7 @@ import sys
 from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
     include
 
+
 def generate(unicode_casefold_file, target):
     """Generates a C++ source file that contains a Unicode case folding
        function.
@@ -13,7 +14,7 @@ def generate(unicode_casefold_file, target):
     The case folding function contains a switch statement with cases for every
     Unicode codepoint that has a case folding mapping.
     """
-    out = open(target, "w")
+    out = open(target, "w", encoding='utf-8')
 
     out.write(getCopyrightNotice())
     out.write(include("mongo/db/fts/unicode/codepoints.h"))
@@ -22,9 +23,10 @@ def generate(unicode_casefold_file, target):
 
     case_mappings = {}
 
-    cf_file = open(unicode_casefold_file, 'rU')
+    cf_file = open(unicode_casefold_file, 'rb')
 
     for line in cf_file:
+        line = line.decode('utf-8')
         # Filter out blank lines and lines that start with #
         data = line[:line.find('#')]
         if(data == ""):
@@ -76,18 +78,19 @@ def generate(unicode_casefold_file, target):
 
     for mapping in sorted_mappings:
         if mapping[0] <= 0x7f:
-            continue # ascii is special cased above.
+            continue  # ascii is special cased above.
 
         if mapping[0] in turkishMapping:
-            out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
-                      % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
+            out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" %
+                      (mapping[0], turkishMapping[mapping[0]], mapping[1]))
         else:
-            out.write("case 0x%x: return 0x%x;\n"%mapping)
+            out.write("case 0x%x: return 0x%x;\n" % mapping)
 
     out.write("\
     default: return codepoint;\n    }\n}")
 
     out.write(closeNamespaces())
 
+
 if __name__ == "__main__":
     generate(sys.argv[1], sys.argv[2])
diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py
index 6cb007ab52a..152fcd77993 100644
--- a/src/mongo/db/fts/unicode/gen_delimiter_list.py
+++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py
@@ -5,6 +5,7 @@ import sys
 from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
     include
 
+
 def generate(unicode_proplist_file, target):
     """Generates a C++ source file that contains a delimiter checking function.
 
@@ -21,25 +22,22 @@ def generate(unicode_proplist_file, target):
 
     delim_codepoints = set()
 
-    proplist_file = open(unicode_proplist_file, 'rU')
+    proplist_file = open(unicode_proplist_file, 'r')
 
-    delim_properties = ["White_Space",
-                        "Dash",
-                        "Hyphen",
-                        "Quotation_Mark",
-                        "Terminal_Punctuation",
-                        "Pattern_Syntax",
-                        "STerm"]
+    delim_properties = [
+        "White_Space", "Dash", "Hyphen", "Quotation_Mark", "Terminal_Punctuation", "Pattern_Syntax",
+        "STerm"
+    ]
 
     for line in proplist_file:
         # Filter out blank lines and lines that start with #
         data = line[:line.find('#')]
-        if(data == ""):
+        if (data == ""):
             continue
 
         # Parse the data on the line
         values = data.split("; ")
-        assert(len(values) == 2)
+        assert (len(values) == 2)
 
         uproperty = values[1].strip()
         if uproperty in delim_properties:
@@ -47,7 +45,7 @@ def generate(unicode_proplist_file, target):
                 codepoint_range = values[0].split('..')
 
                 start = int(codepoint_range[0], 16)
-                end   = int(codepoint_range[1], 16) + 1
+                end = int(codepoint_range[1], 16) + 1
 
                 for i in range(start, end):
                     if i not in delim_codepoints:
@@ -82,7 +80,7 @@ def generate(unicode_proplist_file, target):
     switch (codepoint) {\n""")
 
     for delim in sorted(delim_codepoints):
-        if delim <= 0x7f: # ascii codepoints handled in lists above.
+        if delim <= 0x7f:  # ascii codepoints handled in lists above.
             continue
         out.write("\
     case " + str(hex(delim)) + ": return true;\n")
@@ -92,5 +90,6 @@ def generate(unicode_proplist_file, target):
 
     out.write(closeNamespaces())
 
+
 if __name__ == "__main__":
     generate(sys.argv[1], sys.argv[2])
diff --git a/src/mongo/db/fts/unicode/gen_diacritic_list.py b/src/mongo/db/fts/unicode/gen_diacritic_list.py
index baab6e0b9b7..3859e0e7fe3 100644
--- a/src/mongo/db/fts/unicode/gen_diacritic_list.py
+++ b/src/mongo/db/fts/unicode/gen_diacritic_list.py
@@ -5,6 +5,7 @@ import sys
 from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
     include
 
+
 def generate(unicode_proplist_file, target):
     """Generates a C++ source file that contains a diacritic checking function.
 
@@ -20,17 +21,17 @@ def generate(unicode_proplist_file, target):
 
     diacritics = set()
 
-    proplist_file = open(unicode_proplist_file, 'rU')
+    proplist_file = open(unicode_proplist_file, 'r')
 
     for line in proplist_file:
         # Filter out blank lines and lines that start with #
         data = line[:line.find('#')]
-        if(data == ""):
+        if (data == ""):
             continue
 
         # Parse the data on the line
         values = data.split("; ")
-        assert(len(values) == 2)
+        assert (len(values) == 2)
 
         uproperty = values[1].strip()
         if uproperty in "Diacritic":
@@ -38,7 +39,7 @@ def generate(unicode_proplist_file, target):
                 codepoint_range = values[0].split('..')
 
                 start = int(codepoint_range[0], 16)
-                end   = int(codepoint_range[1], 16) + 1
+                end = int(codepoint_range[1], 16) + 1
 
                 for i in range(start, end):
                     if i not in diacritics:
@@ -59,5 +60,6 @@ def generate(unicode_proplist_file, target):
 
     out.write(closeNamespaces())
 
+
 if __name__ == "__main__":
     generate(sys.argv[1], sys.argv[2])
diff --git a/src/mongo/db/fts/unicode/gen_diacritic_map.py b/src/mongo/db/fts/unicode/gen_diacritic_map.py
index d77a7d1dd16..bad8919c24c 100644
--- a/src/mongo/db/fts/unicode/gen_diacritic_map.py
+++ b/src/mongo/db/fts/unicode/gen_diacritic_map.py
@@ -8,18 +8,19 @@ from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
 
 diacritics = set()
 
+
 def load_diacritics(unicode_proplist_file):
     proplist_file = open(unicode_proplist_file, 'r')
 
     for line in proplist_file:
         # Filter out blank lines and lines that start with #
         data = line[:line.find('#')]
-        if(data == ""):
+        if (data == ""):
             continue
 
         # Parse the data on the line
         values = data.split("; ")
-        assert(len(values) == 2)
+        assert (len(values) == 2)
 
         uproperty = values[1].strip()
         if uproperty == "Diacritic":
@@ -27,7 +28,7 @@ def load_diacritics(unicode_proplist_file):
                 codepoint_range = values[0].split('..')
 
                 start = int(codepoint_range[0], 16)
-                end   = int(codepoint_range[1], 16) + 1
+                end = int(codepoint_range[1], 16) + 1
 
                 for i in range(start, end):
                     if i not in diacritics:
@@ -36,8 +37,10 @@ def load_diacritics(unicode_proplist_file):
                 if int(values[0], 16) not in diacritics:
                     diacritics.add(int(values[0], 16))
 
+
 diacritic_mappings = {}
 
+
 def add_diacritic_mapping(codepoint):
     # a : original unicode character
     # d : decomposed unicode character
@@ -45,7 +48,7 @@ def add_diacritic_mapping(codepoint):
     # c : recomposed unicode character with diacritics removed
     a = chr(codepoint)
     d = normalize('NFD', a)
-    r = u''
+    r = ''
 
     for i in range(len(d)):
         if ord(d[i]) not in diacritics:
@@ -55,14 +58,16 @@ def add_diacritic_mapping(codepoint):
 
     # Only use mappings where the final recomposed form is a single codepoint
     if (a != c and len(c) == 1):
-        assert c != '\0' # This is used to indicate the codepoint is a pure diacritic.
+        assert c != '\0'  # This is used to indicate the codepoint is a pure diacritic.
         assert ord(c) not in diacritics
         diacritic_mappings[codepoint] = ord(c[0])
 
+
 def add_diacritic_range(start, end):
     for x in range(start, end + 1):
         add_diacritic_mapping(x)
 
+
 def generate(target):
     """Generates a C++ source file that contains a diacritic removal mapping
        function.
@@ -101,8 +106,9 @@ def generate(target):
 
     out.write(closeNamespaces())
 
+
 if __name__ == "__main__":
-    if(unidata_version != '8.0.0'):
+    if (unidata_version != '8.0.0'):
         print("""ERROR: This script must be run with a version of Python that \
             contains the Unicode 8.0.0 Character Database.""")
         sys.exit(1)
diff --git a/src/mongo/db/fts/unicode/gen_helper.py b/src/mongo/db/fts/unicode/gen_helper.py
index 5825bea57de..9f470904c4a 100644
--- a/src/mongo/db/fts/unicode/gen_helper.py
+++ b/src/mongo/db/fts/unicode/gen_helper.py
@@ -30,11 +30,14 @@ def getCopyrightNotice():
  *    THIS IS A GENERATED FILE, DO NOT MODIFY.
  */\n\n"""
 
+
 def openNamespaces():
     return "namespace mongo {\nnamespace unicode {\n\n"
 
+
 def closeNamespaces():
     return "\n} //  namespace unicode\n} //  namespace mongo\n"
 
+
 def include(header):
     return '#include "' + header + '"\n'
author	Mathew Robinson <chasinglogic@gmail.com>	2019-02-19 10:50:57 -0500
committer	Mathew Robinson <chasinglogic@gmail.com>	2019-04-08 14:08:49 -0400
commit	8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6 (patch)
tree	69e936c4953cbead2e3bae2690157c5fe75e709d /src/mongo/db/fts
parent	c600aa9d7423eca8151daf626e2799d9a6c7b31c (diff)
download	mongo-8dd6d4755734ed37c1b98dfdefce3ca6bc65f1f6.tar.gz