1 files changed, 132 insertions, 0 deletions
diff --git a/bin/cython-generate-lexicon.py b/bin/cython-generate-lexicon.py
new file mode 100755
index 000000000..e28441585
--- /dev/null
+++ b/bin/cython-generate-lexicon.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+#
+#   Updates Cython's Lexicon.py with the unicode characters that are accepted as
+#   identifiers. Should be run with the most recent version of Python possible
+#   to ensure that Lexicon is as complete as possible.
+#
+#   Python3 only (it relies on str.isidentifier which is a Python 3 addition)
+#
+#   Run with either
+#    --overwrite    to update the existing Lexicon.py file
+#    --here         to create a copy of Lexicon.py in the current directory
+
+import functools
+import re
+import os
+import sys
+
+# Make sure we import the right Cython
+cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
+cythonpath, _ = os.path.split(cythonpath)
+if os.path.exists(os.path.join(cythonpath, "Cython")):
+    sys.path.insert(0, cythonpath)
+    print("Found (and using) local cython directory")
+# else we aren't in a development directory
+
+from Cython.Compiler import Lexicon
+
+
+def main():
+    arg = '--overwrite'
+    if len(sys.argv) == 2:
+        arg = sys.argv[1]
+    if len(sys.argv) > 2 or arg not in ['--overwrite','--here']:
+        print("""Call the script with either:
+  --overwrite    to update the existing Lexicon.py file (default)
+  --here         to create an version of Lexicon.py in the current directory
+""")
+        return
+
+    generated_code = (
+        f"# generated with:\n"
+        f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
+        "\n"
+        f"{generate_character_sets()}\n"
+    )
+
+    print("Reading file", Lexicon.__file__)
+    with open(Lexicon.__file__, 'r') as f:
+        parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
+
+    if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
+        print("Warning: generated code section not found - code not inserted")
+        return
+
+    parts[2] = generated_code
+    output = "".join(parts)
+
+    if arg == "--here":
+        outfile = "Lexicon.py"
+    else:
+        assert arg == "--overwrite"
+        outfile = Lexicon.__file__
+
+    print("Writing to file", outfile)
+    with open(outfile, 'w') as f:
+        f.write(output)
+
+
+# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
+# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
+@functools.lru_cache()
+def get_start_characters_as_number():
+    return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
+
+
+def get_continue_characters_as_number():
+    return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
+
+
+def get_continue_not_start_as_number():
+    start = get_start_characters_as_number()
+    cont = get_continue_characters_as_number()
+    assert set(start) <= set(cont), \
+        "We assume that all identifier start characters are also continuation characters."
+    return sorted(set(cont).difference(start))
+
+
+def to_ranges(char_num_list):
+    # Convert the large lists of character digits to
+    #  list of characters
+    #  a list pairs of characters representing closed ranges
+    char_num_list = sorted(char_num_list)
+    first_good_val = char_num_list[0]
+
+    single_chars = []
+    ranges = []
+    for n in range(1, len(char_num_list)):
+        if char_num_list[n]-1 != char_num_list[n-1]:
+            # discontinuous
+            if first_good_val == char_num_list[n-1]:
+                single_chars.append(chr(char_num_list[n-1]))
+            else:
+                ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
+            first_good_val = char_num_list[n]
+
+    return ''.join(single_chars), ''.join(ranges)
+
+
+def make_split_strings(chars, splitby=60, indent="    "):
+    lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
+    return indent + f"\n{indent}".join(lines)
+
+
+def generate_character_sets():
+    declarations = []
+    for char_type, char_generator in [
+        ("unicode_start_ch", get_start_characters_as_number),
+        ("unicode_continuation_ch", get_continue_not_start_as_number),
+    ]:
+        for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
+            declarations.append(
+                f"{char_type}_{set_type} = (\n"
+                f"{make_split_strings(chars)}\n"
+                f")\n"
+            )
+
+    return "".join(declarations)
+
+
+if __name__ == "__main__":
+    main()