summaryrefslogtreecommitdiff
path: root/bin/cython-generate-lexicon.py
diff options
context:
space:
mode:
Diffstat (limited to 'bin/cython-generate-lexicon.py')
-rwxr-xr-xbin/cython-generate-lexicon.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/bin/cython-generate-lexicon.py b/bin/cython-generate-lexicon.py
new file mode 100755
index 000000000..e28441585
--- /dev/null
+++ b/bin/cython-generate-lexicon.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+#
+# Updates Cython's Lexicon.py with the unicode characters that are accepted as
+# identifiers. Should be run with the most recent version of Python possible
+# to ensure that Lexicon is as complete as possible.
+#
+# Python3 only (it relies on str.isidentifier which is a Python 3 addition)
+#
+# Run with either
+# --overwrite to update the existing Lexicon.py file
+# --here to create a copy of Lexicon.py in the current directory
+
+import functools
+import re
+import os
+import sys
+
+# Make sure we import the right Cython
+cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
+cythonpath, _ = os.path.split(cythonpath)
+if os.path.exists(os.path.join(cythonpath, "Cython")):
+ sys.path.insert(0, cythonpath)
+ print("Found (and using) local cython directory")
+# else we aren't in a development directory
+
+from Cython.Compiler import Lexicon
+
+
+def main():
+ arg = '--overwrite'
+ if len(sys.argv) == 2:
+ arg = sys.argv[1]
+ if len(sys.argv) > 2 or arg not in ['--overwrite','--here']:
+ print("""Call the script with either:
+ --overwrite to update the existing Lexicon.py file (default)
+ --here to create an version of Lexicon.py in the current directory
+""")
+ return
+
+ generated_code = (
+ f"# generated with:\n"
+ f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
+ "\n"
+ f"{generate_character_sets()}\n"
+ )
+
+ print("Reading file", Lexicon.__file__)
+ with open(Lexicon.__file__, 'r') as f:
+ parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
+
+ if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
+ print("Warning: generated code section not found - code not inserted")
+ return
+
+ parts[2] = generated_code
+ output = "".join(parts)
+
+ if arg == "--here":
+ outfile = "Lexicon.py"
+ else:
+ assert arg == "--overwrite"
+ outfile = Lexicon.__file__
+
+ print("Writing to file", outfile)
+ with open(outfile, 'w') as f:
+ f.write(output)
+
+
+# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
+# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
+@functools.lru_cache()
+def get_start_characters_as_number():
+ return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
+
+
+def get_continue_characters_as_number():
+ return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
+
+
+def get_continue_not_start_as_number():
+ start = get_start_characters_as_number()
+ cont = get_continue_characters_as_number()
+ assert set(start) <= set(cont), \
+ "We assume that all identifier start characters are also continuation characters."
+ return sorted(set(cont).difference(start))
+
+
+def to_ranges(char_num_list):
+ # Convert the large lists of character digits to
+ # list of characters
+ # a list pairs of characters representing closed ranges
+ char_num_list = sorted(char_num_list)
+ first_good_val = char_num_list[0]
+
+ single_chars = []
+ ranges = []
+ for n in range(1, len(char_num_list)):
+ if char_num_list[n]-1 != char_num_list[n-1]:
+ # discontinuous
+ if first_good_val == char_num_list[n-1]:
+ single_chars.append(chr(char_num_list[n-1]))
+ else:
+ ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
+ first_good_val = char_num_list[n]
+
+ return ''.join(single_chars), ''.join(ranges)
+
+
+def make_split_strings(chars, splitby=60, indent=" "):
+ lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
+ return indent + f"\n{indent}".join(lines)
+
+
+def generate_character_sets():
+ declarations = []
+ for char_type, char_generator in [
+ ("unicode_start_ch", get_start_characters_as_number),
+ ("unicode_continuation_ch", get_continue_not_start_as_number),
+ ]:
+ for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
+ declarations.append(
+ f"{char_type}_{set_type} = (\n"
+ f"{make_split_strings(chars)}\n"
+ f")\n"
+ )
+
+ return "".join(declarations)
+
+
+if __name__ == "__main__":
+ main()