diff options
Diffstat (limited to 'bin/cython-generate-lexicon.py')
-rwxr-xr-x | bin/cython-generate-lexicon.py | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/bin/cython-generate-lexicon.py b/bin/cython-generate-lexicon.py new file mode 100755 index 000000000..e28441585 --- /dev/null +++ b/bin/cython-generate-lexicon.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +# +# Updates Cython's Lexicon.py with the unicode characters that are accepted as +# identifiers. Should be run with the most recent version of Python possible +# to ensure that Lexicon is as complete as possible. +# +# Python3 only (it relies on str.isidentifier which is a Python 3 addition) +# +# Run with either +# --overwrite to update the existing Lexicon.py file +# --here to create a copy of Lexicon.py in the current directory + +import functools +import re +import os +import sys + +# Make sure we import the right Cython +cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory +cythonpath, _ = os.path.split(cythonpath) +if os.path.exists(os.path.join(cythonpath, "Cython")): + sys.path.insert(0, cythonpath) + print("Found (and using) local cython directory") +# else we aren't in a development directory + +from Cython.Compiler import Lexicon + + +def main(): + arg = '--overwrite' + if len(sys.argv) == 2: + arg = sys.argv[1] + if len(sys.argv) > 2 or arg not in ['--overwrite','--here']: + print("""Call the script with either: + --overwrite to update the existing Lexicon.py file (default) + --here to create an version of Lexicon.py in the current directory +""") + return + + generated_code = ( + f"# generated with:\n" + f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n" + "\n" + f"{generate_character_sets()}\n" + ) + + print("Reading file", Lexicon.__file__) + with open(Lexicon.__file__, 'r') as f: + parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read()) + + if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]: + print("Warning: generated code section not found - code not inserted") + return + + parts[2] = generated_code + output = "".join(parts) + + if arg == "--here": + outfile = "Lexicon.py" + else: + assert arg == "--overwrite" + outfile = Lexicon.__file__ + + print("Writing to file", outfile) + with open(outfile, 'w') as f: + f.write(output) + + +# The easiest way to generate an appropriate character set is just to use the str.isidentifier method +# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412 +@functools.lru_cache() +def get_start_characters_as_number(): + return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ] + + +def get_continue_characters_as_number(): + return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ] + + +def get_continue_not_start_as_number(): + start = get_start_characters_as_number() + cont = get_continue_characters_as_number() + assert set(start) <= set(cont), \ + "We assume that all identifier start characters are also continuation characters." + return sorted(set(cont).difference(start)) + + +def to_ranges(char_num_list): + # Convert the large lists of character digits to + # list of characters + # a list pairs of characters representing closed ranges + char_num_list = sorted(char_num_list) + first_good_val = char_num_list[0] + + single_chars = [] + ranges = [] + for n in range(1, len(char_num_list)): + if char_num_list[n]-1 != char_num_list[n-1]: + # discontinuous + if first_good_val == char_num_list[n-1]: + single_chars.append(chr(char_num_list[n-1])) + else: + ranges.append(chr(first_good_val) + chr(char_num_list[n-1])) + first_good_val = char_num_list[n] + + return ''.join(single_chars), ''.join(ranges) + + +def make_split_strings(chars, splitby=60, indent=" "): + lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)] + return indent + f"\n{indent}".join(lines) + + +def generate_character_sets(): + declarations = [] + for char_type, char_generator in [ + ("unicode_start_ch", get_start_characters_as_number), + ("unicode_continuation_ch", get_continue_not_start_as_number), + ]: + for set_type, chars in zip(("any", "range"), to_ranges(char_generator())): + declarations.append( + f"{char_type}_{set_type} = (\n" + f"{make_split_strings(chars)}\n" + f")\n" + ) + + return "".join(declarations) + + +if __name__ == "__main__": + main() |