bin/cython-generate-lexicon.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

#!/usr/bin/env python3

#
#   Updates Cython's Lexicon.py with the unicode characters that are accepted as
#   identifiers. Should be run with the most recent version of Python possible
#   to ensure that Lexicon is as complete as possible.
#
#   Python3 only (it relies on str.isidentifier which is a Python 3 addition)
#
#   Run with either
#    --overwrite    to update the existing Lexicon.py file
#    --here         to create a copy of Lexicon.py in the current directory

import functools
import re
import os
import sys

# Make sure we import the right Cython
cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
cythonpath, _ = os.path.split(cythonpath)
if os.path.exists(os.path.join(cythonpath, "Cython")):
    sys.path.insert(0, cythonpath)
    print("Found (and using) local cython directory")
# else we aren't in a development directory

from Cython.Compiler import Lexicon


def main():
    arg = '--overwrite'
    if len(sys.argv) == 2:
        arg = sys.argv[1]
    if len(sys.argv) > 2 or arg not in ['--overwrite','--here']:
        print("""Call the script with either:
  --overwrite    to update the existing Lexicon.py file (default)
  --here         to create an version of Lexicon.py in the current directory
""")
        return

    generated_code = (
        f"# generated with:\n"
        f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
        "\n"
        f"{generate_character_sets()}\n"
    )

    print("Reading file", Lexicon.__file__)
    with open(Lexicon.__file__, 'r') as f:
        parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())

    if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
        print("Warning: generated code section not found - code not inserted")
        return

    parts[2] = generated_code
    output = "".join(parts)

    if arg == "--here":
        outfile = "Lexicon.py"
    else:
        assert arg == "--overwrite"
        outfile = Lexicon.__file__

    print("Writing to file", outfile)
    with open(outfile, 'w') as f:
        f.write(output)


# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
@functools.lru_cache()
def get_start_characters_as_number():
    return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]


def get_continue_characters_as_number():
    return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]


def get_continue_not_start_as_number():
    start = get_start_characters_as_number()
    cont = get_continue_characters_as_number()
    assert set(start) <= set(cont), \
        "We assume that all identifier start characters are also continuation characters."
    return sorted(set(cont).difference(start))


def to_ranges(char_num_list):
    # Convert the large lists of character digits to
    #  list of characters
    #  a list pairs of characters representing closed ranges
    char_num_list = sorted(char_num_list)
    first_good_val = char_num_list[0]

    single_chars = []
    ranges = []
    for n in range(1, len(char_num_list)):
        if char_num_list[n]-1 != char_num_list[n-1]:
            # discontinuous
            if first_good_val == char_num_list[n-1]:
                single_chars.append(chr(char_num_list[n-1]))
            else:
                ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
            first_good_val = char_num_list[n]

    return ''.join(single_chars), ''.join(ranges)


def make_split_strings(chars, splitby=60, indent="    "):
    lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
    return indent + f"\n{indent}".join(lines)


def generate_character_sets():
    declarations = []
    for char_type, char_generator in [
        ("unicode_start_ch", get_start_characters_as_number),
        ("unicode_continuation_ch", get_continue_not_start_as_number),
    ]:
        for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
            declarations.append(
                f"{char_type}_{set_type} = (\n"
                f"{make_split_strings(chars)}\n"
                f")\n"
            )

    return "".join(declarations)


if __name__ == "__main__":
    main()