diff options
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 19 |
1 files changed, 9 insertions, 10 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 46eac9c070..b984a54b03 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -13,7 +13,6 @@ import _sre, sys import sre_parse from sre_constants import * -from _sre import MAXREPEAT assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -277,10 +276,10 @@ def _mk_bitmap(bits): # set is constructed. Then, this bitmap is sliced into chunks of 256 # characters, duplicate chunks are eliminated, and each chunk is # given a number. In the compiled expression, the charset is -# represented by a 16-bit word sequence, consisting of one word for -# the number of different chunks, a sequence of 256 bytes (128 words) +# represented by a 32-bit word sequence, consisting of one word for +# the number of different chunks, a sequence of 256 bytes (64 words) # of chunk numbers indexed by their original chunk position, and a -# sequence of chunks (16 words each). +# sequence of 256-bit chunks (8 words each). # Compression is normally good: in a typical charset, large ranges of # Unicode will be either completely excluded (e.g. if only cyrillic @@ -293,9 +292,9 @@ def _mk_bitmap(bits): # less significant byte is a bit index in the chunk (just like the # CHARSET matching). -# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets +# The BIGCHARSET opcode still supports only subsets # of the basic multilingual plane; an efficient representation -# for all of UTF-16 has not yet been developed. This means, +# for all of Unicode has not yet been developed. This means, # in particular, that negated charsets cannot be represented as # bigcharsets. @@ -319,11 +318,13 @@ def _optimize_unicode(charset, fixup): # XXX: could expand category return charset # cannot compress except IndexError: - # non-BMP characters + # non-BMP characters; XXX now they should work return charset if negate: if sys.maxunicode != 65535: # XXX: negation does not work with big charsets + # XXX2: now they should work, but removing this will make the + # charmap 17 times bigger return charset for i in range(65536): charmap[i] = not charmap[i] @@ -344,7 +345,7 @@ def _optimize_unicode(charset, fixup): else: code = 'I' # Convert block indices to byte array of 256 bytes - mapping = array.array('b', mapping).tobytes() + mapping = array.array('B', mapping).tobytes() # Convert byte array to word array mapping = array.array(code, mapping) assert mapping.itemsize == _sre.CODESIZE @@ -356,8 +357,6 @@ def _optimize_unicode(charset, fixup): def _simple(av): # check if av is a "simple" operator lo, hi = av[2].getwidth() - if lo == 0 and hi == MAXREPEAT: - raise error("nothing to repeat") return lo == hi == 1 and av[2][0][0] != SUBPATTERN def _compile_info(code, pattern, flags): |