summaryrefslogtreecommitdiff
path: root/Lib/sre_compile.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r--Lib/sre_compile.py19
1 files changed, 9 insertions, 10 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 46eac9c070..b984a54b03 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -13,7 +13,6 @@
import _sre, sys
import sre_parse
from sre_constants import *
-from _sre import MAXREPEAT
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@@ -277,10 +276,10 @@ def _mk_bitmap(bits):
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
-# represented by a 16-bit word sequence, consisting of one word for
-# the number of different chunks, a sequence of 256 bytes (128 words)
+# represented by a 32-bit word sequence, consisting of one word for
+# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
-# sequence of chunks (16 words each).
+# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
@@ -293,9 +292,9 @@ def _mk_bitmap(bits):
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
-# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
+# The BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
-# for all of UTF-16 has not yet been developed. This means,
+# for all of Unicode has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
@@ -319,11 +318,13 @@ def _optimize_unicode(charset, fixup):
# XXX: could expand category
return charset # cannot compress
except IndexError:
- # non-BMP characters
+ # non-BMP characters; XXX now they should work
return charset
if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
+ # XXX2: now they should work, but removing this will make the
+ # charmap 17 times bigger
return charset
for i in range(65536):
charmap[i] = not charmap[i]
@@ -344,7 +345,7 @@ def _optimize_unicode(charset, fixup):
else:
code = 'I'
# Convert block indices to byte array of 256 bytes
- mapping = array.array('b', mapping).tobytes()
+ mapping = array.array('B', mapping).tobytes()
# Convert byte array to word array
mapping = array.array(code, mapping)
assert mapping.itemsize == _sre.CODESIZE
@@ -356,8 +357,6 @@ def _optimize_unicode(charset, fixup):
def _simple(av):
# check if av is a "simple" operator
lo, hi = av[2].getwidth()
- if lo == 0 and hi == MAXREPEAT:
- raise error("nothing to repeat")
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
def _compile_info(code, pattern, flags):