summaryrefslogtreecommitdiff
path: root/reader.py
diff options
context:
space:
mode:
authorMarcel Bargull <marcel.bargull@udo.edu>2018-03-16 19:53:59 +0100
committerMarcel Bargull <marcel.bargull@udo.edu>2018-03-16 19:53:59 +0100
commit185d4253e5ece454aef0446dd6372be04656ad62 (patch)
tree632e69d095ec0bf72b5cd73e28393157af0218c3 /reader.py
parent541b5e265d0dfa41176e60483f453ae3013b24e5 (diff)
downloadruamel.yaml-185d4253e5ece454aef0446dd6372be04656ad62.tar.gz
str.translate slow for non-ASCII; use bytes.translate/regex
Diffstat (limited to 'reader.py')
-rw-r--r--reader.py46
1 files changed, 32 insertions, 14 deletions
diff --git a/reader.py b/reader.py
index a045514..ab2e8b9 100644
--- a/reader.py
+++ b/reader.py
@@ -184,28 +184,46 @@ class Reader(object):
try:
re.compile(u'[^\U00010000]')
except:
+ NON_PRINTABLE = re.compile(
+ u'[^\x09\x0A\x0D\x20-\x7E\x85'
+ u'\xA0-\uD7FF'
+ u'\uE000-\uFFFD'
+ u']'
+ )
UNICODE_SIZE = 2
else:
+ NON_PRINTABLE = RegExp(
+ u'[^\x09\x0A\x0D\x20-\x7E\x85'
+ u'\xA0-\uD7FF'
+ u'\uE000-\uFFFD'
+ u'\U00010000-\U0010FFFF'
+ u']'
+ )
UNICODE_SIZE = 4
- class _NonPrintable:
- def __getitem__(self, i):
- if (
- 0x20 <= i <= 0x7E or
- i in {0x09, 0x0A, 0x0D, 0x85} or
- 0xA0 <= i <= 0xD7FF or
- 0xE000 <= i <= 0xFFFD or
- 0x00010000 <= i <= 0x0010FFFF):
- return None
- return i
-
@classmethod
- def _get_non_printable(cls, data):
- non_printables = data.translate(cls._NonPrintable())
+ def _get_non_printable_ascii(
+ cls, data, printable=b'\x09\x0A\x0D' + bytes(range(0x20, 0x7E+1))):
+ ascii_bytes = data.encode('ascii')
+ non_printables = ascii_bytes.translate(None, printable)
if not non_printables:
return None
non_printable = non_printables[:1]
- return data.index(non_printable), non_printable
+ return ascii_bytes.index(non_printable), non_printable.decode('ascii')
+
+ @classmethod
+ def _get_non_printable_regex(cls, data):
+ match = cls.NON_PRINTABLE.search(data)
+ if not bool(match):
+ return None
+ return match.start(), match.group()
+
+ @classmethod
+ def _get_non_printable(cls, data):
+ try:
+ return cls._get_non_printable_ascii(data)
+ except UnicodeEncodeError:
+ return cls._get_non_printable_regex(data)
def check_printable(self, data):
# type: (Any) -> None