diff options
author | Marcel Bargull <marcel.bargull@udo.edu> | 2018-03-16 19:53:59 +0100 |
---|---|---|
committer | Marcel Bargull <marcel.bargull@udo.edu> | 2018-03-16 19:53:59 +0100 |
commit | 185d4253e5ece454aef0446dd6372be04656ad62 (patch) | |
tree | 632e69d095ec0bf72b5cd73e28393157af0218c3 /reader.py | |
parent | 541b5e265d0dfa41176e60483f453ae3013b24e5 (diff) | |
download | ruamel.yaml-185d4253e5ece454aef0446dd6372be04656ad62.tar.gz |
str.translate slow for non-ASCII; use bytes.translate/regex
Diffstat (limited to 'reader.py')
-rw-r--r-- | reader.py | 46 |
1 files changed, 32 insertions, 14 deletions
@@ -184,28 +184,46 @@ class Reader(object): try: re.compile(u'[^\U00010000]') except: + NON_PRINTABLE = re.compile( + u'[^\x09\x0A\x0D\x20-\x7E\x85' + u'\xA0-\uD7FF' + u'\uE000-\uFFFD' + u']' + ) UNICODE_SIZE = 2 else: + NON_PRINTABLE = RegExp( + u'[^\x09\x0A\x0D\x20-\x7E\x85' + u'\xA0-\uD7FF' + u'\uE000-\uFFFD' + u'\U00010000-\U0010FFFF' + u']' + ) UNICODE_SIZE = 4 - class _NonPrintable: - def __getitem__(self, i): - if ( - 0x20 <= i <= 0x7E or - i in {0x09, 0x0A, 0x0D, 0x85} or - 0xA0 <= i <= 0xD7FF or - 0xE000 <= i <= 0xFFFD or - 0x00010000 <= i <= 0x0010FFFF): - return None - return i - @classmethod - def _get_non_printable(cls, data): - non_printables = data.translate(cls._NonPrintable()) + def _get_non_printable_ascii( + cls, data, printable=b'\x09\x0A\x0D' + bytes(range(0x20, 0x7E+1))): + ascii_bytes = data.encode('ascii') + non_printables = ascii_bytes.translate(None, printable) if not non_printables: return None non_printable = non_printables[:1] - return data.index(non_printable), non_printable + return ascii_bytes.index(non_printable), non_printable.decode('ascii') + + @classmethod + def _get_non_printable_regex(cls, data): + match = cls.NON_PRINTABLE.search(data) + if not bool(match): + return None + return match.start(), match.group() + + @classmethod + def _get_non_printable(cls, data): + try: + return cls._get_non_printable_ascii(data) + except UnicodeEncodeError: + return cls._get_non_printable_regex(data) def check_printable(self, data): # type: (Any) -> None |