summaryrefslogtreecommitdiff
path: root/reader.py
diff options
context:
space:
mode:
Diffstat (limited to 'reader.py')
-rw-r--r--reader.py53
1 files changed, 42 insertions, 11 deletions
diff --git a/reader.py b/reader.py
index 9ad2b5f..c454925 100644
--- a/reader.py
+++ b/reader.py
@@ -25,9 +25,10 @@ import re
from ruamel.yaml.error import YAMLError, FileMark, StringMark, YAMLStreamError
from ruamel.yaml.compat import text_type, binary_type, PY3
+from ruamel.yaml.util import RegExp
if False: # MYPY
- from typing import Any, Dict, Optional, List, Union, Text # NOQA
+ from typing import Any, Dict, Optional, List, Union, Text, Tuple # NOQA
from ruamel.yaml.compat import StreamTextType # NOQA
__all__ = ['Reader', 'ReaderError']
@@ -181,29 +182,59 @@ class Reader(object):
# 4 if 32 bit unicode supported, 2 e.g. on MacOS (issue 56)
try:
- NON_PRINTABLE = re.compile(
+ re.compile(u'[^\U00010000]')
+ except:
+ NON_PRINTABLE = RegExp(
u'[^\x09\x0A\x0D\x20-\x7E\x85'
u'\xA0-\uD7FF'
u'\uE000-\uFFFD'
- u'\U00010000-\U0010FFFF'
u']'
)
- UNICODE_SIZE = 4
- except:
- NON_PRINTABLE = re.compile(
+ UNICODE_SIZE = 2
+ else:
+ NON_PRINTABLE = RegExp(
u'[^\x09\x0A\x0D\x20-\x7E\x85'
u'\xA0-\uD7FF'
u'\uE000-\uFFFD'
+ u'\U00010000-\U0010FFFF'
u']'
)
- UNICODE_SIZE = 2
+ UNICODE_SIZE = 4
+
+ _printable_ascii = ('\x09\x0A\x0D' + ''.join(map(chr, range(0x20, 0x7F)))).encode('ascii')
+
+ @classmethod
+ def _get_non_printable_ascii(cls, data):
+ # type: (Text, bytes) -> Union[None, Tuple[int, Text]]
+ ascii_bytes = data.encode('ascii')
+ non_printables = ascii_bytes.translate(None, cls._printable_ascii)
+ if not non_printables:
+ return None
+ non_printable = non_printables[:1]
+ return ascii_bytes.index(non_printable), non_printable.decode('ascii')
+
+ @classmethod
+ def _get_non_printable_regex(cls, data):
+ # type: (Text) -> Union[None, Tuple[int, Text]]
+ match = cls.NON_PRINTABLE.search(data)
+ if not bool(match):
+ return None
+ return match.start(), match.group()
+
+ @classmethod
+ def _get_non_printable(cls, data):
+ # type: (Text) -> Union[None, Tuple[int, Text]]
+ try:
+ return cls._get_non_printable_ascii(data)
+ except UnicodeEncodeError:
+ return cls._get_non_printable_regex(data)
def check_printable(self, data):
# type: (Any) -> None
- match = self.NON_PRINTABLE.search(data)
- if bool(match):
- character = match.group()
- position = self.index + (len(self.buffer) - self.pointer) + match.start()
+ non_printable_match = self._get_non_printable(data)
+ if non_printable_match is not None:
+ start, character = non_printable_match
+ position = self.index + (len(self.buffer) - self.pointer) + start
raise ReaderError(self.name, position, ord(character),
'unicode', "special characters are not allowed")