From f3cbb5a5d930580400502c45934884c24fa21f3d Mon Sep 17 00:00:00 2001 From: xi Date: Wed, 15 Feb 2006 21:57:37 +0000 Subject: Stream and Marker are cleaned up. git-svn-id: http://svn.pyyaml.org/branches/pyyaml3000@45 18f92427-320e-0410-9341-c67f048884a3 --- lib/yaml/error.py | 4 + lib/yaml/scanner.py | 59 ++++---- lib/yaml/stream.py | 226 ++++++++++++++++++++++++++++-- tests/data/invalid-character.stream-error | Bin 0 -> 2209 bytes tests/data/invalid-utf8-byte.stream-error | 18 +++ tests/data/odd-utf16.stream-error | Bin 0 -> 2463 bytes tests/test_appliance.py | 16 ++- tests/test_marker.py | 25 ++-- tests/test_stream.py | 31 ++++ tests/test_yaml.py | 1 + 10 files changed, 322 insertions(+), 58 deletions(-) create mode 100644 lib/yaml/error.py create mode 100644 tests/data/invalid-character.stream-error create mode 100644 tests/data/invalid-utf8-byte.stream-error create mode 100644 tests/data/odd-utf16.stream-error create mode 100644 tests/test_stream.py diff --git a/lib/yaml/error.py b/lib/yaml/error.py new file mode 100644 index 0000000..536adad --- /dev/null +++ b/lib/yaml/error.py @@ -0,0 +1,4 @@ + +class YAMLError(Exception): + pass + diff --git a/lib/yaml/scanner.py b/lib/yaml/scanner.py index c8a8ed9..c30faee 100644 --- a/lib/yaml/scanner.py +++ b/lib/yaml/scanner.py @@ -124,7 +124,7 @@ class Scanner: # # Stream supports the following methods # self.stream.peek(k=1) # peek the next k characters - # self.stream.read(k=1) # read the next k characters and move the + # self.stream.forward(k=1) # read the next k characters and move the # # pointer self.stream = Stream(source, data) @@ -442,7 +442,7 @@ class Scanner: # Add DOCUMENT-START or DOCUMENT-END. start_marker = self.stream.get_marker() - self.stream.read(3) + self.stream.forward(3) end_marker = self.stream.get_marker() self.tokens.append(TokenClass(start_marker, end_marker)) @@ -465,7 +465,7 @@ class Scanner: # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. start_marker = self.stream.get_marker() - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(TokenClass(start_marker, end_marker)) @@ -488,7 +488,7 @@ class Scanner: # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. start_marker = self.stream.get_marker() - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(TokenClass(start_marker, end_marker)) @@ -514,7 +514,7 @@ class Scanner: # Add ENTRY. start_marker = self.stream.get_marker() - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(EntryToken(start_marker, end_marker)) @@ -540,7 +540,7 @@ class Scanner: # Add KEY. start_marker = self.stream.get_marker() - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(KeyToken(start_marker, end_marker)) @@ -576,7 +576,7 @@ class Scanner: # Add VALUE. start_marker = self.stream.get_marker() - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(ValueToken(start_marker, end_marker)) @@ -727,12 +727,12 @@ class Scanner: found = False while not found: while self.stream.peek() == u' ': - self.stream.read() + self.stream.forward() if self.stream.peek() == u'#': while self.stream.peek() not in u'\r\n': - self.stream.read() + self.stream.forward() if self.stream.peek() in u'\r\n': - self.stream.read() + self.stream.forward() if not self.flow_level: self.allow_simple_key = True else: @@ -747,20 +747,20 @@ class Scanner: else: self.tokens.append(ReservedDirectiveToken('', marker, marker)) while self.stream.peek() not in u'\0\r\n': - self.stream.read() - self.stream.read() + self.stream.forward() + self.stream.forward() def scan_anchor(self, TokenClass): start_marker = self.stream.get_marker() while self.stream.peek() not in u'\0 \t\r\n,:': - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(TokenClass('', start_marker, end_marker)) def scan_tag(self): start_marker = self.stream.get_marker() while self.stream.peek() not in u'\0 \t\r\n': - self.stream.read() + self.stream.forward() end_marker = self.stream.get_marker() self.tokens.append(TagToken('', start_marker, end_marker)) @@ -771,12 +771,12 @@ class Scanner: indent = 1 while True: while self.stream.peek() and self.stream.peek() and self.stream.peek() not in u'\0\r\n\x85\u2028\u2029': - self.stream.read() + self.stream.forward() if self.stream.peek() != u'\0': - self.stream.read() + self.stream.forward() count = 0 while count < indent and self.stream.peek() == u' ': - self.stream.read() + self.stream.forward() count += 1 if count < indent and self.stream.peek() not in u'#\r\n\x85\u2028\u2029': break @@ -784,15 +784,16 @@ class Scanner: def scan_flow_scalar(self, double): marker = self.stream.get_marker() - quote = self.stream.read() + quote = self.stream.peek() + self.stream.forward() while self.stream.peek() != quote: if double and self.stream.peek() == u'\\': - self.stream.read(2) + self.stream.forward(2) elif not double and self.stream.peek(3)[1:] == u'\'\'': - self.stream.read(3) + self.stream.forward(3) else: - self.stream.read(1) - self.stream.read(1) + self.stream.forward(1) + self.stream.forward(1) self.tokens.append(ScalarToken('', False, marker, marker)) def scan_plain(self): @@ -803,24 +804,24 @@ class Scanner: marker = self.stream.get_marker() while True: while self.stream.peek() == u' ': - self.stream.read() + self.stream.forward() space = True while self.stream.peek() not in u'\0\r\n?:,[]{}#' \ or (not space and self.stream.peek() == '#') \ or (not self.flow_level and self.stream.peek() in '?,[]{}') \ or (not self.flow_level and self.stream.peek() == ':' and self.stream.peek(2)[1] not in u' \0\r\n'): space = self.stream.peek() not in u' \t' - self.stream.read() + self.stream.forward() self.allow_simple_key = False if self.stream.peek() not in u'\r\n': break while self.stream.peek() in u'\r\n': - self.stream.read() + self.stream.forward() if not self.flow_level: self.allow_simple_key = True count = 0 while self.stream.peek() == u' ' and count < indent: - self.stream.read() + self.stream.forward() count += 1 if count < indent: break @@ -833,3 +834,9 @@ class Scanner: def fail(self, message): raise ScannerError(message) +#try: +# import psyco +# psyco.bind(Scanner) +#except ImportError: +# pass + diff --git a/lib/yaml/stream.py b/lib/yaml/stream.py index 47f72a2..644d815 100644 --- a/lib/yaml/stream.py +++ b/lib/yaml/stream.py @@ -1,31 +1,229 @@ +# This module contains abstractions for the input stream. You don't have to +# looks further, there are no pretty code. +# +# We define two classes here. +# +# Marker(source, line, column) +# It's just a record and its only use is producing nice error messages. +# Parser does not use it for any other purposes. +# +# Stream(source, data) +# Stream determines the encoding of `data` and converts it to unicode. +# Stream provides the following methods and attributes: +# stream.peek(length=1) - return the next `length` characters +# stream.forward(length=1) - move the current position to `length` characters. +# stream.index - the number of the current character. +# stream.line, stream.column - the line and the column of the current character. -from marker import Marker + +from error import YAMLError + +import codecs, re + +# Unfortunately, codec functions in Python 2.3 does not support the `finish` +# arguments, so we have to write our own wrappers. + +try: + codecs.utf_8_decode('', 'strict', False) + from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode + +except TypeError: + + def utf_16_le_decode(data, errors, finish=False): + if not finish and len(data) % 2 == 1: + data = data[:-1] + return codecs.utf_16_le_decode(data, errors) + + def utf_16_be_decode(data, errors, finish=False): + if not finish and len(data) % 2 == 1: + data = data[:-1] + return codecs.utf_16_be_decode(data, errors) + + def utf_8_decode(data, errors, finish=False): + if not finish: + # We are trying to remove a possible incomplete multibyte character + # from the suffix of the data. + # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd. + # All further bytes are in the range 0x80 to 0xbf. + # UTF-8 encoded UCS characters may be up to six bytes long. + count = 0 + while count < 5 and count < len(data) \ + and '\x80' <= data[-count-1] <= '\xBF': + count -= 1 + if count < 5 and count < len(data) \ + and '\xC0' <= data[-count-1] <= '\xFD': + data = data[:-count-1] + return codecs.utf_8_decode(data, errors) + +class Marker: + + def __init__(self, source, line, column, buffer, pointer): + self.source = source + self.line = line + self.column = column + self.buffer = buffer + self.pointer = pointer + + def get_snippet(self, max_length=79): + if self.buffer is None: + return None + head = '' + start = self.pointer + while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029': + start -= 1 + if self.pointer-start > max_length/2-1: + head = ' ... ' + start += 5 + break + tail = '' + end = self.pointer + while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029': + end += 1 + if end-self.pointer > max_length/2-1: + tail = ' ... ' + end -= 5 + break + snippet = self.buffer[start:end].encode('utf-8') + return head + snippet + tail + '\n' \ + + ' '*(self.pointer-start+len(head)) + '^' + '\n' + +class StreamError(YAMLError): + + def __init__(self, source, encoding, character, position, reason): + self.source = source + self.encoding = encoding + self.character = character + self.position = position + self.reason = reason + + def __str__(self): + if isinstance(self.character, str): + return "'%s' codec can't decode byte #x%02x: %s\n" \ + "\tin file '%s', position %d." \ + % (self.encoding, ord(self.character), self.reason, + self.source, self.position) + else: + return "unacceptable character #x%04x: %s\n" \ + "\tin file '%s', position %d." \ + % (ord(self.character), self.reason, + self.source, self.position) class Stream: + # Stream: + # - determines the data encoding and converts it to unicode, + # - checks if characters are in allowed range, + # - adds '\0' to the end. + + # Yeah, it's ugly and slow. def __init__(self, source, data): self.source = source - self.data = unicode(data, 'utf-8')+u'\0' + self.stream = None + self.stream_pointer = 0 + self.eof = True + self.buffer = u'' + self.pointer = 0 + self.raw_buffer = None + self.raw_decoder = None self.index = 0 self.line = 0 self.column = 0 + if isinstance(data, unicode): + self.check_printable(data) + self.buffer = data+u'\0' + elif isinstance(data, str): + self.raw_buffer = data + self.determine_encoding() + else: + self.stream = data + self.eof = False + self.raw_buffer = '' + self.determine_encoding() - def peek(self, k=1): - return self.data[self.index:self.index+k] + def peek(self, length=1): + if self.pointer+length >= len(self.buffer): + self.update(length) + return self.buffer[self.pointer:self.pointer+length] - def read(self, k=1): - value = self.data[self.index:self.index+k] - for i in range(k): - if self.index >= len(self.data): - break - if self.data[self.index] in u'\r\n\x85\u2028\u2029': + def forward(self, length=1): + if self.pointer+length+1 >= len(self.buffer): + self.update(length+1) + for k in range(length): + ch = self.buffer[self.pointer] + self.pointer += 1 + self.index += 1 + if ch in u'\n\x85\u2028\u2029' \ + or (ch == u'\r' and self.buffer[self.pointer+1] != u'\n'): self.line += 1 self.column = 0 - else: + elif ch != u'\uFEFF': self.column += 1 - self.index += 1 - return value def get_marker(self): - return Marker(self.source, self.data, self.index, self.line, self.column) + if self.stream is None: + return Marker(self.source, self.line, self.column, + self.buffer, self.pointer) + else: + return Marker(self.source, self.line, self.column, None, None) + + def determine_encoding(self): + while not self.eof and len(self.raw_buffer) < 2: + self.update_raw() + if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): + self.raw_decode = utf_16_le_decode + elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): + self.raw_decode = utf_16_be_decode + else: + self.raw_decode = utf_8_decode + self.update(1) + + NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') + def check_printable(self, data): + match = self.NON_PRINTABLE.search(data) + if match: + character = match.group() + position = self.index+(len(self.buffer)-self.pointer)+match.start() + raise StreamError(self.source, 'unicode', character, position, + "control characters are not allowed") + + def update(self, length): + if self.raw_buffer is None: + return + self.buffer = self.buffer[self.pointer:] + self.pointer = 0 + while len(self.buffer) < length: + if not self.eof: + self.update_raw() + try: + data, converted = self.raw_decode(self.raw_buffer, + 'strict', self.eof) + except UnicodeDecodeError, exc: + character = exc.object[exc.start] + if self.stream is not None: + position = self.stream_pointer-len(self.raw_buffer)+exc.start + else: + position = exc.start + raise StreamError(self.source, exc.encoding, + character, position, exc.reason) + self.check_printable(data) + self.buffer += data + self.raw_buffer = self.raw_buffer[converted:] + if self.eof: + self.buffer += u'\0' + self.raw_buffer = None + break + + def update_raw(self, size=1024): + data = self.stream.read(size) + if data: + self.raw_buffer += data + self.stream_pointer += len(data) + else: + self.eof = True + +#try: +# import psyco +# psyco.bind(Stream) +#except ImportError: +# pass diff --git a/tests/data/invalid-character.stream-error b/tests/data/invalid-character.stream-error new file mode 100644 index 0000000..03687b0 Binary files /dev/null and b/tests/data/invalid-character.stream-error differ diff --git a/tests/data/invalid-utf8-byte.stream-error b/tests/data/invalid-utf8-byte.stream-error new file mode 100644 index 0000000..15111c3 --- /dev/null +++ b/tests/data/invalid-utf8-byte.stream-error @@ -0,0 +1,18 @@ +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +Invalid byte ('\xFF'): ÿ <-- +------------------------------------------------------------------------------------------------------------------------------- diff --git a/tests/data/odd-utf16.stream-error b/tests/data/odd-utf16.stream-error new file mode 100644 index 0000000..37da060 Binary files /dev/null and b/tests/data/odd-utf16.stream-error differ diff --git a/tests/test_appliance.py b/tests/test_appliance.py index d113f16..0c7adee 100644 --- a/tests/test_appliance.py +++ b/tests/test_appliance.py @@ -5,15 +5,15 @@ class TestAppliance(unittest.TestCase): DATA = 'tests/data' - tests = {} + all_tests = {} for filename in os.listdir(DATA): if os.path.isfile(os.path.join(DATA, filename)): root, ext = os.path.splitext(filename) - tests.setdefault(root, []).append(ext) + all_tests.setdefault(root, []).append(ext) def add_tests(cls, method_name, *extensions): - for test in cls.tests: - available_extensions = cls.tests[test] + for test in cls.all_tests: + available_extensions = cls.all_tests[test] for ext in extensions: if ext not in available_extensions: break @@ -22,7 +22,13 @@ class TestAppliance(unittest.TestCase): def test_method(self, test=test, filenames=filenames): getattr(self, '_'+method_name)(test, *filenames) test = test.replace('-', '_') - test_method.__name__ = '%s_%s' % (method_name, test) + try: + test_method.__name__ = '%s_%s' % (method_name, test) + except TypeError: + import new + test_method = new.function(test_method.func_code, test_method.func_globals, + '%s_%s' % (method_name, test), test_method.func_defaults, + test_method.func_closure) setattr(cls, test_method.__name__, test_method) add_tests = classmethod(add_tests) diff --git a/tests/test_marker.py b/tests/test_marker.py index 4570098..3a98c35 100644 --- a/tests/test_marker.py +++ b/tests/test_marker.py @@ -1,7 +1,7 @@ import test_appliance -from yaml.marker import Marker +from yaml.stream import Marker class TestMarker(test_appliance.TestAppliance): @@ -18,18 +18,17 @@ class TestMarker(test_appliance.TestAppliance): else: column += 1 index += 1 - for str_type in [str, unicode]: - marker = Marker(test_name, str_type(input), index, line, column) - snippet = marker.get_snippet() - #print "INPUT:" - #print input - #print "SNIPPET:" - #print snippet - self.failUnless(isinstance(snippet, str)) - self.failUnlessEqual(snippet.count('\n'), 2) - data, pointer, dummy = snippet.split('\n') - self.failUnless(len(data) < 80) - self.failUnlessEqual(data[len(pointer)-1], '*') + marker = Marker(test_name, line, column, unicode(input), index) + snippet = marker.get_snippet() + #print "INPUT:" + #print input + #print "SNIPPET:" + #print snippet + self.failUnless(isinstance(snippet, str)) + self.failUnlessEqual(snippet.count('\n'), 2) + data, pointer, dummy = snippet.split('\n') + self.failUnless(len(data) < 80) + self.failUnlessEqual(data[len(pointer)-1], '*') TestMarker.add_tests('testMarkers', '.markers') diff --git a/tests/test_stream.py b/tests/test_stream.py new file mode 100644 index 0000000..706c6a9 --- /dev/null +++ b/tests/test_stream.py @@ -0,0 +1,31 @@ + +import test_appliance +from yaml.stream import Stream, StreamError + +class TestStreamErrors(test_appliance.TestAppliance): + + def _testStreamUnicodeErrors(self, test_name, stream_filename): + try: + data = unicode(file(stream_filename, 'rb').read(), 'utf-8') + except: + return + self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data)) + + def _testStreamStringErrors(self, test_name, stream_filename): + data = file(stream_filename, 'rb').read() + self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data)) + + def _testStreamFileErrors(self, test_name, stream_filename): + data = file(stream_filename, 'rb') + self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data)) + + def _load(self, stream_filename, data): + stream = Stream(stream_filename, data) + while stream.peek() != u'\0': + stream.forward() + +TestStreamErrors.add_tests('testStreamUnicodeErrors', '.stream-error') +TestStreamErrors.add_tests('testStreamStringErrors', '.stream-error') +TestStreamErrors.add_tests('testStreamFileErrors', '.stream-error') + + diff --git a/tests/test_yaml.py b/tests/test_yaml.py index c9be9dd..389ecd8 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -2,6 +2,7 @@ import unittest from test_marker import * +from test_stream import * from test_canonical import * from test_tokens import * from test_structure import * -- cgit v1.2.1