summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorxi <xi@18f92427-320e-0410-9341-c67f048884a3>2006-02-15 21:57:37 +0000
committerxi <xi@18f92427-320e-0410-9341-c67f048884a3>2006-02-15 21:57:37 +0000
commitf3cbb5a5d930580400502c45934884c24fa21f3d (patch)
treea70255f8870360572163c798bb682df664750eb6
parentda5f57d7b0fe6c1b333fe863f1c395a8f5443ad1 (diff)
downloadpyyaml-f3cbb5a5d930580400502c45934884c24fa21f3d.tar.gz
Stream and Marker are cleaned up.
git-svn-id: http://svn.pyyaml.org/branches/pyyaml3000@45 18f92427-320e-0410-9341-c67f048884a3
-rw-r--r--lib/yaml/error.py4
-rw-r--r--lib/yaml/scanner.py59
-rw-r--r--lib/yaml/stream.py226
-rw-r--r--tests/data/invalid-character.stream-errorbin0 -> 2209 bytes
-rw-r--r--tests/data/invalid-utf8-byte.stream-error18
-rw-r--r--tests/data/odd-utf16.stream-errorbin0 -> 2463 bytes
-rw-r--r--tests/test_appliance.py16
-rw-r--r--tests/test_marker.py25
-rw-r--r--tests/test_stream.py31
-rw-r--r--tests/test_yaml.py1
10 files changed, 322 insertions, 58 deletions
diff --git a/lib/yaml/error.py b/lib/yaml/error.py
new file mode 100644
index 0000000..536adad
--- /dev/null
+++ b/lib/yaml/error.py
@@ -0,0 +1,4 @@
+
+class YAMLError(Exception):
+ pass
+
diff --git a/lib/yaml/scanner.py b/lib/yaml/scanner.py
index c8a8ed9..c30faee 100644
--- a/lib/yaml/scanner.py
+++ b/lib/yaml/scanner.py
@@ -124,7 +124,7 @@ class Scanner:
#
# Stream supports the following methods
# self.stream.peek(k=1) # peek the next k characters
- # self.stream.read(k=1) # read the next k characters and move the
+ # self.stream.forward(k=1) # read the next k characters and move the
# # pointer
self.stream = Stream(source, data)
@@ -442,7 +442,7 @@ class Scanner:
# Add DOCUMENT-START or DOCUMENT-END.
start_marker = self.stream.get_marker()
- self.stream.read(3)
+ self.stream.forward(3)
end_marker = self.stream.get_marker()
self.tokens.append(TokenClass(start_marker, end_marker))
@@ -465,7 +465,7 @@ class Scanner:
# Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
start_marker = self.stream.get_marker()
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(TokenClass(start_marker, end_marker))
@@ -488,7 +488,7 @@ class Scanner:
# Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
start_marker = self.stream.get_marker()
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(TokenClass(start_marker, end_marker))
@@ -514,7 +514,7 @@ class Scanner:
# Add ENTRY.
start_marker = self.stream.get_marker()
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(EntryToken(start_marker, end_marker))
@@ -540,7 +540,7 @@ class Scanner:
# Add KEY.
start_marker = self.stream.get_marker()
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(KeyToken(start_marker, end_marker))
@@ -576,7 +576,7 @@ class Scanner:
# Add VALUE.
start_marker = self.stream.get_marker()
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(ValueToken(start_marker, end_marker))
@@ -727,12 +727,12 @@ class Scanner:
found = False
while not found:
while self.stream.peek() == u' ':
- self.stream.read()
+ self.stream.forward()
if self.stream.peek() == u'#':
while self.stream.peek() not in u'\r\n':
- self.stream.read()
+ self.stream.forward()
if self.stream.peek() in u'\r\n':
- self.stream.read()
+ self.stream.forward()
if not self.flow_level:
self.allow_simple_key = True
else:
@@ -747,20 +747,20 @@ class Scanner:
else:
self.tokens.append(ReservedDirectiveToken('', marker, marker))
while self.stream.peek() not in u'\0\r\n':
- self.stream.read()
- self.stream.read()
+ self.stream.forward()
+ self.stream.forward()
def scan_anchor(self, TokenClass):
start_marker = self.stream.get_marker()
while self.stream.peek() not in u'\0 \t\r\n,:':
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(TokenClass('', start_marker, end_marker))
def scan_tag(self):
start_marker = self.stream.get_marker()
while self.stream.peek() not in u'\0 \t\r\n':
- self.stream.read()
+ self.stream.forward()
end_marker = self.stream.get_marker()
self.tokens.append(TagToken('', start_marker, end_marker))
@@ -771,12 +771,12 @@ class Scanner:
indent = 1
while True:
while self.stream.peek() and self.stream.peek() and self.stream.peek() not in u'\0\r\n\x85\u2028\u2029':
- self.stream.read()
+ self.stream.forward()
if self.stream.peek() != u'\0':
- self.stream.read()
+ self.stream.forward()
count = 0
while count < indent and self.stream.peek() == u' ':
- self.stream.read()
+ self.stream.forward()
count += 1
if count < indent and self.stream.peek() not in u'#\r\n\x85\u2028\u2029':
break
@@ -784,15 +784,16 @@ class Scanner:
def scan_flow_scalar(self, double):
marker = self.stream.get_marker()
- quote = self.stream.read()
+ quote = self.stream.peek()
+ self.stream.forward()
while self.stream.peek() != quote:
if double and self.stream.peek() == u'\\':
- self.stream.read(2)
+ self.stream.forward(2)
elif not double and self.stream.peek(3)[1:] == u'\'\'':
- self.stream.read(3)
+ self.stream.forward(3)
else:
- self.stream.read(1)
- self.stream.read(1)
+ self.stream.forward(1)
+ self.stream.forward(1)
self.tokens.append(ScalarToken('', False, marker, marker))
def scan_plain(self):
@@ -803,24 +804,24 @@ class Scanner:
marker = self.stream.get_marker()
while True:
while self.stream.peek() == u' ':
- self.stream.read()
+ self.stream.forward()
space = True
while self.stream.peek() not in u'\0\r\n?:,[]{}#' \
or (not space and self.stream.peek() == '#') \
or (not self.flow_level and self.stream.peek() in '?,[]{}') \
or (not self.flow_level and self.stream.peek() == ':' and self.stream.peek(2)[1] not in u' \0\r\n'):
space = self.stream.peek() not in u' \t'
- self.stream.read()
+ self.stream.forward()
self.allow_simple_key = False
if self.stream.peek() not in u'\r\n':
break
while self.stream.peek() in u'\r\n':
- self.stream.read()
+ self.stream.forward()
if not self.flow_level:
self.allow_simple_key = True
count = 0
while self.stream.peek() == u' ' and count < indent:
- self.stream.read()
+ self.stream.forward()
count += 1
if count < indent:
break
@@ -833,3 +834,9 @@ class Scanner:
def fail(self, message):
raise ScannerError(message)
+#try:
+# import psyco
+# psyco.bind(Scanner)
+#except ImportError:
+# pass
+
diff --git a/lib/yaml/stream.py b/lib/yaml/stream.py
index 47f72a2..644d815 100644
--- a/lib/yaml/stream.py
+++ b/lib/yaml/stream.py
@@ -1,31 +1,229 @@
+# This module contains abstractions for the input stream. You don't have to
+# looks further, there are no pretty code.
+#
+# We define two classes here.
+#
+# Marker(source, line, column)
+# It's just a record and its only use is producing nice error messages.
+# Parser does not use it for any other purposes.
+#
+# Stream(source, data)
+# Stream determines the encoding of `data` and converts it to unicode.
+# Stream provides the following methods and attributes:
+# stream.peek(length=1) - return the next `length` characters
+# stream.forward(length=1) - move the current position to `length` characters.
+# stream.index - the number of the current character.
+# stream.line, stream.column - the line and the column of the current character.
-from marker import Marker
+
+from error import YAMLError
+
+import codecs, re
+
+# Unfortunately, codec functions in Python 2.3 does not support the `finish`
+# arguments, so we have to write our own wrappers.
+
+try:
+ codecs.utf_8_decode('', 'strict', False)
+ from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode
+
+except TypeError:
+
+ def utf_16_le_decode(data, errors, finish=False):
+ if not finish and len(data) % 2 == 1:
+ data = data[:-1]
+ return codecs.utf_16_le_decode(data, errors)
+
+ def utf_16_be_decode(data, errors, finish=False):
+ if not finish and len(data) % 2 == 1:
+ data = data[:-1]
+ return codecs.utf_16_be_decode(data, errors)
+
+ def utf_8_decode(data, errors, finish=False):
+ if not finish:
+ # We are trying to remove a possible incomplete multibyte character
+ # from the suffix of the data.
+ # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.
+ # All further bytes are in the range 0x80 to 0xbf.
+ # UTF-8 encoded UCS characters may be up to six bytes long.
+ count = 0
+ while count < 5 and count < len(data) \
+ and '\x80' <= data[-count-1] <= '\xBF':
+ count -= 1
+ if count < 5 and count < len(data) \
+ and '\xC0' <= data[-count-1] <= '\xFD':
+ data = data[:-count-1]
+ return codecs.utf_8_decode(data, errors)
+
+class Marker:
+
+ def __init__(self, source, line, column, buffer, pointer):
+ self.source = source
+ self.line = line
+ self.column = column
+ self.buffer = buffer
+ self.pointer = pointer
+
+ def get_snippet(self, max_length=79):
+ if self.buffer is None:
+ return None
+ head = ''
+ start = self.pointer
+ while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029':
+ start -= 1
+ if self.pointer-start > max_length/2-1:
+ head = ' ... '
+ start += 5
+ break
+ tail = ''
+ end = self.pointer
+ while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029':
+ end += 1
+ if end-self.pointer > max_length/2-1:
+ tail = ' ... '
+ end -= 5
+ break
+ snippet = self.buffer[start:end].encode('utf-8')
+ return head + snippet + tail + '\n' \
+ + ' '*(self.pointer-start+len(head)) + '^' + '\n'
+
+class StreamError(YAMLError):
+
+ def __init__(self, source, encoding, character, position, reason):
+ self.source = source
+ self.encoding = encoding
+ self.character = character
+ self.position = position
+ self.reason = reason
+
+ def __str__(self):
+ if isinstance(self.character, str):
+ return "'%s' codec can't decode byte #x%02x: %s\n" \
+ "\tin file '%s', position %d." \
+ % (self.encoding, ord(self.character), self.reason,
+ self.source, self.position)
+ else:
+ return "unacceptable character #x%04x: %s\n" \
+ "\tin file '%s', position %d." \
+ % (ord(self.character), self.reason,
+ self.source, self.position)
class Stream:
+ # Stream:
+ # - determines the data encoding and converts it to unicode,
+ # - checks if characters are in allowed range,
+ # - adds '\0' to the end.
+
+ # Yeah, it's ugly and slow.
def __init__(self, source, data):
self.source = source
- self.data = unicode(data, 'utf-8')+u'\0'
+ self.stream = None
+ self.stream_pointer = 0
+ self.eof = True
+ self.buffer = u''
+ self.pointer = 0
+ self.raw_buffer = None
+ self.raw_decoder = None
self.index = 0
self.line = 0
self.column = 0
+ if isinstance(data, unicode):
+ self.check_printable(data)
+ self.buffer = data+u'\0'
+ elif isinstance(data, str):
+ self.raw_buffer = data
+ self.determine_encoding()
+ else:
+ self.stream = data
+ self.eof = False
+ self.raw_buffer = ''
+ self.determine_encoding()
- def peek(self, k=1):
- return self.data[self.index:self.index+k]
+ def peek(self, length=1):
+ if self.pointer+length >= len(self.buffer):
+ self.update(length)
+ return self.buffer[self.pointer:self.pointer+length]
- def read(self, k=1):
- value = self.data[self.index:self.index+k]
- for i in range(k):
- if self.index >= len(self.data):
- break
- if self.data[self.index] in u'\r\n\x85\u2028\u2029':
+ def forward(self, length=1):
+ if self.pointer+length+1 >= len(self.buffer):
+ self.update(length+1)
+ for k in range(length):
+ ch = self.buffer[self.pointer]
+ self.pointer += 1
+ self.index += 1
+ if ch in u'\n\x85\u2028\u2029' \
+ or (ch == u'\r' and self.buffer[self.pointer+1] != u'\n'):
self.line += 1
self.column = 0
- else:
+ elif ch != u'\uFEFF':
self.column += 1
- self.index += 1
- return value
def get_marker(self):
- return Marker(self.source, self.data, self.index, self.line, self.column)
+ if self.stream is None:
+ return Marker(self.source, self.line, self.column,
+ self.buffer, self.pointer)
+ else:
+ return Marker(self.source, self.line, self.column, None, None)
+
+ def determine_encoding(self):
+ while not self.eof and len(self.raw_buffer) < 2:
+ self.update_raw()
+ if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
+ self.raw_decode = utf_16_le_decode
+ elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
+ self.raw_decode = utf_16_be_decode
+ else:
+ self.raw_decode = utf_8_decode
+ self.update(1)
+
+ NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
+ def check_printable(self, data):
+ match = self.NON_PRINTABLE.search(data)
+ if match:
+ character = match.group()
+ position = self.index+(len(self.buffer)-self.pointer)+match.start()
+ raise StreamError(self.source, 'unicode', character, position,
+ "control characters are not allowed")
+
+ def update(self, length):
+ if self.raw_buffer is None:
+ return
+ self.buffer = self.buffer[self.pointer:]
+ self.pointer = 0
+ while len(self.buffer) < length:
+ if not self.eof:
+ self.update_raw()
+ try:
+ data, converted = self.raw_decode(self.raw_buffer,
+ 'strict', self.eof)
+ except UnicodeDecodeError, exc:
+ character = exc.object[exc.start]
+ if self.stream is not None:
+ position = self.stream_pointer-len(self.raw_buffer)+exc.start
+ else:
+ position = exc.start
+ raise StreamError(self.source, exc.encoding,
+ character, position, exc.reason)
+ self.check_printable(data)
+ self.buffer += data
+ self.raw_buffer = self.raw_buffer[converted:]
+ if self.eof:
+ self.buffer += u'\0'
+ self.raw_buffer = None
+ break
+
+ def update_raw(self, size=1024):
+ data = self.stream.read(size)
+ if data:
+ self.raw_buffer += data
+ self.stream_pointer += len(data)
+ else:
+ self.eof = True
+
+#try:
+# import psyco
+# psyco.bind(Stream)
+#except ImportError:
+# pass
diff --git a/tests/data/invalid-character.stream-error b/tests/data/invalid-character.stream-error
new file mode 100644
index 0000000..03687b0
--- /dev/null
+++ b/tests/data/invalid-character.stream-error
Binary files differ
diff --git a/tests/data/invalid-utf8-byte.stream-error b/tests/data/invalid-utf8-byte.stream-error
new file mode 100644
index 0000000..15111c3
--- /dev/null
+++ b/tests/data/invalid-utf8-byte.stream-error
@@ -0,0 +1,18 @@
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+Invalid byte ('\xFF'): ÿ <--
+-------------------------------------------------------------------------------------------------------------------------------
diff --git a/tests/data/odd-utf16.stream-error b/tests/data/odd-utf16.stream-error
new file mode 100644
index 0000000..37da060
--- /dev/null
+++ b/tests/data/odd-utf16.stream-error
Binary files differ
diff --git a/tests/test_appliance.py b/tests/test_appliance.py
index d113f16..0c7adee 100644
--- a/tests/test_appliance.py
+++ b/tests/test_appliance.py
@@ -5,15 +5,15 @@ class TestAppliance(unittest.TestCase):
DATA = 'tests/data'
- tests = {}
+ all_tests = {}
for filename in os.listdir(DATA):
if os.path.isfile(os.path.join(DATA, filename)):
root, ext = os.path.splitext(filename)
- tests.setdefault(root, []).append(ext)
+ all_tests.setdefault(root, []).append(ext)
def add_tests(cls, method_name, *extensions):
- for test in cls.tests:
- available_extensions = cls.tests[test]
+ for test in cls.all_tests:
+ available_extensions = cls.all_tests[test]
for ext in extensions:
if ext not in available_extensions:
break
@@ -22,7 +22,13 @@ class TestAppliance(unittest.TestCase):
def test_method(self, test=test, filenames=filenames):
getattr(self, '_'+method_name)(test, *filenames)
test = test.replace('-', '_')
- test_method.__name__ = '%s_%s' % (method_name, test)
+ try:
+ test_method.__name__ = '%s_%s' % (method_name, test)
+ except TypeError:
+ import new
+ test_method = new.function(test_method.func_code, test_method.func_globals,
+ '%s_%s' % (method_name, test), test_method.func_defaults,
+ test_method.func_closure)
setattr(cls, test_method.__name__, test_method)
add_tests = classmethod(add_tests)
diff --git a/tests/test_marker.py b/tests/test_marker.py
index 4570098..3a98c35 100644
--- a/tests/test_marker.py
+++ b/tests/test_marker.py
@@ -1,7 +1,7 @@
import test_appliance
-from yaml.marker import Marker
+from yaml.stream import Marker
class TestMarker(test_appliance.TestAppliance):
@@ -18,18 +18,17 @@ class TestMarker(test_appliance.TestAppliance):
else:
column += 1
index += 1
- for str_type in [str, unicode]:
- marker = Marker(test_name, str_type(input), index, line, column)
- snippet = marker.get_snippet()
- #print "INPUT:"
- #print input
- #print "SNIPPET:"
- #print snippet
- self.failUnless(isinstance(snippet, str))
- self.failUnlessEqual(snippet.count('\n'), 2)
- data, pointer, dummy = snippet.split('\n')
- self.failUnless(len(data) < 80)
- self.failUnlessEqual(data[len(pointer)-1], '*')
+ marker = Marker(test_name, line, column, unicode(input), index)
+ snippet = marker.get_snippet()
+ #print "INPUT:"
+ #print input
+ #print "SNIPPET:"
+ #print snippet
+ self.failUnless(isinstance(snippet, str))
+ self.failUnlessEqual(snippet.count('\n'), 2)
+ data, pointer, dummy = snippet.split('\n')
+ self.failUnless(len(data) < 80)
+ self.failUnlessEqual(data[len(pointer)-1], '*')
TestMarker.add_tests('testMarkers', '.markers')
diff --git a/tests/test_stream.py b/tests/test_stream.py
new file mode 100644
index 0000000..706c6a9
--- /dev/null
+++ b/tests/test_stream.py
@@ -0,0 +1,31 @@
+
+import test_appliance
+from yaml.stream import Stream, StreamError
+
+class TestStreamErrors(test_appliance.TestAppliance):
+
+ def _testStreamUnicodeErrors(self, test_name, stream_filename):
+ try:
+ data = unicode(file(stream_filename, 'rb').read(), 'utf-8')
+ except:
+ return
+ self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data))
+
+ def _testStreamStringErrors(self, test_name, stream_filename):
+ data = file(stream_filename, 'rb').read()
+ self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data))
+
+ def _testStreamFileErrors(self, test_name, stream_filename):
+ data = file(stream_filename, 'rb')
+ self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data))
+
+ def _load(self, stream_filename, data):
+ stream = Stream(stream_filename, data)
+ while stream.peek() != u'\0':
+ stream.forward()
+
+TestStreamErrors.add_tests('testStreamUnicodeErrors', '.stream-error')
+TestStreamErrors.add_tests('testStreamStringErrors', '.stream-error')
+TestStreamErrors.add_tests('testStreamFileErrors', '.stream-error')
+
+
diff --git a/tests/test_yaml.py b/tests/test_yaml.py
index c9be9dd..389ecd8 100644
--- a/tests/test_yaml.py
+++ b/tests/test_yaml.py
@@ -2,6 +2,7 @@
import unittest
from test_marker import *
+from test_stream import *
from test_canonical import *
from test_tokens import *
from test_structure import *