summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorxi <xi@18f92427-320e-0410-9341-c67f048884a3>2006-02-18 23:52:18 +0000
committerxi <xi@18f92427-320e-0410-9341-c67f048884a3>2006-02-18 23:52:18 +0000
commit6070191305af2f626f202d33470e0f76cd0aaa1f (patch)
tree865998b24c156e88888d6c5934874815f2629175
parentf43239e1ad0bf85c5b748e3934117af36603b2f8 (diff)
downloadpyyaml-6070191305af2f626f202d33470e0f76cd0aaa1f.tar.gz
Scanner is complete.
git-svn-id: http://svn.pyyaml.org/branches/pyyaml3000@48 18f92427-320e-0410-9341-c67f048884a3
-rw-r--r--Makefile2
-rw-r--r--lib/yaml/reader.py7
-rw-r--r--lib/yaml/scanner.py727
-rw-r--r--lib/yaml/tokens.py17
-rw-r--r--tests/data/invalid-block-scalar-indicator.error-message2
-rw-r--r--tests/data/invalid-escape-character.error-message1
-rw-r--r--tests/data/invalid-escape-numbers.error-message1
-rw-r--r--tests/data/invalid-indentation-for-quoted-scalar.error-message2
-rw-r--r--tests/data/invalid-indentation-indicator-1.error-message2
-rw-r--r--tests/data/invalid-indentation-indicator-2.error-message2
-rw-r--r--tests/data/invalid-starting-character.error-message1
-rw-r--r--tests/data/spec-06-01.data4
-rw-r--r--tests/data/spec-06-06.canonical2
-rw-r--r--tests/data/spec-07-10.canonical2
-rw-r--r--tests/data/spec-08-08.canonical2
-rw-r--r--tests/data/spec-08-10.canonical4
-rw-r--r--tests/data/spec-09-02.canonical2
-rw-r--r--tests/data/spec-09-12.data4
-rw-r--r--tests/data/spec-09-16.data4
-rw-r--r--tests/data/spec-09-25.data2
-rw-r--r--tests/data/spec-09-26.canonical2
-rw-r--r--tests/data/spec-09-27.canonical2
-rw-r--r--tests/data/spec-09-28.canonical2
-rw-r--r--tests/data/spec-10-10.data2
-rw-r--r--tests/data/spec-10-14.canonical2
-rw-r--r--tests/data/unclosed-quoted-scalar.error-message2
-rw-r--r--tests/test_appliance.py2
-rw-r--r--tests/test_structure.py40
-rw-r--r--tests/test_tokens.py4
29 files changed, 729 insertions, 119 deletions
diff --git a/Makefile b/Makefile
index f8207b6..ca53bd7 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ install: build
${PYTHON} setup.py install ${PARAMETERS}
test: build
- ${PYTHON} tests/test_build.py -v ${TEST}
+ ${PYTHON} tests/test_build.py ${TEST}
clean:
${PYTHON} setup.py clean -a
diff --git a/lib/yaml/reader.py b/lib/yaml/reader.py
index 4316e22..73838ee 100644
--- a/lib/yaml/reader.py
+++ b/lib/yaml/reader.py
@@ -158,7 +158,12 @@ class Reader:
self.raw_buffer = ''
self.determine_encoding()
- def peek(self, length=1):
+ def peek(self, index=0):
+ if self.pointer+index+1 >= len(self.buffer):
+ self.update(index+1)
+ return self.buffer[self.pointer+index]
+
+ def prefix(self, length=1):
if self.pointer+length >= len(self.buffer):
self.update(length)
return self.buffer[self.pointer:self.pointer+length]
diff --git a/lib/yaml/scanner.py b/lib/yaml/scanner.py
index a1785ef..c83a551 100644
--- a/lib/yaml/scanner.py
+++ b/lib/yaml/scanner.py
@@ -24,12 +24,11 @@ class ScannerError(YAMLError):
# key: "valu\?e"
# ^
def __init__(self, context=None, context_marker=None,
- problem=None, problem_marker=None, description=None):
+ problem=None, problem_marker=None):
self.context = context
self.context_marker = context_marker
self.problem = problem
self.problem_marker = problem_marker
- self.description = description
def __str__(self):
lines = []
@@ -39,8 +38,6 @@ class ScannerError(YAMLError):
lines.append(place)
if marker is not None:
lines.append(str(marker))
- if self.description is not None:
- lines.append(self.description)
return '\n'.join(lines)
class SimpleKey:
@@ -62,9 +59,10 @@ class Scanner:
# the end.
#
# Reader supports the following methods
- # self.reader.peek(k=1) # peek the next k characters
- # self.reader.forward(k=1) # read the next k characters and move the
- # # pointer
+ # self.reader.peek(i=0) # peek the next i-th character
+ # self.reader.prefix(l=1) # peek the next l characters
+ # self.reader.forward(l=1) # read the next l characters
+ # and move the pointer
self.reader = reader
# Had we reached the end of the stream?
@@ -161,9 +159,13 @@ class Scanner:
# Peek the next character.
ch = self.reader.peek()
- # Is it the end of reader?
+ # Is it the end of stream?
if ch == u'\0':
- return self.fetch_end()
+ return self.fetch_stream_end()
+
+ # Is it the byte order mark?
+ if ch == u'\uFEFF':
+ return self.fetch_bom()
# Is it a directive?
if ch == u'%' and self.check_directive():
@@ -240,7 +242,9 @@ class Scanner:
return self.fetch_plain()
# No? It's an error. Let's produce a nice error message.
- self.invalid_token()
+ raise ScannerError("while scanning for the next token", None,
+ "found character %r that cannot start any token"
+ % ch.encode('utf-8'), self.reader.get_marker())
# Simple keys treatment.
@@ -342,7 +346,7 @@ class Scanner:
# Fetchers.
- def fetch_end(self):
+ def fetch_stream_end(self):
# Set the current intendation to -1.
self.unwind_indent(-1)
@@ -360,6 +364,33 @@ class Scanner:
# The reader is ended.
self.done = True
+ def fetch_bom(self):
+ # We consider the BOM marker as a DOCUMENT-END indicator unless it's
+ # the first character in the stream. It's a reasonable approximation
+ # of the specification requirements. We can follow the specification
+ # literally, but it will require a new token class. Probably later.
+
+ # We ignore BOM if it is the first character in the stream.
+ if self.reader.index == 0:
+ slef.reader.forward()
+
+ # Otherwise we issue DOCUMENT-END.
+ else:
+
+ # Set the current intendation to -1.
+ self.unwind_indent(-1)
+
+ # Reset simple keys. Note that there could not be a block
+ # collection after BOM.
+ self.remove_possible_simple_key()
+ self.allow_simple_key = False
+
+ # Add DOCUMENT-END.
+ start_marker = self.reader.get_marker()
+ self.reader.forward()
+ end_marker = self.reader.get_marker()
+ self.tokens.append(DocumentEndToken(start_marker, end_marker))
+
def fetch_directive(self):
# Set the current intendation to -1.
@@ -637,8 +668,8 @@ class Scanner:
# DOCUMENT-START: ^ '---' (' '|'\n')
if self.reader.column == 0:
- prefix = self.reader.peek(4)
- if prefix[:3] == u'---' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
+ if self.reader.prefix(3) == u'---' \
+ and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
return True
def check_document_end(self):
@@ -646,7 +677,8 @@ class Scanner:
# DOCUMENT-END: ^ '...' (' '|'\n')
if self.reader.column == 0:
prefix = self.reader.peek(4)
- if prefix[:3] == u'...' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
+ if self.reader.prefix(3) == u'...' \
+ and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
return True
def check_entry(self):
@@ -657,8 +689,8 @@ class Scanner:
# ENTRY(block context): '-' (' '|'\n')
else:
- prefix = self.reader.peek(2)
- return prefix[0] == u'-' and prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
+ return self.reader.peek() == u'-' \
+ and self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
def check_key(self):
@@ -668,8 +700,7 @@ class Scanner:
# KEY(block context): '?' (' '|'\n')
else:
- prefix = self.reader.peek(2)
- return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
+ return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
def check_value(self):
@@ -679,11 +710,26 @@ class Scanner:
# VALUE(block context): ':' (' '|'\n')
else:
- prefix = self.reader.peek(2)
- return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
+ return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
def check_plain(self):
- return True
+
+ # A plain scalar may start with any non-space character except:
+ # '-', '?', ':', ',', '[', ']', '{', '}',
+ # '#', '&', '*', '!', '|', '>', '\'', '\"',
+ # '%', '@', '`'.
+ #
+ # It may also start with
+ # '-', '?', ':'
+ # if it is followed by a non-space character.
+ #
+ # Note that we limit the last rule to the block context (except the
+ # '-' character) because we want the flow context to be space
+ # independent.
+ ch = self.reader.peek()
+ return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
+ or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
+ and (ch == '-' or (not self.flow_level and ch in u'?:')))
# Scanners.
@@ -705,95 +751,615 @@ class Scanner:
found = True
def scan_directive(self):
- marker = self.reader.get_marker()
- if self.reader.peek(5) == u'%YAML ':
- token = YAMLDirectiveToken(1, 1, marker, marker)
- elif self.reader.peek(4) == u'%TAG ':
- token = TagDirectiveToken(marker, marker)
+ # See the specification for details.
+ start_marker = self.reader.get_marker()
+ self.reader.forward()
+ name = self.scan_directive_name(start_marker)
+ value = None
+ if name == u'YAML':
+ value = self.scan_yaml_directive_value(start_marker)
+ end_marker = self.reader.get_marker()
+ elif name == u'TAG':
+ value = self.scan_tag_directive_value(start_marker)
+ end_marker = self.reader.get_marker()
else:
- token = ReservedDirectiveToken('', marker, marker)
- while self.reader.peek() not in u'\0\r\n':
+ end_marker = self.reader.get_marker()
+ while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
+ self.reader.forward()
+ self.scan_directive_ignored_line(start_marker)
+ return DirectiveToken(name, value, start_marker, end_marker)
+
+ def scan_directive_name(self, start_marker):
+ # See the specification for details.
+ length = 0
+ ch = self.reader.peek(length)
+ while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
+ or ch in u'-_':
+ length += 1
+ ch = self.reader.peek(length)
+ if not length:
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected directive name, but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ value = self.reader.prefix(length)
+ self.reader.forward(length)
+ ch = self.reader.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive" % name, start_marker,
+ "expected alphabetic or numeric character, but found %r"
+ % ch.encode('utf-8'), self.reader.get_marker())
+ return value
+
+ def scan_yaml_directive_value(self, start_marker):
+ # See the specification for details.
+ while self.reader.peek() == u' ':
self.reader.forward()
+ major = self.scan_yaml_directive_number(start_marker)
+ if self.reader.peek() != '.':
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected a digit or '.', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
self.reader.forward()
- return token
+ minor = self.scan_yaml_directive_number(start_marker)
+ if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected a digit or ' ', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ return (major, minor)
+
+ def scan_yaml_directive_number(self, start_marker):
+ # See the specification for details.
+ ch = self.reader.peek()
+ if not (u'0' <= ch <= '9'):
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected a digit, but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ length = 0
+ while u'0' <= self.reader.peek(length) <= u'9':
+ length += 1
+ value = int(self.reader.prefix(length))
+ self.reader.forward(length)
+ return value
+
+ def scan_tag_directive_value(self, start_marker):
+ # See the specification for details.
+ while self.reader.peek() == u' ':
+ self.reader.forward()
+ handle = self.scan_tag_directive_handle(start_marker)
+ while self.reader.peek() == u' ':
+ self.reader.forward()
+ prefix = self.scan_tag_directive_prefix(start_marker)
+ return (handle, prefix)
+
+ def scan_tag_directive_handle(self, start_marker):
+ # See the specification for details.
+ value = self.scan_tag_handle('directive', start_marker)
+ if self.reader.peek() != u' ':
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected ' ', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ return value
+
+ def scan_tag_directive_prefix(self, start_marker):
+ # See the specification for details.
+ value = self.scan_tag_uri('directive', start_marker)
+ ch = self.reader.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected ' ', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ return value
+
+ def scan_directive_ignored_line(self, start_marker):
+ # See the specification for details.
+ while self.reader.peek() == u' ':
+ self.reader.forward()
+ if self.reader.peek() == u'#':
+ while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
+ self.reader.forward()
+ ch = self.reader.peek()
+ if ch not in u'\0\r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_marker,
+ "expected a comment or a line break, but found %r"
+ % ch.encode('utf-8'), self.reader.get_marker())
+ self.scan_line_break()
def scan_anchor(self, TokenClass):
+ # The specification does not restrict characters for anchors and
+ # aliases. This may lead to problems, for instance, the document:
+ # [ *alias, value ]
+ # can be interpteted in two ways, as
+ # [ "value" ]
+ # and
+ # [ *alias , "value" ]
+ # Therefore we restrict aliases to numbers and ASCII letters.
start_marker = self.reader.get_marker()
- while self.reader.peek() not in u'\0 \t\r\n,:':
- self.reader.forward()
+ indicator = self.reader.peek()
+ if indicator == '*':
+ name = 'alias'
+ else:
+ name = 'anchor'
+ self.reader.forward()
+ length = 0
+ ch = self.reader.peek(length)
+ while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
+ or ch in u'-_':
+ length += 1
+ ch = self.reader.peek(length)
+ if not length:
+ raise ScannerError("while scanning an %s" % name, start_marker,
+ "expected anchor name, but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ value = self.reader.prefix(length)
+ self.reader.forward(length)
+ ch = self.reader.peek()
+ if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
+ raise ScannerError("while scanning an %s" % name, start_marker,
+ "expected alphabetic or numeric character, but found %r"
+ % ch.encode('utf-8'), self.reader.get_marker())
end_marker = self.reader.get_marker()
- return TokenClass('', start_marker, end_marker)
+ return TokenClass(value, start_marker, end_marker)
def scan_tag(self):
+ # See the specification for details.
start_marker = self.reader.get_marker()
- while self.reader.peek() not in u'\0 \t\r\n':
+ ch = self.reader.peek(1)
+ if ch == u'<':
+ handle = None
+ self.reader.forward(2)
+ suffix = self.scan_tag_uri('tag', start_marker)
+ if self.reader.peek() != u'>':
+ raise ScannerError("while parsing a tag", start_marking,
+ "expected '>', but got %r" % self.reader.peek().encode('utf-8'),
+ self.reader.get_marker())
+ self.reader.forward()
+ elif ch in u'\0 \t\r\n\x85\u2028\u2029':
+ handle = None
+ suffix = u'!'
self.reader.forward()
+ else:
+ length = 1
+ use_handle = False
+ while ch not in u'\0 \r\n\x85\u2028\u2029':
+ if ch == u'!':
+ use_handle = True
+ break
+ length += 1
+ ch = self.reader.peek(length)
+ handle = u'!'
+ if use_handle:
+ handle = self.scan_tag_handle('tag', start_marker)
+ else:
+ handle = u'!'
+ self.reader.forward()
+ suffix = self.scan_tag_uri('tag', start_marker)
+ ch = self.reader.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a tag", start_marker,
+ "expected ' ', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ value = (handle, suffix)
end_marker = self.reader.get_marker()
- return TagToken('', start_marker, end_marker)
+ return TagToken(value, start_marker, end_marker)
def scan_block_scalar(self, folded):
+ # See the specification for details.
+
+ chunks = []
start_marker = self.reader.get_marker()
- indent = self.indent+1
- if indent < 1:
- indent = 1
- while True:
- while self.reader.peek() and self.reader.peek() and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
+
+ # Scan the header.
+ self.reader.forward()
+ chomping, increment = self.scan_block_scalar_indicators(start_marker)
+ self.scan_block_scalar_ignored_line(start_marker)
+
+ # Determine the indentation level and go to the first non-empty line.
+ min_indent = self.indent+1
+ if min_indent < 1:
+ min_indent = 1
+ if increment is None:
+ breaks, max_indent, end_marker = self.scan_block_scalar_indentation()
+ indent = max(min_indent, max_indent)
+ else:
+ indent = min_indent+increment-1
+ breaks, end_marker = self.scan_block_scalar_breaks(indent)
+ line_break = u''
+
+ # Scan the inner part of the block scalar.
+ while self.reader.column == indent and self.reader.peek() != u'\0':
+ chunks.extend(breaks)
+ leading_non_space = self.reader.peek() not in u' \t'
+ length = 0
+ while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
+ length += 1
+ chunks.append(self.reader.prefix(length))
+ self.reader.forward(length)
+ line_break = self.scan_line_break()
+ breaks, end_marker = self.scan_block_scalar_breaks(indent)
+ if self.reader.column == indent and self.reader.peek() != u'\0':
+ # Unfortunately, folding rules are ambiguous.
+ #
+ # This is the folding according to the specification:
+ #
+ #if folded and line_break == u'\n' \
+ # and leading_non_space and self.reader.peek() not in u' \t':
+ # if not breaks:
+ # chunks.append(u' ')
+ #else:
+ # chunks.append(line_break)
+ #
+ # This is Clark Evans's interpretation (also in the spec
+ # examples):
+ #
+ if folded and line_break == u'\n':
+ if not breaks:
+ if self.reader.peek() not in ' \t':
+ chunks.append(u' ')
+ else:
+ chunks.append(line_break)
+ else:
+ chunks.append(line_break)
+ else:
+ break
+
+ # Chomp the tail.
+ if chomping is not False:
+ chunks.append(line_break)
+ if chomping is True:
+ chunks.extend(breaks)
+
+ # We are done.
+ return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
+
+ def scan_block_scalar_indicators(self, start_marker):
+ # See the specification for details.
+ chomping = None
+ increment = None
+ ch = self.reader.peek()
+ if ch in u'+-':
+ if ch == '+':
+ chomping = True
+ else:
+ chomping = False
+ self.reader.forward()
+ ch = self.reader.peek()
+ if ch in u'0123456789':
+ increment = int(ch)
+ if increment == 0:
+ raise ScannerError("while scanning a block scalar", start_marker,
+ "expected indentation indicator in the range 1-9, but found 0",
+ self.reader.get_marker())
self.reader.forward()
- if self.reader.peek() != u'\0':
+ elif ch in u'0123456789':
+ increment = int(ch)
+ if increment == 0:
+ raise ScannerError("while scanning a block scalar", start_marker,
+ "expected indentation indicator in the range 1-9, but found 0",
+ self.reader.get_marker())
+ self.reader.forward()
+ ch = self.reader.peek()
+ if ch in u'+-':
+ if ch == '+':
+ chomping = True
+ else:
+ chomping = False
self.reader.forward()
- count = 0
- while count < indent and self.reader.peek() == u' ':
+ ch = self.reader.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a block scalar", start_marker,
+ "expected chomping or indentation indicators, but found %r"
+ % ch.encode('utf-8'), self.reader.get_marker())
+ return chomping, increment
+
+ def scan_block_scalar_ignored_line(self, start_marker):
+ # See the specification for details.
+ while self.reader.peek() == u' ':
+ self.reader.forward()
+ if self.reader.peek() == u'#':
+ while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
self.reader.forward()
- count += 1
- if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029':
- break
- return ScalarToken('', False, start_marker, start_marker)
+ ch = self.reader.peek()
+ if ch not in u'\0\r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a block scalar", start_marker,
+ "expected a comment or a line break, but found %r"
+ % ch.encode('utf-8'), self.reader.get_marker())
+ self.scan_line_break()
+
+ def scan_block_scalar_indentation(self):
+ # See the specification for details.
+ chunks = []
+ max_indent = 0
+ end_marker = self.reader.get_marker()
+ while self.reader.peek() in u' \r\n\x85\u2028\u2029':
+ if self.reader.peek() != u' ':
+ chunks.append(self.scan_line_break())
+ end_marker = self.reader.get_marker()
+ else:
+ self.reader.forward()
+ if self.reader.column > max_indent:
+ max_indent = self.reader.column
+ return chunks, max_indent, end_marker
+
+ def scan_block_scalar_breaks(self, indent):
+ # See the specification for details.
+ chunks = []
+ end_marker = self.reader.get_marker()
+ while self.reader.column < indent and self.reader.peek() == u' ':
+ self.reader.forward()
+ while self.reader.peek() in u'\r\n\x85\u2028\u2029':
+ chunks.append(self.scan_line_break())
+ end_marker = self.reader.get_marker()
+ while self.reader.column < indent and self.reader.peek() == u' ':
+ self.reader.forward()
+ return chunks, end_marker
def scan_flow_scalar(self, double):
- marker = self.reader.get_marker()
+ # See the specification for details.
+ chunks = []
+ start_marker = self.reader.get_marker()
+ indent = self.indent+1
+ if indent == 0:
+ indent = 1
quote = self.reader.peek()
self.reader.forward()
+ chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
while self.reader.peek() != quote:
- if double and self.reader.peek() == u'\\':
+ chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker))
+ chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
+ self.reader.forward()
+ end_marker = self.reader.get_marker()
+ return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
+
+ ESCAPE_REPLACEMENTS = {
+ u'0': u'\0',
+ u'a': u'\x07',
+ u'b': u'\x08',
+ u't': u'\x09',
+ u'\t': u'\x09',
+ u'n': u'\x0A',
+ u'v': u'\x0B',
+ u'f': u'\x0C',
+ u'r': u'\x0D',
+ u'e': u'\x1B',
+ u' ': u'\x20',
+ u'\"': u'\"',
+ u'\\': u'\\',
+ u'N': u'\x85',
+ u'_': u'\xA0',
+ u'L': u'\u2028',
+ u'P': u'\u2029',
+ }
+
+ ESCAPE_CODES = {
+ u'x': 2,
+ u'u': 4,
+ u'U': 8,
+ }
+
+ def scan_flow_scalar_non_spaces(self, double, indent, start_marker):
+ # See the specification for details.
+ chunks = []
+ while True:
+ length = 0
+ while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
+ length += 1
+ if length:
+ chunks.append(self.reader.prefix(length))
+ self.reader.forward(length)
+ ch = self.reader.peek()
+ if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
+ chunks.append(u'\'')
self.reader.forward(2)
- elif not double and self.reader.peek(3)[1:] == u'\'\'':
- self.reader.forward(3)
+ elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
+ chunks.append(ch)
+ self.reader.forward()
+ elif double and ch == u'\\':
+ self.reader.forward()
+ ch = self.reader.peek()
+ if ch in self.ESCAPE_REPLACEMENTS:
+ chunks.append(self.ESCAPE_REPLACEMENTS[ch])
+ self.reader.forward()
+ elif ch in self.ESCAPE_CODES:
+ length = self.ESCAPE_CODES[ch]
+ self.reader.forward()
+ for k in range(length):
+ if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
+ raise ScannerError("while scanning a double-quoted scalar", start_marker,
+ "expected escape sequence of %d hexdecimal numbers, but found %r" %
+ (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
+ code = int(self.reader.prefix(length), 16)
+ chunks.append(unichr(code))
+ self.reader.forward(length)
+ elif ch in u'\r\n\x85\u2028\u2029':
+ self.scan_line_break()
+ chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker))
+ else:
+ raise ScannerError("while scanning a double-quoted scalar", start_marker,
+ "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker())
+ else:
+ return chunks
+
+ def scan_flow_scalar_spaces(self, double, indent, start_marker):
+ # See the specification for details.
+ chunks = []
+ length = 0
+ while self.reader.peek(length) in u' \t':
+ length += 1
+ whitespaces = self.reader.prefix(length)
+ self.reader.forward(length)
+ ch = self.reader.peek()
+ if ch == u'\0':
+ raise ScannerError("while scanning a quoted scalar", start_marker,
+ "found unexpected end of stream", self.reader.get_marker())
+ elif ch in u'\r\n\x85\u2028\u2029':
+ line_break = self.scan_line_break()
+ breaks = self.scan_flow_scalar_breaks(double, indent, start_marker)
+ if line_break != u'\n':
+ chunks.append(line_break)
+ elif not breaks:
+ chunks.append(u' ')
+ chunks.extend(breaks)
+ else:
+ chunks.append(whitespaces)
+ return chunks
+
+ def scan_flow_scalar_breaks(self, double, indent, start_marker):
+ # See the specification for details.
+ chunks = []
+ while True:
+ while self.reader.column < indent and self.reader.peek() == u' ':
+ self.reader.forward()
+ if self.reader.column < indent \
+ and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
+ s = 's'
+ if indent == 1:
+ s = ''
+ raise ScannerError("while scanning a quoted scalar", start_marker,
+ "expected %d space%s indentation, but found %r"
+ % (indent, s, self.reader.peek().encode('utf-8')),
+ self.reader.get_marker())
+ while self.reader.peek() in u' \t':
+ self.reader.forward()
+ if self.reader.peek() in u'\r\n\x85\u2028\u2029':
+ chunks.append(self.scan_line_break())
else:
- self.reader.forward(1)
- self.reader.forward(1)
- return ScalarToken('', False, marker, marker)
+ return chunks
def scan_plain(self):
+ # See the specification for details.
+ # We add an additional restriction for the flow context:
+ # plain scalars in the flow context cannot contain ':' and '?'.
+ # We also keep track of the `allow_simple_key` flag here.
+ chunks = []
+ start_marker = self.reader.get_marker()
+ end_marker = start_marker
indent = self.indent+1
- if indent < 1:
+ if indent == 0:
indent = 1
- space = False
- marker = self.reader.get_marker()
+ spaces = []
while True:
- while self.reader.peek() == u' ':
- self.reader.forward()
- space = True
- while self.reader.peek() not in u'\0\r\n?:,[]{}#' \
- or (not space and self.reader.peek() == '#') \
- or (not self.flow_level and self.reader.peek() in '?,[]{}') \
- or (not self.flow_level and self.reader.peek() == ':' and self.reader.peek(2)[1] not in u' \0\r\n'):
- space = self.reader.peek() not in u' \t'
- self.reader.forward()
- self.allow_simple_key = False
- if self.reader.peek() not in u'\r\n':
+ length = 0
+ if self.reader.peek() == u'#':
break
- while self.reader.peek() in u'\r\n':
- self.reader.forward()
- if not self.flow_level:
- self.allow_simple_key = True
- count = 0
- while self.reader.peek() == u' ' and count < indent:
- self.reader.forward()
- count += 1
- if count < indent:
+ while True:
+ ch = self.reader.peek(length)
+ if ch in u'\0 \t\r\n\x85\u2028\u2029' \
+ or (not self.flow_level and ch == u':' and
+ self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
+ or (self.flow_level and ch in u',:?[]{}'):
+ break
+ length += 1
+ if length == 0:
+ break
+ self.allow_simple_key = False
+ chunks.extend(spaces)
+ chunks.append(self.reader.prefix(length))
+ self.reader.forward(length)
+ end_marker = self.reader.get_marker()
+ spaces = self.scan_plain_spaces(indent)
+ if not spaces or self.reader.peek() == u'#' \
+ or self.reader.column < indent:
break
- space = True
- return ScalarToken('', True, marker, marker)
+ return ScalarToken(u''.join(chunks), True, start_marker, end_marker)
+
+ def scan_plain_spaces(self, indent):
+ # See the specification for details.
+ # The specification is really confusing about tabs in plain scalars.
+ # We just forbid them completely. Do not use tabs in YAML!
+ chunks = []
+ length = 0
+ while self.reader.peek(length) in u' ':
+ length += 1
+ whitespaces = self.reader.prefix(length)
+ self.reader.forward(length)
+ ch = self.reader.peek()
+ if ch in u'\r\n\x85\u2028\u2029':
+ line_break = self.scan_line_break()
+ self.allow_simple_key = True
+ breaks = []
+ while self.reader.peek() in u' \r\n\x85\u2028\u2029':
+ if self.reader.peek() == ' ':
+ self.reader.forward()
+ else:
+ breaks.append(self.scan_line_break())
+ if line_break != u'\n':
+ chunks.append(line_break)
+ elif not breaks:
+ chunks.append(u' ')
+ chunks.extend(breaks)
+ elif whitespaces:
+ chunks.append(whitespaces)
+ return chunks
+
+ def scan_tag_handle(self, name, start_marker):
+ # See the specification for details.
+ # For some strange reasons, the specification does not allow '_' in
+ # tag handles. I have allowed it anyway.
+ if self.reader.peek() != u'!':
+ raise ScannerError("while scanning a %s" % name, start_marker,
+ "expected '!', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ length = 1
+ ch = self.reader.peek(length)
+ if ch != u' ':
+ while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
+ or ch in u'-_':
+ length += 1
+ ch = self.reader.peek(length)
+ if ch != u'!':
+ self.reader.forward(length)
+ raise ScannerError("while scanning a %s" % name, start_marker,
+ "expected '!', but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ length += 1
+ value = self.reader.prefix(length)
+ self.reader.forward(length)
+ return value
+
+ def scan_tag_uri(self, name, start_marker):
+ # See the specification for details.
+ # Note: we do not check if URI is well-formed.
+ chunks = []
+ length = 0
+ ch = self.reader.peek(length)
+ while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
+ or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
+ if ch == u'%':
+ chunks.append(self.reader.prefix(length))
+ self.reader.forward(length)
+ length = 0
+ chunks.append(self.scan_uri_escapes(name, start_marker))
+ else:
+ length += 1
+ ch = self.reader.peek(length)
+ if length:
+ chunks.append(self.reader.prefix(length))
+ self.reader.forward(length)
+ length = 0
+ if not chunks:
+ raise ScannerError("while parsing a %s" % name, start_marker,
+ "expected URI, but found %r" % ch.encode('utf-8'),
+ self.reader.get_marker())
+ return u''.join(chunks)
+
+ def scan_uri_escapes(self, name, start_marker):
+ # See the specification for details.
+ bytes = []
+ marker = self.reader.get_marker()
+ while self.reader.peek() == u'%':
+ self.reader.forward()
+ for k in range(2):
+ if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
+ raise ScannerError("while scanning a %s" % name, start_marker,
+ "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
+ (self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
+ bytes.append(chr(int(self.reader.prefix(2), 16)))
+ self.reader.forward(2)
+ try:
+ value = unicode(''.join(bytes), 'utf-8')
+ except UnicodeDecodeError, exc:
+ raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker)
+ return value
def scan_line_break(self):
# Transforms:
@@ -806,7 +1372,7 @@ class Scanner:
# default : ''
ch = self.reader.peek()
if ch in u'\r\n\x85':
- if self.reader.peek(2) == u'\r\n':
+ if self.reader.prefix(2) == u'\r\n':
self.forward(2)
else:
self.reader.forward()
@@ -816,9 +1382,6 @@ class Scanner:
return ch
return u''
- def invalid_token(self):
- self.fail("invalid token")
-
#try:
# import psyco
# psyco.bind(Scanner)
diff --git a/lib/yaml/tokens.py b/lib/yaml/tokens.py
index 275d4a3..85d0b30 100644
--- a/lib/yaml/tokens.py
+++ b/lib/yaml/tokens.py
@@ -6,22 +6,9 @@ class Token:
class DirectiveToken(Token):
code = '<directive>'
-
-class YAMLDirectiveToken(DirectiveToken):
- code = '<%YAML directive>'
- def __init__(self, major_version, minor_version, start_marker, end_marker):
- self.major_version = major_version
- self.minor_version = minor_version
- self.start_marker = start_marker
- self.end_marker = end_marker
-
-class TagDirectiveToken(DirectiveToken):
- code = '<%TAG directive>'
-
-class ReservedDirectiveToken(DirectiveToken):
- code = '<unknown directive>'
- def __init__(self, name, start_marker, end_marker):
+ def __init__(self, name, value, start_marker, end_marker):
self.name = name
+ self.value = value
self.start_marker = start_marker
self.end_marker = end_marker
diff --git a/tests/data/invalid-block-scalar-indicator.error-message b/tests/data/invalid-block-scalar-indicator.error-message
new file mode 100644
index 0000000..16a6db1
--- /dev/null
+++ b/tests/data/invalid-block-scalar-indicator.error-message
@@ -0,0 +1,2 @@
+--- > what is this? # a comment
+data
diff --git a/tests/data/invalid-escape-character.error-message b/tests/data/invalid-escape-character.error-message
new file mode 100644
index 0000000..a95ab76
--- /dev/null
+++ b/tests/data/invalid-escape-character.error-message
@@ -0,0 +1 @@
+"some escape characters are \ncorrect, but this one \?\nis not\n"
diff --git a/tests/data/invalid-escape-numbers.error-message b/tests/data/invalid-escape-numbers.error-message
new file mode 100644
index 0000000..614ec9f
--- /dev/null
+++ b/tests/data/invalid-escape-numbers.error-message
@@ -0,0 +1 @@
+"hm.... \u123?"
diff --git a/tests/data/invalid-indentation-for-quoted-scalar.error-message b/tests/data/invalid-indentation-for-quoted-scalar.error-message
new file mode 100644
index 0000000..b885db3
--- /dev/null
+++ b/tests/data/invalid-indentation-for-quoted-scalar.error-message
@@ -0,0 +1,2 @@
+test: "foo
+bar"
diff --git a/tests/data/invalid-indentation-indicator-1.error-message b/tests/data/invalid-indentation-indicator-1.error-message
new file mode 100644
index 0000000..a3cd12f
--- /dev/null
+++ b/tests/data/invalid-indentation-indicator-1.error-message
@@ -0,0 +1,2 @@
+--- >0 # not valid
+data
diff --git a/tests/data/invalid-indentation-indicator-2.error-message b/tests/data/invalid-indentation-indicator-2.error-message
new file mode 100644
index 0000000..eefb6ec
--- /dev/null
+++ b/tests/data/invalid-indentation-indicator-2.error-message
@@ -0,0 +1,2 @@
+--- >-0
+data
diff --git a/tests/data/invalid-starting-character.error-message b/tests/data/invalid-starting-character.error-message
new file mode 100644
index 0000000..bb81c60
--- /dev/null
+++ b/tests/data/invalid-starting-character.error-message
@@ -0,0 +1 @@
+@@@@@@@@@@@@@@@@@@@
diff --git a/tests/data/spec-06-01.data b/tests/data/spec-06-01.data
index b5496c1..6134ba1 100644
--- a/tests/data/spec-06-01.data
+++ b/tests/data/spec-06-01.data
@@ -8,5 +8,7 @@ Not indented:
Flow style: [ # Leading spaces
By two, # in flow style
Also by two, # are neither
- Still by two # content nor
+# Tabs are not allowed:
+# Still by two # content nor
+ Still by two # content nor
] # indentation.
diff --git a/tests/data/spec-06-06.canonical b/tests/data/spec-06-06.canonical
index 5e13dde..513d07a 100644
--- a/tests/data/spec-06-06.canonical
+++ b/tests/data/spec-06-06.canonical
@@ -6,5 +6,5 @@
? !!str "quoted"
: !!str "text lines",
? !!str "block"
- : !!str "text lines\n"
+ : !!str "text\n lines\n"
}
diff --git a/tests/data/spec-07-10.canonical b/tests/data/spec-07-10.canonical
index 5f1b3dc..1db650a 100644
--- a/tests/data/spec-07-10.canonical
+++ b/tests/data/spec-07-10.canonical
@@ -3,7 +3,7 @@
!!str "Root flow scalar"
%YAML 1.1
---
-!!str "Root block scalar"
+!!str "Root block scalar\n"
%YAML 1.1
---
!!map {
diff --git a/tests/data/spec-08-08.canonical b/tests/data/spec-08-08.canonical
index 4f95c0f..d3f8b1a 100644
--- a/tests/data/spec-08-08.canonical
+++ b/tests/data/spec-08-08.canonical
@@ -12,4 +12,4 @@
!!str "foo bar"
%YAML 1.1
---
-!!str "foo bar\n"
+!!str "foo\n"
diff --git a/tests/data/spec-08-10.canonical b/tests/data/spec-08-10.canonical
index a6702c3..8281c5e 100644
--- a/tests/data/spec-08-10.canonical
+++ b/tests/data/spec-08-10.canonical
@@ -4,9 +4,9 @@
? !!str "block styles" : !!map {
? !!str "scalars" : !!map {
? !!str "literal"
- : !!str "#!!/usr/bin/perl\n\
+ : !!str "#!/usr/bin/perl\n\
print \"Hello,
- world!!\\n\";\n",
+ world!\\n\";\n",
? !!str "folded"
: !!str "This sentence
is false.\n"
diff --git a/tests/data/spec-09-02.canonical b/tests/data/spec-09-02.canonical
index 6771065..6f8f41a 100644
--- a/tests/data/spec-09-02.canonical
+++ b/tests/data/spec-09-02.canonical
@@ -3,5 +3,5 @@
!!str "as space \
trimmed\n\
specific\L\n\
- escaped\t\
+ escaped\t\n\
none"
diff --git a/tests/data/spec-09-12.data b/tests/data/spec-09-12.data
index dd4a9c2..b9a3ac5 100644
--- a/tests/data/spec-09-12.data
+++ b/tests/data/spec-09-12.data
@@ -1,8 +1,8 @@
# Outside flow collection:
- ::std::vector
-- Up, up and away!
+- Up, up, and away!
- -123
# Inside flow collection:
- [ '::std::vector',
- "Up, up and away!",
+ "Up, up, and away!",
-123 ]
diff --git a/tests/data/spec-09-16.data b/tests/data/spec-09-16.data
index eec5c76..473beb9 100644
--- a/tests/data/spec-09-16.data
+++ b/tests/data/spec-09-16.data
@@ -1 +1,3 @@
- as space … trimmed …… specific
… none
+# Tabs are confusing:
+# as space/trimmed/specific/none
+ as space … trimmed …… specific
… none
diff --git a/tests/data/spec-09-25.data b/tests/data/spec-09-25.data
index 84da455..f6303a1 100644
--- a/tests/data/spec-09-25.data
+++ b/tests/data/spec-09-25.data
@@ -1,3 +1,3 @@
| # Simple block scalar
literal
- text
+ text
diff --git a/tests/data/spec-09-26.canonical b/tests/data/spec-09-26.canonical
index cbbf46a..3029a11 100644
--- a/tests/data/spec-09-26.canonical
+++ b/tests/data/spec-09-26.canonical
@@ -1,3 +1,3 @@
%YAML 1.1
---
-!!str "\nliteral\n\ntext\n"
+!!str "\n\nliteral\n\ntext\n"
diff --git a/tests/data/spec-09-27.canonical b/tests/data/spec-09-27.canonical
index cbbf46a..3029a11 100644
--- a/tests/data/spec-09-27.canonical
+++ b/tests/data/spec-09-27.canonical
@@ -1,3 +1,3 @@
%YAML 1.1
---
-!!str "\nliteral\n\ntext\n"
+!!str "\n\nliteral\n\ntext\n"
diff --git a/tests/data/spec-09-28.canonical b/tests/data/spec-09-28.canonical
index cbbf46a..3029a11 100644
--- a/tests/data/spec-09-28.canonical
+++ b/tests/data/spec-09-28.canonical
@@ -1,3 +1,3 @@
%YAML 1.1
---
-!!str "\nliteral\n\ntext\n"
+!!str "\n\nliteral\n\ntext\n"
diff --git a/tests/data/spec-10-10.data b/tests/data/spec-10-10.data
index 03abb4e..0888b05 100644
--- a/tests/data/spec-10-10.data
+++ b/tests/data/spec-10-10.data
@@ -1,5 +1,5 @@
{
-? explicit key1 : Explicit value,
+? explicit key1 : explicit value,
? explicit key2 : , # Explicit empty
? explicit key3, # Empty value
simple key1 : explicit value,
diff --git a/tests/data/spec-10-14.canonical b/tests/data/spec-10-14.canonical
index ec1ef7b..e87c880 100644
--- a/tests/data/spec-10-14.canonical
+++ b/tests/data/spec-10-14.canonical
@@ -3,7 +3,7 @@
!!map {
? !!str "plain key"
: !!null "",
- ? !!str "quoted key\n"
+ ? !!str "quoted key"
: !!seq [
!!str "one",
!!str "two",
diff --git a/tests/data/unclosed-quoted-scalar.error-message b/tests/data/unclosed-quoted-scalar.error-message
new file mode 100644
index 0000000..8537429
--- /dev/null
+++ b/tests/data/unclosed-quoted-scalar.error-message
@@ -0,0 +1,2 @@
+'foo
+ bar
diff --git a/tests/test_appliance.py b/tests/test_appliance.py
index 29fec89..c471398 100644
--- a/tests/test_appliance.py
+++ b/tests/test_appliance.py
@@ -251,8 +251,10 @@ class CanonicalScanner:
chunks.append(self.QUOTE_REPLACES[ch])
start = self.index
elif self.data[self.index] == u'\n':
+ chunks.append(self.data[start:self.index])
chunks.append(u' ')
self.index += 1
+ start = self.index
ignore_spaces = True
elif ignore_spaces and self.data[self.index] == u' ':
self.index += 1
diff --git a/tests/test_structure.py b/tests/test_structure.py
index 754bc5e..12c87d0 100644
--- a/tests/test_structure.py
+++ b/tests/test_structure.py
@@ -76,8 +76,46 @@ class TestParser(test_appliance.TestAppliance):
self._compare(item1, item2)
else:
self.failUnlessEqual(value1.__class__.__name__, value2.__class__.__name__)
- if isinstance(value1, SequenceNode) or isinstance(value1, MappingNode):
+ if isinstance(value1, SequenceNode): # or isinstance(value1, MappingNode):
self._compare(value1.value, value2.value)
+ elif isinstance(value1, ScalarNode):
+ self.failUnlessEqual(value1.value, value2.value)
TestParser.add_tests('testParser', '.data', '.canonical')
+class TestParserOnCanonical(test_appliance.TestAppliance):
+
+ def _testParserOnCanonical(self, test_name, canonical_filename):
+ documents1 = None
+ documents2 = None
+ try:
+ parser = Parser(Scanner(Reader(file(canonical_filename, 'rb'))))
+ documents1 = parser.parse()
+ canonical = test_appliance.CanonicalParser(canonical_filename, file(canonical_filename, 'rb').read())
+ documents2 = canonical.parse()
+ self._compare(documents1, documents2)
+ except:
+ print
+ print "DATA:"
+ print file(canonical_filename, 'rb').read()
+ print "DOCUMENTS1:", documents1
+ print "DOCUMENTS2:", documents2
+ raise
+
+ def _compare(self, value1, value2):
+ if value1 is None and hasattr(value2, 'tag') and value2.tag == 'tag:yaml.org,2002:null':
+ return
+ self.failUnlessEqual(type(value1), type(value2))
+ if isinstance(value1, list) or isinstance(value1, tuple):
+ self.failUnlessEqual(len(value1), len(value2))
+ for item1, item2 in zip(value1, value2):
+ self._compare(item1, item2)
+ else:
+ self.failUnlessEqual(value1.__class__.__name__, value2.__class__.__name__)
+ if isinstance(value1, SequenceNode) or isinstance(value1, MappingNode):
+ self._compare(value1.value, value2.value)
+ elif isinstance(value1, ScalarNode):
+ self.failUnlessEqual(value1.value, value2.value)
+
+TestParserOnCanonical.add_tests('testParserOnCanonical', '.canonical')
+
diff --git a/tests/test_tokens.py b/tests/test_tokens.py
index c7f5aef..f5daaf2 100644
--- a/tests/test_tokens.py
+++ b/tests/test_tokens.py
@@ -27,9 +27,7 @@ class TestTokens(test_appliance.TestAppliance):
# value: :
replaces = {
- YAMLDirectiveToken: '%',
- TagDirectiveToken: '%',
- ReservedDirectiveToken: '%',
+ DirectiveToken: '%',
DocumentStartToken: '---',
DocumentEndToken: '...',
AliasToken: '*',