diff options
author | xi <xi@18f92427-320e-0410-9341-c67f048884a3> | 2006-02-17 22:39:52 +0000 |
---|---|---|
committer | xi <xi@18f92427-320e-0410-9341-c67f048884a3> | 2006-02-17 22:39:52 +0000 |
commit | f43239e1ad0bf85c5b748e3934117af36603b2f8 (patch) | |
tree | 5ed078e200669c13e272db423333dfcb8163ec19 | |
parent | 38c4ac622779f326b137dff0704501db3f431728 (diff) | |
download | pyyaml-f43239e1ad0bf85c5b748e3934117af36603b2f8.tar.gz |
Working on the scanner.
git-svn-id: http://svn.pyyaml.org/branches/pyyaml3000@47 18f92427-320e-0410-9341-c67f048884a3
-rw-r--r-- | lib/yaml/parser.py | 12 | ||||
-rw-r--r-- | lib/yaml/reader.py | 18 | ||||
-rw-r--r-- | lib/yaml/scanner.py | 136 | ||||
-rw-r--r-- | lib/yaml/tokens.py | 38 | ||||
-rw-r--r-- | tests/data/forbidden-entry.error-message | 2 | ||||
-rw-r--r-- | tests/data/forbidden-key.error-message | 2 | ||||
-rw-r--r-- | tests/data/forbidden-value.error-message | 1 | ||||
-rw-r--r-- | tests/data/invalid-character.error-message | bin | 0 -> 2209 bytes | |||
-rw-r--r-- | tests/data/invalid-simple-key.error-message | 3 | ||||
-rw-r--r-- | tests/data/invalid-utf8-byte.error-message | 18 | ||||
-rw-r--r-- | tests/data/spec-10-07.data | 3 | ||||
-rw-r--r-- | tests/data/unclosed-bracket.error-message | 5 | ||||
-rw-r--r-- | tests/test_appliance.py | 6 | ||||
-rw-r--r-- | tests/test_errors.py | 32 | ||||
-rw-r--r-- | tests/test_marker.py | 8 | ||||
-rw-r--r-- | tests/test_tokens.py | 4 | ||||
-rw-r--r-- | tests/test_yaml.py | 1 |
17 files changed, 215 insertions, 74 deletions
diff --git a/lib/yaml/parser.py b/lib/yaml/parser.py index 07865dd..b7c5aa7 100644 --- a/lib/yaml/parser.py +++ b/lib/yaml/parser.py @@ -95,23 +95,23 @@ class Parser: def parse_stream(self): documents = [] - if not self.is_token(DirectiveToken, DocumentStartToken, EndToken): + if not self.is_token(DirectiveToken, DocumentStartToken, StreamEndToken): documents.append(self.parse_block_node()) - while not self.is_token(EndToken): + while not self.is_token(StreamEndToken): while self.is_token(DirectiveToken): self.get_token() if not self.is_token(DocumentStartToken): self.fail('DOCUMENT-START is expected') self.get_token() if self.is_token(DirectiveToken, - DocumentStartToken, DocumentEndToken, EndToken): + DocumentStartToken, DocumentEndToken, StreamEndToken): documents.append(None) else: documents.append(self.parse_block_node()) while self.is_token(DocumentEndToken): self.get_token() - if not self.is_token(EndToken): - self.fail("END is expected") + if not self.is_token(StreamEndToken): + self.fail("STREAM-END is expected") return documents def parse_block_node(self): @@ -284,5 +284,5 @@ class Parser: def fail(self, message): marker = self.scanner.peek_token().start_marker - raise Error(message+':\n'+marker.get_snippet()) + raise ParserError(message+':\n'+marker.get_snippet()) diff --git a/lib/yaml/reader.py b/lib/yaml/reader.py index 6094f8c..4316e22 100644 --- a/lib/yaml/reader.py +++ b/lib/yaml/reader.py @@ -65,7 +65,7 @@ class Marker: self.buffer = buffer self.pointer = pointer - def get_snippet(self, max_length=79): + def get_snippet(self, indent=4, max_length=75): if self.buffer is None: return None head = '' @@ -85,8 +85,16 @@ class Marker: end -= 5 break snippet = self.buffer[start:end].encode('utf-8') - return head + snippet + tail + '\n' \ - + ' '*(self.pointer-start+len(head)) + '^' + '\n' + return ' '*indent + head + snippet + tail + '\n' \ + + ' '*(indent+self.pointer-start+len(head)) + '^' + + def __str__(self): + snippet = self.get_snippet() + where = " in \"%s\", line %d, column %d" \ + % (self.name, self.line+1, self.column+1) + if snippet is not None: + where += ":\n"+snippet + return where class ReaderError(YAMLError): @@ -100,12 +108,12 @@ class ReaderError(YAMLError): def __str__(self): if isinstance(self.character, str): return "'%s' codec can't decode byte #x%02x: %s\n" \ - "\tin '%s', position %d." \ + " in \"%s\", position %d" \ % (self.encoding, ord(self.character), self.reason, self.name, self.position) else: return "unacceptable character #x%04x: %s\n" \ - "\tin '%s', position %d." \ + " in \"%s\", position %d" \ % (ord(self.character), self.reason, self.name, self.position) diff --git a/lib/yaml/scanner.py b/lib/yaml/scanner.py index b834478..a1785ef 100644 --- a/lib/yaml/scanner.py +++ b/lib/yaml/scanner.py @@ -23,7 +23,25 @@ class ScannerError(YAMLError): # in '...', line 5, column 15: # key: "valu\?e" # ^ - pass + def __init__(self, context=None, context_marker=None, + problem=None, problem_marker=None, description=None): + self.context = context + self.context_marker = context_marker + self.problem = problem + self.problem_marker = problem_marker + self.description = description + + def __str__(self): + lines = [] + for (place, marker) in [(self.context, self.context_marker), + (self.problem, self.problem_marker)]: + if place is not None: + lines.append(place) + if marker is not None: + lines.append(str(marker)) + if self.description is not None: + lines.append(self.description) + return '\n'.join(lines) class SimpleKey: def __init__(self, token_number, required, index, line, column, marker): @@ -140,9 +158,6 @@ class Scanner: # and decrease the current indentation level. self.unwind_indent(self.reader.column) - #print - #print self.reader.get_marker().get_snippet() - # Peek the next character. ch = self.reader.peek() @@ -256,7 +271,8 @@ class Scanner: if key.line != self.reader.line \ or self.reader.index-key.index > 1024: if key.required: - self.fail("simple key is required") + raise ScannerError("while scanning a simple key", key.marker, + "could not found expected ':'", self.reader.get_marker()) del self.possible_simple_keys[level] def save_possible_simple_key(self): @@ -267,6 +283,10 @@ class Scanner: # Check if a simple key is required at the current position. required = not self.flow_level and self.indent == self.reader.column + # A simple key is required only if it is the first token in the current + # line. Therefore it is always allowed. + assert self.allow_simple_key or not required + # The next token might be a simple key. Let's save it's number and # position. if self.allow_simple_key: @@ -280,24 +300,31 @@ class Scanner: index, line, column, marker) self.possible_simple_keys[self.flow_level] = key - # A simple key is required at the current position. - elif required: - self.fail("simple key is required") - def remove_possible_simple_key(self): # Remove the saved possible key position at the current flow level. if self.flow_level in self.possible_simple_keys: key = self.possible_simple_keys[self.flow_level] - if key.required: - self.fail("simple key is required") + + # I don't think it's possible, but I could be wrong. + assert not key.required + #if key.required: + # raise ScannerError("while scanning a simple key", key.marker, + # "could not found expected ':'", self.reader.get_marker()) # Indentation functions. def unwind_indent(self, column): # In flow context, tokens should respect indentation. + # Actually the condition should be `self.indent >= column` according to + # the spec. But this condition will prohibit intuitively correct + # constructions such as + # key : { + # } if self.flow_level and self.indent > column: - self.fail("invalid intendation in the flow context") + raise ScannerError(None, None, + "invalid intendation or unclosed '[' or '{'", + self.reader.get_marker()) # In block context, we may need to issue the BLOCK-END tokens. while self.indent > column: @@ -328,7 +355,7 @@ class Scanner: marker = self.reader.get_marker() # Add END. - self.tokens.append(EndToken(marker, marker)) + self.tokens.append(StreamEndToken(marker, marker)) # The reader is ended. self.done = True @@ -343,7 +370,7 @@ class Scanner: self.allow_simple_key = False # Scan and add DIRECTIVE. - self.scan_directive() + self.tokens.append(self.scan_directive()) def fetch_document_start(self): self.fetch_document_indicator(DocumentStartToken) @@ -420,7 +447,9 @@ class Scanner: # Are we allowed to start a new entry? if not self.allow_simple_key: - self.fail("Cannot start a new entry here") + raise ScannerError(None, None, + "sequence entries are not allowed here", + self.reader.get_marker()) # We may need to add BLOCK-SEQUENCE-START. if self.add_indent(self.reader.column): @@ -446,7 +475,9 @@ class Scanner: # Are we allowed to start a key (not nessesary a simple)? if not self.allow_simple_key: - self.fail("Cannot start a new key here") + raise ScannerError(None, None, + "mapping keys are not allowed here", + self.reader.get_marker()) # We may need to add BLOCK-MAPPING-START. if self.add_indent(self.reader.column): @@ -489,6 +520,18 @@ class Scanner: # It must be a part of a complex key. else: + # Block context needs additional checks. + # (Do we really need them? They will be catched by the parser + # anyway.) + if not self.flow_level: + + # We are allowed to start a complex value if and only if + # we can start a simple key. + if not self.allow_simple_key: + raise ScannerError(None, None, + "mapping values are not allowed here", + self.reader.get_marker()) + # Simple keys are allowed after ':' in the block context. self.allow_simple_key = not self.flow_level @@ -510,7 +553,7 @@ class Scanner: self.allow_simple_key = False # Scan and add ALIAS. - self.scan_anchor(AliasToken) + self.tokens.append(self.scan_anchor(AliasToken)) def fetch_anchor(self): @@ -521,7 +564,7 @@ class Scanner: self.allow_simple_key = False # Scan and add ANCHOR. - self.scan_anchor(AnchorToken) + self.tokens.append(self.scan_anchor(AnchorToken)) def fetch_tag(self): @@ -532,7 +575,7 @@ class Scanner: self.allow_simple_key = False # Scan and add TAG. - self.scan_tag() + self.tokens.append(self.scan_tag()) def fetch_literal(self): self.fetch_block_scalar(folded=False) @@ -549,7 +592,7 @@ class Scanner: self.remove_possible_simple_key() # Scan and add SCALAR. - self.scan_block_scalar(folded) + self.tokens.append(self.scan_block_scalar(folded)) def fetch_single(self): self.fetch_flow_scalar(double=False) @@ -566,7 +609,7 @@ class Scanner: self.allow_simple_key = False # Scan and add SCALAR. - self.scan_flow_scalar(double) + self.tokens.append(self.scan_flow_scalar(double)) def fetch_plain(self): @@ -579,7 +622,7 @@ class Scanner: self.allow_simple_key = False # Scan and add SCALAR. May change `allow_simple_key`. - self.scan_plain() + self.tokens.append(self.scan_plain()) # Checkers. @@ -645,15 +688,17 @@ class Scanner: # Scanners. def scan_to_next_token(self): + # We ignore spaces, line breaks and comments. + # If we find a line break in the block context, we set the flag + # `allow_simple_key` on. found = False while not found: while self.reader.peek() == u' ': self.reader.forward() if self.reader.peek() == u'#': - while self.reader.peek() not in u'\r\n': + while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029': self.reader.forward() - if self.reader.peek() in u'\r\n': - self.reader.forward() + if self.scan_line_break(): if not self.flow_level: self.allow_simple_key = True else: @@ -662,28 +707,29 @@ class Scanner: def scan_directive(self): marker = self.reader.get_marker() if self.reader.peek(5) == u'%YAML ': - self.tokens.append(YAMLDirectiveToken(1, 1, marker, marker)) + token = YAMLDirectiveToken(1, 1, marker, marker) elif self.reader.peek(4) == u'%TAG ': - self.tokens.append(TagDirectiveToken(marker, marker)) + token = TagDirectiveToken(marker, marker) else: - self.tokens.append(ReservedDirectiveToken('', marker, marker)) + token = ReservedDirectiveToken('', marker, marker) while self.reader.peek() not in u'\0\r\n': self.reader.forward() self.reader.forward() + return token def scan_anchor(self, TokenClass): start_marker = self.reader.get_marker() while self.reader.peek() not in u'\0 \t\r\n,:': self.reader.forward() end_marker = self.reader.get_marker() - self.tokens.append(TokenClass('', start_marker, end_marker)) + return TokenClass('', start_marker, end_marker) def scan_tag(self): start_marker = self.reader.get_marker() while self.reader.peek() not in u'\0 \t\r\n': self.reader.forward() end_marker = self.reader.get_marker() - self.tokens.append(TagToken('', start_marker, end_marker)) + return TagToken('', start_marker, end_marker) def scan_block_scalar(self, folded): start_marker = self.reader.get_marker() @@ -701,7 +747,7 @@ class Scanner: count += 1 if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029': break - self.tokens.append(ScalarToken('', False, start_marker, start_marker)) + return ScalarToken('', False, start_marker, start_marker) def scan_flow_scalar(self, double): marker = self.reader.get_marker() @@ -715,7 +761,7 @@ class Scanner: else: self.reader.forward(1) self.reader.forward(1) - self.tokens.append(ScalarToken('', False, marker, marker)) + return ScalarToken('', False, marker, marker) def scan_plain(self): indent = self.indent+1 @@ -747,14 +793,32 @@ class Scanner: if count < indent: break space = True - self.tokens.append(ScalarToken('', True, marker, marker)) + return ScalarToken('', True, marker, marker) + + def scan_line_break(self): + # Transforms: + # '\r\n' : '\n' + # '\r' : '\n' + # '\n' : '\n' + # '\x85' : '\n' + # '\u2028' : '\u2028' + # '\u2029 : '\u2029' + # default : '' + ch = self.reader.peek() + if ch in u'\r\n\x85': + if self.reader.peek(2) == u'\r\n': + self.forward(2) + else: + self.reader.forward() + return u'\n' + elif ch in u'\u2028\u2029': + self.reader.forward() + return ch + return u'' def invalid_token(self): self.fail("invalid token") - def fail(self, message): - raise ScannerError(message) - #try: # import psyco # psyco.bind(Scanner) diff --git a/lib/yaml/tokens.py b/lib/yaml/tokens.py index 3bfa8b2..275d4a3 100644 --- a/lib/yaml/tokens.py +++ b/lib/yaml/tokens.py @@ -5,9 +5,10 @@ class Token: self.end_marker = end_marker class DirectiveToken(Token): - pass + code = '<directive>' class YAMLDirectiveToken(DirectiveToken): + code = '<%YAML directive>' def __init__(self, major_version, minor_version, start_marker, end_marker): self.major_version = major_version self.minor_version = minor_version @@ -15,72 +16,77 @@ class YAMLDirectiveToken(DirectiveToken): self.end_marker = end_marker class TagDirectiveToken(DirectiveToken): - pass + code = '<%TAG directive>' class ReservedDirectiveToken(DirectiveToken): + code = '<unknown directive>' def __init__(self, name, start_marker, end_marker): self.name = name self.start_marker = start_marker self.end_marker = end_marker class DocumentStartToken(Token): - pass + code = '<document start>' class DocumentEndToken(Token): - pass + code = '<document end>' -class EndToken(Token): - pass +class StreamEndToken(Token): + code = '<stream end>' class BlockSequenceStartToken(Token): - pass + code = '<block sequence start>' class BlockMappingStartToken(Token): - pass + code = '<block mapping end>' class BlockEndToken(Token): - pass + code = '<block end>' class FlowSequenceStartToken(Token): - pass + code = '[' class FlowMappingStartToken(Token): - pass + code = '{' class FlowSequenceEndToken(Token): - pass + code = ']' class FlowMappingEndToken(Token): - pass + code = '}' class KeyToken(Token): - pass + code = '?' class ValueToken(Token): - pass + code = ':' class EntryToken(Token): - pass + code = '- or ,' class AliasToken(Token): + code = '<alias>' def __init__(self, value, start_marker, end_marker): self.value = value self.start_marker = start_marker self.end_marker = end_marker class AnchorToken(Token): + code = '<anchor>' def __init__(self, value, start_marker, end_marker): self.value = value self.start_marker = start_marker self.end_marker = end_marker class TagToken(Token): + code = '<tag>' def __init__(self, value, start_marker, end_marker): self.value = value self.start_marker = start_marker self.end_marker = end_marker class ScalarToken(Token): + code = '<scalar>' def __init__(self, value, plain, start_marker, end_marker): self.value = value self.plain = plain diff --git a/tests/data/forbidden-entry.error-message b/tests/data/forbidden-entry.error-message new file mode 100644 index 0000000..f2e3079 --- /dev/null +++ b/tests/data/forbidden-entry.error-message @@ -0,0 +1,2 @@ +test: - foo + - bar diff --git a/tests/data/forbidden-key.error-message b/tests/data/forbidden-key.error-message new file mode 100644 index 0000000..da9b471 --- /dev/null +++ b/tests/data/forbidden-key.error-message @@ -0,0 +1,2 @@ +test: ? foo + : bar diff --git a/tests/data/forbidden-value.error-message b/tests/data/forbidden-value.error-message new file mode 100644 index 0000000..efd7ce5 --- /dev/null +++ b/tests/data/forbidden-value.error-message @@ -0,0 +1 @@ +test: key: value diff --git a/tests/data/invalid-character.error-message b/tests/data/invalid-character.error-message Binary files differnew file mode 100644 index 0000000..03687b0 --- /dev/null +++ b/tests/data/invalid-character.error-message diff --git a/tests/data/invalid-simple-key.error-message b/tests/data/invalid-simple-key.error-message new file mode 100644 index 0000000..a58deec --- /dev/null +++ b/tests/data/invalid-simple-key.error-message @@ -0,0 +1,3 @@ +key: value +invalid simple key +next key: next value diff --git a/tests/data/invalid-utf8-byte.error-message b/tests/data/invalid-utf8-byte.error-message new file mode 100644 index 0000000..15111c3 --- /dev/null +++ b/tests/data/invalid-utf8-byte.error-message @@ -0,0 +1,18 @@ +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------------------------------------------------------------------------- +Invalid byte ('\xFF'): ÿ <-- +------------------------------------------------------------------------------------------------------------------------------- diff --git a/tests/data/spec-10-07.data b/tests/data/spec-10-07.data index 46d7d09..ff943fb 100644 --- a/tests/data/spec-10-07.data +++ b/tests/data/spec-10-07.data @@ -1,6 +1,5 @@ { -#? : value # Empty key -? ~ : value, # Empty key +? : value, # Empty key ? explicit key: value, simple key : value, diff --git a/tests/data/unclosed-bracket.error-message b/tests/data/unclosed-bracket.error-message new file mode 100644 index 0000000..1d07a46 --- /dev/null +++ b/tests/data/unclosed-bracket.error-message @@ -0,0 +1,5 @@ +test: + - [ foo: bar + - baz +"we could have detected the unclosed bracket on the above line, but this would forbid such syntax as": { +} diff --git a/tests/test_appliance.py b/tests/test_appliance.py index 0c7adee..29fec89 100644 --- a/tests/test_appliance.py +++ b/tests/test_appliance.py @@ -69,7 +69,7 @@ class Token: args.append(repr(self.value)) return "%s(%s)" % (self.__class__.__name__, ''.join(args)) -class EndToken(Token): +class StreamEndToken(Token): pass class DirectiveToken(Token): @@ -132,7 +132,7 @@ class CanonicalScanner: self.find_token() ch = self.data[self.index] if ch == u'\0': - tokens.append(EndToken()) + tokens.append(StreamEndToken()) break elif ch == u'%': tokens.append(self.scan_directive()) @@ -285,7 +285,7 @@ class CanonicalParser: # stream: document* END def parse_stream(self): documents = [] - while not self.test_token(EndToken): + while not self.test_token(StreamEndToken): if self.test_token(DirectiveToken, DocumentStartToken): documents.append(self.parse_document()) else: diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100644 index 0000000..2b6e9a2 --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,32 @@ + +import test_appliance + +from yaml.error import YAMLError +from yaml.reader import Reader +from yaml.scanner import Scanner + +class TestErrors(test_appliance.TestAppliance): + + def _testErrors(self, test_name, invalid_filename): + #self._load(invalid_filename) + self.failUnlessRaises(YAMLError, lambda: self._load(invalid_filename)) + + def _testStringErrors(self, test_name, invalid_filename): + #self._load_string(invalid_filename) + self.failUnlessRaises(YAMLError, lambda: self._load_string(invalid_filename)) + + def _load(self, filename): + reader = Reader(file(filename, 'rb')) + scanner = Scanner(reader) + while scanner.peek_token(): + scanner.get_token() + + def _load_string(self, filename): + reader = Reader(file(filename, 'rb').read()) + scanner = Scanner(reader) + while scanner.peek_token(): + scanner.get_token() + +TestErrors.add_tests('testErrors', '.error-message') +TestErrors.add_tests('testStringErrors', '.error-message') + diff --git a/tests/test_marker.py b/tests/test_marker.py index ac36bd5..9ea4474 100644 --- a/tests/test_marker.py +++ b/tests/test_marker.py @@ -19,15 +19,15 @@ class TestMarker(test_appliance.TestAppliance): column += 1 index += 1 marker = Marker(test_name, line, column, unicode(input), index) - snippet = marker.get_snippet() + snippet = marker.get_snippet(indent=2, max_length=79) #print "INPUT:" #print input #print "SNIPPET:" #print snippet self.failUnless(isinstance(snippet, str)) - self.failUnlessEqual(snippet.count('\n'), 2) - data, pointer, dummy = snippet.split('\n') - self.failUnless(len(data) < 80) + self.failUnlessEqual(snippet.count('\n'), 1) + data, pointer = snippet.split('\n') + self.failUnless(len(data) < 82) self.failUnlessEqual(data[len(pointer)-1], '*') TestMarker.add_tests('testMarkers', '.markers') diff --git a/tests/test_tokens.py b/tests/test_tokens.py index 1343e57..c7f5aef 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -54,7 +54,7 @@ class TestTokens(test_appliance.TestAppliance): try: scanner = Scanner(Reader(file(data_filename, 'rb'))) tokens1 = [] - while not isinstance(scanner.peek_token(), EndToken): + while not isinstance(scanner.peek_token(), StreamEndToken): tokens1.append(scanner.get_token()) tokens1 = [self.replaces[t.__class__] for t in tokens1] self.failUnlessEqual(tokens1, tokens2) @@ -76,7 +76,7 @@ class TestScanner(test_appliance.TestAppliance): try: scanner = Scanner(Reader(file(filename, 'rb'))) tokens = [] - while not isinstance(scanner.peek_token(), EndToken): + while not isinstance(scanner.peek_token(), StreamEndToken): tokens.append(scanner.get_token().__class__.__name__) except: print diff --git a/tests/test_yaml.py b/tests/test_yaml.py index f391a50..fd7b5ac 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -6,6 +6,7 @@ from test_reader import * from test_canonical import * from test_tokens import * from test_structure import * +from test_errors import * def main(module='__main__'): unittest.main(module) |