diff options
author | xi <xi@18f92427-320e-0410-9341-c67f048884a3> | 2006-02-19 22:17:28 +0000 |
---|---|---|
committer | xi <xi@18f92427-320e-0410-9341-c67f048884a3> | 2006-02-19 22:17:28 +0000 |
commit | 282bbe171b1c36ce94185dbb95fbd9d5b971d21f (patch) | |
tree | 76547cfe9bdb93f11d732ea7b388e69d55dfa5bf | |
parent | 6070191305af2f626f202d33470e0f76cd0aaa1f (diff) | |
download | pyyaml-282bbe171b1c36ce94185dbb95fbd9d5b971d21f.tar.gz |
Parser is done. Add iterator interfaces for Scanner and Parser.
git-svn-id: http://svn.pyyaml.org/branches/pyyaml3000@51 18f92427-320e-0410-9341-c67f048884a3
-rw-r--r-- | lib/yaml/__init__.py | 19 | ||||
-rw-r--r-- | lib/yaml/events.py | 49 | ||||
-rw-r--r-- | lib/yaml/parser.py | 583 | ||||
-rw-r--r-- | lib/yaml/scanner.py | 150 | ||||
-rw-r--r-- | lib/yaml/tokens.py | 51 | ||||
-rw-r--r-- | tests/data/spec-05-08.canonical | 4 | ||||
-rw-r--r-- | tests/data/spec-06-01.canonical | 2 | ||||
-rw-r--r-- | tests/data/spec-09-20.canonical | 2 | ||||
-rw-r--r-- | tests/data/spec-09-30.canonical | 4 | ||||
-rw-r--r-- | tests/data/spec-09-31.canonical | 4 | ||||
-rw-r--r-- | tests/data/spec-09-32.canonical | 4 | ||||
-rw-r--r-- | tests/data/spec-09-33.canonical | 4 | ||||
-rw-r--r-- | tests/test_appliance.py | 198 | ||||
-rw-r--r-- | tests/test_canonical.py | 13 | ||||
-rw-r--r-- | tests/test_errors.py | 6 | ||||
-rw-r--r-- | tests/test_structure.py | 129 | ||||
-rw-r--r-- | tests/test_tokens.py | 13 |
17 files changed, 713 insertions, 522 deletions
diff --git a/lib/yaml/__init__.py b/lib/yaml/__init__.py index e69de29..cae7cde 100644 --- a/lib/yaml/__init__.py +++ b/lib/yaml/__init__.py @@ -0,0 +1,19 @@ + +from reader import Reader +from scanner import Scanner +from parser import Parser + +from tokens import * +from events import * + +def scan(data, Reader=Reader, Scanner=Scanner): + reader = Reader(data) + scanner = Scanner(reader) + return iter(scanner) + +def parse(data, Reader=Reader, Scanner=Scanner, Parser=Parser): + reader = Reader(data) + scanner = Scanner(reader) + parser = Parser(scanner) + return iter(parser) + diff --git a/lib/yaml/events.py b/lib/yaml/events.py new file mode 100644 index 0000000..6ecb772 --- /dev/null +++ b/lib/yaml/events.py @@ -0,0 +1,49 @@ + +class Event: + def __init__(self, start_marker, end_marker): + self.start_marker = start_marker + self.end_marker = end_marker + def __repr__(self): + attributes = [key for key in self.__dict__ + if not key.endswith('_marker')] + attributes.sort() + arguments = ', '.join(['%s=%r' % (key, getattr(self, key)) + for key in attributes]) + return '%s(%s)' % (self.__class__.__name__, arguments) + +class NodeEvent(Event): + def __init__(self, anchor, tag, start_marker, end_marker): + self.anchor = anchor + self.tag = tag + self.start_marker = start_marker + self.end_marker = end_marker + +class AliasEvent(NodeEvent): + def __init__(self, name, start_marker, end_marker): + self.name = name + self.start_marker = start_marker + self.end_marker = end_marker + +class ScalarEvent(NodeEvent): + def __init__(self, anchor, tag, value, start_marker, end_marker): + self.anchor = anchor + self.tag = tag + self.value = value + self.start_marker = start_marker + self.end_marker = end_marker + +class CollectionEvent(NodeEvent): + pass + +class SequenceEvent(CollectionEvent): + pass + +class MappingEvent(CollectionEvent): + pass + +class CollectionEndEvent(Event): + pass + +class StreamEndEvent(Event): + pass + diff --git a/lib/yaml/parser.py b/lib/yaml/parser.py index b7c5aa7..858d906 100644 --- a/lib/yaml/parser.py +++ b/lib/yaml/parser.py @@ -1,6 +1,8 @@ -# Production rules: -# stream ::= implicit_document? explicit_document* END +# YAML can be parsed by an LL(1) parser! +# +# We use the following production rules: +# stream ::= implicit_document? explicit_document* STREAM-END # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END? # implicit_document ::= block_node DOCUMENT-END? # block_node ::= ALIAS | properties? block_content @@ -9,20 +11,37 @@ # block_content ::= block_collection | flow_collection | SCALAR # flow_content ::= flow_collection | SCALAR # block_collection ::= block_sequence | block_mapping -# block_sequence ::= BLOCK-SEQUENCE-START (ENTRY block_node?)* BLOCK-END +# block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END # block_mapping ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END # block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence) -# indentless_block_sequence ::= (ENTRY block_node?)+ +# indentless_block_sequence ::= (BLOCK-ENTRY block_node?)+ # flow_collection ::= flow_sequence | flow_mapping -# flow_sequence ::= FLOW-SEQUENCE-START (flow_sequence_entry ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END -# flow_mapping ::= FLOW-MAPPING-START flow_mapping_entry ENTRY)* flow_mapping_entry? FLOW-MAPPING-END -# flow_sequence_entry ::= flow_node | KEY flow_node (VALUE flow_node?)? -# flow_mapping_entry ::= flow_node | KEY flow_node (VALUE flow_node?)? +# flow_sequence ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END +# flow_mapping ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END +# flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? +# flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? +# +# Note that there is a slight deviation from the specification. We require a +# non-empty node content if ANCHOR or TAG is specified. This disallow such +# documents as +# +# key: !!str # empty value +# +# This is done to prevent ambiguity in parsing tags and aliases: +# +# { !!perl/YAML::Parser: value } +# +# What is it? Should it be interpreted as +# { ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value } +# or +# { ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' } +# Since we disallow non-empty node content, tags are always followed by spaces +# or line breaks. -# FIRST(rule) sets: -# stream: {} +# FIRST sets: +# stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START } # explicit_document: { DIRECTIVE DOCUMENT-START } -# implicit_document: block_node +# implicit_document: FIRST(block_node) # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START } # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START } # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR } @@ -31,7 +50,7 @@ # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START } # block_sequence: { BLOCK-SEQUENCE-START } # block_mapping: { BLOCK-MAPPING-START } -# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START ENTRY } +# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY } # indentless_sequence: { ENTRY } # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START } # flow_sequence: { FLOW-SEQUENCE-START } @@ -41,78 +60,131 @@ from error import YAMLError from tokens import * +from events import * class ParserError(YAMLError): - pass -class Node: - def __repr__(self): - args = [] - for attribute in ['anchor', 'tag', 'value']: - if hasattr(self, attribute): - args.append(repr(getattr(self, attribute))) - return "%s(%s)" % (self.__class__.__name__, ', '.join(args)) + def __init__(self, context=None, context_marker=None, + problem=None, problem_marker=None): + self.context = context + self.context_marker = context_marker + self.problem = problem + self.problem_marker = problem_marker -class AliasNode(Node): - def __init__(self, anchor): - self.anchor = anchor - -class ScalarNode(Node): - def __init__(self, anchor, tag, value): - self.anchor = anchor - self.tag = tag - self.value = value - -class SequenceNode(Node): - def __init__(self, anchor, tag, value): - self.anchor = anchor - self.tag = tag - self.value = value - -class MappingNode(Node): - def __init__(self, anchor, tag, value): - self.anchor = anchor - self.tag = tag - self.value = value + def __str__(self): + lines = [] + for (place, marker) in [(self.context, self.context_marker), + (self.problem, self.problem_marker)]: + if place is not None: + lines.append(place) + if marker is not None: + lines.append(str(marker)) + return '\n'.join(lines) class Parser: + # Since writing an LL(1) parser is a straightforward task, we do not give + # many comments here. + # Note that we use Python generators. If you rewrite the parser to another + # language, you may replace all 'yield'-s with event handler calls. + + DEFAULT_TAGS = { + u'!': u'!', + u'!!': u'tag:yaml.org,2002:', + } def __init__(self, scanner): self.scanner = scanner + self.current_event = None + self.yaml_version = None + self.tag_handles = {} + self.event_generator = self.parse_stream() - def is_token(self, *choices): - token = self.scanner.peek_token() - for choice in choices: - if isinstance(token, choices): - return True + def check(self, *choices): + # Check the type of the next event. + if self.current_event is None: + try: + self.current_event = self.event_generator.next() + except StopIteration: + pass + if self.current_event is not None: + for choice in choices: + if isinstance(self.current_event, choice): + return True return False - def get_token(self): - return self.scanner.get_token() + def get(self): + # Get the next event. + if self.current_event is None: + try: + self.current_event = self.event_generator.next() + except StopIteration: + pass + value = self.current_event + self.current_event = None + return value - def parse(self): - return self.parse_stream() + def __iter__(self): + # Iterator protocol. + return self.event_generator def parse_stream(self): - documents = [] - if not self.is_token(DirectiveToken, DocumentStartToken, StreamEndToken): - documents.append(self.parse_block_node()) - while not self.is_token(StreamEndToken): - while self.is_token(DirectiveToken): - self.get_token() - if not self.is_token(DocumentStartToken): - self.fail('DOCUMENT-START is expected') - self.get_token() - if self.is_token(DirectiveToken, + # implicit_document? explicit_document* STREAM-END + + # Parse implicit document. + if not self.scanner.check(DirectiveToken, DocumentStartToken, + StreamEndToken): + self.tag_handles = self.DEFAULT_TAGS + for event in self.parse_block_node(): + yield event + + # Parse explicit documents. + while not self.scanner.check(StreamEndToken): + self.process_directives() + if not self.scanner.check(DocumentStartToken): + raise ParserError(None, None, + "expected '<document start>', but found %r" + % self.scanner.peek().id, + self.scanner.peek().start_marker) + token = self.scanner.get() + if self.scanner.check(DirectiveToken, DocumentStartToken, DocumentEndToken, StreamEndToken): - documents.append(None) + yield self.process_empty_scalar(token.end_marker) else: - documents.append(self.parse_block_node()) - while self.is_token(DocumentEndToken): - self.get_token() - if not self.is_token(StreamEndToken): - self.fail("STREAM-END is expected") - return documents + for event in self.parse_block_node(): + yield event + while self.scanner.check(DocumentEndToken): + self.scanner.get() + + # Parse end of stream. + token = self.scanner.get() + yield StreamEndEvent(token.start_marker, token.end_marker) + + def process_directives(self): + # DIRECTIVE* + self.yaml_version = None + self.tag_handles = {} + while self.scanner.check(DirectiveToken): + token = self.scanner.get() + if token.name == u'YAML': + if self.yaml_version is not None: + raise ParserError(None, None, + "found duplicate YAML directive", token.start_marker()) + major, minor = token.value + if major != 1: + raise ParserError(None, None, + "found incompatible YAML document (version 1.* is required)", + token.start_marker()) + self.yaml_version = token.value + elif token.name == u'TAG': + handle, prefix = token.value + if handle in self.tag_handles: + raise ParserError(None, None, + "duplicate tag handle %r" % handle.encode('utf-8'), + token.start_marker()) + self.tag_handles[handle] = prefix + for key in self.DEFAULT_TAGS: + if key not in self.tag_handles: + self.tag_handles[key] = self.DEFAULT_TAGS[key] def parse_block_node(self): return self.parse_node(block=True) @@ -124,165 +196,254 @@ class Parser: return self.parse_node(block=True, indentless_sequence=True) def parse_node(self, block=False, indentless_sequence=False): - if self.is_token(AliasToken): - token = self.get_token() - return AliasNode(token.value) - anchor = None - tag = None - if self.is_token(AnchorToken): - anchor = self.get_token().value - if self.is_token(TagToken): - tag = self.get_token().value - elif self.is_token(TagToken): - tag = self.get_token().value - if self.is_token(AnchorToken): - anchor = self.get_token().value - if indentless_sequence and self.is_token(EntryToken): - NodeClass = SequenceNode - value = self.parse_indentless_sequence() + # block_node ::= ALIAS | properties? block_content + # flow_node ::= ALIAS | properties? flow_content + # properties ::= TAG ANCHOR? | ANCHOR TAG? + # block_content ::= block_collection | flow_collection | SCALAR + # flow_content ::= flow_collection | SCALAR + # block_collection ::= block_sequence | block_mapping + # block_node_or_indentless_sequence ::= ALIAS | properties? + # (block_content | indentless_block_sequence) + if self.scanner.check(AliasToken): + token = self.scanner.get() + yield AliasEvent(token.value, token.start_marker, token.end_marker) else: - if self.is_token(ScalarToken): - NodeClass = ScalarNode - elif self.is_token(BlockSequenceStartToken, FlowSequenceStartToken): - NodeClass = SequenceNode - elif self.is_token(BlockMappingStartToken, FlowMappingStartToken): - NodeClass = MappingNode - if block: - value = self.parse_block_content() + anchor = None + tag = None + start_marker = end_marker = tag_marker = None + if self.scanner.check(AnchorToken): + token = self.scanner.get() + start_marker = end_marker = token.start_marker + anchor = token.value + if self.scanner.check(TagToken): + token = self.scanner.get() + end_marker = tag_marker = token.start_marker + tag = token.value + elif self.scanner.check(TagToken): + token = self.scanner.get() + start_marker = end_marker = tag_marker = token.start_marker + tag = token.value + if self.scanner.check(AnchorToken): + token = self.scanner.get() + end_marker = token.start_marker + anchor = token.value + if tag is not None: + handle, suffix = tag + if handle is not None: + if handle not in self.tag_handles: + raise ParserError("while parsing a node", start_marker, + "found undefined tag handle %r" % handle.encode('utf-8'), + tag_marker) + tag = self.tag_handles[handle]+suffix + else: + tag = suffix + if tag is None: + if not (self.scanner.check(ScalarToken) and + self.scanner.peek().plain): + tag = u'!' + if start_marker is None: + start_marker = self.scanner.peek().start_marker + event = None + collection_events = None + if indentless_sequence and self.scanner.check(BlockEntryToken): + end_marker = self.scanner.peek().end_marker + event = SequenceEvent(anchor, tag, start_marker, end_marker) + collection_events = self.parse_indentless_sequence() else: - value = self.parse_flow_content() - return NodeClass(anchor, tag, value) - - def parse_block_content(self): - if self.is_token(ScalarToken): - return self.get_token().value - elif self.is_token(BlockSequenceStartToken): - return self.parse_block_sequence() - elif self.is_token(BlockMappingStartToken): - return self.parse_block_mapping() - elif self.is_token(FlowSequenceStartToken): - return self.parse_flow_sequence() - elif self.is_token(FlowMappingStartToken): - return self.parse_flow_mapping() - else: - self.fail('block content is expected') - - def parse_flow_content(self): - if self.is_token(ScalarToken): - return self.get_token().value - elif self.is_token(FlowSequenceStartToken): - return self.parse_flow_sequence() - elif self.is_token(FlowMappingStartToken): - return self.parse_flow_mapping() - else: - self.fail('flow content is expected') + if self.scanner.check(ScalarToken): + token = self.scanner.get() + end_marker = token.end_marker + event = ScalarEvent(anchor, tag, token.value, + start_marker, end_marker) + elif self.scanner.check(FlowSequenceStartToken): + end_marker = self.scanner.peek().end_marker + event = SequenceEvent(anchor, tag, start_marker, end_marker) + collection_events = self.parse_flow_sequence() + elif self.scanner.check(FlowMappingStartToken): + end_marker = self.scanner.peek().end_marker + event = MappingEvent(anchor, tag, start_marker, end_marker) + collection_events = self.parse_flow_mapping() + elif block and self.scanner.check(BlockSequenceStartToken): + end_marker = self.scanner.peek().start_marker + event = SequenceEvent(anchor, tag, start_marker, end_marker) + collection_events = self.parse_block_sequence() + elif block and self.scanner.check(BlockMappingStartToken): + end_marker = self.scanner.peek().start_marker + event = MappingEvent(anchor, tag, start_marker, end_marker) + collection_events = self.parse_block_mapping() + else: + if block: + node = 'block' + else: + node = 'flow' + token = self.scanner.peek() + raise ParserError("while scanning a %s node" % node, start_marker, + "expected the node content, but found %r" % token.id, + token.start_marker) + yield event + if collection_events is not None: + for event in collection_events: + yield event def parse_block_sequence(self): - sequence = [] - if not self.is_token(BlockSequenceStartToken): - self.fail('BLOCK-SEQUENCE-START is expected') - self.get_token() - while self.is_token(EntryToken): - self.get_token() - if not self.is_token(EntryToken, BlockEndToken): - sequence.append(self.parse_block_node()) + # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END + token = self.scanner.get() + start_marker = token.start_marker + while self.scanner.check(BlockEntryToken): + token = self.scanner.get() + if not self.scanner.check(BlockEntryToken, BlockEndToken): + for event in self.parse_block_node(): + yield event else: - sequence.append(None) - if not self.is_token(BlockEndToken): - self.fail('BLOCK-END is expected') - self.get_token() - return sequence + yield self.process_empty_scalar(token.end_marker) + if not self.scanner.check(BlockEndToken): + token = self.scanner.peek() + raise ParserError("while scanning a block collection", start_marker, + "expected <block end>, but found %r" % token.id, token.start_marker) + token = self.scanner.get() + yield CollectionEndEvent(token.start_marker, token.end_marker) def parse_indentless_sequence(self): - sequence = [] - while self.is_token(EntryToken): - self.get_token() - if not self.is_token(EntryToken): - sequence.append(self.parse_block_node()) + # (BLOCK-ENTRY block_node?)+ + while self.scanner.check(BlockEntryToken): + token = self.scanner.get() + if not self.scanner.check(BlockEntryToken, + KeyToken, ValueToken, BlockEndToken): + for event in self.parse_block_node(): + yield event else: - sequence.append(None) - return sequence + yield self.process_empty_scalar(token.end_marker) + token = self.scanner.peek() + yield CollectionEndEvent(token.start_marker, token.start_marker) def parse_block_mapping(self): - mapping = [] - if not self.is_token(BlockMappingStartToken): - self.fail('BLOCK-MAPPING-START is expected') - self.get_token() - while self.is_token(KeyToken, ValueToken): - key = None - value = None - if self.is_token(KeyToken): - self.get_token() - if not self.is_token(KeyToken, ValueToken, BlockEndToken): - key = self.parse_block_node_or_indentless_sequence() - if self.is_token(ValueToken): - self.get_token() - if not self.is_token(KeyToken, ValueToken, BlockEndToken): - value = self.parse_block_node_or_indentless_sequence() - mapping.append((key, value)) - if not self.is_token(BlockEndToken): - self.fail('BLOCK-END is expected') - self.get_token() - return mapping + # BLOCK-MAPPING_START + # ((KEY block_node_or_indentless_sequence?)? + # (VALUE block_node_or_indentless_sequence?)?)* + # BLOCK-END + token = self.scanner.get() + start_marker = token.start_marker + while self.scanner.check(KeyToken, ValueToken): + if self.scanner.check(KeyToken): + token = self.scanner.get() + if not self.scanner.check(KeyToken, ValueToken, BlockEndToken): + for event in self.parse_block_node_or_indentless_sequence(): + yield event + else: + yield self.process_empty_scalar(token.end_marker) + if self.scanner.check(ValueToken): + token = self.scanner.get() + if not self.scanner.check(KeyToken, ValueToken, BlockEndToken): + for event in self.parse_block_node_or_indentless_sequence(): + yield event + else: + yield self.process_empty_scalar(token.end_marker) + else: + token = self.scanner.peek() + yield self.process_empty_scalar(token.start_marker) + if not self.scanner.check(BlockEndToken): + token = self.scanner.peek() + raise ParserError("while scanning a block mapping", start_marker, + "expected <block end>, but found %r" % token.id, token.start_marker) + token = self.scanner.get() + yield CollectionEndEvent(token.start_marker, token.end_marker) def parse_flow_sequence(self): - sequence = [] - if not self.is_token(FlowSequenceStartToken): - self.fail('FLOW-SEQUENCE-START is expected') - self.get_token() - while not self.is_token(FlowSequenceEndToken): - if self.is_token(KeyToken): - self.get_token() - key = None - value = None - if not self.is_token(ValueToken): - key = self.parse_flow_node() - if self.is_token(ValueToken): - self.get_token() - if not self.is_token(EntryToken, FlowSequenceEndToken): - value = self.parse_flow_node() - node = MappingNode(None, None, [(key, value)]) - sequence.append(node) + # flow_sequence ::= FLOW-SEQUENCE-START + # (flow_sequence_entry FLOW-ENTRY)* + # flow_sequence_entry? + # FLOW-SEQUENCE-END + # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? + # + # Note that while production rules for both flow_sequence_entry and + # flow_mapping_entry are equal, their interpretations are different. + # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?` + # generate an inline mapping (set syntax). + token = self.scanner.get() + start_marker = token.start_marker + while not self.scanner.check(FlowSequenceEndToken): + if self.scanner.check(KeyToken): + token = self.scanner.get() + yield MappingEvent(None, u'!', + token.start_marker, token.end_marker) + if not self.scanner.check(ValueToken, + FlowEntryToken, FlowSequenceEndToken): + for event in self.parse_flow_node(): + yield event + else: + yield self.process_empty_scalar(token.end_marker) + if self.scanner.check(ValueToken): + token = self.scanner.get() + if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken): + for event in self.parse_flow_node(): + yield event + else: + yield self.process_empty_scalar(token.end_marker) + else: + token = self.scanner.peek() + yield self.process_empty_scalar(token.start_marker) + token = self.scanner.peek() + yield CollectionEndEvent(token.start_marker, token.start_marker) else: - sequence.append(self.parse_flow_node()) - if not self.is_token(EntryToken, FlowSequenceEndToken): - self.fail("ENTRY or FLOW-SEQUENCE-END are expected") - if self.is_token(EntryToken): - self.get_token() - if not self.is_token(FlowSequenceEndToken): - self.fail('FLOW-SEQUENCE-END is expected') - self.get_token() - return sequence + for event in self.parse_flow_node(): + yield event + if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken): + token = self.scanner.peek() + raise ParserError("while scanning a flow sequence", start_marker, + "expected ',' or ']', but got %r" % token.id, token.start_marker) + if self.scanner.check(FlowEntryToken): + self.scanner.get() + if not self.scanner.check(FlowSequenceEndToken): + token = self.scanner.peek() + raise ParserError("while scanning a flow sequence", start_marker, + "expected ']', but found %r" % token.id, token.start_marker) + token = self.scanner.get() + yield CollectionEndEvent(token.start_marker, token.end_marker) def parse_flow_mapping(self): - mapping = [] - if not self.is_token(FlowMappingStartToken): - self.fail('FLOW-MAPPING-START is expected') - self.get_token() - while not self.is_token(FlowMappingEndToken): - if self.is_token(KeyToken): - self.get_token() - key = None - value = None - if not self.is_token(ValueToken): - key = self.parse_flow_node() - if self.is_token(ValueToken): - self.get_token() - if not self.is_token(EntryToken, FlowMappingEndToken): - value = self.parse_flow_node() - mapping.append((key, value)) + # flow_mapping ::= FLOW-MAPPING-START + # (flow_mapping_entry FLOW-ENTRY)* + # flow_mapping_entry? + # FLOW-MAPPING-END + # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? + token = self.scanner.get() + start_marker = token.start_marker + while not self.scanner.check(FlowMappingEndToken): + if self.scanner.check(KeyToken): + token = self.scanner.get() + if not self.scanner.check(ValueToken, + FlowEntryToken, FlowMappingEndToken): + for event in self.parse_flow_node(): + yield event + else: + yield self.process_empty_scalar(token.end_marker) + if self.scanner.check(ValueToken): + token = self.scanner.get() + if not self.scanner.check(FlowEntryToken, FlowMappingEndToken): + for event in self.parse_flow_node(): + yield event + else: + yield self.process_empty_scalar(token.end_marker) + else: + token = self.scanner.peek() + yield self.process_empty_scalar(token.start_marker) else: - mapping.append((self.parse_flow_node(), None)) - if not self.is_token(EntryToken, FlowMappingEndToken): - self.fail("ENTRY or FLOW-MAPPING-END are expected") - if self.is_token(EntryToken): - self.get_token() - if not self.is_token(FlowMappingEndToken): - self.fail('FLOW-MAPPING-END is expected') - self.get_token() - return mapping + for event in self.parse_flow_node(): + yield event + yield self.process_empty_scalar(self.scanner.peek().start_marker) + if not self.scanner.check(FlowEntryToken, FlowMappingEndToken): + token = self.scanner.peek() + raise ParserError("while scanning a flow mapping", start_marker, + "expected ',' or '}', but got %r" % token.id, token.start_marker) + if self.scanner.check(FlowEntryToken): + self.scanner.get() + if not self.scanner.check(FlowMappingEndToken): + token = self.scanner.peek() + raise ParserError("while scanning a flow mapping", start_marker, + "expected '}', but found %r" % token.id, token.start_marker) + token = self.scanner.get() + yield CollectionEndEvent(token.start_marker, token.end_marker) - def fail(self, message): - marker = self.scanner.peek_token().start_marker - raise ParserError(message+':\n'+marker.get_snippet()) + def process_empty_scalar(self, marker): + return ScalarEvent(None, None, u'', marker, marker) diff --git a/lib/yaml/scanner.py b/lib/yaml/scanner.py index c83a551..220a99b 100644 --- a/lib/yaml/scanner.py +++ b/lib/yaml/scanner.py @@ -14,7 +14,6 @@ from error import YAMLError from tokens import * class ScannerError(YAMLError): - # TODO: # ScannerError: while reading a quoted string # in '...', line 5, column 10: # key: "valu\?e" @@ -23,6 +22,7 @@ class ScannerError(YAMLError): # in '...', line 5, column 15: # key: "valu\?e" # ^ + def __init__(self, context=None, context_marker=None, problem=None, problem_marker=None): self.context = context @@ -41,6 +41,8 @@ class ScannerError(YAMLError): return '\n'.join(lines) class SimpleKey: + # See below simple keys treatment. + def __init__(self, token_number, required, index, line, column, marker): self.token_number = token_number self.required = required @@ -114,23 +116,43 @@ class Scanner: # '[', or '{' tokens. self.possible_simple_keys = {} - # Two public methods. + # Public methods. + + def check(self, *choices): + # Check if the next token is one of the given types. + while self.need_more_tokens(): + self.fetch_more_tokens() + if self.tokens: + for choice in choices: + if isinstance(self.tokens[0], choice): + return True + return False - def peek_token(self): - """Get the current token.""" + def peek(self): + # Return the next token, but do not delete if from the queue. while self.need_more_tokens(): self.fetch_more_tokens() if self.tokens: return self.tokens[0] - def get_token(self): - "Get the current token and remove it from the list of pending tokens.""" + def get(self): + # Return the next token. while self.need_more_tokens(): self.fetch_more_tokens() if self.tokens: self.tokens_taken += 1 return self.tokens.pop(0) + def __iter__(self): + # Iterator protocol. + while self.need_more_tokens(): + self.fetch_more_tokens() + while self.tokens: + self.tokens_taken += 1 + yield self.tokens.pop(0) + while self.need_more_tokens(): + self.fetch_more_tokens() + # Private methods. def need_more_tokens(self): @@ -163,10 +185,6 @@ class Scanner: if ch == u'\0': return self.fetch_stream_end() - # Is it the byte order mark? - if ch == u'\uFEFF': - return self.fetch_bom() - # Is it a directive? if ch == u'%' and self.check_directive(): return self.fetch_directive() @@ -197,9 +215,13 @@ class Scanner: if ch == u'}': return self.fetch_flow_mapping_end() - # Is it the entry indicator? - if ch in u'-,' and self.check_entry(): - return self.fetch_entry() + # Is it the flow entry indicator? + if ch in u',': + return self.fetch_flow_entry() + + # Is it the block entry indicator? + if ch in u'-' and self.check_block_entry(): + return self.fetch_block_entry() # Is it the key indicator? if ch == u'?' and self.check_key(): @@ -364,33 +386,6 @@ class Scanner: # The reader is ended. self.done = True - def fetch_bom(self): - # We consider the BOM marker as a DOCUMENT-END indicator unless it's - # the first character in the stream. It's a reasonable approximation - # of the specification requirements. We can follow the specification - # literally, but it will require a new token class. Probably later. - - # We ignore BOM if it is the first character in the stream. - if self.reader.index == 0: - slef.reader.forward() - - # Otherwise we issue DOCUMENT-END. - else: - - # Set the current intendation to -1. - self.unwind_indent(-1) - - # Reset simple keys. Note that there could not be a block - # collection after BOM. - self.remove_possible_simple_key() - self.allow_simple_key = False - - # Add DOCUMENT-END. - start_marker = self.reader.get_marker() - self.reader.forward() - end_marker = self.reader.get_marker() - self.tokens.append(DocumentEndToken(start_marker, end_marker)) - def fetch_directive(self): # Set the current intendation to -1. @@ -471,7 +466,21 @@ class Scanner: end_marker = self.reader.get_marker() self.tokens.append(TokenClass(start_marker, end_marker)) - def fetch_entry(self): + def fetch_flow_entry(self): + + # Simple keys are allowed after ','. + self.allow_simple_key = True + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add FLOW-ENTRY. + start_marker = self.reader.get_marker() + self.reader.forward() + end_marker = self.reader.get_marker() + self.tokens.append(FlowEntryToken(start_marker, end_marker)) + + def fetch_block_entry(self): # Block context needs additional checks. if not self.flow_level: @@ -487,17 +496,22 @@ class Scanner: marker = self.reader.get_marker() self.tokens.append(BlockSequenceStartToken(marker, marker)) - # Simple keys are allowed after '-' and ','. + # It's an error for the block entry to occur in the flow context, + # but we let the parser detect this. + else: + pass + + # Simple keys are allowed after '-'. self.allow_simple_key = True # Reset possible simple key on the current level. self.remove_possible_simple_key() - # Add ENTRY. + # Add BLOCK-ENTRY. start_marker = self.reader.get_marker() self.reader.forward() end_marker = self.reader.get_marker() - self.tokens.append(EntryToken(start_marker, end_marker)) + self.tokens.append(BlockEntryToken(start_marker, end_marker)) def fetch_key(self): @@ -681,16 +695,10 @@ class Scanner: and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': return True - def check_entry(self): + def check_block_entry(self): - # ENTRY(flow context): ',' - if self.flow_level: - return self.reader.peek() == u',' - - # ENTRY(block context): '-' (' '|'\n') - else: - return self.reader.peek() == u'-' \ - and self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' + # BLOCK-ENTRY: '-' (' '|'\n') + return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' def check_key(self): @@ -737,6 +745,12 @@ class Scanner: # We ignore spaces, line breaks and comments. # If we find a line break in the block context, we set the flag # `allow_simple_key` on. + # The byte order mark is stripped if it's the first character in the + # stream. We do not yet support BOM inside the stream as the + # specification requires. Any such mark will be considered as a part + # of the document. + if self.reader.index == 0 and self.reader.peek() == u'\uFEFF': + self.reader.forward() found = False while not found: while self.reader.peek() == u' ': @@ -980,25 +994,25 @@ class Scanner: # Unfortunately, folding rules are ambiguous. # # This is the folding according to the specification: + + if folded and line_break == u'\n' \ + and leading_non_space and self.reader.peek() not in u' \t': + if not breaks: + chunks.append(u' ') + else: + chunks.append(line_break) + + # This is Clark Evans's interpretation (also in the spec + # examples): # - #if folded and line_break == u'\n' \ - # and leading_non_space and self.reader.peek() not in u' \t': + #if folded and line_break == u'\n': # if not breaks: - # chunks.append(u' ') + # if self.reader.peek() not in ' \t': + # chunks.append(u' ') + # else: + # chunks.append(line_break) #else: # chunks.append(line_break) - # - # This is Clark Evans's interpretation (also in the spec - # examples): - # - if folded and line_break == u'\n': - if not breaks: - if self.reader.peek() not in ' \t': - chunks.append(u' ') - else: - chunks.append(line_break) - else: - chunks.append(line_break) else: break diff --git a/lib/yaml/tokens.py b/lib/yaml/tokens.py index 85d0b30..637ec87 100644 --- a/lib/yaml/tokens.py +++ b/lib/yaml/tokens.py @@ -3,9 +3,19 @@ class Token: def __init__(self, start_marker, end_marker): self.start_marker = start_marker self.end_marker = end_marker + def __repr__(self): + attributes = [key for key in self.__dict__ + if not key.endswith('_marker')] + attributes.sort() + arguments = ', '.join(['%s=%r' % (key, getattr(self, key)) + for key in attributes]) + return '%s(%s)' % (self.__class__.__name__, arguments) + +#class BOMToken(Token): +# id = '<byte order mark>' class DirectiveToken(Token): - code = '<directive>' + id = '<directive>' def __init__(self, name, value, start_marker, end_marker): self.name = name self.value = value @@ -13,67 +23,70 @@ class DirectiveToken(Token): self.end_marker = end_marker class DocumentStartToken(Token): - code = '<document start>' + id = '<document start>' class DocumentEndToken(Token): - code = '<document end>' + id = '<document end>' class StreamEndToken(Token): - code = '<stream end>' + id = '<stream end>' class BlockSequenceStartToken(Token): - code = '<block sequence start>' + id = '<block sequence start>' class BlockMappingStartToken(Token): - code = '<block mapping end>' + id = '<block mapping end>' class BlockEndToken(Token): - code = '<block end>' + id = '<block end>' class FlowSequenceStartToken(Token): - code = '[' + id = '[' class FlowMappingStartToken(Token): - code = '{' + id = '{' class FlowSequenceEndToken(Token): - code = ']' + id = ']' class FlowMappingEndToken(Token): - code = '}' + id = '}' class KeyToken(Token): - code = '?' + id = '?' class ValueToken(Token): - code = ':' + id = ':' + +class BlockEntryToken(Token): + id = '-' -class EntryToken(Token): - code = '- or ,' +class FlowEntryToken(Token): + id = ',' class AliasToken(Token): - code = '<alias>' + id = '<alias>' def __init__(self, value, start_marker, end_marker): self.value = value self.start_marker = start_marker self.end_marker = end_marker class AnchorToken(Token): - code = '<anchor>' + id = '<anchor>' def __init__(self, value, start_marker, end_marker): self.value = value self.start_marker = start_marker self.end_marker = end_marker class TagToken(Token): - code = '<tag>' + id = '<tag>' def __init__(self, value, start_marker, end_marker): self.value = value self.start_marker = start_marker self.end_marker = end_marker class ScalarToken(Token): - code = '<scalar>' + id = '<scalar>' def __init__(self, value, plain, start_marker, end_marker): self.value = value self.plain = plain diff --git a/tests/data/spec-05-08.canonical b/tests/data/spec-05-08.canonical index fd8af6a..610bd68 100644 --- a/tests/data/spec-05-08.canonical +++ b/tests/data/spec-05-08.canonical @@ -1,8 +1,8 @@ %YAML 1.1 --- !!map { - ? !!str "double" - : !!str "text", ? !!str "single" : !!str "text", + ? !!str "double" + : !!str "text", } diff --git a/tests/data/spec-06-01.canonical b/tests/data/spec-06-01.canonical index a1e43ff..f17ec92 100644 --- a/tests/data/spec-06-01.canonical +++ b/tests/data/spec-06-01.canonical @@ -8,8 +8,8 @@ ? !!str "Flow style" : !!seq [ !!str "By two", + !!str "Also by two", !!str "Still by two", - !!str "Again by two", ] } } diff --git a/tests/data/spec-09-20.canonical b/tests/data/spec-09-20.canonical index 3f697e2..d03bef5 100644 --- a/tests/data/spec-09-20.canonical +++ b/tests/data/spec-09-20.canonical @@ -4,5 +4,5 @@ !!str "detected\n", !!str "\n\n# detected\n", !!str " explicit\n", - !!str "\t detected\n", + !!str "\t\ndetected\n", ] diff --git a/tests/data/spec-09-30.canonical b/tests/data/spec-09-30.canonical index 5c32f16..fc37db1 100644 --- a/tests/data/spec-09-30.canonical +++ b/tests/data/spec-09-30.canonical @@ -1,7 +1,7 @@ %YAML 1.1 --- !!str "folded line\n\ - next line\n\ + next line\n\n\ \ * bullet\n\ - \ * list\n\ + \ * list\n\n\ last line\n" diff --git a/tests/data/spec-09-31.canonical b/tests/data/spec-09-31.canonical index 5c32f16..fc37db1 100644 --- a/tests/data/spec-09-31.canonical +++ b/tests/data/spec-09-31.canonical @@ -1,7 +1,7 @@ %YAML 1.1 --- !!str "folded line\n\ - next line\n\ + next line\n\n\ \ * bullet\n\ - \ * list\n\ + \ * list\n\n\ last line\n" diff --git a/tests/data/spec-09-32.canonical b/tests/data/spec-09-32.canonical index 5c32f16..fc37db1 100644 --- a/tests/data/spec-09-32.canonical +++ b/tests/data/spec-09-32.canonical @@ -1,7 +1,7 @@ %YAML 1.1 --- !!str "folded line\n\ - next line\n\ + next line\n\n\ \ * bullet\n\ - \ * list\n\ + \ * list\n\n\ last line\n" diff --git a/tests/data/spec-09-33.canonical b/tests/data/spec-09-33.canonical index 5c32f16..fc37db1 100644 --- a/tests/data/spec-09-33.canonical +++ b/tests/data/spec-09-33.canonical @@ -1,7 +1,7 @@ %YAML 1.1 --- !!str "folded line\n\ - next line\n\ + next line\n\n\ \ * bullet\n\ - \ * list\n\ + \ * list\n\n\ last line\n" diff --git a/tests/test_appliance.py b/tests/test_appliance.py index c471398..6879036 100644 --- a/tests/test_appliance.py +++ b/tests/test_appliance.py @@ -1,6 +1,9 @@ import unittest, os +from yaml.tokens import * +from yaml.events import * + class TestAppliance(unittest.TestCase): DATA = 'tests/data' @@ -32,96 +35,12 @@ class TestAppliance(unittest.TestCase): setattr(cls, test_method.__name__, test_method) add_tests = classmethod(add_tests) -class Node: - def __repr__(self): - args = [] - for attribute in ['anchor', 'tag', 'value']: - if hasattr(self, attribute): - args.append(repr(getattr(self, attribute))) - return "%s(%s)" % (self.__class__.__name__, ', '.join(args)) - -class AliasNode(Node): - def __init__(self, anchor): - self.anchor = anchor - -class ScalarNode(Node): - def __init__(self, anchor, tag, value): - self.anchor = anchor - self.tag = tag - self.value = value - -class SequenceNode(Node): - def __init__(self, anchor, tag, value): - self.anchor = anchor - self.tag = tag - self.value = value - -class MappingNode(Node): - def __init__(self, anchor, tag, value): - self.anchor = anchor - self.tag = tag - self.value = value - -class Token: - def __repr__(self): - args = [] - if hasattr(self, 'value'): - args.append(repr(self.value)) - return "%s(%s)" % (self.__class__.__name__, ''.join(args)) - -class StreamEndToken(Token): - pass - -class DirectiveToken(Token): - pass - -class DocumentStartToken(Token): - pass - -class SequenceStartToken(Token): - pass - -class MappingStartToken(Token): - pass - -class SequenceEndToken(Token): - pass - -class MappingEndToken(Token): - pass - -class KeyToken(Token): - pass - -class ValueToken(Token): - pass - -class EntryToken(Token): - pass - -class AliasToken(Token): - def __init__(self, value): - self.value = value - -class AnchorToken(Token): - def __init__(self, value): - self.value = value - -class TagToken(Token): - def __init__(self, value): - self.value = value - -class ScalarToken(Token): - def __init__(self, value): - self.value = value - class Error(Exception): pass class CanonicalScanner: - def __init__(self, source, data): - self.source = source + def __init__(self, data): self.data = unicode(data, 'utf-8')+u'\0' self.index = 0 @@ -132,34 +51,34 @@ class CanonicalScanner: self.find_token() ch = self.data[self.index] if ch == u'\0': - tokens.append(StreamEndToken()) + tokens.append(StreamEndToken(None, None)) break elif ch == u'%': tokens.append(self.scan_directive()) elif ch == u'-' and self.data[self.index:self.index+3] == u'---': self.index += 3 - tokens.append(DocumentStartToken()) + tokens.append(DocumentStartToken(None, None)) elif ch == u'[': self.index += 1 - tokens.append(SequenceStartToken()) + tokens.append(FlowSequenceStartToken(None, None)) elif ch == u'{': self.index += 1 - tokens.append(MappingStartToken()) + tokens.append(FlowMappingStartToken(None, None)) elif ch == u']': self.index += 1 - tokens.append(SequenceEndToken()) + tokens.append(FlowSequenceEndToken(None, None)) elif ch == u'}': self.index += 1 - tokens.append(MappingEndToken()) + tokens.append(FlowMappingEndToken(None, None)) elif ch == u'?': self.index += 1 - tokens.append(KeyToken()) + tokens.append(KeyToken(None, None)) elif ch == u':': self.index += 1 - tokens.append(ValueToken()) + tokens.append(ValueToken(None, None)) elif ch == u',': self.index += 1 - tokens.append(EntryToken()) + tokens.append(FlowEntryToken(None, None)) elif ch == u'*' or ch == u'&': tokens.append(self.scan_alias()) elif ch == u'!': @@ -176,7 +95,7 @@ class CanonicalScanner: if self.data[self.index:self.index+len(self.DIRECTIVE)] == self.DIRECTIVE and \ self.data[self.index+len(self.DIRECTIVE)] in u' \n\0': self.index += len(self.DIRECTIVE) - return DirectiveToken() + return DirectiveToken('YAML', (1, 1), None, None) def scan_alias(self): if self.data[self.index] == u'*': @@ -188,7 +107,7 @@ class CanonicalScanner: while self.data[self.index] not in u', \n\0': self.index += 1 value = self.data[start:self.index] - return TokenClass(value) + return TokenClass(value, None, None) def scan_tag(self): self.index += 1 @@ -198,9 +117,11 @@ class CanonicalScanner: value = self.data[start:self.index] if value[0] == u'!': value = 'tag:yaml.org,2002:'+value[1:] - else: + elif value[0] == u'<' and value[-1] == u'>': value = value[1:-1] - return TagToken(value) + else: + value = u'!'+value + return TagToken(value, None, None) QUOTE_CODES = { 'x': 2, @@ -264,7 +185,7 @@ class CanonicalScanner: self.index += 1 chunks.append(self.data[start:self.index]) self.index += 1 - return ScalarToken(u''.join(chunks)) + return ScalarToken(u''.join(chunks), False, None, None) def find_token(self): found = False @@ -281,83 +202,79 @@ class CanonicalScanner: class CanonicalParser: - def __init__(self, source, data): - self.scanner = CanonicalScanner(source, data) + def __init__(self, data): + self.scanner = CanonicalScanner(data) + self.events = [] # stream: document* END def parse_stream(self): - documents = [] while not self.test_token(StreamEndToken): if self.test_token(DirectiveToken, DocumentStartToken): - documents.append(self.parse_document()) + self.parse_document() else: raise Error("document is expected, got "+repr(self.tokens[self.index])) - return documents + self.events.append(StreamEndEvent(None, None)) - # document: DIRECTIVE? DOCUMENT-START node? + # document: DIRECTIVE? DOCUMENT-START node def parse_document(self): node = None if self.test_token(DirectiveToken): self.consume_token(DirectiveToken) self.consume_token(DocumentStartToken) - if self.test_token(TagToken, AliasToken, AnchorToken, TagToken, - SequenceStartToken, MappingStartToken, ScalarToken): - node = self.parse_node() - return node + self.parse_node() # node: ALIAS | ANCHOR? TAG? (SCALAR|sequence|mapping) def parse_node(self): if self.test_token(AliasToken): - return AliasNode(self.get_value()) + self.events.append(AliasEvent(self.get_value(), None, None)) else: anchor = None if self.test_token(AnchorToken): anchor = self.get_value() - tag = None + tag = u'!' if self.test_token(TagToken): tag = self.get_value() if self.test_token(ScalarToken): - return ScalarNode(anchor, tag, self.get_value()) - elif self.test_token(SequenceStartToken): - return SequenceNode(anchor, tag, self.parse_sequence()) - elif self.test_token(MappingStartToken): - return MappingNode(anchor, tag, self.parse_mapping()) + self.events.append(ScalarEvent(anchor, tag, self.get_value(), None, None)) + elif self.test_token(FlowSequenceStartToken): + self.events.append(SequenceEvent(anchor, tag, None, None)) + self.parse_sequence() + elif self.test_token(FlowMappingStartToken): + self.events.append(MappingEvent(anchor, tag, None, None)) + self.parse_mapping() else: raise Error("SCALAR, '[', or '{' is expected, got "+repr(self.tokens[self.index])) # sequence: SEQUENCE-START (node (ENTRY node)*)? ENTRY? SEQUENCE-END def parse_sequence(self): - values = [] - self.consume_token(SequenceStartToken) - if not self.test_token(SequenceEndToken): - values.append(self.parse_node()) - while not self.test_token(SequenceEndToken): - self.consume_token(EntryToken) - if not self.test_token(SequenceEndToken): - values.append(self.parse_node()) - self.consume_token(SequenceEndToken) - return values + self.consume_token(FlowSequenceStartToken) + if not self.test_token(FlowSequenceEndToken): + self.parse_node() + while not self.test_token(FlowSequenceEndToken): + self.consume_token(FlowEntryToken) + if not self.test_token(FlowSequenceEndToken): + self.parse_node() + self.consume_token(FlowSequenceEndToken) + self.events.append(CollectionEndEvent(None, None)) # mapping: MAPPING-START (map_entry (ENTRY map_entry)*)? ENTRY? MAPPING-END def parse_mapping(self): - values = [] - self.consume_token(MappingStartToken) - if not self.test_token(MappingEndToken): - values.append(self.parse_map_entry()) - while not self.test_token(MappingEndToken): - self.consume_token(EntryToken) - if not self.test_token(MappingEndToken): - values.append(self.parse_map_entry()) - self.consume_token(MappingEndToken) - return values + self.consume_token(FlowMappingStartToken) + if not self.test_token(FlowMappingEndToken): + self.parse_map_entry() + while not self.test_token(FlowMappingEndToken): + self.consume_token(FlowEntryToken) + if not self.test_token(FlowMappingEndToken): + self.parse_map_entry() + self.consume_token(FlowMappingEndToken) + self.events.append(CollectionEndEvent(None, None)) # map_entry: KEY node VALUE node def parse_map_entry(self): self.consume_token(KeyToken) - key = self.parse_node() + self.parse_node() self.consume_token(ValueToken) - value = self.parse_node() - return (key, value) + self.parse_node() def test_token(self, *choices): for choice in choices: @@ -378,5 +295,6 @@ class CanonicalParser: def parse(self): self.tokens = self.scanner.scan() self.index = 0 - return self.parse_stream() + self.parse_stream() + return self.events diff --git a/tests/test_canonical.py b/tests/test_canonical.py index add1f8e..7fa85dc 100644 --- a/tests/test_canonical.py +++ b/tests/test_canonical.py @@ -5,16 +5,17 @@ class TestCanonicalAppliance(test_appliance.TestAppliance): def _testCanonicalScanner(self, test_name, canonical_filename): data = file(canonical_filename, 'rb').read() - scanner = test_appliance.CanonicalScanner(canonical_filename, data) + scanner = test_appliance.CanonicalScanner(data) tokens = scanner.scan() - #print tokens + #for token in tokens: + # print token def _testCanonicalParser(self, test_name, canonical_filename): data = file(canonical_filename, 'rb').read() - parser = test_appliance.CanonicalParser(canonical_filename, data) - documents = parser.parse() - #for document in documents: - # print document + parser = test_appliance.CanonicalParser(data) + events = parser.parse() + #for event in events: + # print event TestCanonicalAppliance.add_tests('testCanonicalScanner', '.canonical') TestCanonicalAppliance.add_tests('testCanonicalParser', '.canonical') diff --git a/tests/test_errors.py b/tests/test_errors.py index 2b6e9a2..431258b 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -18,14 +18,12 @@ class TestErrors(test_appliance.TestAppliance): def _load(self, filename): reader = Reader(file(filename, 'rb')) scanner = Scanner(reader) - while scanner.peek_token(): - scanner.get_token() + return list(scanner) def _load_string(self, filename): reader = Reader(file(filename, 'rb').read()) scanner = Scanner(reader) - while scanner.peek_token(): - scanner.get_token() + return list(scanner) TestErrors.add_tests('testErrors', '.error-message') TestErrors.add_tests('testStringErrors', '.error-message') diff --git a/tests/test_structure.py b/tests/test_structure.py index 12c87d0..0bffbcd 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -12,8 +12,10 @@ class TestStructure(test_appliance.TestAppliance): node2 = eval(file(structure_filename, 'rb').read()) try: parser = Parser(Scanner(Reader(file(data_filename, 'rb')))) - node1 = parser.parse() - node1 = [self._convert(n) for n in node1] + node1 = [] + while not parser.check(StreamEndEvent): + node1.append(self._convert(parser)) + parser.get() if len(node1) == 1: node1 = node1[0] self.failUnlessEqual(node1, node2) @@ -25,97 +27,110 @@ class TestStructure(test_appliance.TestAppliance): print "NODE2:", node2 raise - def _convert(self, node): - if isinstance(node, ScalarNode): - return True - elif isinstance(node, SequenceNode): + def _convert(self, parser): + if parser.check(ScalarEvent): + event = parser.get() + if event.tag or event.anchor or event.value: + return True + else: + return None + elif parser.check(SequenceEvent): + parser.get() sequence = [] - for item in node.value: - sequence.append(self._convert(item)) + while not parser.check(CollectionEndEvent): + sequence.append(self._convert(parser)) + parser.get() return sequence - elif isinstance(node, MappingNode): + elif parser.check(MappingEvent): + parser.get() mapping = [] - for key, value in node.value: - mapping.append((self._convert(key), self._convert(value))) + while not parser.check(CollectionEndEvent): + key = self._convert(parser) + value = self._convert(parser) + mapping.append((key, value)) + parser.get() return mapping - elif isinstance(node, AliasNode): + elif parser.check(AliasEvent): + parser.get() return '*' else: - return node + parser.get() + return '?' TestStructure.add_tests('testStructure', '.data', '.structure') class TestParser(test_appliance.TestAppliance): def _testParser(self, test_name, data_filename, canonical_filename): - documents1 = None - documents2 = None + events1 = None + events2 = None try: parser = Parser(Scanner(Reader(file(data_filename, 'rb')))) - documents1 = parser.parse() - canonical = test_appliance.CanonicalParser(canonical_filename, file(canonical_filename, 'rb').read()) - documents2 = canonical.parse() - self._compare(documents1, documents2) + events1 = list(iter(parser)) + canonical = test_appliance.CanonicalParser(file(canonical_filename, 'rb').read()) + events2 = canonical.parse() + self._compare(events1, events2) except: print print "DATA1:" print file(data_filename, 'rb').read() print "DATA2:" print file(canonical_filename, 'rb').read() - print "DOCUMENTS1:", documents1 - print "DOCUMENTS2:", documents2 + print "EVENTS1:", events1 + print "EVENTS2:", events2 raise - def _compare(self, value1, value2): - if value1 is None and hasattr(value2, 'tag') and value2.tag == 'tag:yaml.org,2002:null': - return - self.failUnlessEqual(type(value1), type(value2)) - if isinstance(value1, list) or isinstance(value1, tuple): - self.failUnlessEqual(len(value1), len(value2)) - for item1, item2 in zip(value1, value2): - self._compare(item1, item2) - else: - self.failUnlessEqual(value1.__class__.__name__, value2.__class__.__name__) - if isinstance(value1, SequenceNode): # or isinstance(value1, MappingNode): - self._compare(value1.value, value2.value) - elif isinstance(value1, ScalarNode): - self.failUnlessEqual(value1.value, value2.value) + def _compare(self, events1, events2): + self.failUnlessEqual(len(events1), len(events2)) + for event1, event2 in zip(events1, events2): + self.failUnlessEqual(event1.__class__, event2.__class__) + if isinstance(event1, AliasEvent): + #self.failUnlessEqual(event1.name, event2.name) + pass + elif isinstance(event1, ScalarEvent): + #self.failUnlessEqual(event1.anchor, event2.anchor) + #self.failUnlessEqual(event1.tag, event2.tag) + self.failUnlessEqual(event1.value, event2.value) + if isinstance(event1, CollectionEvent): + #self.failUnlessEqual(event1.anchor, event2.anchor) + #self.failUnlessEqual(event1.tag, event2.tag) + pass + TestParser.add_tests('testParser', '.data', '.canonical') class TestParserOnCanonical(test_appliance.TestAppliance): def _testParserOnCanonical(self, test_name, canonical_filename): - documents1 = None - documents2 = None + events1 = None + events2 = None try: parser = Parser(Scanner(Reader(file(canonical_filename, 'rb')))) - documents1 = parser.parse() - canonical = test_appliance.CanonicalParser(canonical_filename, file(canonical_filename, 'rb').read()) - documents2 = canonical.parse() - self._compare(documents1, documents2) + events1 = list(iter(parser)) + canonical = test_appliance.CanonicalParser(file(canonical_filename, 'rb').read()) + events2 = canonical.parse() + self._compare(events1, events2) except: print print "DATA:" print file(canonical_filename, 'rb').read() - print "DOCUMENTS1:", documents1 - print "DOCUMENTS2:", documents2 + print "EVENTS1:", events1 + print "EVENTS2:", events2 raise - def _compare(self, value1, value2): - if value1 is None and hasattr(value2, 'tag') and value2.tag == 'tag:yaml.org,2002:null': - return - self.failUnlessEqual(type(value1), type(value2)) - if isinstance(value1, list) or isinstance(value1, tuple): - self.failUnlessEqual(len(value1), len(value2)) - for item1, item2 in zip(value1, value2): - self._compare(item1, item2) - else: - self.failUnlessEqual(value1.__class__.__name__, value2.__class__.__name__) - if isinstance(value1, SequenceNode) or isinstance(value1, MappingNode): - self._compare(value1.value, value2.value) - elif isinstance(value1, ScalarNode): - self.failUnlessEqual(value1.value, value2.value) + def _compare(self, events1, events2): + self.failUnlessEqual(len(events1), len(events2)) + for event1, event2 in zip(events1, events2): + self.failUnlessEqual(event1.__class__, event2.__class__) + if isinstance(event1, AliasEvent): + self.failUnlessEqual(event1.name, event2.name) + elif isinstance(event1, ScalarEvent): + self.failUnlessEqual(event1.anchor, event2.anchor) + self.failUnlessEqual(event1.tag, event2.tag) + self.failUnlessEqual(event1.value, event2.value) + if isinstance(event1, CollectionEvent): + self.failUnlessEqual(event1.anchor, event2.anchor) + self.failUnlessEqual(event1.tag, event2.tag) TestParserOnCanonical.add_tests('testParserOnCanonical', '.canonical') diff --git a/tests/test_tokens.py b/tests/test_tokens.py index f5daaf2..2ccc305 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -41,7 +41,8 @@ class TestTokens(test_appliance.TestAppliance): FlowSequenceEndToken: ']', FlowMappingStartToken: '{', FlowMappingEndToken: '}', - EntryToken: ',', + BlockEntryToken: ',', + FlowEntryToken: ',', KeyToken: '?', ValueToken: ':', } @@ -52,8 +53,9 @@ class TestTokens(test_appliance.TestAppliance): try: scanner = Scanner(Reader(file(data_filename, 'rb'))) tokens1 = [] - while not isinstance(scanner.peek_token(), StreamEndToken): - tokens1.append(scanner.get_token()) + for token in scanner: + if not isinstance(token, StreamEndToken): + tokens1.append(token) tokens1 = [self.replaces[t.__class__] for t in tokens1] self.failUnlessEqual(tokens1, tokens2) except: @@ -74,8 +76,9 @@ class TestScanner(test_appliance.TestAppliance): try: scanner = Scanner(Reader(file(filename, 'rb'))) tokens = [] - while not isinstance(scanner.peek_token(), StreamEndToken): - tokens.append(scanner.get_token().__class__.__name__) + for token in scanner: + if not isinstance(token, StreamEndToken): + tokens.append(token.__class__.__name__) except: print print "DATA:" |