From c65cc8317f9cd1a432b75c7c6366680af003b5e4 Mon Sep 17 00:00:00 2001 From: xi Date: Sun, 12 Feb 2006 23:19:54 +0000 Subject: Working on the scanner. git-svn-id: http://svn.pyyaml.org/branches/pyyaml3000@39 18f92427-320e-0410-9341-c67f048884a3 --- Makefile | 22 + lib/yaml/__init__.py | 0 lib/yaml/marker.py | 35 ++ lib/yaml/scanner.py | 1007 ++++++++++++++++++++++++++++++++++++++++ setup.py | 22 + tests/data/spec-02-01.tokens | 1 + tests/data/spec-02-02.tokens | 5 + tests/data/spec-02-03.tokens | 4 + tests/data/spec-02-04.tokens | 4 + tests/data/spec-02-05.tokens | 5 + tests/data/spec-02-06.tokens | 4 + tests/data/spec-02-07.tokens | 12 + tests/data/spec-02-08.tokens | 15 + tests/data/spec-02-09.tokens | 5 + tests/data/spec-02-10.tokens | 5 + tests/data/test_marker.markers | 38 ++ tests/test_appliance.py | 28 ++ tests/test_build.py | 12 + tests/test_marker.py | 35 ++ tests/test_tokens.py | 62 +++ tests/test_yaml.py | 12 + 21 files changed, 1333 insertions(+) create mode 100644 Makefile create mode 100644 lib/yaml/__init__.py create mode 100644 lib/yaml/marker.py create mode 100644 lib/yaml/scanner.py create mode 100644 setup.py create mode 100644 tests/data/spec-02-01.tokens create mode 100644 tests/data/spec-02-02.tokens create mode 100644 tests/data/spec-02-03.tokens create mode 100644 tests/data/spec-02-04.tokens create mode 100644 tests/data/spec-02-05.tokens create mode 100644 tests/data/spec-02-06.tokens create mode 100644 tests/data/spec-02-07.tokens create mode 100644 tests/data/spec-02-08.tokens create mode 100644 tests/data/spec-02-09.tokens create mode 100644 tests/data/spec-02-10.tokens create mode 100644 tests/data/test_marker.markers create mode 100644 tests/test_appliance.py create mode 100644 tests/test_build.py create mode 100644 tests/test_marker.py create mode 100644 tests/test_tokens.py create mode 100644 tests/test_yaml.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f8207b6 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ + +.PHONY: default build force install test clean + +PYTHON=/usr/bin/python +TEST= +PARAMETERS= + +build: + ${PYTHON} setup.py build ${PARAMETERS} + +force: + ${PYTHON} setup.py build -f ${PARAMETERS} + +install: build + ${PYTHON} setup.py install ${PARAMETERS} + +test: build + ${PYTHON} tests/test_build.py -v ${TEST} + +clean: + ${PYTHON} setup.py clean -a + diff --git a/lib/yaml/__init__.py b/lib/yaml/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/yaml/marker.py b/lib/yaml/marker.py new file mode 100644 index 0000000..6ec12e6 --- /dev/null +++ b/lib/yaml/marker.py @@ -0,0 +1,35 @@ + +class Marker: + + def __init__(self, source, data, index, row, column): + self.source = source + self.data = data + self.index = index + self.row = row + self.column = column + + def get_snippet(self, max_length=79): + if not isinstance(self.data, basestring): + return None + head = '' + start = self.index + while start > 0 and self.data[start-1] not in '\r\n': + start -= 1 + if self.index-start > max_length/2-1: + head = ' ... ' + start += 5 + break + tail = '' + end = self.index + while end < len(self.data) and self.data[end] not in '\r\n': + end += 1 + if end-self.index > max_length/2-1: + tail = ' ... ' + end -= 5 + break + snippet = self.data[start:end] + if isinstance(snippet, unicode): + snippet = snippet.encode('utf-8') + return head + snippet + tail + '\n' \ + + ' '*(self.index-start+len(head)) + '^' + '\n' + diff --git a/lib/yaml/scanner.py b/lib/yaml/scanner.py new file mode 100644 index 0000000..d871369 --- /dev/null +++ b/lib/yaml/scanner.py @@ -0,0 +1,1007 @@ + +from marker import Marker +from error import ParserError +from stream import Stream + +class Scanner: + + def __init__(self, source, data): + """Initialize the scanner.""" + # The input stream. The Stream class do the dirty work of checking for + # BOM and converting the input data to Unicode. It also adds LF to the + # end if the data does not ends with an EOL character. + # + # Stream supports the following methods + # self.stream.peek(k=1) # peek the next k characters + # self.stream.read(k=1) # read the next k characters and move the + # # pointer + self.stream = Stream(source, data) + + # Had we reached the end of the stream? + self.done = False + + # The number of unclosed '{' and '['. `flow_level == 0` means block + # context. + self.flow_level = 0 + + # List of processed tokens that are not yet emitted. + self.tokens = [] + + # Number of tokens that were emitted through the `get_token` method. + self.tokens_taken = 0 + + # The current indentation level. + self.indent = -1 + + # Past indentation levels. + self.indents = [] + + # Variables related to simple key treatment. + + # A simple key is a key that is not denoted by the '?' indicator. + # Example of simple keys: + # --- + # block simple key: value + # ? not a simple key: + # : { flow simple key: value } + # We emit the KEY token before all keys, so when we find a potential + # simple key, we try to locate the corresponding ':' indicator. + # Simple keys should be limited to a single line and 1024 characters. + + # Can a block collection start at the current position? A block + # collection may start: + # - at the beginning of the line (not counting spaces), + # - after the block sequence indicator '-'. + self.allow_block_collection = True + + # Can a simple key in flow context start at the current position? A + # simple key may start after the '{', '[', and ',' indicators. + self.allow_flow_simple_keys = False + + # Keep track of possible simple keys. This is a dictionary. The key + # is `flow_level`; there can be no more that one possible simple key + # for each level. The value is a record of + # (stream.index, stream.line, stream.column, token_number) + self.possible_simple_keys = {} + + # Public methods: + + def peek_token(self): + """Get the current token.""" + while self.need_more_tokens() + self.fetch_more_tokens() + if self.tokens: + return self.tokens[0] + + def get_token(self): + "Get the current token and remove it from the list.""" + while self.need_more_tokens(): + self.fetch_more_tokens() + if self.tokens: + self.tokens_taken += 1 + return self.tokens.pop(0) + + # Private methods: + + def need_more_tokens(self): + if self.done: + return False + if not self.tokens: + return True + # The current token may be a potential simple key, so we + # need to look further. + if self.next_possible_simple_key() == self.tokens_taken: + return True + + def fetch_more_tokens(self): + + # Eat whitespaces and comments until we reach the next token. + self.find_next_token() + + # Compare the current indentation and column. It may add some tokens + # and decrease the current indentation. + self.unwind_indent(self.stream.column) + + # Peek the next character. + ch = self.stream.peek() + + # Is it the end of stream? + if ch is None: + return self.fetch_end() + + # Is it a directive? + if ch == u'%' and self.check_directive(): + return self.fetch_directive() + + # Is it the document start? + if ch == u'-' and self.check_document_start(): + return self.fetch_document_start() + + # Is it the document end? + if ch == u'.' and self.check_document_end(): + return self.fetch_document_end() + + # Note: the order of the following checks is NOT significant. + + # Is it the sequence indicator? + if ch in u'-,' and self.check_entry(): + return self.fetch_entry() + + # Is it the flow sequence start indicator? + if ch == u'[': + return self.fetch_flow_sequence_start() + + # Is it the flow mapping start indicator? + if ch == u'{': + return self.fetch_flow_mapping_start() + + # Is it the flow sequence end indicator? + if ch == u']': + return self.fetch_flow_sequence_end() + + # Is it the flow mapping end indicator? + if ch == u'}': + return self.fetch_flow_mapping_end() + + # Is it the key indicator? + if ch == u'?' and self.check_key(): + return self.fetch_key() + + # Is it the value indicator? + if ch == u':' and self.check_value(): + return self.fetch_value() + + # Is it an alias? + if ch == u'*': + return self.fetch_alias() + + # Is it an anchor? + if ch == u'&': + return self.fetch_anchor() + + # Is is a tag? + if ch == u'!': + return self.fetch_tag() + + # Is is a literal scalar? + if ch == u'|': + return self.fetch_literal() + + # Is it a folded scalar? + if ch == u'>': + return self.fetch_folded() + + # Is it a single quoted scalar? + if ch == u'\'': + return self.fetch_single() + + # Is it a double quoted scalar? + if ch == u'\"': + return self.fetch_double() + + # It must be a plain scalar. + if self.check_plain(): + return self.fetch_plain() + + # No? It's an error then. Let's produce a nice error message. + self.invalid_token() + + def fetch_end(self): + + # Set the current intendation to -1. + self.unwind_indents(-1) + + # Reset everything (not really needed). + self.allow_block_collection = False + self.allow_flow_simple_keys = False + self.possible_simple_keys = {} + + # Add END. + marker = self.stream.get_marker() + self.tokens.append(EndToken(marker)) + + # The stream is ended. + self.done = True + + def check_directive(self): + + # Checking for + # /* The beginning of the line */ '%' + # The '%' indicator is already checked. + if self.stream.column == 0: + return True + + def check_document_start(self): + + # Checking for + # /* The beginning of the line */ '---' /* Space or EOL */ + if self.stream.column == 0: + prefix = self.stream.peek(4) + if prefix[:3] == u'---' and prefix[3] in u' \t\r\n\x85\u2028\u2029': + return True + + def fetch_document_start(self): + + # Set the current intendation to -1. + self.unwind_indents(-1) + + # No block collections after '---'. + self.allow_block_collection = False + + # No flow simple keys (not needed -- we are in the block context). + self.allow_flow_simple_keys = False + + # Reset possible simple keys (not needed -- EOL should have reset it). + self.possible_simple_keys = {} + + start_marker = self.stream.get_marker() + + # The characters are already checked, just move forward. + self.stream.read(3) + + end_marker = self.stream.get_marker() + + # Add DOCUMENT-START. + self.tokens.append(DocumentStartToken(start_marker, end_marker)) + + + def check_document_end(self): + if self.stream.column == 0: + prefix = self.stream.peek(4) + if prefix[:3] == u'...' and prefix[3] in u' \t\r\n\x85\u2028\u2029': + return True + + def fetch_document_end(self): + # The same code as `fetch_document_start`. + + # Set the current intendation to -1. + self.unwind_indents(-1) + + # Reset everything (not really needed). + self.allow_block_collection = False + self.allow_flow_simple_keys = False + self.possible_simple_keys = {} + + start_marker = self.stream.get_marker() + + # The characters are already checked, just move forward. + self.stream.read(3) + + end_marker = self.stream.get_marker() + + # Add DOCUMENT-END. + self.tokens.append(DocumentEndToken(start_marker, end_marker)) + + + +# Tokens: +# YAML_DIRECTIVE: ^ '%' YAML ' '+ (version: \d+ '.' \d+) s-l-comments +# TAG_DIRECTIVE: ^ % TAG ' '+ (handle: '!' (word-char* '!')? ) (prefix: uri-char+) s-l-comments +# RESERVED_DIRECTIVE: ^ '%' (directive-name: ns-char+) (' '+ (directive-parameter: ns-char+))* s-l-comments +# DOCUMENT_START: ^ '---' (' ' | b-any) +# DOCUMENT_END: ^ ... (' ' | b-any) +# TAG: '!' ( ('<' uri-char+ '>') | uri-char* ) (' ' | b-any) +# ANCHOR: '&' ns-char+ <-- bug +# ALIAS: * ns-char+ <-- bug +# ENTRY(block): '-' (' ' | b-any) +# KEY(block): '?' (' ' | b-any) +# VALUE(block): ':' (' ' | b-any) +# FLOW_SEQ_START: '[' +# FLOW_SEQ_END: ']' +# FLOW_MAP_START: '{' +# FLOW_MAP_END: '}' +# KEY(flow): '?' +# VALUE(flow): ':' +# ENTRY(flow): ',' +# PLAIN: (plain-char - indicator) | ([-?:] plain-char) ... <-- bugs +# DOUBLE_QUOTED: '"' ... +# SINGLE_QUOTED: ''' ... +# LITERAL: '|' ... +# FOLDED: '>' ... +# BLOCK_SEQ_START: indentation before '-'. +# BLOCK_MAP_START: indentation before '?' or a simple key. +# BLOCK_END: no indentation +# LINE: end of line + +# b-generic: \r \n | \r | \n | #x85 +# b-specific: #x2028 | #x2029 +# b-any: b-generic | b-specific +# hex-digit: [0-9A-Fa-f] +# word-char: [0-9A-Za-z-] +# uri-char: word-char | % hex-digit hex-digit | [;/?:@&=+$,_.!~*'()[]] + +# Production rules: +# stream :== implicit_document? explicit_document* END +# explicit_document :== DIRECTIVE* DOCUMENT_START block_node? DOCUMENT_END? +# implicit_document :== block_node DOCUMENT_END? +# block_node :== ALIAS | properties? block_content +# flow_node :== ALIAS | properties? flow_content +# properties :== TAG ANCHOR? | ANCHOR TAG? +# block_content :== block_collection | flow_collection | SCALAR +# flow_content :== flow_collection | SCALAR +# block_collection :== block_sequence | block_mapping +# block_sequence :== BLOCK_SEQ_START (ENTRY block_node?)* BLOCK_END +# block_mapping :== BLOCK_MAP_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK_END +# block_node_or_indentless_sequence :== ALIAS | properties? (block_content | indentless_block_sequence) +# indentless_block_sequence :== (ENTRY block_node?)+ +# flow_collection :== flow_sequence | flow_mapping +# flow_sequence :== FLOW_SEQ_START (flow_sequence_entry ENTRY)* flow_sequence_entry? FLOW_SEQ_END +# flow_sequence_entry :== flow_node | KEY flow_node (VALUE flow_node?)? +# flow_mapping :== FLOW_MAP_START flow_mapping_entry ENTRY)* flow_mapping_entry? FLOW_MAP_END +# flow_mapping_entry :== flow_node | KEY flow_node (VALUE flow_node?)? + +# FIRST(rule) sets: +# stream: {} +# explicit_document: { DIRECTIVE DOCUMENT_START } +# implicit_document: block_node +# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK_SEQ_START BLOCK_MAP_START FLOW_SEQ_START FLOW_MAP_START } +# flow_node: { ALIAS TAG ANCHOR SCALAR FLOW_SEQ_START FLOW_MAP_START } +# block_content: { BLOCK_SEQ_START BLOCK_MAP_START FLOW_SEQ_START FLOW_MAP_START SCALAR } +# flow_content: { FLOW_SEQ_START FLOW_MAP_START SCALAR } +# block_collection: { BLOCK_SEQ_START BLOCK_MAP_START } +# flow_collection: { FLOW_SEQ_START FLOW_MAP_START } +# block_sequence: { BLOCK_SEQ_START } +# block_mapping: { BLOCK_MAP_START } +# block_node_or_indentless_sequence: { ALIAS TAG ANCHOR SCALAR BLOCK_SEQ_START BLOCK_MAP_START FLOW_SEQ_START FLOW_MAP_START ENTRY } +# indentless_sequence: { ENTRY } +# flow_collection: { FLOW_SEQ_START FLOW_MAP_START } +# flow_sequence: { FLOW_SEQ_START } +# flow_mapping: { FLOW_MAP_START } +# flow_sequence_entry: { ALIAS TAG ANCHOR SCALAR FLOW_SEQ_START FLOW_MAP_START KEY } +# flow_mapping_entry: { ALIAS TAG ANCHOR SCALAR FLOW_SEQ_START FLOW_MAP_START KEY } + +class Marker(object): + + def __init__(self, source, data, index, length=0): + self.source = source + self.data = data + self.index = index + self.length = length + self._line = None + self._position = None + + def line(self): + if not self._line: + self._make_line_position() + return self._line + + def position(self): + if not self._position: + self._make_line_position() + return self._position + + def _make_line_position(self): + allow_block_collection = self.data.rfind('\n', 0, self.index)+1 + line_end = self.data.find('\n', self.index)+1 + if line_end == 0: + line_end = len(self.data) + self._line = (allow_block_collection, line_end) + row = self.data.count('\n', 0, allow_block_collection) + col = self.index-allow_block_collection + self._position = (row, col) + +class Error(Exception): + + def __init__(self, message=None, marker=None): + Exception.__init__(self) + self.message = message + self.marker = marker + + def __str__(self): + if self.marker is not None: + row, col = self.marker.position() + start, end = self.marker.line() + error_position = "source \"%s\", line %s, column %s:\n%s\n" \ + % (self.marker.source, row+1, col+1, self.marker.data[start:end].rstrip().encode('utf-8')) + error_pointer = " " * col + "^\n" + else: + error_position = "" + error_pointer = "" + if self.message is not None: + error_message = self.message + else: + error_message = "YAML error" + return error_position+error_pointer+error_message + +class _Scanner: + + def scan(self, source, data): + self.source = source + self.data = data + self.flow_level = 0 + self.indents = [] + self.indent = -1 + self.index = 0 + self.line = 0 + self.column = 0 + self.allow_block_collection = True + self.guess_simple_key = False + self.guess_simple_key_token = None + self.guess_simple_key_indent = None + self.allow_flow_key = False + self.guess_flow_key_levels = [] + self.guess_flow_key_tokens = [] + self.tokens = [] + while self.eat_ignored() or self.fetch_token(): + pass + return self.tokens + + def eat_ignored(self): + result = False + while self.eat_ignored_spaces() or self.eat_ignored_comment() or self.eat_ignored_newline(): + result = True + return result + + def eat_ignored_spaces(self): + result = False + while self.index < len(self.data) and self.data[self.index] == ' ': + self.index += 1 + self.column += 1 + result = True + return result + + def eat_ignored_comment(self): + if self.index < len(self.data) and self.data[self.index] == '#': + self.eat_line() + return False + + def eat_line(self): + result = False + while self.index < len(self.data) and self.data[self.index] not in '\r\n': + self.index += 1 + self.column += 1 + result = True + return result + + def eat_ignored_newline(self): + if self.index < len(self.data) and self.data[self.index] in '\r\n': + if self.data[self.index:self.index+2] == '\r\n': + self.index += 2 + else: + self.index += 1 + self.line += 1 + self.column = 0 + self.allow_block_collection = True + return True + return False + + def eat_ns(self): + result = False + while self.index < len(self.data) and self.data[self.index] not in ' \t\r\n': + self.index += 1 + self.column += 1 + result = True + return result + + def eat_indent(self, indent=0): + if indent < self.indent: + indent = self.indent + if self.column != 0: + return False + count = 0 + while self.index < len(self.data) and self.data[self.index] == ' ' and count < indent: + self.index += 1 + self.column += 1 + count += 1 + return count == indent + + def eat_double_quoted(self): + if self.index < len(self.data) and self.data[self.index] == '"': + self.index += 1 + self.column += 1 + while self.index < len(self.data) and self.data[self.index] != '"': + if self.data[self.index:self.index+2] in ['\\\\', '\\"']: + self.index += 2 + self.column += 2 + elif self.data[self.index] in '\r\n': + self.eat_ignored_newline() + if not self.eat_indent(1): + self.error("Invalid indentation") + else: + self.index += 1 + self.column += 1 + if self.index < len(self.data) and self.data[self.index] == '"': + self.index += 1 + self.column += 1 + return True + else: + self.error("unclosed double quoted scalar") + else: + return False + + def eat_single_quoted(self): + if self.index < len(self.data) and self.data[self.index] == '\'': + self.index += 1 + self.column += 1 + while self.index < len(self.data) and \ + (self.data[self.index] != '\'' or self.data[self.index:self.index+2] == '\'\''): + if self.data[self.index:self.index+2] == '\'\'': + self.index += 2 + self.column += 2 + elif self.data[self.index] in '\r\n': + self.eat_ignored_newline() + if not self.eat_indent(1): + self.error("Invalid indentation") + else: + self.index += 1 + self.column += 1 + if self.index < len(self.data) and self.data[self.index] == '\'': + self.index += 1 + self.column += 1 + return True + else: + self.error("unclosed single quoted scalar") + else: + return False + + def eat_folded(self): + self.eat_block_scalar() + + def eat_literal(self): + self.eat_block_scalar() + + def eat_block_scalar(self): + if self.index < len(self.data) and self.data[self.index] in '>|': + self.eat_line() + if not self.eat_ignored_newline(): + return True + indent = self.indent+1 + if indent < 1: + indent = 1 + while (self.eat_indent(indent) and ((self.eat_line() and self.eat_ignored_newline()) or (self.eat_ignored_newline()))) or \ + (self.eat_ignored_comment() and self.eat_ignored_newline()) or \ + self.eat_ignored_newline(): + pass + return True + return False + + def eat_block_plain(self): + return self.eat_plain(block=True) + + def eat_flow_plain(self): + return self.eat_plain(block=False) + + def eat_plain(self, block): + indent = self.indent+1 + if indent < 1: + indent = 1 + if self.index < len(self.data): + if self.data[self.index] not in ' \t\r\n-?:,[]{}#&*!|>\'"%@`' or \ + (block and self.data[self.index] == '-' and self.data[self.index:self.index+2] not in ['-', '- ', '-\r', '-\n']) or \ + (block and self.data[self.index] == '?' and self.data[self.index:self.index+2] not in ['?', '? ', '?\r', '?\n']) or \ + (block and self.data[self.index] == ':' and self.data[self.index:self.index+2] not in [':', ': ', ':\r', ':\n']): + if block and self.allow_block_collection: + self.guessing_simple_key() + if self.flow_level and self.allow_flow_key: + self.guess_flow_key_levels.append(self.flow_level) + self.guess_flow_key_tokens.append(len(self.tokens)) + self.allow_flow_key = False + self.index += 1 + self.column += 1 + space = False + while True: + self.eat_ignored_spaces() + while self.index < len(self.data) and ( + self.data[self.index] not in '\r\n?:,[]{}#' or + (not space and self.data[self.index] == '#') or + (block and self.data[self.index] in '?,[]{}') or + (block and self.data[self.index] == ':' and self.data[self.index:self.index+2] not in [':', ': ', ':\r', ':\n'])): + space = self.data[self.index] not in ' \t' + self.index += 1 + self.column += 1 + self.allow_block_collection = False + if not (self.eat_ignored_newline() and self.eat_indent(indent)): + break + space = True + return True + return False + + def no_simple_key(self): + self.guess_simple_key = False + self.guess_simple_key_token = None + self.guess_simple_key_indent = None + + def guessing_simple_key(self): + self.guess_simple_key = True + self.guess_simple_key_token = len(self.tokens) + self.guess_simple_key_indent = self.column + + def unwind_indents(self, level): + while self.indent > level: + if self.flow_level: + self.error("Invalid indentation") + self.tokens.append('BLOCK_END') + self.indent = self.indents.pop() + self.no_simple_key() + + def fetch_token(self): + self.unwind_indents(self.column) + if self.index < len(self.data): + if self.column == 0: + if self.data[self.index] == '%': + self.tokens.append('DIRECTIVE') + self.eat_line() + self.no_simple_key() + return True + if self.data[self.index:self.index+3] == '---' and \ + (not self.data[self.index+3:self.index+4] or self.data[self.index+3:self.index+4] in ' \r\n'): + self.unwind_indents(-1) + self.tokens.append('DOCUMENT_START') + self.index += 3 + self.column += 3 + self.allow_block_collection = False + self.allow_flow_key = False + self.guess_flow_keys = [] + self.no_simple_key() + return True + if self.data[self.index:self.index+3] == '...' and \ + (not self.data[self.index+3:self.index+4] or self.data[self.index+3:self.index+4] in ' \r\n'): + self.unwind_indents(-1) + self.tokens.append('DOCUMENT_END') + self.index += 3 + self.column += 3 + self.allow_block_collection = False + self.allow_flow_key = False + self.guess_flow_keys = [] + self.no_simple_key() + return True + if self.data[self.index] in '[]{}': + if self.data[self.index] == '[': + self.flow_level += 1 + self.allow_flow_key = True + self.tokens.append('FLOW_SEQ_START') + elif self.data[self.index] == '{': + self.flow_level += 1 + self.allow_flow_key = True + self.tokens.append('FLOW_MAP_START') + elif self.data[self.index] == ']': + if not self.flow_level: + self.error("Extra ]") + self.flow_level -= 1 + self.allow_flow_key = False + self.tokens.append('FLOW_SEQ_END') + else: + if not self.flow_level: + self.error("Extra }") + self.flow_level -= 1 + self.allow_flow_key = False + self.tokens.append('FLOW_MAP_END') + while self.guess_flow_key_levels and self.guess_flow_key_levels[-1] > self.flow_level: + self.guess_flow_key_levels.pop() + self.guess_flow_key_tokens.pop() + self.index += 1 + self.column += 1 + self.allow_block_collection = False + return True + if self.data[self.index] in '!&*': + if self.flow_level and self.allow_flow_key: + self.guess_flow_key_levels.append(self.flow_level) + self.guess_flow_key_tokens.append(len(self.tokens)) + if not self.flow_level and self.allow_block_collection: + self.guessing_simple_key() + if self.data[self.index] == '!': + self.tokens.append('TAG') + elif self.data[self.index] == '&': + self.tokens.append('ANCHOR') + else: + self.tokens.append('ALIAS') + self.eat_ns() + self.allow_flow_key = False + self.allow_block_collection = False + return True + if self.data[self.index] == '"': + if self.flow_level and self.allow_flow_key: + self.guess_flow_key_levels.append(self.flow_level) + self.guess_flow_key_tokens.append(len(self.tokens)) + if not self.flow_level and self.allow_block_collection: + self.guessing_simple_key() + self.tokens.append('SCALAR') + self.eat_double_quoted() + self.allow_flow_key = False + self.allow_block_collection = False + return True + if self.data[self.index] == '\'': + if self.flow_level and self.allow_flow_key: + self.guess_flow_key_levels.append(self.flow_level) + self.guess_flow_key_tokens.append(len(self.tokens)) + if not self.flow_level and self.allow_block_collection: + self.guessing_simple_key() + self.tokens.append('SCALAR') + self.eat_single_quoted() + self.allow_flow_key = False + self.allow_block_collection = False + return True + if not self.flow_level: + if self.data[self.index] in '-?:' and \ + (not self.data[self.index+1:self.index+2] or self.data[self.index+1:self.index+2] in ' \r\n'): + if self.guess_simple_key and self.data[self.index] == ':': + self.tokens.insert(self.guess_simple_key_token, 'KEY') + if self.guess_simple_key_indent > self.indent: + self.indents.append(self.indent) + self.indent = self.guess_simple_key_indent + self.tokens.insert(self.guess_simple_key_token, 'BLOCK_MAP_START') + self.tokens.append('VALUE') + self.no_simple_key() + self.index += 1 + self.column += 1 + self.allow_block_collection = False + return True + else: + if not self.allow_block_collection: + self.error("Block collection should start at the beginning of the line") + if self.column > self.indent: + self.indents.append(self.indent) + self.indent = self.column + if self.data[self.index] == '-': + self.tokens.append('BLOCK_SEQ_START') + else: + self.tokens.append('BLOCK_MAP_START') + if self.data[self.index] == '-': + self.tokens.append('ENTRY') + elif self.data[self.index] == '?': + self.tokens.append('KEY') + else: + self.tokens.append('VALUE') + self.index += 1 + self.column += 1 + #self.allow_block_collection = False + self.allow_block_collection = True + self.no_simple_key() + return True + if self.data[self.index] == '>': + self.no_simple_key() + self.tokens.append('SCALAR') + self.eat_folded() + self.allow_block_collection = True + return True + if self.data[self.index] == '|': + self.no_simple_key() + self.tokens.append('SCALAR') + self.eat_literal() + self.allow_block_collection = True + return True + if self.eat_block_plain(): + self.tokens.append('SCALAR') + return True + else: + if self.data[self.index] in ',?:': + if self.data[self.index] == ',': + self.tokens.append('ENTRY') + while self.guess_flow_key_levels and self.guess_flow_key_levels[-1] >= self.flow_level: + self.guess_flow_key_levels.pop() + self.guess_flow_key_tokens.pop() + self.allow_flow_key = True + elif self.data[self.index] == '?': + self.tokens.append('KEY') + while self.guess_flow_key_levels and self.guess_flow_key_levels[-1] >= self.flow_level: + self.guess_flow_key_levels.pop() + self.guess_flow_key_tokens.pop() + self.allow_flow_key = False + else: + self.tokens.append('VALUE') + if self.guess_flow_key_levels and self.guess_flow_key_levels[-1] == self.flow_level: + self.guess_flow_key_levels.pop() + index = self.guess_flow_key_tokens.pop() + self.tokens.insert(index, 'KEY') + self.allow_flow_key =False + self.index += 1 + self.column += 1 + return True + if self.eat_flow_plain(): + self.tokens.append('SCALAR') + return True + self.error("Invalid token") + else: + self.unwind_indents(-1) + + def error(self, message): + raise Error(message, Marker(self.source, self.data, self.index)) + +class Parser: + + def parse(self, source, data): + scanner = Scanner() + self.tokens = scanner.scan(source, data) + self.tokens.append('END') + documents = self.parse_stream() + if len(documents) == 1: + return documents[0] + return documents + + def parse_stream(self): + documents = [] + if self.tokens[0] not in ['DIRECTIVE', 'DOCUMENT_START', 'END']: + documents.append(self.parse_block_node()) + while self.tokens[0] != 'END': + while self.tokens[0] == 'DIRECTIVE': + self.tokens.pop(0) + if self.tokens[0] != 'DOCUMENT_START': + self.error('DOCUMENT_START is expected') + self.tokens.pop(0) + if self.tokens[0] in ['DIRECTIVE', 'DOCUMENT_START', 'DOCUMENT_END', 'END']: + documents.append(None) + else: + documents.append(self.parse_block_node()) + while self.tokens[0] == 'DOCUMENT_END': + self.tokens.pop(0) + if self.tokens[0] != 'END': + self.error("END is expected") + return tuple(documents) + + def parse_block_node(self): + if self.tokens[0] == 'ALIAS': + self.tokens.pop(0) + return '*' + if self.tokens[0] == 'TAG': + self.tokens.pop(0) + if self.tokens[0] == 'ANCHOR': + self.tokens.pop(0) + elif self.tokens[0] == 'ANCHOR': + self.tokens.pop(0) + if self.tokens[0] == 'TAG': + self.tokens.pop(0) + return self.parse_block_content() + + def parse_flow_node(self): + if self.tokens[0] == 'ALIAS': + self.tokens.pop(0) + return '*' + if self.tokens[0] == 'TAG': + self.tokens.pop(0) + if self.tokens[0] == 'ANCHOR': + self.tokens.pop(0) + elif self.tokens[0] == 'ANCHOR': + self.tokens.pop(0) + if self.tokens[0] == 'TAG': + self.tokens.pop(0) + return self.parse_flow_content() + + def parse_block_node_or_indentless_sequence(self): + if self.tokens[0] == 'ALIAS': + self.tokens.pop(0) + return '*' + if self.tokens[0] == 'TAG': + self.tokens.pop(0) + if self.tokens[0] == 'ANCHOR': + self.tokens.pop(0) + elif self.tokens[0] == 'ANCHOR': + self.tokens.pop(0) + if self.tokens[0] == 'TAG': + self.tokens.pop(0) + if self.tokens[0] == 'ENTRY': + return self.parse_indentless_sequence(self) + return self.parse_block_content() + + def parse_block_content(self): + if self.tokens[0] == 'SCALAR': + self.tokens.pop(0) + return True + elif self.tokens[0] == 'BLOCK_SEQ_START': + return self.parse_block_sequence() + elif self.tokens[0] == 'BLOCK_MAP_START': + return self.parse_block_mapping() + elif self.tokens[0] == 'FLOW_SEQ_START': + return self.parse_flow_sequence() + elif self.tokens[0] == 'FLOW_MAP_START': + return self.parse_flow_mapping() + else: + self.error('block content is expected') + + def parse_flow_content(self): + if self.tokens[0] == 'SCALAR': + self.tokens.pop(0) + return True + elif self.tokens[0] == 'FLOW_SEQ_START': + return self.parse_flow_sequence() + elif self.tokens[0] == 'FLOW_MAP_START': + return self.parse_flow_mapping() + else: + self.error('flow content is expected') + + def parse_block_sequence(self): + sequence = [] + if self.tokens[0] != 'BLOCK_SEQ_START': + self.error('BLOCK_SEQ_START is expected') + self.tokens.pop(0) + while self.tokens[0] == 'ENTRY': + self.tokens.pop(0) + if self.tokens[0] not in ['ENTRY', 'BLOCK_END']: + sequence.append(self.parse_block_node()) + else: + sequence.append(None) + if self.tokens[0] != 'BLOCK_END': + self.error('BLOCK_END is expected') + self.tokens.pop(0) + return sequence + + def parse_indentless_sequence(self): + sequence = [] + while self.tokens[0] == 'ENTRY': + self.tokens.pop(0) + if self.tokens[0] not in ['ENTRY']: + sequence.append(self.parse_block_node()) + else: + sequence.append(None) + return sequence + + def parse_block_mapping(self): + mapping = [] + if self.tokens[0] != 'BLOCK_MAP_START': + self.error('BLOCK_MAP_START is expected') + self.tokens.pop(0) + while self.tokens[0] in ['KEY', 'VALUE']: + key = None + value = None + if self.tokens[0] == 'KEY': + self.tokens.pop(0) + if self.tokens[0] not in ['KEY', 'VALUE', 'BLOCK_END']: + key = self.parse_block_node_or_indentless_sequence() + if self.tokens[0] == 'VALUE': + self.tokens.pop(0) + if self.tokens[0] not in ['KEY', 'VALUE', 'BLOCK_END']: + value = self.parse_block_node_or_indentless_sequence() + mapping.append((key, value)) + if self.tokens[0] != 'BLOCK_END': + self.error('BLOCK_END is expected') + self.tokens.pop(0) + return mapping + + def parse_flow_sequence(self): + sequence = [] + if self.tokens[0] != 'FLOW_SEQ_START': + self.error('FLOW_SEQ_START is expected') + self.tokens.pop(0) + while self.tokens[0] != 'FLOW_SEQ_END': + if self.tokens[0] == 'KEY': + self.tokens.pop(0) + key = None + value = None + if self.tokens[0] != 'VALUE': + key = self.parse_flow_node() + if self.tokens[0] == 'VALUE': + self.tokens.pop(0) + if self.tokens[0] not in ['ENTRY', 'FLOW_SEQ_END']: + value = self.parse_flow_node() + sequence.append([(key, value)]) + else: + sequence.append(self.parse_flow_node()) + if self.tokens[0] not in ['ENTRY', 'FLOW_SEQ_END']: + self.error("ENTRY or FLOW_SEQ_END is expected") + if self.tokens[0] == 'ENTRY': + self.tokens.pop(0) + if self.tokens[0] != 'FLOW_SEQ_END': + self.error('FLOW_SEQ_END is expected') + self.tokens.pop(0) + return sequence + + def parse_flow_mapping(self): + mapping = [] + if self.tokens[0] != 'FLOW_MAP_START': + self.error('FLOW_MAP_START is expected') + self.tokens.pop(0) + while self.tokens[0] != 'FLOW_MAP_END': + if self.tokens[0] == 'KEY': + self.tokens.pop(0) + key = None + value = None + if self.tokens[0] != 'VALUE': + key = self.parse_flow_node() + if self.tokens[0] == 'VALUE': + self.tokens.pop(0) + if self.tokens[0] not in ['ENTRY', 'FLOW_MAP_END']: + value = self.parse_flow_node() + mapping.append((key, value)) + else: + mapping.append((self.parse_flow_node(), None)) + if self.tokens[0] not in ['ENTRY', 'FLOW_MAP_END']: + self.error("ENTRY or FLOW_MAP_END is expected") + if self.tokens[0] == 'ENTRY': + self.tokens.pop(0) + if self.tokens[0] != 'FLOW_MAP_END': + self.error('FLOW_MAP_END is expected') + self.tokens.pop(0) + return mapping + + def error(self, message): + raise Error(message+': '+str(self.tokens)) + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e6c1652 --- /dev/null +++ b/setup.py @@ -0,0 +1,22 @@ + +NAME = 'PyYAML3000' +VERSION = '0.1' +DESCRIPTION = "YAML parser (and emitter)" +AUTHOR = "Kirill Simonov" +AUTHOR_EMAIL = 'xi@resolvent.net' +LICENSE = "BSD" + +from distutils.core import setup + +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + author=AUTHOR, + author_email=AUTHOR_EMAIL, + license=LICENSE, + + package_dir={'': 'lib'}, + packages=['yaml'], +) + diff --git a/tests/data/spec-02-01.tokens b/tests/data/spec-02-01.tokens new file mode 100644 index 0000000..ce44cac --- /dev/null +++ b/tests/data/spec-02-01.tokens @@ -0,0 +1 @@ +[[ , _ , _ , _ ]} diff --git a/tests/data/spec-02-02.tokens b/tests/data/spec-02-02.tokens new file mode 100644 index 0000000..e4e381b --- /dev/null +++ b/tests/data/spec-02-02.tokens @@ -0,0 +1,5 @@ +{{ +? _ : _ +? _ : _ +? _ : _ +]} diff --git a/tests/data/spec-02-03.tokens b/tests/data/spec-02-03.tokens new file mode 100644 index 0000000..89815f2 --- /dev/null +++ b/tests/data/spec-02-03.tokens @@ -0,0 +1,4 @@ +{{ +? _ : [[ , _ , _ , _ ]} +? _ : [[ , _ , _ , _ ]} +]} diff --git a/tests/data/spec-02-04.tokens b/tests/data/spec-02-04.tokens new file mode 100644 index 0000000..9cb9815 --- /dev/null +++ b/tests/data/spec-02-04.tokens @@ -0,0 +1,4 @@ +[[ +, {{ ? _ : _ ? _ : _ ? _ : _ ]} +, {{ ? _ : _ ? _ : _ ? _ : _ ]} +]} diff --git a/tests/data/spec-02-05.tokens b/tests/data/spec-02-05.tokens new file mode 100644 index 0000000..3f6f1ab --- /dev/null +++ b/tests/data/spec-02-05.tokens @@ -0,0 +1,5 @@ +[[ +, [ _ , _ , _ ] +, [ _ , _ , _ ] +, [ _ , _ , _ ] +]} diff --git a/tests/data/spec-02-06.tokens b/tests/data/spec-02-06.tokens new file mode 100644 index 0000000..a1a5eef --- /dev/null +++ b/tests/data/spec-02-06.tokens @@ -0,0 +1,4 @@ +{{ +? _ : { ? _ : _ , ? _ : _ } +? _ : { ? _ : _ , ? _ : _ } +]} diff --git a/tests/data/spec-02-07.tokens b/tests/data/spec-02-07.tokens new file mode 100644 index 0000000..ed48883 --- /dev/null +++ b/tests/data/spec-02-07.tokens @@ -0,0 +1,12 @@ +--- +[[ +, _ +, _ +, _ +]} + +--- +[[ +, _ +, _ +]} diff --git a/tests/data/spec-02-08.tokens b/tests/data/spec-02-08.tokens new file mode 100644 index 0000000..7d2c03d --- /dev/null +++ b/tests/data/spec-02-08.tokens @@ -0,0 +1,15 @@ +--- +{{ +? _ : _ +? _ : _ +? _ : _ +]} +... + +--- +{{ +? _ : _ +? _ : _ +? _ : _ +]} +... diff --git a/tests/data/spec-02-09.tokens b/tests/data/spec-02-09.tokens new file mode 100644 index 0000000..b2ec10e --- /dev/null +++ b/tests/data/spec-02-09.tokens @@ -0,0 +1,5 @@ +--- +{{ +? _ : [[ , _ , _ ]} +? _ : [[ , _ , _ ]} +]} diff --git a/tests/data/spec-02-10.tokens b/tests/data/spec-02-10.tokens new file mode 100644 index 0000000..26caa2b --- /dev/null +++ b/tests/data/spec-02-10.tokens @@ -0,0 +1,5 @@ +--- +{{ +? _ : [[ , _ , & _ ]} +? _ : [[ , * , _ ]} +]} diff --git a/tests/data/test_marker.markers b/tests/data/test_marker.markers new file mode 100644 index 0000000..7b08ee4 --- /dev/null +++ b/tests/data/test_marker.markers @@ -0,0 +1,38 @@ +--- +*The first line. +The last line. +--- +The first*line. +The last line. +--- +The first line.* +The last line. +--- +The first line. +*The last line. +--- +The first line. +The last*line. +--- +The first line. +The last line.* +--- +The first line. +*The selected line. +The last line. +--- +The first line. +The selected*line. +The last line. +--- +The first line. +The selected line.* +The last line. +--- +*The only line. +--- +The only*line. +--- +The only line.* +--- +Loooooooooooooooooooooooooooooooooooooooooooooong*Liiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiine diff --git a/tests/test_appliance.py b/tests/test_appliance.py new file mode 100644 index 0000000..372a68d --- /dev/null +++ b/tests/test_appliance.py @@ -0,0 +1,28 @@ + +import unittest, os + +class TestAppliance(unittest.TestCase): + + DATA = 'tests/data' + + tests = {} + for filename in os.listdir(DATA): + if os.path.isfile(os.path.join(DATA, filename)): + root, ext = os.path.splitext(filename) + tests.setdefault(root, []).append(ext) + + def add_tests(cls, method_name, *extensions): + for test in cls.tests: + available_extensions = cls.tests[test] + for ext in extensions: + if ext not in available_extensions: + break + else: + filenames = [os.path.join(cls.DATA, test+ext) for ext in extensions] + def test_method(self, test=test, filenames=filenames): + getattr(self, '_'+method_name)(test, *filenames) + test = test.replace('-', '_') + test_method.__name__ = '%s_%s' % (method_name, test) + setattr(cls, test_method.__name__, test_method) + add_tests = classmethod(add_tests) + diff --git a/tests/test_build.py b/tests/test_build.py new file mode 100644 index 0000000..50f7437 --- /dev/null +++ b/tests/test_build.py @@ -0,0 +1,12 @@ + +def main(): + import sys, os, distutils.util + #build_lib = os.path.join('build', 'lib.%s-%s' % (distutils.util.get_platform(), sys.version[0:3])) + build_lib = 'build/lib' + sys.path.insert(0, build_lib) + import test_yaml + test_yaml.main('test_yaml') + +if __name__ == '__main__': + main() + diff --git a/tests/test_marker.py b/tests/test_marker.py new file mode 100644 index 0000000..4570098 --- /dev/null +++ b/tests/test_marker.py @@ -0,0 +1,35 @@ + +import test_appliance + +from yaml.marker import Marker + +class TestMarker(test_appliance.TestAppliance): + + def _testMarkers(self, test_name, markers_filename): + inputs = file(markers_filename, 'rb').read().split('---\n')[1:] + for input in inputs: + index = 0 + line = 0 + column = 0 + while input[index] != '*': + if input[index] == '\n': + line += 1 + column = 0 + else: + column += 1 + index += 1 + for str_type in [str, unicode]: + marker = Marker(test_name, str_type(input), index, line, column) + snippet = marker.get_snippet() + #print "INPUT:" + #print input + #print "SNIPPET:" + #print snippet + self.failUnless(isinstance(snippet, str)) + self.failUnlessEqual(snippet.count('\n'), 2) + data, pointer, dummy = snippet.split('\n') + self.failUnless(len(data) < 80) + self.failUnlessEqual(data[len(pointer)-1], '*') + +TestMarker.add_tests('testMarkers', '.markers') + diff --git a/tests/test_tokens.py b/tests/test_tokens.py new file mode 100644 index 0000000..f233589 --- /dev/null +++ b/tests/test_tokens.py @@ -0,0 +1,62 @@ + +import test_appliance + +from yaml.scanner import Scanner + +class TestTokens(test_appliance.TestAppliance): + + # Tokens mnemonic: + # directive: % + # document_start: --- + # document_end: ... + # alias: * + # anchor: & + # tag: ! + # scalar _ + # block_sequence_start: [[ + # block_mapping_start: {{ + # block_end: ]} + # flow_sequence_start: [ + # flow_sequence_end: ] + # flow_mapping_start: { + # flow_mapping_end: } + # entry: , + # key: ? + # value: : + + replaces = { + 'DIRECTIVE': '%', + 'DOCUMENT_START': '---', + 'DOCUMENT_END': '...', + 'ALIAS': '*', + 'ANCHOR': '&', + 'TAG': '!', + 'SCALAR': '_', + 'BLOCK_SEQ_START': '[[', + 'BLOCK_MAP_START': '{{', + 'BLOCK_END': ']}', + 'FLOW_SEQ_START': '[', + 'FLOW_SEQ_END': ']', + 'FLOW_MAP_START': '{', + 'FLOW_MAP_END': '}', + 'ENTRY': ',', + 'KEY': '?', + 'VALUE': ':', + } + + def _testTokens(self, test_name, data_filename, tokens_filename): + tokens1 = None + tokens2 = file(tokens_filename, 'rb').read().split() + try: + scanner = Scanner() + tokens1 = scanner.scan(data_filename, file(data_filename, 'rb').read()) + tokens1 = [self.replaces[t] for t in tokens1] + self.failUnlessEqual(tokens1, tokens2) + except: + print + print "TOKENS1:", tokens1 + print "TOKENS2:", tokens2 + raise + +TestTokens.add_tests('testTokens', '.data', '.tokens') + diff --git a/tests/test_yaml.py b/tests/test_yaml.py new file mode 100644 index 0000000..54447b3 --- /dev/null +++ b/tests/test_yaml.py @@ -0,0 +1,12 @@ + +import unittest + +from test_marker import * +from test_tokens import * + +def main(module='__main__'): + unittest.main(module) + +if __name__ == '__main__': + main() + -- cgit v1.2.1