diff options
author | Benjamin Schubert <ben.c.schubert@gmail.com> | 2019-05-20 18:37:52 +0100 |
---|---|---|
committer | Benjamin Schubert <contact@benschubert.me> | 2019-05-29 19:49:46 +0100 |
commit | 865fbce3570fd0a07bb46e80db76d2dc9f3fa0e5 (patch) | |
tree | 76c646aadd334fa2f553c012461e4cc0fb10c5aa | |
parent | 2de2675f03e16303af289b5551356af256c961b9 (diff) | |
download | buildstream-865fbce3570fd0a07bb46e80db76d2dc9f3fa0e5.tar.gz |
_yaml: Cythonize `Representer`.
`Representer` is the main interface with the `ruamel` library to
parse the yaml files. Rewriting it with Cython introduces significant
performance gains.
Since `Representer` is not a python class anymore, we can't call
`getattr` on it, and therefore have to do a manual switch on the types
of events. While this is harder to read, it is also much more
performant.
Finally, sotp using `yaml.parse`, but call the parser manually, in order
to avoid going in and out of the python code. This part could be made
even better in the future when `ruamel` becomes stable and if they
expose cython definitions, as they are coded in Cython.
-rw-r--r-- | src/buildstream/_yaml.pyx | 130 |
1 files changed, 86 insertions, 44 deletions
diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx index 3989fe82b..9ad5b34ce 100644 --- a/src/buildstream/_yaml.pyx +++ b/src/buildstream/_yaml.pyx @@ -156,13 +156,11 @@ class YAMLLoadError(Exception): # Mappings must only have string keys, values are always represented as # strings if they are scalar, or else as simple dictionaries and lists. # -class Representer: - __slots__ = ( - "_file_index", - "state", - "output", - "keys", - ) +cdef class Representer: + + cdef int _file_index + cdef str state + cdef list output, keys # Initialise a new representer # @@ -171,7 +169,7 @@ class Representer: # # Args: # file_index (int): The index of this YAML file - def __init__(self, file_index): + def __init__(self, int file_index): self._file_index = file_index self.state = "init" self.output = [] @@ -184,12 +182,14 @@ class Representer: # # Raises: # YAMLLoadError: Something went wrong. - def handle_event(self, event): + cdef void handle_event(self, event) except *: if getattr(event, "anchor", None) is not None: raise YAMLLoadError("Anchors are disallowed in BuildStream at line {} column {}" .format(event.start_mark.line, event.start_mark.column)) - if event.__class__.__name__ == "ScalarEvent": + cdef str event_name = event.__class__.__name__ + + if event_name == "ScalarEvent": if event.tag is not None: if not event.tag.startswith("tag:yaml.org,2002:"): raise YAMLLoadError( @@ -197,77 +197,112 @@ class Representer: "This is disallowed in BuildStream. At line {} column {}" .format(event.start_mark.line, event.start_mark.column)) - handler = "_handle_{}_{}".format(self.state, event.__class__.__name__) - handler = getattr(self, handler, None) + cdef object handler = self._get_handler_for_event(event_name) if handler is None: raise YAMLLoadError( "Invalid input detected. No handler for {} in state {} at line {} column {}" .format(event, self.state, event.start_mark.line, event.start_mark.column)) - self.state = handler(event) # pylint: disable=not-callable + # Cython weirdness here, we need to pass self to the function + self.state = handler(self, event) # pylint: disable=not-callable # Get the output of the YAML parse # # Returns: # (Node or None): Return the Node instance of the top level mapping or # None if there wasn't one. - def get_output(self): - try: + cdef Node get_output(self): + if len(self.output): return self.output[0] - except IndexError: - return None + return None - def _handle_init_StreamStartEvent(self, ev): + cdef object _get_handler_for_event(self, str event_name): + if self.state == "wait_list_item": + if event_name == "ScalarEvent": + return self._handle_wait_list_item_ScalarEvent + elif event_name == "MappingStartEvent": + return self._handle_wait_list_item_MappingStartEvent + elif event_name == "SequenceStartEvent": + return self._handle_wait_list_item_SequenceStartEvent + elif event_name == "SequenceEndEvent": + return self._handle_wait_list_item_SequenceEndEvent + elif self.state == "wait_value": + if event_name == "ScalarEvent": + return self._handle_wait_value_ScalarEvent + elif event_name == "MappingStartEvent": + return self._handle_wait_value_MappingStartEvent + elif event_name == "SequenceStartEvent": + return self._handle_wait_value_SequenceStartEvent + elif self.state == "wait_key": + if event_name == "ScalarEvent": + return self._handle_wait_key_ScalarEvent + elif event_name == "MappingEndEvent": + return self._handle_wait_key_MappingEndEvent + elif self.state == "stream": + if event_name == "DocumentStartEvent": + return self._handle_stream_DocumentStartEvent + elif event_name == "StreamEndEvent": + return self._handle_stream_StreamEndEvent + elif self.state == "doc": + if event_name == "MappingStartEvent": + return self._handle_doc_MappingStartEvent + elif event_name == "DocumentEndEvent": + return self._handle_doc_DocumentEndEvent + elif self.state == "init" and event_name == "StreamStartEvent": + return self._handle_init_StreamStartEvent + return None + + cdef str _handle_init_StreamStartEvent(self, object ev): return "stream" - def _handle_stream_DocumentStartEvent(self, ev): + cdef str _handle_stream_DocumentStartEvent(self, object ev): return "doc" - def _handle_doc_MappingStartEvent(self, ev): + cdef str _handle_doc_MappingStartEvent(self, object ev): newmap = Node({}, self._file_index, ev.start_mark.line, ev.start_mark.column) self.output.append(newmap) return "wait_key" - def _handle_wait_key_ScalarEvent(self, ev): + cdef str _handle_wait_key_ScalarEvent(self, object ev): self.keys.append(ev.value) return "wait_value" - def _handle_wait_value_ScalarEvent(self, ev): + cdef str _handle_wait_value_ScalarEvent(self, object ev): key = self.keys.pop() - self.output[-1].value[key] = \ + (<dict> (<Node> self.output[-1]).value)[key] = \ Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column) return "wait_key" - def _handle_wait_value_MappingStartEvent(self, ev): - new_state = self._handle_doc_MappingStartEvent(ev) + cdef str _handle_wait_value_MappingStartEvent(self, object ev): + cdef str new_state = self._handle_doc_MappingStartEvent(ev) key = self.keys.pop() - self.output[-2].value[key] = self.output[-1] + (<dict> (<Node> self.output[-2]).value)[key] = self.output[-1] return new_state - def _handle_wait_key_MappingEndEvent(self, ev): + cdef str _handle_wait_key_MappingEndEvent(self, object ev): # We've finished a mapping, so pop it off the output stack # unless it's the last one in which case we leave it if len(self.output) > 1: self.output.pop() - if type(self.output[-1].value) is list: + if type((<Node> self.output[-1]).value) is list: return "wait_list_item" else: return "wait_key" else: return "doc" - def _handle_wait_value_SequenceStartEvent(self, ev): + cdef str _handle_wait_value_SequenceStartEvent(self, object ev): self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column)) - self.output[-2].value[self.keys[-1]] = self.output[-1] + (<dict> (<Node> self.output[-2]).value)[self.keys[-1]] = self.output[-1] return "wait_list_item" - def _handle_wait_list_item_SequenceStartEvent(self, ev): - self.keys.append(len(self.output[-1].value)) + cdef str _handle_wait_list_item_SequenceStartEvent(self, object ev): + self.keys.append(len((<Node> self.output[-1]).value)) self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column)) - self.output[-2].value.append(self.output[-1]) + (<list> (<Node> self.output[-2]).value).append(self.output[-1]) return "wait_list_item" - def _handle_wait_list_item_SequenceEndEvent(self, ev): + cdef str _handle_wait_list_item_SequenceEndEvent(self, object ev): # When ending a sequence, we need to pop a key because we retain the # key until the end so that if we need to mutate the underlying entry # we can. @@ -278,22 +313,22 @@ class Representer: else: return "wait_key" - def _handle_wait_list_item_ScalarEvent(self, ev): - self.output[-1].value.append( + cdef str _handle_wait_list_item_ScalarEvent(self, object ev): + (<Node> self.output[-1]).value.append( Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column)) return "wait_list_item" - def _handle_wait_list_item_MappingStartEvent(self, ev): - new_state = self._handle_doc_MappingStartEvent(ev) - self.output[-2].value.append(self.output[-1]) + cdef str _handle_wait_list_item_MappingStartEvent(self, object ev): + cdef str new_state = self._handle_doc_MappingStartEvent(ev) + (<list> (<Node> self.output[-2]).value).append(self.output[-1]) return new_state - def _handle_doc_DocumentEndEvent(self, ev): + cdef str _handle_doc_DocumentEndEvent(self, object ev): if len(self.output) != 1: raise YAMLLoadError("Zero, or more than one document found in YAML stream") return "stream" - def _handle_stream_StreamEndEvent(self, ev): + cdef str _handle_stream_StreamEndEvent(self, object ev): return "init" @@ -348,12 +383,19 @@ cpdef Node load(str filename, str shortname=None, bint copy_tree=False, object p # Like load(), but doesnt require the data to be in a file # -def load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False): +cpdef Node load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False): + cdef Representer rep try: rep = Representer(file_index) - for event in yaml.parse(data, Loader=yaml.CBaseLoader): - rep.handle_event(event) + parser = yaml.CParser(data) + + try: + while parser.check_event(): + rep.handle_event(parser.get_event()) + finally: + parser.dispose() + contents = rep.get_output() except YAMLLoadError as e: raise LoadError(LoadErrorReason.INVALID_YAML, |