#
#  Copyright (C) 2018 Codethink Limited
#  Copyright (C) 2019 Bloomberg LLP
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
#
#  Authors:
#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
#        Daniel Silverstone <daniel.silverstone@codethink.co.uk>
#        James Ennis <james.ennis@codethink.co.uk>
#        Benjamin Schubert <bschubert@bloomberg.net>

import sys
import string
from contextlib import ExitStack
from collections import OrderedDict
from collections.abc import Mapping, Sequence
from copy import deepcopy

from ruamel import yaml
from ._exceptions import LoadError, LoadErrorReason


# Without this, pylint complains about all the `type(foo) is blah` checks
# because it feels isinstance() is more idiomatic.  Sadly, it is much slower to
# do `isinstance(foo, blah)` for reasons I am unable to fathom.  As such, we
# blanket disable the check for this module.
#
# pylint: disable=unidiomatic-typecheck


# Node()
#
# Container for YAML loaded data and its provenance
#
# All nodes returned (and all internal lists/strings) have this type (rather
# than a plain tuple, to distinguish them in things like node_sanitize)
#
# Members:
#   value (str/list/dict): The loaded value.
#   file_index (int): Index within _FILE_LIST (a list of loaded file paths).
#                     Negative indices indicate synthetic nodes so that
#                     they can be referenced.
#   line (int): The line number within the file where the value appears.
#   col (int): The column number within the file where the value appears.
#
cdef class Node:

    def __init__(self, object value, int file_index, int line, int column):
        self.value = value
        self.file_index = file_index
        self.line = line
        self.column = column

    def __contains__(self, what):
        # Delegate to the inner value, though this will likely not work
        # very well if the node is a list or string, it's unlikely that
        # code which has access to such nodes would do this.
        return what in self.value


# Metadata container for a yaml toplevel node.
#
# This class contains metadata around a yaml node in order to be able
# to trace back the provenance of a node to the file.
#
cdef class FileInfo:

    cdef str filename, shortname, displayname
    cdef Node toplevel,
    cdef object project

    def __init__(self, str filename, str shortname, str displayname, Node toplevel, object project):
        self.filename = filename
        self.shortname = shortname
        self.displayname = displayname
        self.toplevel = toplevel
        self.project = project


# File name handling
cdef _FILE_LIST = []


# Purely synthetic node will have _SYNTHETIC_FILE_INDEX for the file number, have line number
# zero, and a negative column number which comes from inverting the next value
# out of this counter.  Synthetic nodes created with a reference node will
# have a file number from the reference node, some unknown line number, and
# a negative column number from this counter.
cdef int _SYNTHETIC_FILE_INDEX = -1
cdef int __counter = 0

cdef int next_synthetic_counter():
    global __counter
    __counter -= 1
    return __counter


# Returned from node_get_provenance
cdef class ProvenanceInformation:

    def __init__(self, Node nodeish):
        cdef FileInfo fileinfo

        self.node = nodeish
        if (nodeish is None) or (nodeish.file_index is None):
            self.filename = ""
            self.shortname = ""
            self.displayname = ""
            self.line = 1
            self.col = 0
            self.toplevel = None
            self.project = None
        else:
            fileinfo = <FileInfo> _FILE_LIST[nodeish.file_index]
            self.filename = fileinfo.filename
            self.shortname = fileinfo.shortname
            self.displayname = fileinfo.displayname
            # We add 1 here to convert from computerish to humanish
            self.line = nodeish.line + 1
            self.col = nodeish.column
            self.toplevel = fileinfo.toplevel
            self.project = fileinfo.project
        self.is_synthetic = (self.filename == '') or (self.col < 0)

    # Convert a Provenance to a string for error reporting
    def __str__(self):
        if self.is_synthetic:
            return "{} [synthetic node]".format(self.displayname)
        else:
            return "{} [line {:d} column {:d}]".format(self.displayname, self.line, self.col)


# These exceptions are intended to be caught entirely within
# the BuildStream framework, hence they do not reside in the
# public exceptions.py
class CompositeError(Exception):
    def __init__(self, path, message):
        super().__init__(message)
        self.path = path
        self.message = message


class YAMLLoadError(Exception):
    pass


# Represents the various states in which the Representer can be
# while parsing yaml.
cdef enum RepresenterState:
    doc
    init
    stream
    wait_key
    wait_list_item
    wait_value


ctypedef RepresenterState (*representer_action)(Representer, object)

# Representer for YAML events comprising input to the BuildStream format.
#
# All streams MUST represent a single document which must be a Mapping.
# Anything else is considered an error.
#
# Mappings must only have string keys, values are always represented as
# strings if they are scalar, or else as simple dictionaries and lists.
#
cdef class Representer:

    cdef int _file_index
    cdef RepresenterState state
    cdef list output, keys

    # Initialise a new representer
    #
    # The file index is used to store into the Node instances so that the
    # provenance of the YAML can be tracked.
    #
    # Args:
    #   file_index (int): The index of this YAML file
    def __init__(self, int file_index):
        self._file_index = file_index
        self.state = RepresenterState.init
        self.output = []
        self.keys = []

    # Handle a YAML parse event
    #
    # Args:
    #   event (YAML Event): The event to be handled
    #
    # Raises:
    #   YAMLLoadError: Something went wrong.
    cdef void handle_event(self, event) except *:
        if getattr(event, "anchor", None) is not None:
            raise YAMLLoadError("Anchors are disallowed in BuildStream at line {} column {}"
                                .format(event.start_mark.line, event.start_mark.column))

        cdef str event_name = event.__class__.__name__

        if event_name == "ScalarEvent":
            if event.tag is not None:
                if not event.tag.startswith("tag:yaml.org,2002:"):
                    raise YAMLLoadError(
                        "Non-core tag expressed in input.  " +
                        "This is disallowed in BuildStream. At line {} column {}"
                        .format(event.start_mark.line, event.start_mark.column))

        cdef representer_action handler = self._get_handler_for_event(event_name)
        if not handler:
            raise YAMLLoadError(
                "Invalid input detected. No handler for {} in state {} at line {} column {}"
                .format(event, self.state, event.start_mark.line, event.start_mark.column))

        # Cython weirdness here, we need to pass self to the function
        self.state = <RepresenterState> handler(self, event)  # pylint: disable=not-callable

    # Get the output of the YAML parse
    #
    # Returns:
    #   (Node or None): Return the Node instance of the top level mapping or
    #                   None if there wasn't one.
    cdef Node get_output(self):
        if len(self.output):
            return self.output[0]
        return None

    cdef representer_action _get_handler_for_event(self, str event_name):
        if self.state == RepresenterState.wait_list_item:
            if event_name == "ScalarEvent":
                return self._handle_wait_list_item_ScalarEvent
            elif event_name == "MappingStartEvent":
                return self._handle_wait_list_item_MappingStartEvent
            elif event_name == "SequenceStartEvent":
                return self._handle_wait_list_item_SequenceStartEvent
            elif event_name == "SequenceEndEvent":
                return self._handle_wait_list_item_SequenceEndEvent
        elif self.state == RepresenterState.wait_value:
            if event_name == "ScalarEvent":
                return self._handle_wait_value_ScalarEvent
            elif event_name == "MappingStartEvent":
                return self._handle_wait_value_MappingStartEvent
            elif event_name == "SequenceStartEvent":
                return self._handle_wait_value_SequenceStartEvent
        elif self.state == RepresenterState.wait_key:
            if event_name == "ScalarEvent":
                return self._handle_wait_key_ScalarEvent
            elif event_name == "MappingEndEvent":
                return self._handle_wait_key_MappingEndEvent
        elif self.state == RepresenterState.stream:
            if event_name == "DocumentStartEvent":
                return self._handle_stream_DocumentStartEvent
            elif event_name == "StreamEndEvent":
                return self._handle_stream_StreamEndEvent
        elif self.state == RepresenterState.doc:
            if event_name == "MappingStartEvent":
                return self._handle_doc_MappingStartEvent
            elif event_name == "DocumentEndEvent":
                return self._handle_doc_DocumentEndEvent
        elif self.state == RepresenterState.init and event_name == "StreamStartEvent":
            return self._handle_init_StreamStartEvent
        return NULL

    cdef RepresenterState _handle_init_StreamStartEvent(self, object ev):
        return RepresenterState.stream

    cdef RepresenterState _handle_stream_DocumentStartEvent(self, object ev):
        return RepresenterState.doc

    cdef RepresenterState _handle_doc_MappingStartEvent(self, object ev):
        newmap = Node({}, self._file_index, ev.start_mark.line, ev.start_mark.column)
        self.output.append(newmap)
        return RepresenterState.wait_key

    cdef RepresenterState _handle_wait_key_ScalarEvent(self, object ev):
        self.keys.append(ev.value)
        return RepresenterState.wait_value

    cdef RepresenterState _handle_wait_value_ScalarEvent(self, object ev):
        key = self.keys.pop()
        (<dict> (<Node> self.output[-1]).value)[key] = \
            Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column)
        return RepresenterState.wait_key

    cdef RepresenterState _handle_wait_value_MappingStartEvent(self, object ev):
        cdef RepresenterState new_state = self._handle_doc_MappingStartEvent(ev)
        key = self.keys.pop()
        (<dict> (<Node> self.output[-2]).value)[key] = self.output[-1]
        return new_state

    cdef RepresenterState _handle_wait_key_MappingEndEvent(self, object ev):
        # We've finished a mapping, so pop it off the output stack
        # unless it's the last one in which case we leave it
        if len(self.output) > 1:
            self.output.pop()
            if type((<Node> self.output[-1]).value) is list:
                return RepresenterState.wait_list_item
            else:
                return RepresenterState.wait_key
        else:
            return RepresenterState.doc

    cdef RepresenterState _handle_wait_value_SequenceStartEvent(self, object ev):
        self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
        (<dict> (<Node> self.output[-2]).value)[self.keys[-1]] = self.output[-1]
        return RepresenterState.wait_list_item

    cdef RepresenterState _handle_wait_list_item_SequenceStartEvent(self, object ev):
        self.keys.append(len((<Node> self.output[-1]).value))
        self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
        (<list> (<Node> self.output[-2]).value).append(self.output[-1])
        return RepresenterState.wait_list_item

    cdef RepresenterState _handle_wait_list_item_SequenceEndEvent(self, object ev):
        # When ending a sequence, we need to pop a key because we retain the
        # key until the end so that if we need to mutate the underlying entry
        # we can.
        key = self.keys.pop()
        self.output.pop()
        if type(key) is int:
            return RepresenterState.wait_list_item
        else:
            return RepresenterState.wait_key

    cdef RepresenterState _handle_wait_list_item_ScalarEvent(self, object ev):
        (<Node> self.output[-1]).value.append(
            Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column))
        return RepresenterState.wait_list_item

    cdef RepresenterState _handle_wait_list_item_MappingStartEvent(self, object ev):
        cdef RepresenterState new_state = self._handle_doc_MappingStartEvent(ev)
        (<list> (<Node> self.output[-2]).value).append(self.output[-1])
        return new_state

    cdef RepresenterState _handle_doc_DocumentEndEvent(self, object ev):
        if len(self.output) != 1:
            raise YAMLLoadError("Zero, or more than one document found in YAML stream")
        return RepresenterState.stream

    cdef RepresenterState _handle_stream_StreamEndEvent(self, object ev):
        return RepresenterState.init


# Loads a dictionary from some YAML
#
# Args:
#    filename (str): The YAML file to load
#    shortname (str): The filename in shorthand for error reporting (or None)
#    copy_tree (bool): Whether to make a copy, preserving the original toplevels
#                      for later serialization
#    project (Project): The (optional) project to associate the parsed YAML with
#
# Returns (dict): A loaded copy of the YAML file with provenance information
#
# Raises: LoadError
#
cpdef Node load(str filename, str shortname=None, bint copy_tree=False, object project=None):
    if not shortname:
        shortname = filename

    cdef str displayname
    if (project is not None) and (project.junction is not None):
        displayname = "{}:{}".format(project.junction.name, shortname)
    else:
        displayname = shortname

    cdef Py_ssize_t file_number = len(_FILE_LIST)
    _FILE_LIST.append(FileInfo(filename, shortname, displayname, None, project))

    cdef Node data

    try:
        with open(filename) as f:
            contents = f.read()

        data = load_data(contents,
                         file_index=file_number,
                         file_name=filename,
                         copy_tree=copy_tree)

        return data
    except FileNotFoundError as e:
        raise LoadError(LoadErrorReason.MISSING_FILE,
                        "Could not find file at {}".format(filename)) from e
    except IsADirectoryError as e:
        raise LoadError(LoadErrorReason.LOADING_DIRECTORY,
                        "{} is a directory. bst command expects a .bst file."
                        .format(filename)) from e
    except LoadError as e:
        raise LoadError(e.reason, "{}: {}".format(displayname, e)) from e


# Like load(), but doesnt require the data to be in a file
#
cpdef Node load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False):
    cdef Representer rep
    cdef FileInfo f_info

    try:
        rep = Representer(file_index)
        parser = yaml.CParser(data)

        try:
            while parser.check_event():
                rep.handle_event(parser.get_event())
        finally:
            parser.dispose()

        contents = rep.get_output()
    except YAMLLoadError as e:
        raise LoadError(LoadErrorReason.INVALID_YAML,
                        "Malformed YAML:\n\n{}\n\n".format(e)) from e
    except Exception as e:
        raise LoadError(LoadErrorReason.INVALID_YAML,
                        "Severely malformed YAML:\n\n{}\n\n".format(e)) from e

    if type(contents) != Node:
        # Special case allowance for None, when the loaded file has only comments in it.
        if contents is None:
            contents = Node({}, file_index, 0, 0)
        else:
            raise LoadError(LoadErrorReason.INVALID_YAML,
                            "YAML file has content of type '{}' instead of expected type 'dict': {}"
                            .format(type(contents[0]).__name__, file_name))

    # Store this away because we'll use it later for "top level" provenance
    if file_index is not None:
        f_info = <FileInfo> _FILE_LIST[file_index]

        _FILE_LIST[file_index] = FileInfo(
            f_info.filename,
            f_info.shortname,
            f_info.displayname,
            contents,
            f_info.project,
        )

    if copy_tree:
        contents = node_copy(contents)
    return contents


# dump()
#
# Write a YAML node structure out to disk.
#
# This will always call `node_sanitize` on its input, so if you wanted
# to output something close to what you read in, consider using the
# `roundtrip_load` and `roundtrip_dump` function pair instead.
#
# Args:
#    contents (any): Content to write out
#    filename (str): The (optional) file name to write out to
def dump(object contents, str filename=None):
    roundtrip_dump(node_sanitize(contents), file=filename)


# node_get_provenance()
#
# Gets the provenance for a node
#
# Args:
#   node (Node): a dictionary
#   key (str): key in the dictionary
#   indices (list of indexes): Index path, in the case of list values
#
# Returns: The Provenance of the dict, member or list element
#
cpdef ProvenanceInformation node_get_provenance(Node node, str key=None, list indices=None):
    assert type(node.value) is dict

    if key is None:
        # Retrieving the provenance for this node directly
        return ProvenanceInformation(node)

    if key and not indices:
        return ProvenanceInformation(node.value.get(key))

    cdef Node nodeish = <Node> node.value.get(key)
    for idx in indices:
        nodeish = <Node> nodeish.value[idx]

    return ProvenanceInformation(nodeish)


# A sentinel to be used as a default argument for functions that need
# to distinguish between a kwarg set to None and an unset kwarg.
_sentinel = object()


# node_get()
#
# Fetches a value from a dictionary node and checks it for
# an expected value. Use default_value when parsing a value
# which is only optionally supplied.
#
# Args:
#    node (dict): The dictionary node
#    expected_type (type): The expected type for the value being searched
#    key (str): The key to get a value for in node
#    indices (list of ints): Optionally decend into lists of lists
#    default_value: Optionally return this value if the key is not found
#    allow_none: (bool): Allow None to be a valid value
#
# Returns:
#    The value if found in node, otherwise default_value is returned
#
# Raises:
#    LoadError, when the value found is not of the expected type
#
# Note:
#    Returned strings are stripped of leading and trailing whitespace
#
cpdef object node_get(Node node, object expected_type, str key, list indices=None, object default_value=_sentinel, bint allow_none=False):
    if indices is None:
        value = node.value.get(key, _sentinel)

        if value is _sentinel:
            if default_value is _sentinel:
                provenance = node_get_provenance(node)
                raise LoadError(LoadErrorReason.INVALID_DATA,
                                "{}: Dictionary did not contain expected key '{}'".format(provenance, key))

            value = Node(default_value, _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())
    else:
        # Implied type check of the element itself
        # No need to synthesise useful node content as we destructure it immediately
        value = Node(node_get(node, list, key), _SYNTHETIC_FILE_INDEX, 0, 0)
        for index in indices:
            value = value.value[index]
            if type(value) is not Node:
                value = Node(value, _SYNTHETIC_FILE_INDEX, 0, 0)

    # Optionally allow None as a valid value for any type
    if value.value is None and (allow_none or default_value is None):
        return None

    if (expected_type is not None) and (type(value.value) is not expected_type):
        # Attempt basic conversions if possible, typically we want to
        # be able to specify numeric values and convert them to strings,
        # but we dont want to try converting dicts/lists
        try:
            if expected_type == bool and type(value.value) is str:
                # Dont coerce booleans to string, this makes "False" strings evaluate to True
                # We don't structure into full nodes since there's no need.
                if value.value in ('True', 'true'):
                    value = Node(True, _SYNTHETIC_FILE_INDEX, 0, 0)
                elif value.value in ('False', 'false'):
                    value = Node(False, _SYNTHETIC_FILE_INDEX, 0, 0)
                else:
                    raise ValueError()
            elif not (expected_type == list or
                      expected_type == dict or
                      isinstance(value.value, (list, dict))):
                value = Node(expected_type(value.value), _SYNTHETIC_FILE_INDEX, 0, 0)
            else:
                raise ValueError()
        except (ValueError, TypeError):
            provenance = node_get_provenance(node, key=key, indices=indices)
            if indices:
                path = [key, *["[{:d}]".format(i) for i in indices]]
                path = "".join(path)
            else:
                path = key
            raise LoadError(LoadErrorReason.INVALID_DATA,
                            "{}: Value of '{}' is not of the expected type '{}'"
                            .format(provenance, path, expected_type.__name__))

    # Now collapse lists, and scalars, to their value, leaving nodes as-is
    if type(value.value) is not dict:
        value = value.value

    # Trim it at the bud, let all loaded strings from yaml be stripped of whitespace
    if type(value) is str:
        value = value.strip()

    elif type(value) is list:
        # Now we create a fresh list which unwraps the str and list types
        # semi-recursively.
        value = __trim_list_provenance(value)

    return value


cdef list __trim_list_provenance(list value):
    cdef list ret = []
    cdef Node entry

    for entry in value:
        if type(entry.value) is list:
            ret.append(__trim_list_provenance(entry.value))
        elif type(entry.value) is dict:
            ret.append(entry)
        else:
            ret.append(entry.value)
    return ret


# node_set()
#
# Set an item within the node.  If using `indices` be aware that the entry must
# already exist, or else a KeyError will be raised.  Use `node_extend_list` to
# create entries before using `node_set`
#
# Args:
#    node (Node): The node
#    key (str): The key name
#    value: The value
#    indices: Any indices to index into the list referenced by key, like in
#             `node_get` (must be a list of integers)
#
cpdef void node_set(Node node, object key, object value, list indices=None) except *:
    cdef int idx

    if type(value) is list:
        value = __new_node_from_list(value)

    if indices:
        node = <Node> (<dict> node.value)[key]
        key = indices.pop()
        for idx in indices:
            node = <Node> (<list> node.value)[idx]
    if type(value) is Node:
        node.value[key] = value
    else:
        try:
            # Need to do this just in case we're modifying a list
            old_value = <Node> node.value[key]
        except KeyError:
            old_value = None
        if old_value is None:
            node.value[key] = Node(value, node.file_index, node.line, next_synthetic_counter())
        else:
            node.value[key] = Node(value, old_value.file_index, old_value.line, old_value.column)


# node_extend_list()
#
# Extend a list inside a node to a given length, using the passed
# default value to fill it out.
#
# Valid default values are:
#    Any string
#    An empty dict
#    An empty list
#
# Args:
#    node (node): The node
#    key (str): The list name in the node
#    length (int): The length to extend the list to
#    default (any): The default value to extend with.
def node_extend_list(Node node, str key, Py_ssize_t length, object default):
    assert type(default) is str or default in ([], {})

    cdef Node list_node = <Node> node.value.get(key)
    if list_node is None:
        list_node = node.value[key] = Node([], node.file_index, node.line, next_synthetic_counter())

    cdef list the_list = list_node.value
    def_type = type(default)

    file_index = node.file_index
    if the_list:
        line_num = the_list[-1][2]
    else:
        line_num = list_node.line

    while length > len(the_list):
        if def_type is str:
            value = default
        elif def_type is list:
            value = []
        else:
            value = {}

        line_num += 1

        the_list.append(Node(value, file_index, line_num, next_synthetic_counter()))


# node_items()
#
# A convenience generator for iterating over loaded key/value
# tuples in a dictionary loaded from project YAML.
#
# Args:
#    node (Node): The dictionary node
#
# Yields:
#    (str): The key name
#    (anything): The value for the key
#
def node_items(Node node):
    cdef str key
    cdef Node value

    for key, value in node.value.items():
        if type(value.value) is dict:
            yield (key, value)
        elif type(value.value) is list:
            yield (key, __trim_list_provenance(value.value))
        else:
            yield (key, value.value)


# node_keys()
#
# A convenience generator for iterating over loaded keys
# in a dictionary loaded from project YAML.
#
# Args:
#    node (Node): The dictionary node
#
# Yields:
#    (str): The key name
#
cpdef list node_keys(Node node):
    return list(node.value.keys())


# node_del()
#
# A convenience generator for iterating over loaded key/value
# tuples in a dictionary loaded from project YAML.
#
# Args:
#    node (dict): The dictionary node
#    key (str): The key we want to remove
#    safe (bool): Whether to raise a KeyError if unable
#
cpdef void node_del(Node node, str key, bint safe=False) except *:
    try:
        del node.value[key]
    except KeyError:
        if not safe:
            raise


# is_node()
#
# A test method which returns whether or not the passed in value
# is a valid YAML node.  It is not valid to call this on a Node
# object which is not a Mapping.
#
# Args:
#    maybenode (any): The object to test for nodeness
#
# Returns:
#    (bool): Whether or not maybenode was a Node
#
def is_node(maybenode):
    # It's a programming error to give this a Node which isn't a mapping
    # so assert that.
    assert (type(maybenode) is not Node) or (type(maybenode.value) is dict)
    # Now return the type check
    return type(maybenode) is Node


# new_synthetic_file()
#
# Create a new synthetic mapping node, with an associated file entry
# (in _FILE_LIST) such that later tracking can correctly determine which
# file needs writing to in order to persist the changes.
#
# Args:
#    filename (str): The name of the synthetic file to create
#    project (Project): The optional project to associate this synthetic file with
#
# Returns:
#    (Node): An empty YAML mapping node, whose provenance is to this new
#            synthetic file
#
def new_synthetic_file(str filename, object project=None):
    cdef Py_ssize_t file_index = len(_FILE_LIST)
    cdef Node node = Node({}, file_index, 0, 0)

    _FILE_LIST.append(FileInfo(filename,
                       filename,
                       "<synthetic {}>".format(filename),
                       node,
                       project))
    return node


# new_empty_node()
#
# Args:
#    ref_node (Node): Optional node whose provenance should be referenced
#
# Returns
#    (Node): A new empty YAML mapping node
#
def new_empty_node(Node ref_node=None):
    if ref_node is not None:
        return Node({}, ref_node.file_index, ref_node.line, next_synthetic_counter())
    else:
        return Node({}, _SYNTHETIC_FILE_INDEX, 0, 0)


# new_node_from_dict()
#
# Args:
#   indict (dict): The input dictionary
#
# Returns:
#   (Node): A new synthetic YAML tree which represents this dictionary
#
cpdef Node new_node_from_dict(dict indict):
    cdef dict ret = {}
    cdef str k
    for k, v in indict.items():
        vtype = type(v)
        if vtype is dict:
            ret[k] = new_node_from_dict(v)
        elif vtype is list:
            ret[k] = __new_node_from_list(v)
        else:
            ret[k] = Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())
    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())


# Internal function to help new_node_from_dict() to handle lists
cdef Node __new_node_from_list(list inlist):
    cdef list ret = []
    for v in inlist:
        vtype = type(v)
        if vtype is dict:
            ret.append(new_node_from_dict(v))
        elif vtype is list:
            ret.append(__new_node_from_list(v))
        else:
            ret.append(Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter()))
    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())


# _is_composite_list
#
# Checks if the given node is a Mapping with array composition
# directives.
#
# Args:
#    node (value): Any node
#
# Returns:
#    (bool): True if node was a Mapping containing only
#            list composition directives
#
# Raises:
#    (LoadError): If node was a mapping and contained a mix of
#                 list composition directives and other keys
#
cdef bint _is_composite_list(Node node):
    cdef bint has_directives = False
    cdef bint has_keys = False
    cdef str key

    if type(node.value) is dict:
        for key in node_keys(node):
            if key in ['(>)', '(<)', '(=)']:  # pylint: disable=simplifiable-if-statement
                has_directives = True
            else:
                has_keys = True

            if has_keys and has_directives:
                provenance = node_get_provenance(node)
                raise LoadError(LoadErrorReason.INVALID_DATA,
                                "{}: Dictionary contains array composition directives and arbitrary keys"
                                .format(provenance))
        return has_directives

    return False


# _compose_composite_list()
#
# Composes a composite list (i.e. a dict with list composition directives)
# on top of a target list which is a composite list itself.
#
# Args:
#    target (Node): A composite list
#    source (Node): A composite list
#
cdef void _compose_composite_list(Node target, Node source):
    clobber = source.value.get("(=)")
    prefix = source.value.get("(<)")
    suffix = source.value.get("(>)")
    if clobber is not None:
        # We want to clobber the target list
        # which basically means replacing the target list
        # with ourselves
        target.value["(=)"] = clobber
        if prefix is not None:
            target.value["(<)"] = prefix
        elif "(<)" in target.value:
            target.value["(<)"].value.clear()
        if suffix is not None:
            target.value["(>)"] = suffix
        elif "(>)" in target.value:
            target.value["(>)"].value.clear()
    else:
        # Not clobbering, so prefix the prefix and suffix the suffix
        if prefix is not None:
            if "(<)" in target.value:
                for v in reversed(prefix.value):
                    target.value["(<)"].value.insert(0, v)
            else:
                target.value["(<)"] = prefix
        if suffix is not None:
            if "(>)" in target.value:
                target.value["(>)"].value.extend(suffix.value)
            else:
                target.value["(>)"] = suffix


# _compose_list()
#
# Compose a composite list (a dict with composition directives) on top of a
# simple list.
#
# Args:
#    target (Node): The target list to be composed into
#    source (Node): The composition list to be composed from
#
cdef void _compose_list(Node target, Node source):
    clobber = source.value.get("(=)")
    prefix = source.value.get("(<)")
    suffix = source.value.get("(>)")
    if clobber is not None:
        target.value.clear()
        target.value.extend(clobber.value)
    if prefix is not None:
        for v in reversed(prefix.value):
            target.value.insert(0, v)
    if suffix is not None:
        target.value.extend(suffix.value)


# composite_dict()
#
# Compose one mapping node onto another
#
# Args:
#    target (Node): The target to compose into
#    source (Node): The source to compose from
#    path   (list): The path to the current composition node
#
# Raises: CompositeError
#
cpdef void composite_dict(Node target, Node source, list path=None) except *:
    cdef str k
    cdef Node v, target_value

    if path is None:
        path = []
    for k, v in source.value.items():
        path.append(k)
        if type(v.value) is list:
            # List clobbers anything list-like
            target_value = target.value.get(k)
            if not (target_value is None or
                    type(target_value.value) is list or
                    _is_composite_list(target_value)):
                raise CompositeError(path,
                                     "{}: List cannot overwrite {} at: {}"
                                     .format(node_get_provenance(source, k),
                                             k,
                                             node_get_provenance(target, k)))
            # Looks good, clobber it
            target.value[k] = v
        elif _is_composite_list(v):
            if k not in target.value:
                # Composite list clobbers empty space
                target.value[k] = v
            elif type(target.value[k].value) is list:
                # Composite list composes into a list
                _compose_list(target.value[k], v)
            elif _is_composite_list(target.value[k]):
                # Composite list merges into composite list
                _compose_composite_list(target.value[k], v)
            else:
                # Else composing on top of normal dict or a scalar, so raise...
                raise CompositeError(path,
                                     "{}: Cannot compose lists onto {}".format(
                                         node_get_provenance(v),
                                         node_get_provenance(target.value[k])))
        elif type(v.value) is dict:
            # We're composing a dict into target now
            if k not in target.value:
                # Target lacks a dict at that point, make a fresh one with
                # the same provenance as the incoming dict
                target.value[k] = Node({}, v.file_index, v.line, v.column)
            if type(target.value) is not dict:
                raise CompositeError(path,
                                     "{}: Cannot compose dictionary onto {}".format(
                                         node_get_provenance(v),
                                         node_get_provenance(target.value[k])))
            composite_dict(target.value[k], v, path)
        else:
            target_value = target.value.get(k)
            if target_value is not None and type(target_value.value) is not str:
                raise CompositeError(path,
                                     "{}: Cannot compose scalar on non-scalar at {}".format(
                                         node_get_provenance(v),
                                         node_get_provenance(target.value[k])))
            target.value[k] = v
        path.pop()


# Like composite_dict(), but raises an all purpose LoadError for convenience
#
cpdef void composite(Node target, Node source) except *:
    assert type(source.value) is dict
    assert type(target.value) is dict

    try:
        composite_dict(target, source)
    except CompositeError as e:
        source_provenance = node_get_provenance(source)
        error_prefix = ""
        if source_provenance:
            error_prefix = "{}: ".format(source_provenance)
        raise LoadError(LoadErrorReason.ILLEGAL_COMPOSITE,
                        "{}Failure composing {}: {}"
                        .format(error_prefix,
                                e.path,
                                e.message)) from e


# Like composite(target, source), but where target overrides source instead.
#
def composite_and_move(Node target, Node source):
    composite(source, target)

    cdef str key
    cdef Node value
    cdef list to_delete = [key for key in target.value.keys() if key not in source.value]
    for key, value in source.value.items():
        target.value[key] = value
    for key in to_delete:
        del target.value[key]


# Types we can short-circuit in node_sanitize for speed.
__SANITIZE_SHORT_CIRCUIT_TYPES = (int, float, str, bool)


# node_sanitize()
#
# Returns an alphabetically ordered recursive copy
# of the source node with internal provenance information stripped.
#
# Only dicts are ordered, list elements are left in order.
#
cpdef object node_sanitize(object node, object dict_type=OrderedDict):
    node_type = type(node)

    # If we have an unwrappable node, unwrap it
    if node_type is Node:
        node = node.value
        node_type = type(node)

    # Short-circuit None which occurs ca. twice per element
    if node is None:
        return node

    # Next short-circuit integers, floats, strings, booleans, and tuples
    if node_type in __SANITIZE_SHORT_CIRCUIT_TYPES:
        return node

    # Now short-circuit lists.
    elif node_type is list:
        return [node_sanitize(elt, dict_type=dict_type) for elt in node]

    # Finally dict, and other Mappings need special handling
    elif node_type is dict:
        result = dict_type()

        key_list = [key for key, _ in node.items()]
        for key in sorted(key_list):
            result[key] = node_sanitize(node[key], dict_type=dict_type)

        return result

    # Sometimes we're handed tuples and we can't be sure what they contain
    # so we have to sanitize into them
    elif node_type is tuple:
        return tuple([node_sanitize(v, dict_type=dict_type) for v in node])

    # Everything else just gets returned as-is.
    return node


# node_validate()
#
# Validate the node so as to ensure the user has not specified
# any keys which are unrecognized by buildstream (usually this
# means a typo which would otherwise not trigger an error).
#
# Args:
#    node (Node): A dictionary loaded from YAML
#    valid_keys (list): A list of valid keys for the specified node
#
# Raises:
#    LoadError: In the case that the specified node contained
#               one or more invalid keys
#
cpdef void node_validate(Node node, list valid_keys) except *:

    # Probably the fastest way to do this: https://stackoverflow.com/a/23062482
    cdef set valid_keys_set = set(valid_keys)
    cdef str key

    for key in node.value:
        if key not in valid_keys_set:
            provenance = node_get_provenance(node, key=key)
            raise LoadError(LoadErrorReason.INVALID_DATA,
                            "{}: Unexpected key: {}".format(provenance, key))


# Node copying
#
# Unfortunately we copy nodes a *lot* and `isinstance()` is super-slow when
# things from collections.abc get involved.  The result is the following
# intricate but substantially faster group of tuples and the use of `in`.
#
# If any of the {node,list}_copy routines raise a ValueError
# then it's likely additional types need adding to these tuples.


# These types just have their value copied
__QUICK_TYPES = (str, bool)

# These are the directives used to compose lists, we need this because it's
# slightly faster during the node_final_assertions checks
__NODE_ASSERT_COMPOSITION_DIRECTIVES = ('(>)', '(<)', '(=)')


# node_copy()
#
# Make a deep copy of the given YAML node, preserving provenance.
#
# Args:
#    source (Node): The YAML node to copy
#
# Returns:
#    (Node): A deep copy of source with provenance preserved.
#
cpdef Node node_copy(Node source):
    cdef dict copy = {}
    cdef str key
    cdef Node value

    for key, value in source.value.items():
        value_type = type(value.value)
        if value_type is dict:
            copy[key] = node_copy(value)
        elif value_type is list:
            copy[key] = _list_copy(value)
        elif value_type in __QUICK_TYPES:
            copy[key] = value
        else:
            raise ValueError("Unable to be quick about node_copy of {}".format(value_type))

    return Node(copy, source.file_index, source.line, source.column)


# Internal function to help node_copy() but for lists.
cdef Node _list_copy(Node source):
    cdef list copy = []
    cdef Node item

    for item in source.value:
        item_type = type(item.value)

        if item_type is dict:
            copy.append(node_copy(item))
        elif item_type is list:
            copy.append(_list_copy(item))
        elif item_type in __QUICK_TYPES:
            copy.append(item)
        else:
            raise ValueError("Unable to be quick about list_copy of {}".format(item_type))

    return Node(copy, source.file_index, source.line, source.column)


# node_final_assertions()
#
# This must be called on a fully loaded and composited node,
# after all composition has completed.
#
# Args:
#    node (Mapping): The final composited node
#
# Raises:
#    (LoadError): If any assertions fail
#
cpdef void node_final_assertions(Node node) except *:
    cdef str key
    cdef Node value

    for key, value in node.value.items():

        # Assert that list composition directives dont remain, this
        # indicates that the user intended to override a list which
        # never existed in the underlying data
        #
        if key in __NODE_ASSERT_COMPOSITION_DIRECTIVES:
            provenance = node_get_provenance(node, key)
            raise LoadError(LoadErrorReason.TRAILING_LIST_DIRECTIVE,
                            "{}: Attempt to override non-existing list".format(provenance))

        value_type = type(value.value)

        if value_type is dict:
            node_final_assertions(value)
        elif value_type is list:
            _list_final_assertions(value)


# Helper function for node_final_assertions(), but for lists.
def _list_final_assertions(Node values):
    for value in values.value:
        value_type = type(value.value)

        if value_type is dict:
            node_final_assertions(value)
        elif value_type is list:
            _list_final_assertions(value)


# assert_symbol_name()
#
# A helper function to check if a loaded string is a valid symbol
# name and to raise a consistent LoadError if not. For strings which
# are required to be symbols.
#
# Args:
#    provenance (Provenance): The provenance of the loaded symbol, or None
#    symbol_name (str): The loaded symbol name
#    purpose (str): The purpose of the string, for an error message
#    allow_dashes (bool): Whether dashes are allowed for this symbol
#
# Raises:
#    LoadError: If the symbol_name is invalid
#
# Note that dashes are generally preferred for variable names and
# usage in YAML, but things such as option names which will be
# evaluated with jinja2 cannot use dashes.
def assert_symbol_name(ProvenanceInformation provenance, str symbol_name, str purpose, *, bint allow_dashes=True):
    cdef str valid_chars = string.digits + string.ascii_letters + '_'
    if allow_dashes:
        valid_chars += '-'

    cdef bint valid = True
    if not symbol_name:
        valid = False
    elif any(x not in valid_chars for x in symbol_name):
        valid = False
    elif symbol_name[0] in string.digits:
        valid = False

    if not valid:
        detail = "Symbol names must contain only alphanumeric characters, " + \
                 "may not start with a digit, and may contain underscores"
        if allow_dashes:
            detail += " or dashes"

        message = "Invalid symbol name for {}: '{}'".format(purpose, symbol_name)
        if provenance is not None:
            message = "{}: {}".format(provenance, message)

        raise LoadError(LoadErrorReason.INVALID_SYMBOL_NAME,
                        message, detail=detail)


# node_find_target()
#
# Searches the given node tree for the given target node.
#
# This is typically used when trying to walk a path to a given node
# for the purpose of then modifying a similar tree of objects elsewhere
#
# If the key is provided, then we actually hunt for the node represented by
# target[key] and return its container, rather than hunting for target directly
#
# Args:
#    node (Node): The node at the root of the tree to search
#    target (Node): The node you are looking for in that tree
#    key (str): Optional string key within target node
#
# Returns:
#    (list): A path from `node` to `target` or None if `target` is not in the subtree
cpdef list node_find_target(Node node, Node target, str key=None):
    if key is not None:
        target = target.value[key]

    cdef list path = []
    if _walk_find_target(node, path, target):
        if key:
            # Remove key from end of path
            path = path[:-1]
        return path
    return None


# Helper for node_find_target() which walks a value
cdef bint _walk_find_target(Node node, list path, Node target):
    if node.file_index == target.file_index and node.line == target.line and node.column == target.column:
        return True
    elif type(node.value) is dict:
        return _walk_dict_node(node, path, target)
    elif type(node.value) is list:
        return _walk_list_node(node, path, target)
    return False


# Helper for node_find_target() which walks a list
cdef bint _walk_list_node(Node node, list path, Node target):
    cdef int i
    cdef Node v

    for i, v in enumerate(node.value):
        path.append(i)
        if _walk_find_target(v, path, target):
            return True
        del path[-1]
    return False


# Helper for node_find_target() which walks a mapping
cdef bint _walk_dict_node(Node node, list path, Node target):
    cdef str k
    cdef Node v

    for k, v in node.value.items():
        path.append(k)
        if _walk_find_target(v, path, target):
            return True
        del path[-1]
    return False


###############################################################################

# Roundtrip code

# Always represent things consistently:

yaml.RoundTripRepresenter.add_representer(OrderedDict,
                                          yaml.SafeRepresenter.represent_dict)

# Always parse things consistently

yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:int',
                                          yaml.RoundTripConstructor.construct_yaml_str)
yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:float',
                                          yaml.RoundTripConstructor.construct_yaml_str)
yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:bool',
                                          yaml.RoundTripConstructor.construct_yaml_str)
yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:null',
                                          yaml.RoundTripConstructor.construct_yaml_str)
yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:timestamp',
                                          yaml.RoundTripConstructor.construct_yaml_str)


# HardlineDumper
#
# This is a dumper used during roundtrip_dump which forces every scalar to be
# a plain string, in order to match the output format to the input format.
#
# If you discover something is broken, please add a test case to the roundtrip
# test in tests/internals/yaml/roundtrip-test.yaml
#
class HardlineDumper(yaml.RoundTripDumper):
    def __init__(self, *args, **kwargs):
        yaml.RoundTripDumper.__init__(self, *args, **kwargs)
        # For each of YAML 1.1 and 1.2, force everything to be a plain string
        for version in [(1, 1), (1, 2), None]:
            self.add_version_implicit_resolver(
                version,
                u'tag:yaml.org,2002:str',
                yaml.util.RegExp(r'.*'),
                None)


# roundtrip_load()
#
# Load a YAML file into memory in a form which allows roundtripping as best
# as ruamel permits.
#
# Note, the returned objects can be treated as Mappings and Lists and Strings
# but replacing content wholesale with plain dicts and lists may result
# in a loss of comments and formatting.
#
# Args:
#    filename (str): The file to load in
#    allow_missing (bool): Optionally set this to True to allow missing files
#
# Returns:
#    (Mapping): The loaded YAML mapping.
#
# Raises:
#    (LoadError): If the file is missing, or a directory, this is raised.
#                 Also if the YAML is malformed.
#
def roundtrip_load(filename, *, allow_missing=False):
    try:
        with open(filename, "r") as fh:
            data = fh.read()
        contents = roundtrip_load_data(data, filename=filename)
    except FileNotFoundError as e:
        if allow_missing:
            # Missing files are always empty dictionaries
            return {}
        else:
            raise LoadError(LoadErrorReason.MISSING_FILE,
                            "Could not find file at {}".format(filename)) from e
    except IsADirectoryError as e:
        raise LoadError(LoadErrorReason.LOADING_DIRECTORY,
                        "{} is a directory."
                        .format(filename)) from e
    return contents


# roundtrip_load_data()
#
# Parse the given contents as YAML, returning them as a roundtrippable data
# structure.
#
# A lack of content will be returned as an empty mapping.
#
# Args:
#    contents (str): The contents to be parsed as YAML
#    filename (str): Optional filename to be used in error reports
#
# Returns:
#    (Mapping): The loaded YAML mapping
#
# Raises:
#    (LoadError): Raised on invalid YAML, or YAML which parses to something other
#                 than a Mapping
#
def roundtrip_load_data(contents, *, filename=None):
    try:
        contents = yaml.load(contents, yaml.RoundTripLoader, preserve_quotes=True)
    except (yaml.scanner.ScannerError, yaml.composer.ComposerError, yaml.parser.ParserError) as e:
        raise LoadError(LoadErrorReason.INVALID_YAML,
                        "Malformed YAML:\n\n{}\n\n{}\n".format(e.problem, e.problem_mark)) from e

    # Special case empty files at this point
    if contents is None:
        # We'll make them empty mappings like the main Node loader
        contents = {}

    if not isinstance(contents, Mapping):
        raise LoadError(LoadErrorReason.INVALID_YAML,
                        "YAML file has content of type '{}' instead of expected type 'dict': {}"
                        .format(type(contents).__name__, filename))

    return contents


# roundtrip_dump()
#
# Dumps the given contents as a YAML file.  Ideally the contents came from
# parsing with `roundtrip_load` or `roundtrip_load_data` so that they will be
# dumped in the same form as they came from.
#
# If `file` is a string, it is the filename to write to, if `file` has a
# `write` method, it's treated as a stream, otherwise output is to stdout.
#
# Args:
#    contents (Mapping or list): The content to write out as YAML.
#    file (any): The file to write to
#
def roundtrip_dump(contents, file=None):
    assert type(contents) is not Node

    def stringify_dict(thing):
        for k, v in thing.items():
            if type(v) is str:
                pass
            elif isinstance(v, Mapping):
                stringify_dict(v)
            elif isinstance(v, Sequence):
                stringify_list(v)
            else:
                thing[k] = str(v)

    def stringify_list(thing):
        for i, v in enumerate(thing):
            if type(v) is str:
                pass
            elif isinstance(v, Mapping):
                stringify_dict(v)
            elif isinstance(v, Sequence):
                stringify_list(v)
            else:
                thing[i] = str(v)

    contents = deepcopy(contents)
    stringify_dict(contents)

    with ExitStack() as stack:
        if type(file) is str:
            from . import utils
            f = stack.enter_context(utils.save_file_atomic(file, 'w'))
        elif hasattr(file, 'write'):
            f = file
        else:
            f = sys.stdout
        yaml.round_trip_dump(contents, f, Dumper=HardlineDumper)