Tarball conversion

author: Lorry <lorry@roadtrain.codethink.co.uk> 2012-08-22 15:47:16 +0100
committer: Lorry <lorry@roadtrain.codethink.co.uk> 2012-08-22 15:47:16 +0100
commit: 25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch)
tree: d889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/xml_serializer.py
download: bzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz
1 files changed, 439 insertions, 0 deletions
diff --git a/bzrlib/xml_serializer.py b/bzrlib/xml_serializer.py
new file mode 100644
index 0000000..6cb6556
--- /dev/null
+++ b/bzrlib/xml_serializer.py
@@ -0,0 +1,439 @@
+# Copyright (C) 2005-2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""XML externalization support."""
+
+from __future__ import absolute_import
+
+# "XML is like violence: if it doesn't solve your problem, you aren't
+# using enough of it." -- various
+
+# importing this module is fairly slow because it has to load several
+# ElementTree bits
+
+import re
+
+try:
+    import xml.etree.cElementTree as elementtree
+    ParseError = getattr(elementtree, "ParseError", SyntaxError)
+except ImportError:
+    # Fall back to pure python implementation if C extension is unavailable
+    import xml.etree.ElementTree as elementtree
+    try:
+        from xml.etree.ElementTree import ParseError
+    except ImportError:
+        from xml.parsers.expat import ExpatError as ParseError
+
+(ElementTree, SubElement, Element, XMLTreeBuilder, fromstring, tostring) = (
+    elementtree.ElementTree, elementtree.SubElement, elementtree.Element,
+    elementtree.XMLTreeBuilder, elementtree.fromstring, elementtree.tostring)
+
+
+from bzrlib import (
+    cache_utf8,
+    errors,
+    inventory,
+    lazy_regex,
+    serializer,
+    )
+
+
+class XMLSerializer(serializer.Serializer):
+    """Abstract XML object serialize/deserialize"""
+
+    squashes_xml_invalid_characters = True
+
+    def read_inventory_from_string(self, xml_string, revision_id=None,
+                                   entry_cache=None, return_from_cache=False):
+        """Read xml_string into an inventory object.
+
+        :param xml_string: The xml to read.
+        :param revision_id: If not-None, the expected revision id of the
+            inventory. Some serialisers use this to set the results' root
+            revision. This should be supplied for deserialising all
+            from-repository inventories so that xml5 inventories that were
+            serialised without a revision identifier can be given the right
+            revision id (but not for working tree inventories where users can
+            edit the data without triggering checksum errors or anything).
+        :param entry_cache: An optional cache of InventoryEntry objects. If
+            supplied we will look up entries via (file_id, revision_id) which
+            should map to a valid InventoryEntry (File/Directory/etc) object.
+        :param return_from_cache: Return entries directly from the cache,
+            rather than copying them first. This is only safe if the caller
+            promises not to mutate the returned inventory entries, but it can
+            make some operations significantly faster.
+        """
+        try:
+            return self._unpack_inventory(fromstring(xml_string), revision_id,
+                                          entry_cache=entry_cache,
+                                          return_from_cache=return_from_cache)
+        except ParseError, e:
+            raise errors.UnexpectedInventoryFormat(e)
+
+    def read_inventory(self, f, revision_id=None):
+        try:
+            try:
+                return self._unpack_inventory(self._read_element(f),
+                    revision_id=None)
+            finally:
+                f.close()
+        except ParseError, e:
+            raise errors.UnexpectedInventoryFormat(e)
+
+    def write_revision(self, rev, f):
+        self._write_element(self._pack_revision(rev), f)
+
+    def write_revision_to_string(self, rev):
+        return tostring(self._pack_revision(rev)) + '\n'
+
+    def read_revision(self, f):
+        return self._unpack_revision(self._read_element(f))
+
+    def read_revision_from_string(self, xml_string):
+        return self._unpack_revision(fromstring(xml_string))
+
+    def _write_element(self, elt, f):
+        ElementTree(elt).write(f, 'utf-8')
+        f.write('\n')
+
+    def _read_element(self, f):
+        return ElementTree().parse(f)
+
+
+def escape_invalid_chars(message):
+    """Escape the XML-invalid characters in a commit message.
+
+    :param message: Commit message to escape
+    :return: tuple with escaped message and number of characters escaped
+    """
+    if message is None:
+        return None, 0
+    # Python strings can include characters that can't be
+    # represented in well-formed XML; escape characters that
+    # aren't listed in the XML specification
+    # (http://www.w3.org/TR/REC-xml/#NT-Char).
+    return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
+            lambda match: match.group(0).encode('unicode_escape'),
+            message)
+
+
+def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):
+    """Return a cached version of the string.
+
+    cElementTree will return a plain string if the XML is plain ascii. It only
+    returns Unicode when it needs to. We want to work in utf-8 strings. So if
+    cElementTree returns a plain string, we can just return the cached version.
+    If it is Unicode, then we need to encode it.
+
+    :param a_str: An 8-bit string or Unicode as returned by
+                  cElementTree.Element.get()
+    :return: A utf-8 encoded 8-bit string.
+    """
+    # This is fairly optimized because we know what cElementTree does, this is
+    # not meant as a generic function for all cases. Because it is possible for
+    # an 8-bit string to not be ascii or valid utf8.
+    if a_str.__class__ is unicode:
+        return _encode_utf8(a_str)
+    else:
+        return intern(a_str)
+
+
+_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')
+_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
+
+
+_xml_escape_map = {
+    "&":'&amp;',
+    "'":"&apos;", # FIXME: overkill
+    "\"":"&quot;",
+    "<":"&lt;",
+    ">":"&gt;",
+    }
+
+
+def _unicode_escape_replace(match, _map=_xml_escape_map):
+    """Replace a string of non-ascii, non XML safe characters with their escape
+
+    This will escape both Standard XML escapes, like <>"', etc.
+    As well as escaping non ascii characters, because ElementTree did.
+    This helps us remain compatible to older versions of bzr. We may change
+    our policy in the future, though.
+    """
+    # jam 20060816 Benchmarks show that try/KeyError is faster if you
+    # expect the entity to rarely miss. There is about a 10% difference
+    # in overall time. But if you miss frequently, then if None is much
+    # faster. For our use case, we *rarely* have a revision id, file id
+    # or path name that is unicode. So use try/KeyError.
+    try:
+        return _map[match.group()]
+    except KeyError:
+        return "&#%d;" % ord(match.group())
+
+
+def _utf8_escape_replace(match, _map=_xml_escape_map):
+    """Escape utf8 characters into XML safe ones.
+
+    This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
+    or it is handling characters with the high-bit set. For ascii characters,
+    we just lookup the replacement in the dictionary. For everything else, we
+    decode back into Unicode, and then use the XML escape code.
+    """
+    try:
+        return _map[match.group()]
+    except KeyError:
+        return ''.join('&#%d;' % ord(uni_chr)
+                       for uni_chr in match.group().decode('utf8'))
+
+
+_to_escaped_map = {}
+
+def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
+    """Encode the string into utf8, and escape invalid XML characters"""
+    # We frequently get entities we have not seen before, so it is better
+    # to check if None, rather than try/KeyError
+    text = _map.get(unicode_or_utf8_str)
+    if text is None:
+        if unicode_or_utf8_str.__class__ is unicode:
+            # The alternative policy is to do a regular UTF8 encoding
+            # and then escape only XML meta characters.
+            # Performance is equivalent once you use cache_utf8. *However*
+            # this makes the serialized texts incompatible with old versions
+            # of bzr. So no net gain. (Perhaps the read code would handle utf8
+            # better than entity escapes, but cElementTree seems to do just fine
+            # either way)
+            text = str(_unicode_re.sub(_unicode_escape_replace,
+                                       unicode_or_utf8_str)) + '"'
+        else:
+            # Plain strings are considered to already be in utf-8 so we do a
+            # slightly different method for escaping.
+            text = _utf8_re.sub(_utf8_escape_replace,
+                                unicode_or_utf8_str) + '"'
+        _map[unicode_or_utf8_str] = text
+    return text
+
+
+def _clear_cache():
+    """Clean out the unicode => escaped map"""
+    _to_escaped_map.clear()
+
+
+def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
+    elt_get = elt.get
+    file_id = elt_get('file_id')
+    revision = elt_get('revision')
+    # Check and see if we have already unpacked this exact entry
+    # Some timings for "repo.revision_trees(last_100_revs)"
+    #               bzr     mysql
+    #   unmodified  4.1s    40.8s
+    #   using lru   3.5s
+    #   using fifo  2.83s   29.1s
+    #   lru._cache  2.8s
+    #   dict        2.75s   26.8s
+    #   inv.add     2.5s    26.0s
+    #   no_copy     2.00s   20.5s
+    #   no_c,dict   1.95s   18.0s
+    # Note that a cache of 10k nodes is more than sufficient to hold all of
+    # the inventory for the last 100 revs for bzr, but not for mysql (20k
+    # is enough for mysql, which saves the same 2s as using a dict)
+
+    # Breakdown of mysql using time.clock()
+    #   4.1s    2 calls to element.get for file_id, revision_id
+    #   4.5s    cache_hit lookup
+    #   7.1s    InventoryFile.copy()
+    #   2.4s    InventoryDirectory.copy()
+    #   0.4s    decoding unique entries
+    #   1.6s    decoding entries after FIFO fills up
+    #   0.8s    Adding nodes to FIFO (including flushes)
+    #   0.1s    cache miss lookups
+    # Using an LRU cache
+    #   4.1s    2 calls to element.get for file_id, revision_id
+    #   9.9s    cache_hit lookup
+    #   10.8s   InventoryEntry.copy()
+    #   0.3s    cache miss lookus
+    #   1.2s    decoding entries
+    #   1.0s    adding nodes to LRU
+    if entry_cache is not None and revision is not None:
+        key = (file_id, revision)
+        try:
+            # We copy it, because some operations may mutate it
+            cached_ie = entry_cache[key]
+        except KeyError:
+            pass
+        else:
+            # Only copying directory entries drops us 2.85s => 2.35s
+            if return_from_cache:
+                if cached_ie.kind == 'directory':
+                    return cached_ie.copy()
+                return cached_ie
+            return cached_ie.copy()
+
+    kind = elt.tag
+    if not inventory.InventoryEntry.versionable_kind(kind):
+        raise AssertionError('unsupported entry kind %s' % kind)
+
+    file_id = get_utf8_or_ascii(file_id)
+    if revision is not None:
+        revision = get_utf8_or_ascii(revision)
+    parent_id = elt_get('parent_id')
+    if parent_id is not None:
+        parent_id = get_utf8_or_ascii(parent_id)
+
+    if kind == 'directory':
+        ie = inventory.InventoryDirectory(file_id,
+                                          elt_get('name'),
+                                          parent_id)
+    elif kind == 'file':
+        ie = inventory.InventoryFile(file_id,
+                                     elt_get('name'),
+                                     parent_id)
+        ie.text_sha1 = elt_get('text_sha1')
+        if elt_get('executable') == 'yes':
+            ie.executable = True
+        v = elt_get('text_size')
+        ie.text_size = v and int(v)
+    elif kind == 'symlink':
+        ie = inventory.InventoryLink(file_id,
+                                     elt_get('name'),
+                                     parent_id)
+        ie.symlink_target = elt_get('symlink_target')
+    elif kind == 'tree-reference':
+        file_id = elt.attrib['file_id']
+        name = elt.attrib['name']
+        parent_id = elt.attrib['parent_id']
+        revision = elt.get('revision')
+        reference_revision = elt.get('reference_revision')
+        ie = inventory.TreeReference(file_id, name, parent_id, revision,
+                                       reference_revision)
+    else:
+        raise errors.UnsupportedInventoryKind(kind)
+    ie.revision = revision
+    if revision is not None and entry_cache is not None:
+        # We cache a copy() because callers like to mutate objects, and
+        # that would cause the item in cache to mutate as well.
+        # This has a small effect on many-inventory performance, because
+        # the majority fraction is spent in cache hits, not misses.
+        entry_cache[key] = ie.copy()
+
+    return ie
+
+
+def unpack_inventory_flat(elt, format_num, unpack_entry,
+            entry_cache=None, return_from_cache=False):
+    """Unpack a flat XML inventory.
+
+    :param elt: XML element for the inventory
+    :param format_num: Expected format number
+    :param unpack_entry: Function for unpacking inventory entries
+    :return: An inventory
+    :raise UnexpectedInventoryFormat: When unexpected elements or data is
+        encountered
+    """
+    if elt.tag != 'inventory':
+        raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
+    format = elt.get('format')
+    if format != format_num:
+        raise errors.UnexpectedInventoryFormat('Invalid format version %r'
+                                               % format)
+    revision_id = elt.get('revision_id')
+    if revision_id is not None:
+        revision_id = cache_utf8.encode(revision_id)
+    inv = inventory.Inventory(root_id=None, revision_id=revision_id)
+    for e in elt:
+        ie = unpack_entry(e, entry_cache, return_from_cache)
+        inv.add(ie)
+    return inv
+
+
+def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):
+    """Serialize an inventory to a flat XML file.
+
+    :param inv: Inventory to serialize
+    :param append: Function for writing a line of output
+    :param working: If True skip history data - text_sha1, text_size,
+        reference_revision, symlink_target.    self._check_revisions(inv)
+    """
+    entries = inv.iter_entries()
+    # Skip the root
+    root_path, root_ie = entries.next()
+    for path, ie in entries:
+        if ie.parent_id != root_id:
+            parent_str = ' parent_id="'
+            parent_id  = encode_and_escape(ie.parent_id)
+        else:
+            parent_str = ''
+            parent_id  = ''
+        if ie.kind == 'file':
+            if ie.executable:
+                executable = ' executable="yes"'
+            else:
+                executable = ''
+            if not working:
+                append('<file%s file_id="%s name="%s%s%s revision="%s '
+                    'text_sha1="%s" text_size="%d" />\n' % (
+                    executable, encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name), parent_str, parent_id,
+                    encode_and_escape(ie.revision), ie.text_sha1,
+                    ie.text_size))
+            else:
+                append('<file%s file_id="%s name="%s%s%s />\n' % (
+                    executable, encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name), parent_str, parent_id))
+        elif ie.kind == 'directory':
+            if not working:
+                append('<directory file_id="%s name="%s%s%s revision="%s '
+                    '/>\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id,
+                    encode_and_escape(ie.revision)))
+            else:
+                append('<directory file_id="%s name="%s%s%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id))
+        elif ie.kind == 'symlink':
+            if not working:
+                append('<symlink file_id="%s name="%s%s%s revision="%s '
+                    'symlink_target="%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id,
+                    encode_and_escape(ie.revision),
+                    encode_and_escape(ie.symlink_target)))
+            else:
+                append('<symlink file_id="%s name="%s%s%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id))
+        elif ie.kind == 'tree-reference':
+            if ie.kind not in supported_kinds:
+                raise errors.UnsupportedInventoryKind(ie.kind)
+            if not working:
+                append('<tree-reference file_id="%s name="%s%s%s '
+                    'revision="%s reference_revision="%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id,
+                    encode_and_escape(ie.revision),
+                    encode_and_escape(ie.reference_revision)))
+            else:
+                append('<tree-reference file_id="%s name="%s%s%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id))
+        else:
+            raise errors.UnsupportedInventoryKind(ie.kind)
+    append('</inventory>\n')
author	Lorry <lorry@roadtrain.codethink.co.uk>	2012-08-22 15:47:16 +0100
committer	Lorry <lorry@roadtrain.codethink.co.uk>	2012-08-22 15:47:16 +0100
commit	25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch)
tree	d889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/xml_serializer.py
download	bzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz