diff options
author | Lennart Regebro <regebro@gmail.com> | 2018-11-22 13:26:17 +0100 |
---|---|---|
committer | Lennart Regebro <regebro@gmail.com> | 2018-11-23 14:54:26 +0100 |
commit | 8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b (patch) | |
tree | 6cf087a9a87e84bec3ed86150d7fad0b2de8d9c1 | |
parent | 2d92c1edc1dbf5c3eee7206011725453faa04b20 (diff) | |
download | python-lxml-8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b.tar.gz |
New and improved namespace handling for the saxifier
-rw-r--r-- | CHANGES.txt | 13 | ||||
-rw-r--r-- | src/lxml/sax.py | 47 | ||||
-rw-r--r-- | src/lxml/tests/test_sax.py | 141 |
3 files changed, 145 insertions, 56 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index 62005560..33f929aa 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,9 +10,10 @@ Features added * The module ``lxml.sax`` is compiled using Cython in order to speed it up. -* ElementTreeProducer no longer ignores the namespace prefixes that were available - in the element tree, and now only generates nsXX prefixes if undefined prefixes - are encountered. +* ElementTreeProducer now preserves the namespace prefixes. If two prefixes + point to the same URI, the first prefix in alphabetical order is used + for attributes. + 4.2.6 (2018-??-??) ================== @@ -3888,16 +3889,16 @@ Features added prefix to namespace URI mapping. This will create namespace prefix declarations on these elements and these prefixes will show up in XML serialization. - + Bugs fixed ---------- - + * Killed yet another memory management related bug: trees created using newDoc would not get a libxml2-level dictionary, which caused problems when deallocating these documents later if they contained a node that came from a document with a dictionary. -* Moving namespaced elements between documents was problematic as +* Moving namespaced elements between documents was problematic as references to the original document would remain. This has been fixed by applying xmlReconciliateNs() after each move operation. diff --git a/src/lxml/sax.py b/src/lxml/sax.py index 189a8b80..ac0e2b2e 100644 --- a/src/lxml/sax.py +++ b/src/lxml/sax.py @@ -179,19 +179,19 @@ class ElementTreeProducer(object): siblings.append(sibling) sibling = sibling.getprevious() for sibling in siblings[::-1]: - self._recursive_saxify(sibling, {}) + self._recursive_saxify(sibling) - self._recursive_saxify(element, {}) + self._recursive_saxify(element) if hasattr(element, 'getnext'): sibling = element.getnext() while getattr(sibling, 'tag', None) is ProcessingInstruction: - self._recursive_saxify(sibling, {}) + self._recursive_saxify(sibling) sibling = sibling.getnext() self._content_handler.endDocument() - def _recursive_saxify(self, element, prefixes): + def _recursive_saxify(self, element): content_handler = self._content_handler tag = element.tag if tag is Comment or tag is ProcessingInstruction: @@ -202,14 +202,14 @@ class ElementTreeProducer(object): content_handler.characters(element.tail) return - # Get a new copy in this call, so changes doesn't propagate upwards - prefixes = prefixes.copy() + # Get a new copy in this call, so changes don't propagate upwards new_prefixes = [] - for prefix, ns_uri in element.nsmap.items(): - if prefixes.get(prefix) != ns_uri: - # New or updated namespace - new_prefixes.append( (prefix, ns_uri) ) - prefixes[prefix] = ns_uri + parent_nsmap = getattr(element.getparent(), 'nsmap', {}) + if element.nsmap != parent_nsmap: + # There has been updates to the namespace + for prefix, ns_uri in element.nsmap.items(): + if parent_nsmap.get(prefix) != ns_uri: + new_prefixes.append( (prefix, ns_uri) ) build_qname = self._build_qname attribs = element.items() @@ -220,13 +220,13 @@ class ElementTreeProducer(object): attr_ns_tuple = _getNsTag(attr_ns_name) attr_values[attr_ns_tuple] = value attr_qnames[attr_ns_tuple] = build_qname( - attr_ns_tuple[0], attr_ns_tuple[1], prefixes, None) + attr_ns_tuple[0], attr_ns_tuple[1], element.nsmap, -1) sax_attributes = self._attr_class(attr_values, attr_qnames) else: sax_attributes = self._empty_attributes ns_uri, local_name = _getNsTag(tag) - qname = build_qname(ns_uri, local_name, prefixes, element.prefix) + qname = build_qname(ns_uri, local_name, element.nsmap, element.prefix) for prefix, uri in new_prefixes: content_handler.startPrefixMapping(prefix, uri) @@ -235,22 +235,31 @@ class ElementTreeProducer(object): if element.text: content_handler.characters(element.text) for child in element: - self._recursive_saxify(child, prefixes) + self._recursive_saxify(child) content_handler.endElementNS((ns_uri, local_name), qname) for prefix, uri in new_prefixes: content_handler.endPrefixMapping(prefix) if element.tail: content_handler.characters(element.tail) - def _build_qname(self, ns_uri, local_name, prefixes, preferred): + def _build_qname(self, ns_uri, local_name, prefixes, preferred_prefix): if ns_uri is None: return local_name - if preferred in prefixes and prefixes[preferred] == ns_uri: - prefix = preferred + if prefixes.get(preferred_prefix) == ns_uri: + prefix = preferred_prefix else: - # Pick the first matching prefix - prefix = [pfx for pfx, uri in prefixes.items() if uri == ns_uri][0] + # Pick the first matching prefix: + for pfx in sorted(prefixes, key=str): + if prefixes[pfx] == ns_uri: + prefix = pfx + if pfx is None and preferred_prefix == -1: + # If preferred_prefix is -1, that's a flag to say + # that we want a prefix, any prefix, and only + # accept the default prefix if no other is + # available + continue + break if prefix is None: # Default namespace diff --git a/src/lxml/tests/test_sax.py b/src/lxml/tests/test_sax.py index 5084f183..adc5e736 100644 --- a/src/lxml/tests/test_sax.py +++ b/src/lxml/tests/test_sax.py @@ -13,6 +13,7 @@ if this_dir not in sys.path: from common_imports import HelperTestCase, make_doctest, BytesIO, _bytes from lxml import sax from xml.dom import pulldom +from xml.sax.handler import ContentHandler class ETreeSaxTestCase(HelperTestCase): @@ -157,37 +158,6 @@ class ETreeSaxTestCase(HelperTestCase): self.assertEqual(0, len(root)) - def test_element_sax_ns_prefix(self): - # The name of the prefix should be preserved - tree = self.parse('<a:a xmlns:a="blaA"><b/><c:c xmlns:c="blaC">' - '<d/></c:c></a:a>') - a = tree.getroot() - - self.assertEqual(b'<a:a xmlns:a="blaA"><b/><c:c xmlns:c="blaC">' - b'<d/></c:c></a:a>', - self._saxify_serialize(a)) - - def test_element_sax_default_ns_prefix(self): - # Default prefixes should also not get a generated prefix - tree = self.parse('<a xmlns="blaA"><b/><c:c xmlns:c="blaC">' - '<d/></c:c></a>') - a = tree.getroot() - - self.assertEqual(b'<a xmlns="blaA"><b/><c:c xmlns:c="blaC">' - b'<d/></c:c></a>', - self._saxify_serialize(a)) - - def test_element_sax_unknown_ns_prefix(self): - # Make an element with an unregister prefix - tree = self.parse('<a xmlns="blaA"><b/><c:c xmlns:c="blaC">' - '<d/></c:c></a>') - a = tree.getroot() - a.append(a.makeelement('{blaE}e')) - - self.assertEqual(b'<a xmlns="blaA"><b/><c:c xmlns:c="blaC">' - b'<d/></c:c><ns0:e xmlns:ns0="blaE"/></a>', - self._saxify_serialize(a)) - def test_etree_sax_handler_default_ns(self): handler = sax.ElementTreeContentHandler() handler.startDocument() @@ -327,9 +297,118 @@ class ETreeSaxTestCase(HelperTestCase): return f.getvalue().replace(_bytes('\n'), _bytes('')) +class SimpleContentHandler(ContentHandler, object): + """A SAX content handler that just stores the events""" + + def __init__(self): + self.sax_events = [] + super(SimpleContentHandler, self).__init__() + + def startDocument(self): + self.sax_events.append(('startDocument',)) + + def endDocument(self): + self.sax_events.append(('endDocument',)) + + def startPrefixMapping(self, prefix, uri): + self.sax_events.append(('startPrefixMapping', prefix, uri)) + + def endPrefixMapping(self, prefix): + self.sax_events.append(('endPrefixMapping', prefix)) + + def startElement(self, name, attrs): + self.sax_events.append(('startElement', name, dict(attrs))) + + def endElement(self, name): + self.sax_events.append(('endElement', name)) + + def startElementNS(self, name, qname, attrs): + self.sax_events.append(('startElementNS', name, qname, attrs._qnames)) + + def endElementNS(self, name, qname): + self.sax_events.append(('endElementNS', name, qname)) + + def characters(self, content): + self.sax_events.append(('characters', content)) + + def ignorableWhitespace(self, whitespace): + self.sax_events.append(('ignorableWhitespace', whitespace)) + + def processingInstruction(self, target, data): + self.sax_events.append(('processingInstruction', target, data)) + + def skippedEntity(self, name): + self.sax_events.append(('skippedEntity', name)) + + +class NSPrefixSaxTestCase(HelperTestCase): + """Testing that namespaces generate the right SAX events""" + + def _saxify(self, tree): + handler = SimpleContentHandler() + sax.ElementTreeProducer(tree, handler).saxify() + return handler.sax_events + + def test_element_sax_ns_prefix(self): + # The name of the prefix should be preserved, if the uri is unique + tree = self.parse('<a:a xmlns:a="blaA" xmlns:c="blaC">' + '<d a:attr="value" c:attr="value" /></a:a>') + a = tree.getroot() + + self.assertEqual( + [('startElementNS', ('blaA', 'a'), 'a:a', {}), + ('startElementNS', (None, 'd'), 'd', + {('blaA', 'attr'): 'a:attr', ('blaC', 'attr'): 'c:attr'}), + ('endElementNS', (None, 'd'), 'd'), + ('endElementNS', ('blaA', 'a'), 'a:a'), + ], + self._saxify(a)[3:7]) + + def test_element_sax_default_ns_prefix(self): + # Default prefixes should also not get a generated prefix + tree = self.parse('<a xmlns="blaA"><b attr="value" /></a>') + a = tree.getroot() + + self.assertEqual( + [('startDocument',), + # NS prefix should be None: + ('startPrefixMapping', None, 'blaA'), + ('startElementNS', ('blaA', 'a'), 'a', {}), + # Attribute prefix should be None: + ('startElementNS', ('blaA', 'b'), 'b', {(None, 'attr'): 'attr'}), + ('endElementNS', ('blaA', 'b'), 'b'), + ('endElementNS', ('blaA', 'a'), 'a'), + # Prefix should be None again: + ('endPrefixMapping', None), + ('endDocument',)], + self._saxify(a)) + + # Except for attributes, if there is both a default namespace + # and a named namespace with the same uri + tree = self.parse('<a xmlns="bla" xmlns:a="bla">' + '<b a:attr="value" /></a>') + a = tree.getroot() + + self.assertEqual( + ('startElementNS', ('bla', 'b'), 'b', {('bla', 'attr'): 'a:attr'}), + self._saxify(a)[4]) + + def test_element_sax_twin_ns_prefix(self): + # Make an element with an doubly registered uri + tree = self.parse('<a xmlns:b="bla" xmlns:c="bla">' + '<d c:attr="attr" /></a>') + a = tree.getroot() + + self.assertEqual( + # It should get the b prefix in this case + ('startElementNS', (None, 'd'), 'd', {('bla', 'attr'): 'b:attr'}), + self._saxify(a)[4]) + + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ETreeSaxTestCase)]) + suite.addTests([unittest.makeSuite(NSPrefixSaxTestCase)]) suite.addTests( [make_doctest('../../../doc/sax.txt')]) return suite |