summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLennart Regebro <regebro@gmail.com>2018-11-22 13:26:17 +0100
committerLennart Regebro <regebro@gmail.com>2018-11-23 14:54:26 +0100
commit8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b (patch)
tree6cf087a9a87e84bec3ed86150d7fad0b2de8d9c1
parent2d92c1edc1dbf5c3eee7206011725453faa04b20 (diff)
downloadpython-lxml-8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b.tar.gz
New and improved namespace handling for the saxifier
-rw-r--r--CHANGES.txt13
-rw-r--r--src/lxml/sax.py47
-rw-r--r--src/lxml/tests/test_sax.py141
3 files changed, 145 insertions, 56 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 62005560..33f929aa 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,9 +10,10 @@ Features added
* The module ``lxml.sax`` is compiled using Cython in order to speed it up.
-* ElementTreeProducer no longer ignores the namespace prefixes that were available
- in the element tree, and now only generates nsXX prefixes if undefined prefixes
- are encountered.
+* ElementTreeProducer now preserves the namespace prefixes. If two prefixes
+ point to the same URI, the first prefix in alphabetical order is used
+ for attributes.
+
4.2.6 (2018-??-??)
==================
@@ -3888,16 +3889,16 @@ Features added
prefix to namespace URI mapping. This will create namespace
prefix declarations on these elements and these prefixes will show up
in XML serialization.
-
+
Bugs fixed
----------
-
+
* Killed yet another memory management related bug: trees created
using newDoc would not get a libxml2-level dictionary, which caused
problems when deallocating these documents later if they contained a
node that came from a document with a dictionary.
-* Moving namespaced elements between documents was problematic as
+* Moving namespaced elements between documents was problematic as
references to the original document would remain. This has been fixed
by applying xmlReconciliateNs() after each move operation.
diff --git a/src/lxml/sax.py b/src/lxml/sax.py
index 189a8b80..ac0e2b2e 100644
--- a/src/lxml/sax.py
+++ b/src/lxml/sax.py
@@ -179,19 +179,19 @@ class ElementTreeProducer(object):
siblings.append(sibling)
sibling = sibling.getprevious()
for sibling in siblings[::-1]:
- self._recursive_saxify(sibling, {})
+ self._recursive_saxify(sibling)
- self._recursive_saxify(element, {})
+ self._recursive_saxify(element)
if hasattr(element, 'getnext'):
sibling = element.getnext()
while getattr(sibling, 'tag', None) is ProcessingInstruction:
- self._recursive_saxify(sibling, {})
+ self._recursive_saxify(sibling)
sibling = sibling.getnext()
self._content_handler.endDocument()
- def _recursive_saxify(self, element, prefixes):
+ def _recursive_saxify(self, element):
content_handler = self._content_handler
tag = element.tag
if tag is Comment or tag is ProcessingInstruction:
@@ -202,14 +202,14 @@ class ElementTreeProducer(object):
content_handler.characters(element.tail)
return
- # Get a new copy in this call, so changes doesn't propagate upwards
- prefixes = prefixes.copy()
+ # Get a new copy in this call, so changes don't propagate upwards
new_prefixes = []
- for prefix, ns_uri in element.nsmap.items():
- if prefixes.get(prefix) != ns_uri:
- # New or updated namespace
- new_prefixes.append( (prefix, ns_uri) )
- prefixes[prefix] = ns_uri
+ parent_nsmap = getattr(element.getparent(), 'nsmap', {})
+ if element.nsmap != parent_nsmap:
+ # There has been updates to the namespace
+ for prefix, ns_uri in element.nsmap.items():
+ if parent_nsmap.get(prefix) != ns_uri:
+ new_prefixes.append( (prefix, ns_uri) )
build_qname = self._build_qname
attribs = element.items()
@@ -220,13 +220,13 @@ class ElementTreeProducer(object):
attr_ns_tuple = _getNsTag(attr_ns_name)
attr_values[attr_ns_tuple] = value
attr_qnames[attr_ns_tuple] = build_qname(
- attr_ns_tuple[0], attr_ns_tuple[1], prefixes, None)
+ attr_ns_tuple[0], attr_ns_tuple[1], element.nsmap, -1)
sax_attributes = self._attr_class(attr_values, attr_qnames)
else:
sax_attributes = self._empty_attributes
ns_uri, local_name = _getNsTag(tag)
- qname = build_qname(ns_uri, local_name, prefixes, element.prefix)
+ qname = build_qname(ns_uri, local_name, element.nsmap, element.prefix)
for prefix, uri in new_prefixes:
content_handler.startPrefixMapping(prefix, uri)
@@ -235,22 +235,31 @@ class ElementTreeProducer(object):
if element.text:
content_handler.characters(element.text)
for child in element:
- self._recursive_saxify(child, prefixes)
+ self._recursive_saxify(child)
content_handler.endElementNS((ns_uri, local_name), qname)
for prefix, uri in new_prefixes:
content_handler.endPrefixMapping(prefix)
if element.tail:
content_handler.characters(element.tail)
- def _build_qname(self, ns_uri, local_name, prefixes, preferred):
+ def _build_qname(self, ns_uri, local_name, prefixes, preferred_prefix):
if ns_uri is None:
return local_name
- if preferred in prefixes and prefixes[preferred] == ns_uri:
- prefix = preferred
+ if prefixes.get(preferred_prefix) == ns_uri:
+ prefix = preferred_prefix
else:
- # Pick the first matching prefix
- prefix = [pfx for pfx, uri in prefixes.items() if uri == ns_uri][0]
+ # Pick the first matching prefix:
+ for pfx in sorted(prefixes, key=str):
+ if prefixes[pfx] == ns_uri:
+ prefix = pfx
+ if pfx is None and preferred_prefix == -1:
+ # If preferred_prefix is -1, that's a flag to say
+ # that we want a prefix, any prefix, and only
+ # accept the default prefix if no other is
+ # available
+ continue
+ break
if prefix is None:
# Default namespace
diff --git a/src/lxml/tests/test_sax.py b/src/lxml/tests/test_sax.py
index 5084f183..adc5e736 100644
--- a/src/lxml/tests/test_sax.py
+++ b/src/lxml/tests/test_sax.py
@@ -13,6 +13,7 @@ if this_dir not in sys.path:
from common_imports import HelperTestCase, make_doctest, BytesIO, _bytes
from lxml import sax
from xml.dom import pulldom
+from xml.sax.handler import ContentHandler
class ETreeSaxTestCase(HelperTestCase):
@@ -157,37 +158,6 @@ class ETreeSaxTestCase(HelperTestCase):
self.assertEqual(0,
len(root))
- def test_element_sax_ns_prefix(self):
- # The name of the prefix should be preserved
- tree = self.parse('<a:a xmlns:a="blaA"><b/><c:c xmlns:c="blaC">'
- '<d/></c:c></a:a>')
- a = tree.getroot()
-
- self.assertEqual(b'<a:a xmlns:a="blaA"><b/><c:c xmlns:c="blaC">'
- b'<d/></c:c></a:a>',
- self._saxify_serialize(a))
-
- def test_element_sax_default_ns_prefix(self):
- # Default prefixes should also not get a generated prefix
- tree = self.parse('<a xmlns="blaA"><b/><c:c xmlns:c="blaC">'
- '<d/></c:c></a>')
- a = tree.getroot()
-
- self.assertEqual(b'<a xmlns="blaA"><b/><c:c xmlns:c="blaC">'
- b'<d/></c:c></a>',
- self._saxify_serialize(a))
-
- def test_element_sax_unknown_ns_prefix(self):
- # Make an element with an unregister prefix
- tree = self.parse('<a xmlns="blaA"><b/><c:c xmlns:c="blaC">'
- '<d/></c:c></a>')
- a = tree.getroot()
- a.append(a.makeelement('{blaE}e'))
-
- self.assertEqual(b'<a xmlns="blaA"><b/><c:c xmlns:c="blaC">'
- b'<d/></c:c><ns0:e xmlns:ns0="blaE"/></a>',
- self._saxify_serialize(a))
-
def test_etree_sax_handler_default_ns(self):
handler = sax.ElementTreeContentHandler()
handler.startDocument()
@@ -327,9 +297,118 @@ class ETreeSaxTestCase(HelperTestCase):
return f.getvalue().replace(_bytes('\n'), _bytes(''))
+class SimpleContentHandler(ContentHandler, object):
+ """A SAX content handler that just stores the events"""
+
+ def __init__(self):
+ self.sax_events = []
+ super(SimpleContentHandler, self).__init__()
+
+ def startDocument(self):
+ self.sax_events.append(('startDocument',))
+
+ def endDocument(self):
+ self.sax_events.append(('endDocument',))
+
+ def startPrefixMapping(self, prefix, uri):
+ self.sax_events.append(('startPrefixMapping', prefix, uri))
+
+ def endPrefixMapping(self, prefix):
+ self.sax_events.append(('endPrefixMapping', prefix))
+
+ def startElement(self, name, attrs):
+ self.sax_events.append(('startElement', name, dict(attrs)))
+
+ def endElement(self, name):
+ self.sax_events.append(('endElement', name))
+
+ def startElementNS(self, name, qname, attrs):
+ self.sax_events.append(('startElementNS', name, qname, attrs._qnames))
+
+ def endElementNS(self, name, qname):
+ self.sax_events.append(('endElementNS', name, qname))
+
+ def characters(self, content):
+ self.sax_events.append(('characters', content))
+
+ def ignorableWhitespace(self, whitespace):
+ self.sax_events.append(('ignorableWhitespace', whitespace))
+
+ def processingInstruction(self, target, data):
+ self.sax_events.append(('processingInstruction', target, data))
+
+ def skippedEntity(self, name):
+ self.sax_events.append(('skippedEntity', name))
+
+
+class NSPrefixSaxTestCase(HelperTestCase):
+ """Testing that namespaces generate the right SAX events"""
+
+ def _saxify(self, tree):
+ handler = SimpleContentHandler()
+ sax.ElementTreeProducer(tree, handler).saxify()
+ return handler.sax_events
+
+ def test_element_sax_ns_prefix(self):
+ # The name of the prefix should be preserved, if the uri is unique
+ tree = self.parse('<a:a xmlns:a="blaA" xmlns:c="blaC">'
+ '<d a:attr="value" c:attr="value" /></a:a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ [('startElementNS', ('blaA', 'a'), 'a:a', {}),
+ ('startElementNS', (None, 'd'), 'd',
+ {('blaA', 'attr'): 'a:attr', ('blaC', 'attr'): 'c:attr'}),
+ ('endElementNS', (None, 'd'), 'd'),
+ ('endElementNS', ('blaA', 'a'), 'a:a'),
+ ],
+ self._saxify(a)[3:7])
+
+ def test_element_sax_default_ns_prefix(self):
+ # Default prefixes should also not get a generated prefix
+ tree = self.parse('<a xmlns="blaA"><b attr="value" /></a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ [('startDocument',),
+ # NS prefix should be None:
+ ('startPrefixMapping', None, 'blaA'),
+ ('startElementNS', ('blaA', 'a'), 'a', {}),
+ # Attribute prefix should be None:
+ ('startElementNS', ('blaA', 'b'), 'b', {(None, 'attr'): 'attr'}),
+ ('endElementNS', ('blaA', 'b'), 'b'),
+ ('endElementNS', ('blaA', 'a'), 'a'),
+ # Prefix should be None again:
+ ('endPrefixMapping', None),
+ ('endDocument',)],
+ self._saxify(a))
+
+ # Except for attributes, if there is both a default namespace
+ # and a named namespace with the same uri
+ tree = self.parse('<a xmlns="bla" xmlns:a="bla">'
+ '<b a:attr="value" /></a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ ('startElementNS', ('bla', 'b'), 'b', {('bla', 'attr'): 'a:attr'}),
+ self._saxify(a)[4])
+
+ def test_element_sax_twin_ns_prefix(self):
+ # Make an element with an doubly registered uri
+ tree = self.parse('<a xmlns:b="bla" xmlns:c="bla">'
+ '<d c:attr="attr" /></a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ # It should get the b prefix in this case
+ ('startElementNS', (None, 'd'), 'd', {('bla', 'attr'): 'b:attr'}),
+ self._saxify(a)[4])
+
+
def test_suite():
suite = unittest.TestSuite()
suite.addTests([unittest.makeSuite(ETreeSaxTestCase)])
+ suite.addTests([unittest.makeSuite(NSPrefixSaxTestCase)])
suite.addTests(
[make_doctest('../../../doc/sax.txt')])
return suite