summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/_lxml.py52
-rw-r--r--bs4/element.py3
-rw-r--r--bs4/testing.py5
4 files changed, 36 insertions, 27 deletions
diff --git a/NEWS.txt b/NEWS.txt
index dadba80..3e9d015 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,9 @@
* Fix a bug in the html5lib treebuilder which sometimes created
disconnected trees. [bug=1039527]
+* Fix a bug in the lxml treebuilder which crashed when a tag included
+ an attribute from the predefined "xml:" namespace. [bug=1065617]
+
= 4.1.3 (20120820) =
* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 4c070b8..f718ed1 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -28,6 +28,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
CHUNK_SIZE = 512
+ # This namespace mapping is specified in the XML Namespace
+ # standard.
+ DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
@property
def default_parser(self):
# This can either return a parser object or a class, which
@@ -45,7 +49,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
- self.nsmaps = None
+ self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
@@ -85,22 +89,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.close()
def close(self):
- self.nsmaps = None
+ self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
- if len(nsmap) == 0 and self.nsmaps != None:
- # There are no new namespaces for this tag, but namespaces
- # are in play, so we need a separate tag stack to know
- # when they end.
+ if len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- if self.nsmaps is None:
- self.nsmaps = []
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
@@ -111,20 +113,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
- if self.nsmaps is not None and len(self.nsmaps) > 0:
- # Namespaces are in play. Find any attributes that came in
- # from lxml with namespaces attached to their names, and
- # turn then into NamespacedAttribute objects.
- new_attrs = {}
- for attr, value in attrs.items():
- namespace, attr = self._getNsTag(attr)
- if namespace is None:
- new_attrs[attr] = value
- else:
- nsprefix = self._prefix_for_namespace(namespace)
- attr = NamespacedAttribute(nsprefix, attr, namespace)
- new_attrs[attr] = value
- attrs = new_attrs
+ # Namespaces are in play. Find any attributes that came in
+ # from lxml with namespaces attached to their names, and
+ # turn then into NamespacedAttribute objects.
+ new_attrs = {}
+ for attr, value in attrs.items():
+ namespace, attr = self._getNsTag(attr)
+ if namespace is None:
+ new_attrs[attr] = value
+ else:
+ nsprefix = self._prefix_for_namespace(namespace)
+ attr = NamespacedAttribute(nsprefix, attr, namespace)
+ new_attrs[attr] = value
+ attrs = new_attrs
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
@@ -137,6 +138,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
+ return None
def end(self, name):
self.soup.endData()
@@ -149,14 +151,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
- if self.nsmaps != None:
+ if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
- if len(self.nsmaps) == 0:
- # Namespaces are no longer in play, so don't bother keeping
- # track of the namespace stack.
- self.nsmaps = None
def pi(self, target, data):
pass
diff --git a/bs4/element.py b/bs4/element.py
index 26422fd..594ef78 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None):
if name is None:
obj = unicode.__new__(cls, prefix)
+ elif prefix is None:
+ # Not really namespaced.
+ obj = unicode.__new__(cls, name)
else:
obj = unicode.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
diff --git a/bs4/testing.py b/bs4/testing.py
index 0f052eb..1a92af4 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -501,6 +501,11 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
+ def test_namespaced_attributes_xml_namespace(self):
+ markup = '<foo xml:lang="fr">bar</foo>'
+ soup = self.soup(markup)
+ self.assertEqual(unicode(soup.foo), markup)
+
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""