summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--TODO.txt5
-rw-r--r--bs4/dammit.py48
-rw-r--r--bs4/tests/test_soup.py8
4 files changed, 44 insertions, 20 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 772b9fa..d39c6df 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,9 @@
* Made encoding substitution in <meta> tags completely transparent (no
more %SOUP-ENCODING%).
+* Fixed a bug in decoding data that contained a byte-order mark, such
+ as data encoded in UTF-16LE. [bug=988980]
+
* Fixed a bug that made the HTMLParser treebuilder generate XML
definitions ending with two question marks instead of
one. [bug=984258]
diff --git a/TODO.txt b/TODO.txt
index c9f9baa..b8dbfd2 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -11,6 +11,11 @@ which worries me and has resulted in a number of bugs.
markup_attr_map can be optimized since it's always a map now.
+Upon encountering UTF-16LE data or some other uncommon serialization
+of Unicode, UnicodeDammit will convert the data to Unicode, then
+encode it at UTF-8. This is wasteful because it will just get decoded
+back to Unicode.
+
CDATA
-----
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 824c4c0..ec62b99 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -187,16 +187,24 @@ class UnicodeDammit:
self.original_encoding = None
return
- self.markup, document_encoding, sniffed_encoding = \
- self._detectEncoding(markup, is_html)
+ new_markup, document_encoding, sniffed_encoding = \
+ self._detectEncoding(markup, is_html)
+ self.markup = new_markup
u = None
- for proposed_encoding in (
- override_encodings + [document_encoding, sniffed_encoding]):
- if proposed_encoding is not None:
- u = self._convert_from(proposed_encoding)
- if u:
- break
+ if new_markup != markup:
+ # _detectEncoding modified the markup, then converted it to
+ # Unicode and then to UTF-8. So convert it from UTF-8.
+ u = self._convert_from("utf8")
+ self.original_encoding = sniffed_encoding
+
+ if not u:
+ for proposed_encoding in (
+ override_encodings + [document_encoding, sniffed_encoding]):
+ if proposed_encoding is not None:
+ u = self._convert_from(proposed_encoding)
+ if u:
+ break
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
@@ -305,44 +313,44 @@ class UnicodeDammit:
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
- if xml_data[:4] == '\x4c\x6f\xa7\x94':
+ if xml_data[:4] == b'\x4c\x6f\xa7\x94':
# EBCDIC
xml_data = self._ebcdic_to_ascii(xml_data)
- elif xml_data[:4] == '\x00\x3c\x00\x3f':
+ elif xml_data[:4] == b'\x00\x3c\x00\x3f':
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
- and (xml_data[2:4] != '\x00\x00'):
+ elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+ and (xml_data[2:4] != b'\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
- elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ elif xml_data[:4] == b'\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
- (xml_data[2:4] != '\x00\x00'):
+ elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+ (xml_data[2:4] != b'\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
- elif xml_data[:4] == '\x00\x00\x00\x3c':
+ elif xml_data[:4] == b'\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
- elif xml_data[:4] == '\x3c\x00\x00\x00':
+ elif xml_data[:4] == b'\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
- elif xml_data[:4] == '\x00\x00\xfe\xff':
+ elif xml_data[:4] == b'\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
- elif xml_data[:4] == '\xff\xfe\x00\x00':
+ elif xml_data[:4] == b'\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
- elif xml_data[:3] == '\xef\xbb\xbf':
+ elif xml_data[:3] == b'\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 94f325e..bb97e52 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -278,6 +278,14 @@ class TestUnicodeDammit(unittest.TestCase):
finally:
bs4.dammit.chardet = chardet
+ def test_sniffed_xml_encoding(self):
+ # A document written in UTF-16LE will be converted by a different
+ # code path that sniffs the byte order markers.
+ data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
+ dammit = UnicodeDammit(data)
+ self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+ self.assertEqual("utf-16le", dammit.original_encoding)
+
class TestNamedspacedAttribute(SoupTest):