diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | TODO.txt | 5 | ||||
-rw-r--r-- | bs4/dammit.py | 48 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 8 |
4 files changed, 44 insertions, 20 deletions
@@ -3,6 +3,9 @@ * Made encoding substitution in <meta> tags completely transparent (no more %SOUP-ENCODING%). +* Fixed a bug in decoding data that contained a byte-order mark, such + as data encoded in UTF-16LE. [bug=988980] + * Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258] @@ -11,6 +11,11 @@ which worries me and has resulted in a number of bugs. markup_attr_map can be optimized since it's always a map now. +Upon encountering UTF-16LE data or some other uncommon serialization +of Unicode, UnicodeDammit will convert the data to Unicode, then +encode it at UTF-8. This is wasteful because it will just get decoded +back to Unicode. + CDATA ----- diff --git a/bs4/dammit.py b/bs4/dammit.py index 824c4c0..ec62b99 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -187,16 +187,24 @@ class UnicodeDammit: self.original_encoding = None return - self.markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) + new_markup, document_encoding, sniffed_encoding = \ + self._detectEncoding(markup, is_html) + self.markup = new_markup u = None - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break + if new_markup != markup: + # _detectEncoding modified the markup, then converted it to + # Unicode and then to UTF-8. So convert it from UTF-8. + u = self._convert_from("utf8") + self.original_encoding = sniffed_encoding + + if not u: + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) + if u: + break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): @@ -305,44 +313,44 @@ class UnicodeDammit: """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': + if xml_data[:4] == b'\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': + elif xml_data[:4] == b'\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ + and (xml_data[2:4] != b'\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': + elif xml_data[:4] == b'\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ + (xml_data[2:4] != b'\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': + elif xml_data[:4] == b'\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': + elif xml_data[:4] == b'\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': + elif xml_data[:4] == b'\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': + elif xml_data[:4] == b'\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': + elif xml_data[:3] == b'\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 94f325e..bb97e52 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -278,6 +278,14 @@ class TestUnicodeDammit(unittest.TestCase): finally: bs4.dammit.chardet = chardet + def test_sniffed_xml_encoding(self): + # A document written in UTF-16LE will be converted by a different + # code path that sniffs the byte order markers. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + class TestNamedspacedAttribute(SoupTest): |