diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-02 19:12:07 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-02 19:12:07 -0400 |
commit | 847a8e08e21de9036783feeecd8de93b112f3868 (patch) | |
tree | 8c9bb1d87d864ed246f7373449d5c03cd3fb4b54 /bs4 | |
parent | 04b7d588f86eeea3af8c28f2010a78e7a5b0b176 (diff) | |
download | beautifulsoup4-847a8e08e21de9036783feeecd8de93b112f3868.tar.gz |
Turns out we had two bits of code to strip byte-order marks.
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/builder/_lxml.py | 2 | ||||
-rw-r--r-- | bs4/dammit.py | 77 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 5 |
3 files changed, 46 insertions, 38 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 92ace07..fa5d498 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -97,7 +97,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector(markup, try_encodings, is_html) for encoding in detector.encodings: - yield (markup, encoding, document_declared_encoding, False) + yield (detector.markup, encoding, document_declared_encoding, False) def feed(self, markup): if isinstance(markup, bytes): diff --git a/bs4/dammit.py b/bs4/dammit.py index a5558d7..9ea432f 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -213,12 +213,13 @@ class EncodingDetector: 5. Windows-1252. """ def __init__(self, markup, override_encodings=None, is_html=False): - self.markup = markup self.override_encodings = override_encodings or [] self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None - self.sniffed_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) def _usable(self, encoding, tried): if encoding is not None: @@ -236,15 +237,21 @@ class EncodingDetector: if self._usable(e, tried): yield e + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. if self.declared_encoding is None: - # Look within the document for an XML or HTML encoding - # declaration. self.declared_encoding = self.find_declared_encoding( self.markup, self.is_html) - if self._usable(self.declared_encoding, tried): yield self.declared_encoding + # Use third-party character set detection to guess at the + # encoding. if self.chardet_encoding is None: self.chardet_encoding = chardet_dammit(self.markup) if self._usable(self.chardet_encoding, tried): @@ -256,6 +263,29 @@ class EncodingDetector: yield e @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod def find_declared_encoding(cls, markup, is_html=False): """Given a document, tries to find its declared encoding. @@ -298,18 +328,21 @@ class UnicodeDammit: self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.is_html = is_html self.detector = EncodingDetector(markup, override_encodings, is_html) - if markup == '' or isinstance(markup, unicode): + + # Is the data in Unicode to begin with? + if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) - self.original_encoding = None - return - self.markup = markup + # As a first step, the encoding detector may strip a byte-order mark. + self.markup = self.detector.markup u = None for encoding in self.detector.encodings: + markup = self.detector.markup u = self._convert_from(encoding) if u is not None: break @@ -382,27 +415,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding, errors) - return newdata + return unicode(data, encoding, errors) @property def declared_html_encoding(self): @@ -410,10 +423,6 @@ class UnicodeDammit: return None return self.detector.declared_encoding - @property - def is_html(self): - return self.detector.is_html - def find_codec(self, charset): value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) or (charset and self._codec(charset.replace("-", ""))) diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index c275228..0b69318 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -306,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase): logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet - def test_sniffed_xml_encoding(self): - # A document written in UTF-16LE will be converted by a different - # code path that sniffs the byte order markers. + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) |