Turns out we had two bits of code to strip byte-order marks.

author: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-02 19:12:07 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-02 19:12:07 -0400
commit: 847a8e08e21de9036783feeecd8de93b112f3868 (patch)
tree: 8c9bb1d87d864ed246f7373449d5c03cd3fb4b54 /bs4
parent: 04b7d588f86eeea3af8c28f2010a78e7a5b0b176 (diff)
download: beautifulsoup4-847a8e08e21de9036783feeecd8de93b112f3868.tar.gz
3 files changed, 46 insertions, 38 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 92ace07..fa5d498 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -97,7 +97,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         try_encodings = [user_specified_encoding, document_declared_encoding]
         detector = EncodingDetector(markup, try_encodings, is_html)
         for encoding in detector.encodings:
-            yield (markup, encoding, document_declared_encoding, False)
+            yield (detector.markup, encoding, document_declared_encoding, False)
 
     def feed(self, markup):
         if isinstance(markup, bytes):
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a5558d7..9ea432f 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -213,12 +213,13 @@ class EncodingDetector:
     5. Windows-1252.
     """
     def __init__(self, markup, override_encodings=None, is_html=False):
-        self.markup = markup
         self.override_encodings = override_encodings or []
         self.chardet_encoding = None
         self.is_html = is_html
         self.declared_encoding = None
-        self.sniffed_encoding = None
+
+        # First order of business: strip a byte-order mark.
+        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
 
     def _usable(self, encoding, tried):
         if encoding is not None:
@@ -236,15 +237,21 @@ class EncodingDetector:
             if self._usable(e, tried):
                 yield e
 
+        # Did the document originally start with a byte-order mark
+        # that indicated its encoding?
+        if self._usable(self.sniffed_encoding, tried):
+            yield self.sniffed_encoding
+
+        # Look within the document for an XML or HTML encoding
+        # declaration.
         if self.declared_encoding is None:
-            # Look within the document for an XML or HTML encoding
-            # declaration.
             self.declared_encoding = self.find_declared_encoding(
                 self.markup, self.is_html)
-
         if self._usable(self.declared_encoding, tried):
             yield self.declared_encoding
 
+        # Use third-party character set detection to guess at the
+        # encoding.
         if self.chardet_encoding is None:
             self.chardet_encoding = chardet_dammit(self.markup)
         if self._usable(self.chardet_encoding, tried):
@@ -256,6 +263,29 @@ class EncodingDetector:
                 yield e
 
     @classmethod
+    def strip_byte_order_mark(cls, data):
+        """If a byte-order mark is present, strip it and return the encoding it implies."""
+        encoding = None
+        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == b'\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == b'\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == b'\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        return data, encoding
+
+    @classmethod
     def find_declared_encoding(cls, markup, is_html=False):
         """Given a document, tries to find its declared encoding.
 
@@ -298,18 +328,21 @@ class UnicodeDammit:
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
         self.contains_replacement_characters = False
+        self.is_html = is_html
 
         self.detector = EncodingDetector(markup, override_encodings, is_html)
-        if markup == '' or isinstance(markup, unicode):
+
+        # Is the data in Unicode to begin with?
+        if isinstance(markup, unicode) or markup == '':
             self.markup = markup
             self.unicode_markup = unicode(markup)
-            self.original_encoding = None
-            return
 
-        self.markup = markup
+        # As a first step, the encoding detector may strip a byte-order mark.
+        self.markup = self.detector.markup
 
         u = None
         for encoding in self.detector.encodings:
+            markup = self.detector.markup
             u = self._convert_from(encoding)
             if u is not None:
                 break
@@ -382,27 +415,7 @@ class UnicodeDammit:
     def _to_unicode(self, data, encoding, errors="strict"):
         '''Given a string and its encoding, decodes the string into Unicode.
         %encoding is a string recognized by encodings.aliases'''
-
-        # strip Byte Order Mark (if present)
-        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16be'
-            data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16le'
-            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
-            encoding = 'utf-8'
-            data = data[3:]
-        elif data[:4] == '\x00\x00\xfe\xff':
-            encoding = 'utf-32be'
-            data = data[4:]
-        elif data[:4] == '\xff\xfe\x00\x00':
-            encoding = 'utf-32le'
-            data = data[4:]
-        newdata = unicode(data, encoding, errors)
-        return newdata
+        return unicode(data, encoding, errors)
 
     @property
     def declared_html_encoding(self):
@@ -410,10 +423,6 @@ class UnicodeDammit:
             return None
         return self.detector.declared_encoding
 
-    @property
-    def is_html(self):
-        return self.detector.is_html
-
     def find_codec(self, charset):
         value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
                or (charset and self._codec(charset.replace("-", "")))
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index c275228..0b69318 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -306,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase):
             logging.disable(logging.NOTSET)
             bs4.dammit.chardet_dammit = chardet
 
-    def test_sniffed_xml_encoding(self):
-        # A document written in UTF-16LE will be converted by a different
-        # code path that sniffs the byte order markers.
+    def test_byte_order_mark_removed(self):
+        # A document written in UTF-16LE will have its byte order marker stripped.
         data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
         dammit = UnicodeDammit(data)
         self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
author	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-02 19:12:07 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-02 19:12:07 -0400
commit	847a8e08e21de9036783feeecd8de93b112f3868 (patch)
tree	8c9bb1d87d864ed246f7373449d5c03cd3fb4b54 /bs4
parent	04b7d588f86eeea3af8c28f2010a78e7a5b0b176 (diff)
download	beautifulsoup4-847a8e08e21de9036783feeecd8de93b112f3868.tar.gz