4 files changed, 44 insertions, 20 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 772b9fa..d39c6df 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -3,6 +3,9 @@
 * Made encoding substitution in <meta> tags completely transparent (no
   more %SOUP-ENCODING%).
 
+* Fixed a bug in decoding data that contained a byte-order mark, such
+  as data encoded in UTF-16LE. [bug=988980]
+
 * Fixed a bug that made the HTMLParser treebuilder generate XML
   definitions ending with two question marks instead of
   one. [bug=984258]
diff --git a/TODO.txt b/TODO.txt
index c9f9baa..b8dbfd2 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -11,6 +11,11 @@ which worries me and has resulted in a number of bugs.
 
 markup_attr_map can be optimized since it's always a map now.
 
+Upon encountering UTF-16LE data or some other uncommon serialization
+of Unicode, UnicodeDammit will convert the data to Unicode, then
+encode it at UTF-8. This is wasteful because it will just get decoded
+back to Unicode.
+
 CDATA
 -----
 
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 824c4c0..ec62b99 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -187,16 +187,24 @@ class UnicodeDammit:
             self.original_encoding = None
             return
 
-        self.markup, document_encoding, sniffed_encoding = \
-                     self._detectEncoding(markup, is_html)
+        new_markup, document_encoding, sniffed_encoding = \
+            self._detectEncoding(markup, is_html)
+        self.markup = new_markup
 
         u = None
-        for proposed_encoding in (
-            override_encodings + [document_encoding, sniffed_encoding]):
-            if proposed_encoding is not None:
-                u = self._convert_from(proposed_encoding)
-                if u:
-                    break
+        if new_markup != markup:
+            # _detectEncoding modified the markup, then converted it to
+            # Unicode and then to UTF-8. So convert it from UTF-8.
+            u = self._convert_from("utf8")
+            self.original_encoding = sniffed_encoding
+
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [document_encoding, sniffed_encoding]):
+                if proposed_encoding is not None:
+                    u = self._convert_from(proposed_encoding)
+                    if u:
+                        break
 
         # If no luck and we have auto-detection library, try that:
         if not u and chardet and not isinstance(self.markup, unicode):
@@ -305,44 +313,44 @@ class UnicodeDammit:
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
         try:
-            if xml_data[:4] == '\x4c\x6f\xa7\x94':
+            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
                 # EBCDIC
                 xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == '\x00\x3c\x00\x3f':
+            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
                 # UTF-16BE
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+                     and (xml_data[2:4] != b'\x00\x00'):
                 # UTF-16BE with BOM
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x3f\x00':
+            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
                 # UTF-16LE
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+                     (xml_data[2:4] != b'\x00\x00'):
                 # UTF-16LE with BOM
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\x00\x3c':
+            elif xml_data[:4] == b'\x00\x00\x00\x3c':
                 # UTF-32BE
                 sniffed_xml_encoding = 'utf-32be'
                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x00\x00':
+            elif xml_data[:4] == b'\x3c\x00\x00\x00':
                 # UTF-32LE
                 sniffed_xml_encoding = 'utf-32le'
                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\xfe\xff':
+            elif xml_data[:4] == b'\x00\x00\xfe\xff':
                 # UTF-32BE with BOM
                 sniffed_xml_encoding = 'utf-32be'
                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\xff\xfe\x00\x00':
+            elif xml_data[:4] == b'\xff\xfe\x00\x00':
                 # UTF-32LE with BOM
                 sniffed_xml_encoding = 'utf-32le'
                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == '\xef\xbb\xbf':
+            elif xml_data[:3] == b'\xef\xbb\xbf':
                 # UTF-8 with BOM
                 sniffed_xml_encoding = 'utf-8'
                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 94f325e..bb97e52 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -278,6 +278,14 @@ class TestUnicodeDammit(unittest.TestCase):
         finally:
             bs4.dammit.chardet = chardet
 
+    def test_sniffed_xml_encoding(self):
+        # A document written in UTF-16LE will be converted by a different
+        # code path that sniffs the byte order markers.
+        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
+        dammit = UnicodeDammit(data)
+        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+        self.assertEqual("utf-16le", dammit.original_encoding)
+
 
 class TestNamedspacedAttribute(SoupTest):