Merged in big encoding-detection refactoring branch.

author: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-02 22:19:37 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-02 22:19:37 -0400
commit: b42a4ece63de739ad7a37973a4e10af23346ffd1 (patch)
tree: a65794b5422a1e12a8ddf943c9afd0e0f798f6c4
parent: b8b0711b903509e4b88e878fb6ca3731738ca99e (diff)
parent: 847a8e08e21de9036783feeecd8de93b112f3868 (diff)
download: beautifulsoup4-b42a4ece63de739ad7a37973a4e10af23346ffd1.tar.gz
12 files changed, 296 insertions, 231 deletions
diff --git a/NEWS.txt b/NEWS.txt
index bb90d04..3d0846f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,21 @@
+= 4.3.0 (Unreleased) =
+
+* Instead of converting incoming data to Unicode and feeding it to the
+  lxml tree builder, Beautiful Soup now makes successive guesses at
+  the encoding of the incoming data, and tells lxml to parse the data
+  as that encoding. This improves performance and avoids an issue in
+  which lxml was refusing to parse strings because they were Unicode
+  strings.
+
+  This required a major overhaul of the tree builder architecture. If
+  you wrote your own tree builder and didn't tell me, you'll need to
+  modify your prepare_markup() method.
+
+* The UnicodeDammit code that makes guesses at encodings has been
+  split into its own class, EncodingDetector. A lot of apparently
+  redundant code has been removed from Unicode, Dammit, and some
+  undocumented features have also been removed.
+
 = 4.2.1 (20130531) =
 
 * The default XML formatter will now replace ampersands even if they
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 03b2416..7b5964a 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -26,7 +26,7 @@ __all__ = ['BeautifulSoup']
 import re
 import warnings
 
-from .builder import builder_registry
+from .builder import builder_registry, ParserRejectedMarkup
 from .dammit import UnicodeDammit
 from .element import (
     CData,
@@ -160,18 +160,17 @@ class BeautifulSoup(Tag):
 
         self.parse_only = parse_only
 
-        self.reset()
-
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
-        (self.markup, self.original_encoding, self.declared_html_encoding,
-         self.contains_replacement_characters) = (
-            self.builder.prepare_markup(markup, from_encoding))
-
-        try:
-            self._feed()
-        except StopParsing:
-            pass
+        for (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) in (
+            self.builder.prepare_markup(markup, from_encoding)):
+            self.reset()
+            try:
+                self._feed()
+                break
+            except ParserRejectedMarkup, e:
+                pass
 
         # Clear out the markup and remove the builder's circular
         # reference to this object.
@@ -353,7 +352,6 @@ class BeautifulStoneSoup(BeautifulSoup):
 class StopParsing(Exception):
     pass
 
-
 class FeatureNotFound(ValueError):
     pass
 
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index bae453e..e59dae2 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -296,6 +296,9 @@ def register_treebuilders_from(module):
             # Register the builder while we're at it.
             this_module.builder_registry.register(obj)
 
+class ParserRejectedMarkup(Exception):
+    pass
+
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index e439ac8..3bbc9a9 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -27,7 +27,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
     def prepare_markup(self, markup, user_specified_encoding):
         # Store the user-specified encoding for use later on.
         self.user_specified_encoding = user_specified_encoding
-        return markup, None, None, False
+        yield (markup, None, None, False)
 
     # These methods are defined by Beautiful Soup.
     def feed(self, markup):
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 65ee618..4b80f79 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -135,13 +135,14 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         replaced with REPLACEMENT CHARACTER).
         """
         if isinstance(markup, unicode):
-            return markup, None, None, False
+            yield (markup, None, None, False)
+            return
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, is_html=True)
-        return (dammit.markup, dammit.original_encoding,
-                dammit.declared_html_encoding,
-                dammit.contains_replacement_characters)
+        yield (dammit.markup, dammit.original_encoding,
+               dammit.declared_html_encoding,
+               dammit.contains_replacement_characters)
 
     def feed(self, markup):
         args, kwargs = self.parser_args
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index be35d70..fa5d498 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -13,9 +13,10 @@ from bs4.builder import (
     HTML,
     HTMLTreeBuilder,
     PERMISSIVE,
+    ParserRejectedMarkup,
     TreeBuilder,
     XML)
-from bs4.dammit import UnicodeDammit
+from bs4.dammit import EncodingDetector
 
 LXML = 'lxml'
 
@@ -33,22 +34,30 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     # standard.
     DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
 
-    @property
-    def default_parser(self):
+    def default_parser(self, encoding):
         # This can either return a parser object or a class, which
         # will be instantiated with default arguments.
-        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+        if self._default_parser is not None:
+            return self._default_parser
+        return etree.XMLParser(
+            target=self, strip_cdata=False, recover=True, encoding=encoding)
+
+    def parser_for(self, encoding):
+        # Use the default parser.
+        parser = self.default_parser(encoding)
+
+        if isinstance(parser, collections.Callable):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False, encoding=encoding)
+        return parser
 
     def __init__(self, parser=None, empty_element_tags=None):
+        # TODO: Issue a warning if parser is present but not a
+        # callable, since that means there's no way to create new
+        # parsers for different encodings.
+        self._default_parser = parser
         if empty_element_tags is not None:
             self.empty_element_tags = set(empty_element_tags)
-        if parser is None:
-            # Use the default parser.
-            parser = self.default_parser
-        if isinstance(parser, collections.Callable):
-            # Instantiate the parser with default arguments
-            parser = parser(target=self, strip_cdata=False)
-        self.parser = parser
         self.soup = None
         self.nsmaps = [self.DEFAULT_NSMAPS]
 
@@ -63,33 +72,53 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None):
         """
-        :return: A 3-tuple (markup, original encoding, encoding
-        declared within markup).
+        :yield: A series of 4-tuples.
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+        Each 4-tuple represents a strategy for parsing the document.
         """
         if isinstance(markup, unicode):
-            return markup, None, None, False
+            # We were given Unicode. Maybe lxml can parse Unicode on
+            # this system?
+            yield markup, None, document_declared_encoding, False
 
+        if isinstance(markup, unicode):
+            # No, apparently not. Convert the Unicode to UTF-8 and
+            # tell lxml to parse it as UTF-8.
+            yield (markup.encode("utf8"), "utf8",
+                   document_declared_encoding, False)
+
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
         try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
-        return (dammit.markup, dammit.original_encoding,
-                dammit.declared_html_encoding,
-                dammit.contains_replacement_characters)
+        detector = EncodingDetector(markup, try_encodings, is_html)
+        for encoding in detector.encodings:
+            yield (detector.markup, encoding, document_declared_encoding, False)
 
     def feed(self, markup):
         if isinstance(markup, bytes):
             markup = BytesIO(markup)
         elif isinstance(markup, unicode):
             markup = StringIO(markup)
+
         # Call feed() at least once, even if the markup is empty,
         # or the parser won't be initialized.
         data = markup.read(self.CHUNK_SIZE)
-        self.parser.feed(data)
-        while data != '':
-            # Now call feed() on the rest of the data, chunk by chunk.
-            data = markup.read(self.CHUNK_SIZE)
-            if data != '':
-                self.parser.feed(data)
-        self.parser.close()
+        try:
+            self.parser = self.parser_for(self.soup.original_encoding)
+            self.parser.feed(data)
+            while len(data) != 0:
+                # Now call feed() on the rest of the data, chunk by chunk.
+                data = markup.read(self.CHUNK_SIZE)
+                if len(data) != 0:
+                    self.parser.feed(data)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+            raise ParserRejectedMarkup(str(e))
 
     def close(self):
         self.nsmaps = [self.DEFAULT_NSMAPS]
@@ -186,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
     features = [LXML, HTML, FAST, PERMISSIVE]
     is_xml = False
 
-    @property
-    def default_parser(self):
+    def default_parser(self, encoding):
         return etree.HTMLParser
 
     def feed(self, markup):
-        self.parser.feed(markup)
-        self.parser.close()
+        encoding = self.soup.original_encoding
+        try:
+            self.parser = self.parser_for(encoding)
+            self.parser.feed(markup)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+            raise ParserRejectedMarkup(str(e))
+
 
     def test_fragment_to_document(self, fragment):
         """See `TreeBuilder`."""
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a733cad..9ea432f 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -11,6 +11,7 @@ import codecs
 from htmlentitydefs import codepoint2name
 import re
 import logging
+import string
 
 # Import a library to autodetect character encodings.
 chardet_type = None
@@ -175,7 +176,6 @@ class EntitySubstitution(object):
             value = cls.quoted_attribute_value(value)
         return value
 
-
     @classmethod
     def substitute_html(cls, s):
         """Replace certain Unicode characters with named HTML entities.
@@ -192,6 +192,118 @@ class EntitySubstitution(object):
             cls._substitute_html_entity, s)
 
 
+class EncodingDetector:
+    """Suggests a number of possible encodings for a bytestring.
+
+    Order of precedence:
+
+    1. Encodings you specifically tell EncodingDetector to try first
+    (the override_encodings argument to the constructor).
+
+    2. An encoding declared within the bytestring itself, either in an
+    XML declaration (if the bytestring is to be interpreted as an XML
+    document), or in a <meta> tag (if the bytestring is to be
+    interpreted as an HTML document.)
+
+    3. An encoding detected through textual analysis by chardet,
+    cchardet, or a similar external library.
+
+    4. UTF-8.
+
+    5. Windows-1252.
+    """
+    def __init__(self, markup, override_encodings=None, is_html=False):
+        self.override_encodings = override_encodings or []
+        self.chardet_encoding = None
+        self.is_html = is_html
+        self.declared_encoding = None
+
+        # First order of business: strip a byte-order mark.
+        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
+
+    def _usable(self, encoding, tried):
+        if encoding is not None:
+            encoding = encoding.lower()
+            if encoding not in tried:
+                tried.add(encoding)
+                return True
+        return False
+
+    @property
+    def encodings(self):
+        """Yield a number of encodings that might work for this markup."""
+        tried = set()
+        for e in self.override_encodings:
+            if self._usable(e, tried):
+                yield e
+
+        # Did the document originally start with a byte-order mark
+        # that indicated its encoding?
+        if self._usable(self.sniffed_encoding, tried):
+            yield self.sniffed_encoding
+
+        # Look within the document for an XML or HTML encoding
+        # declaration.
+        if self.declared_encoding is None:
+            self.declared_encoding = self.find_declared_encoding(
+                self.markup, self.is_html)
+        if self._usable(self.declared_encoding, tried):
+            yield self.declared_encoding
+
+        # Use third-party character set detection to guess at the
+        # encoding.
+        if self.chardet_encoding is None:
+            self.chardet_encoding = chardet_dammit(self.markup)
+        if self._usable(self.chardet_encoding, tried):
+            yield self.chardet_encoding
+
+        # As a last-ditch effort, try utf-8 and windows-1252.
+        for e in ('utf-8', 'windows-1252'):
+            if self._usable(e, tried):
+                yield e
+
+    @classmethod
+    def strip_byte_order_mark(cls, data):
+        """If a byte-order mark is present, strip it and return the encoding it implies."""
+        encoding = None
+        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == b'\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == b'\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == b'\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        return data, encoding
+
+    @classmethod
+    def find_declared_encoding(cls, markup, is_html=False):
+        """Given a document, tries to find its declared encoding.
+
+        An XML encoding is declared at the beginning of the document.
+
+        An HTML encoding is declared in a <meta> tag.
+        """
+        declared_encoding = None
+        declared_encoding_match = xml_encoding_re.match(markup)
+        if not declared_encoding_match and is_html:
+            declared_encoding_match = html_meta_re.search(markup)
+        if declared_encoding_match is not None:
+            declared_encoding = declared_encoding_match.groups()[0].decode(
+                'ascii')
+        if declared_encoding:
+            return declared_encoding.lower()
+        return None
+
 class UnicodeDammit:
     """A class for detecting the encoding of a *ML document and
     converting it to a Unicode string. If the source encoding is
@@ -213,55 +325,35 @@ class UnicodeDammit:
 
     def __init__(self, markup, override_encodings=[],
                  smart_quotes_to=None, is_html=False):
-        self.declared_html_encoding = None
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
         self.contains_replacement_characters = False
+        self.is_html = is_html
 
-        if markup == '' or isinstance(markup, unicode):
+        self.detector = EncodingDetector(markup, override_encodings, is_html)
+
+        # Is the data in Unicode to begin with?
+        if isinstance(markup, unicode) or markup == '':
             self.markup = markup
             self.unicode_markup = unicode(markup)
-            self.original_encoding = None
-            return
 
-        new_markup, document_encoding, sniffed_encoding = \
-            self._detectEncoding(markup, is_html)
-        self.markup = new_markup
+        # As a first step, the encoding detector may strip a byte-order mark.
+        self.markup = self.detector.markup
 
         u = None
-        if new_markup != markup:
-            # _detectEncoding modified the markup, then converted it to
-            # Unicode and then to UTF-8. So convert it from UTF-8.
-            u = self._convert_from("utf8")
-            self.original_encoding = sniffed_encoding
+        for encoding in self.detector.encodings:
+            markup = self.detector.markup
+            u = self._convert_from(encoding)
+            if u is not None:
+                break
 
         if not u:
-            for proposed_encoding in (
-                override_encodings + [document_encoding, sniffed_encoding]):
-                if proposed_encoding is not None:
-                    u = self._convert_from(proposed_encoding)
-                    if u:
-                        break
-
-        # If no luck and we have auto-detection library, try that:
-        if not u and not isinstance(self.markup, unicode):
-            u = self._convert_from(chardet_dammit(self.markup))
-
-        # As a last resort, try utf-8 and windows-1252:
-        if not u:
-            for proposed_encoding in ("utf-8", "windows-1252"):
-                u = self._convert_from(proposed_encoding)
-                if u:
-                    break
+            # None of the encodings worked. As an absolute last resort,
+            # try them again with character replacement.
 
-        # As an absolute last resort, try the encodings again with
-        # character replacement.
-        if not u:
-            for proposed_encoding in (
-                override_encodings + [
-                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
-                if proposed_encoding != "ascii":
-                    u = self._convert_from(proposed_encoding, "replace")
+            for encoding in self.detector.encodings:
+                if encoding != "ascii":
+                    u = self._convert_from(encoding, "replace")
                 if u is not None:
                     logging.warning(
                             "Some characters could not be decoded, and were "
@@ -269,8 +361,9 @@ class UnicodeDammit:
                     self.contains_replacement_characters = True
                     break
 
-        # We could at this point force it to ASCII, but that would
-        # destroy so much data that I think giving up is better
+        # If none of that worked, we could at this point force it to
+        # ASCII, but that would destroy so much data that I think
+        # giving up is better.
         self.unicode_markup = u
         if not u:
             self.original_encoding = None
@@ -301,7 +394,7 @@ class UnicodeDammit:
         # Convert smart quotes to HTML if coming from an encoding
         # that might have them.
         if (self.smart_quotes_to is not None
-            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
             smart_quotes_re = b"([\x80-\x9f])"
             smart_quotes_compiled = re.compile(smart_quotes_re)
             markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
@@ -322,99 +415,24 @@ class UnicodeDammit:
     def _to_unicode(self, data, encoding, errors="strict"):
         '''Given a string and its encoding, decodes the string into Unicode.
         %encoding is a string recognized by encodings.aliases'''
+        return unicode(data, encoding, errors)
 
-        # strip Byte Order Mark (if present)
-        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16be'
-            data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16le'
-            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
-            encoding = 'utf-8'
-            data = data[3:]
-        elif data[:4] == '\x00\x00\xfe\xff':
-            encoding = 'utf-32be'
-            data = data[4:]
-        elif data[:4] == '\xff\xfe\x00\x00':
-            encoding = 'utf-32le'
-            data = data[4:]
-        newdata = unicode(data, encoding, errors)
-        return newdata
-
-    def _detectEncoding(self, xml_data, is_html=False):
-        """Given a document, tries to detect its XML encoding."""
-        xml_encoding = sniffed_xml_encoding = None
-        try:
-            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
-                # EBCDIC
-                xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
-                # UTF-16BE
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
-                     and (xml_data[2:4] != b'\x00\x00'):
-                # UTF-16BE with BOM
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
-                # UTF-16LE
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
-                     (xml_data[2:4] != b'\x00\x00'):
-                # UTF-16LE with BOM
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == b'\x00\x00\x00\x3c':
-                # UTF-32BE
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == b'\x3c\x00\x00\x00':
-                # UTF-32LE
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == b'\x00\x00\xfe\xff':
-                # UTF-32BE with BOM
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == b'\xff\xfe\x00\x00':
-                # UTF-32LE with BOM
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == b'\xef\xbb\xbf':
-                # UTF-8 with BOM
-                sniffed_xml_encoding = 'utf-8'
-                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-            else:
-                sniffed_xml_encoding = 'ascii'
-                pass
-        except:
-            xml_encoding_match = None
-        xml_encoding_match = xml_encoding_re.match(xml_data)
-        if not xml_encoding_match and is_html:
-            xml_encoding_match = html_meta_re.search(xml_data)
-        if xml_encoding_match is not None:
-            xml_encoding = xml_encoding_match.groups()[0].decode(
-                'ascii').lower()
-            if is_html:
-                self.declared_html_encoding = xml_encoding
-            if sniffed_xml_encoding and \
-               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
-                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
-                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
-                                 'utf16', 'u16')):
-                xml_encoding = sniffed_xml_encoding
-        return xml_data, xml_encoding, sniffed_xml_encoding
+    @property
+    def declared_html_encoding(self):
+        if not self.is_html:
+            return None
+        return self.detector.declared_encoding
 
     def find_codec(self, charset):
-        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
-               or (charset and self._codec(charset.replace("-", ""))) \
-               or (charset and self._codec(charset.replace("-", "_"))) \
+        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
+               or (charset and self._codec(charset.replace("-", "")))
+               or (charset and self._codec(charset.replace("-", "_")))
+               or (charset and charset.lower())
                or charset
+                )
+        if value:
+            return value.lower()
+        return None
 
     def _codec(self, charset):
         if not charset:
@@ -427,32 +445,6 @@ class UnicodeDammit:
             pass
         return codec
 
-    EBCDIC_TO_ASCII_MAP = None
-
-    def _ebcdic_to_ascii(self, s):
-        c = self.__class__
-        if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
-            import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
-            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
-        return s.translate(c.EBCDIC_TO_ASCII_MAP)
 
     # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
     MS_CHARS = {b'\x80': ('euro', '20AC'),
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index 25fda5c..f9bff28 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -61,14 +61,14 @@ def diagnose(data):
 
         print "-" * 80
 
-def lxml_trace(data, html=True):
+def lxml_trace(data, html=True, **kwargs):
     """Print out the lxml events that occur during parsing.
 
     This lets you see how lxml parses a document when no Beautiful
     Soup code is running.
     """
     from lxml import etree
-    for event, element in etree.iterparse(StringIO(data), html=html):
+    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
         print("%s, %4s, %s" % (event, element.tag, element.text))
 
 class AnnouncingParser(HTMLParser):
diff --git a/bs4/testing.py b/bs4/testing.py
index 23b26f1..fd4495a 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -281,6 +281,14 @@ class HTMLTreeBuilderSmokeTest(object):
     # to detect any differences between them.
     #
 
+    def test_can_parse_unicode_document(self):
+        # A seemingly innocuous document... but it's in Unicode! And
+        # it contains characters that can't be represented in the
+        # encoding found in the  declaration! The horror!
+        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+        soup = self.soup(markup)
+        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+
     def test_soupstrainer(self):
         """Parsers should be able to work with SoupStrainers."""
         strainer = SoupStrainer("b")
@@ -484,6 +492,11 @@ class XMLTreeBuilderSmokeTest(object):
         encoded = soup.encode()
         self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
 
+    def test_can_parse_unicode_document(self):
+        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+        soup = self.soup(markup)
+        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+
     def test_popping_namespaced_tag(self):
         markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
         soup = self.soup(markup)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 80458de..27cb2d9 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -4,14 +4,16 @@ import re
 import warnings
 
 try:
-    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
-    LXML_PRESENT = True
     import lxml.etree
+    LXML_PRESENT = True
     LXML_VERSION = lxml.etree.LXML_VERSION
 except ImportError, e:
     LXML_PRESENT = False
     LXML_VERSION = (0,)
 
+if LXML_PRESENT:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+
 from bs4 import (
     BeautifulSoup,
     BeautifulStoneSoup,
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index b127716..0b69318 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -15,7 +15,10 @@ from bs4.element import (
     NamespacedAttribute,
     )
 import bs4.dammit
-from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.dammit import (
+    EntitySubstitution,
+    UnicodeDammit,
+)
 from bs4.testing import (
     SoupTest,
     skipIf,
@@ -156,13 +159,23 @@ class TestEncodingConversion(SoupTest):
 
     def test_ascii_in_unicode_out(self):
         # ASCII input is converted to Unicode. The original_encoding
-        # attribute is set.
-        ascii = b"<foo>a</foo>"
-        soup_from_ascii = self.soup(ascii)
-        unicode_output = soup_from_ascii.decode()
-        self.assertTrue(isinstance(unicode_output, unicode))
-        self.assertEqual(unicode_output, self.document_for(ascii.decode()))
-        self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii")
+        # attribute is set to 'utf-8', a superset of ASCII.
+        chardet = bs4.dammit.chardet_dammit
+        logging.disable(logging.WARNING)
+        try:
+            def noop(str):
+                return None
+            # Disable chardet, which will realize that the ASCII is ASCII.
+            bs4.dammit.chardet_dammit = noop
+            ascii = b"<foo>a</foo>"
+            soup_from_ascii = self.soup(ascii)
+            unicode_output = soup_from_ascii.decode()
+            self.assertTrue(isinstance(unicode_output, unicode))
+            self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+            self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
+        finally:
+            logging.disable(logging.NOTSET)
+            bs4.dammit.chardet_dammit = chardet
 
     def test_unicode_in_unicode_out(self):
         # Unicode input is left alone. The original_encoding attribute
@@ -192,7 +205,7 @@ class TestEncodingConversion(SoupTest):
         self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
 
 class TestUnicodeDammit(unittest.TestCase):
-    """Standalone tests of Unicode, Dammit."""
+    """Standalone tests of UnicodeDammit."""
 
     def test_smart_quotes_to_unicode(self):
         markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -293,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase):
             logging.disable(logging.NOTSET)
             bs4.dammit.chardet_dammit = chardet
 
-    def test_sniffed_xml_encoding(self):
-        # A document written in UTF-16LE will be converted by a different
-        # code path that sniffs the byte order markers.
+    def test_byte_order_mark_removed(self):
+        # A document written in UTF-16LE will have its byte order marker stripped.
         data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
         dammit = UnicodeDammit(data)
         self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
diff --git a/doc/source/index.rst b/doc/source/index.rst
index a91854c..1b38df7 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2478,9 +2478,11 @@ become Unicode::
  dammit.original_encoding
  # 'utf-8'
 
-The more data you give Unicode, Dammit, the more accurately it will
-guess. If you have your own suspicions as to what the encoding might
-be, you can pass them in as a list::
+Unicode, Dammit's guesses will get a lot more accurate if you install
+the ``chardet`` or ``cchardet`` Python libraries. The more data you
+give Unicode, Dammit, the more accurately it will guess. If you have
+your own suspicions as to what the encoding might be, you can pass
+them in as a list::
 
  dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
  print(dammit.unicode_markup)
@@ -2823,16 +2825,6 @@ significantly faster using lxml than using html.parser or html5lib.
 You can speed up encoding detection significantly by installing the
 `cchardet <http://pypi.python.org/pypi/cchardet/>`_ library.
 
-Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by
-doing a byte-by-byte examination of the file. This slows Beautiful
-Soup to a crawl. My tests indicate that this only happened on 2.x
-versions of Python, and that it happened most often with documents
-using Russian or Chinese encodings. If this is happening to you, you
-can fix it by installing cchardet, or by using Python 3 for your
-script. If you happen to know a document's encoding, you can pass
-it into the ``BeautifulSoup`` constructor as ``from_encoding``, and
-bypass encoding detection altogether.
-
 `Parsing only part of a document`_ won't save you much time parsing
 the document, but it can save a lot of memory, and it'll make
 `searching` the document much faster.
author	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-02 22:19:37 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-02 22:19:37 -0400
commit	b42a4ece63de739ad7a37973a4e10af23346ffd1 (patch)
tree	a65794b5422a1e12a8ddf943c9afd0e0f798f6c4
parent	b8b0711b903509e4b88e878fb6ca3731738ca99e (diff)
parent	847a8e08e21de9036783feeecd8de93b112f3868 (diff)
download	beautifulsoup4-b42a4ece63de739ad7a37973a4e10af23346ffd1.tar.gz