diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-02 22:19:37 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-02 22:19:37 -0400 |
commit | b42a4ece63de739ad7a37973a4e10af23346ffd1 (patch) | |
tree | a65794b5422a1e12a8ddf943c9afd0e0f798f6c4 | |
parent | b8b0711b903509e4b88e878fb6ca3731738ca99e (diff) | |
parent | 847a8e08e21de9036783feeecd8de93b112f3868 (diff) | |
download | beautifulsoup4-b42a4ece63de739ad7a37973a4e10af23346ffd1.tar.gz |
Merged in big encoding-detection refactoring branch.
-rw-r--r-- | NEWS.txt | 18 | ||||
-rw-r--r-- | bs4/__init__.py | 22 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 3 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 9 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 92 | ||||
-rw-r--r-- | bs4/dammit.py | 304 | ||||
-rw-r--r-- | bs4/diagnose.py | 4 | ||||
-rw-r--r-- | bs4/testing.py | 13 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 36 | ||||
-rw-r--r-- | doc/source/index.rst | 18 |
12 files changed, 296 insertions, 231 deletions
@@ -1,3 +1,21 @@ += 4.3.0 (Unreleased) = + +* Instead of converting incoming data to Unicode and feeding it to the + lxml tree builder, Beautiful Soup now makes successive guesses at + the encoding of the incoming data, and tells lxml to parse the data + as that encoding. This improves performance and avoids an issue in + which lxml was refusing to parse strings because they were Unicode + strings. + + This required a major overhaul of the tree builder architecture. If + you wrote your own tree builder and didn't tell me, you'll need to + modify your prepare_markup() method. + +* The UnicodeDammit code that makes guesses at encodings has been + split into its own class, EncodingDetector. A lot of apparently + redundant code has been removed from Unicode, Dammit, and some + undocumented features have also been removed. + = 4.2.1 (20130531) = * The default XML formatter will now replace ampersands even if they diff --git a/bs4/__init__.py b/bs4/__init__.py index 03b2416..7b5964a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -26,7 +26,7 @@ __all__ = ['BeautifulSoup'] import re import warnings -from .builder import builder_registry +from .builder import builder_registry, ParserRejectedMarkup from .dammit import UnicodeDammit from .element import ( CData, @@ -160,18 +160,17 @@ class BeautifulSoup(Tag): self.parse_only = parse_only - self.reset() - if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup(markup, from_encoding)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup, e: + pass # Clear out the markup and remove the builder's circular # reference to this object. @@ -353,7 +352,6 @@ class BeautifulStoneSoup(BeautifulSoup): class StopParsing(Exception): pass - class FeatureNotFound(ValueError): pass diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index bae453e..e59dae2 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -296,6 +296,9 @@ def register_treebuilders_from(module): # Register the builder while we're at it. this_module.builder_registry.register(obj) +class ParserRejectedMarkup(Exception): + pass + # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index e439ac8..3bbc9a9 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -27,7 +27,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding - return markup, None, None, False + yield (markup, None, None, False) # These methods are defined by Beautiful Soup. def feed(self, markup): diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 65ee618..4b80f79 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -135,13 +135,14 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): - return markup, None, None, False + yield (markup, None, None, False) + return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index be35d70..fa5d498 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -13,9 +13,10 @@ from bs4.builder import ( HTML, HTMLTreeBuilder, PERMISSIVE, + ParserRejectedMarkup, TreeBuilder, XML) -from bs4.dammit import UnicodeDammit +from bs4.dammit import EncodingDetector LXML = 'lxml' @@ -33,22 +34,30 @@ class LXMLTreeBuilderForXML(TreeBuilder): # standard. DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} - @property - def default_parser(self): + def default_parser(self, encoding): # This can either return a parser object or a class, which # will be instantiated with default arguments. - return etree.XMLParser(target=self, strip_cdata=False, recover=True) + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser def __init__(self, parser=None, empty_element_tags=None): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) - if parser is None: - # Use the default parser. - parser = self.default_parser - if isinstance(parser, collections.Callable): - # Instantiate the parser with default arguments - parser = parser(target=self, strip_cdata=False) - self.parser = parser self.soup = None self.nsmaps = [self.DEFAULT_NSMAPS] @@ -63,33 +72,53 @@ class LXMLTreeBuilderForXML(TreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, unicode): - return markup, None, None, False + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + if isinstance(markup, unicode): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + detector = EncodingDetector(markup, try_encodings, is_html) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) elif isinstance(markup, unicode): markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) - self.parser.feed(data) - while data != '': - # Now call feed() on the rest of the data, chunk by chunk. - data = markup.read(self.CHUNK_SIZE) - if data != '': - self.parser.feed(data) - self.parser.close() + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(str(e)) def close(self): self.nsmaps = [self.DEFAULT_NSMAPS] @@ -186,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False - @property - def default_parser(self): + def default_parser(self, encoding): return etree.HTMLParser def feed(self, markup): - self.parser.feed(markup) - self.parser.close() + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(str(e)) + def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" diff --git a/bs4/dammit.py b/bs4/dammit.py index a733cad..9ea432f 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -11,6 +11,7 @@ import codecs from htmlentitydefs import codepoint2name import re import logging +import string # Import a library to autodetect character encodings. chardet_type = None @@ -175,7 +176,6 @@ class EntitySubstitution(object): value = cls.quoted_attribute_value(value) return value - @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. @@ -192,6 +192,118 @@ class EntitySubstitution(object): cls._substitute_html_entity, s) +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a <meta> tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False): + self.override_encodings = override_encodings or [] + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + if encoding is not None: + encoding = encoding.lower() + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a <meta> tag. + """ + declared_encoding = None + declared_encoding_match = xml_encoding_re.match(markup) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii') + if declared_encoding: + return declared_encoding.lower() + return None + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is @@ -213,55 +325,35 @@ class UnicodeDammit: def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.is_html = is_html - if markup == '' or isinstance(markup, unicode): + self.detector = EncodingDetector(markup, override_encodings, is_html) + + # Is the data in Unicode to begin with? + if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) - self.original_encoding = None - return - new_markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) - self.markup = new_markup + # As a first step, the encoding detector may strip a byte-order mark. + self.markup = self.detector.markup u = None - if new_markup != markup: - # _detectEncoding modified the markup, then converted it to - # Unicode and then to UTF-8. So convert it from UTF-8. - u = self._convert_from("utf8") - self.original_encoding = sniffed_encoding + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break if not u: - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break - - # If no luck and we have auto-detection library, try that: - if not u and not isinstance(self.markup, unicode): - u = self._convert_from(chardet_dammit(self.markup)) - - # As a last resort, try utf-8 and windows-1252: - if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convert_from(proposed_encoding) - if u: - break + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. - # As an absolute last resort, try the encodings again with - # character replacement. - if not u: - for proposed_encoding in ( - override_encodings + [ - document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): - if proposed_encoding != "ascii": - u = self._convert_from(proposed_encoding, "replace") + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") if u is not None: logging.warning( "Some characters could not be decoded, and were " @@ -269,8 +361,9 @@ class UnicodeDammit: self.contains_replacement_characters = True break - # We could at this point force it to ASCII, but that would - # destroy so much data that I think giving up is better + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. self.unicode_markup = u if not u: self.original_encoding = None @@ -301,7 +394,7 @@ class UnicodeDammit: # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None - and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) @@ -322,99 +415,24 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' + return unicode(data, encoding, errors) - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding, errors) - return newdata - - def _detectEncoding(self, xml_data, is_html=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ - and (xml_data[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ - (xml_data[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_match = xml_encoding_re.match(xml_data) - if not xml_encoding_match and is_html: - xml_encoding_match = html_meta_re.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if is_html: - self.declared_html_encoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) or charset + ) + if value: + return value.lower() + return None def _codec(self, charset): if not charset: @@ -427,32 +445,6 @@ class UnicodeDammit: pass return codec - EBCDIC_TO_ASCII_MAP = None - - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), diff --git a/bs4/diagnose.py b/bs4/diagnose.py index 25fda5c..f9bff28 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -61,14 +61,14 @@ def diagnose(data): print "-" * 80 -def lxml_trace(data, html=True): +def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful Soup code is running. """ from lxml import etree - for event, element in etree.iterparse(StringIO(data), html=html): + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): print("%s, %4s, %s" % (event, element.tag, element.text)) class AnnouncingParser(HTMLParser): diff --git a/bs4/testing.py b/bs4/testing.py index 23b26f1..fd4495a 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -281,6 +281,14 @@ class HTMLTreeBuilderSmokeTest(object): # to detect any differences between them. # + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") @@ -484,6 +492,11 @@ class XMLTreeBuilderSmokeTest(object): encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded) + def test_can_parse_unicode_document(self): + markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + def test_popping_namespaced_tag(self): markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' soup = self.soup(markup) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 80458de..27cb2d9 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -4,14 +4,16 @@ import re import warnings try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True import lxml.etree + LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION except ImportError, e: LXML_PRESENT = False LXML_VERSION = (0,) +if LXML_PRESENT: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + from bs4 import ( BeautifulSoup, BeautifulStoneSoup, diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b127716..0b69318 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -15,7 +15,10 @@ from bs4.element import ( NamespacedAttribute, ) import bs4.dammit -from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.dammit import ( + EntitySubstitution, + UnicodeDammit, +) from bs4.testing import ( SoupTest, skipIf, @@ -156,13 +159,23 @@ class TestEncodingConversion(SoupTest): def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding - # attribute is set. - ascii = b"<foo>a</foo>" - soup_from_ascii = self.soup(ascii) - unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) - self.assertEqual(unicode_output, self.document_for(ascii.decode())) - self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii") + # attribute is set to 'utf-8', a superset of ASCII. + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + # Disable chardet, which will realize that the ASCII is ASCII. + bs4.dammit.chardet_dammit = noop + ascii = b"<foo>a</foo>" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute @@ -192,7 +205,7 @@ class TestEncodingConversion(SoupTest): self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of Unicode, Dammit.""" + """Standalone tests of UnicodeDammit.""" def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" @@ -293,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase): logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet - def test_sniffed_xml_encoding(self): - # A document written in UTF-16LE will be converted by a different - # code path that sniffs the byte order markers. + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) diff --git a/doc/source/index.rst b/doc/source/index.rst index a91854c..1b38df7 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2478,9 +2478,11 @@ become Unicode:: dammit.original_encoding # 'utf-8' -The more data you give Unicode, Dammit, the more accurately it will -guess. If you have your own suspicions as to what the encoding might -be, you can pass them in as a list:: +Unicode, Dammit's guesses will get a lot more accurate if you install +the ``chardet`` or ``cchardet`` Python libraries. The more data you +give Unicode, Dammit, the more accurately it will guess. If you have +your own suspicions as to what the encoding might be, you can pass +them in as a list:: dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) print(dammit.unicode_markup) @@ -2823,16 +2825,6 @@ significantly faster using lxml than using html.parser or html5lib. You can speed up encoding detection significantly by installing the `cchardet <http://pypi.python.org/pypi/cchardet/>`_ library. -Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by -doing a byte-by-byte examination of the file. This slows Beautiful -Soup to a crawl. My tests indicate that this only happened on 2.x -versions of Python, and that it happened most often with documents -using Russian or Chinese encodings. If this is happening to you, you -can fix it by installing cchardet, or by using Python 3 for your -script. If you happen to know a document's encoding, you can pass -it into the ``BeautifulSoup`` constructor as ``from_encoding``, and -bypass encoding detection altogether. - `Parsing only part of a document`_ won't save you much time parsing the document, but it can save a lot of memory, and it'll make `searching` the document much faster. |