diff options
author | scoder <stefan_ml@behnel.de> | 2017-08-12 16:35:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-08-12 16:35:50 +0200 |
commit | 8fb482d6b046b95e701cb1539e9cda4ffcca0520 (patch) | |
tree | a75e6aa1d0d033dab8fc5401bce6ada2e2c15649 /src/lxml/html/html5parser.py | |
parent | 798a6ea71dc3b59b973ea74e1fad9e2b768a8531 (diff) | |
parent | 656984a011298de68703e87cfb828878d809c882 (diff) | |
download | python-lxml-8fb482d6b046b95e701cb1539e9cda4ffcca0520.tar.gz |
Merge pull request #232 from ondergetekende/1654544
Fix LP1654544
Diffstat (limited to 'src/lxml/html/html5parser.py')
-rw-r--r-- | src/lxml/html/html5parser.py | 43 |
1 files changed, 38 insertions, 5 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index ed70b340..8c958269 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -1,15 +1,14 @@ """ An interface to html5lib that mimics the lxml.html interface. """ - +import functools import sys import string from html5lib import HTMLParser as _HTMLParser from html5lib.treebuilders.etree_lxml import TreeBuilder - from lxml import etree -from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element +from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag # python3 compatibility try: @@ -25,7 +24,41 @@ try: except ImportError: from urllib.parse import urlparse -class HTMLParser(_HTMLParser): + +def _dodgeUseChardet(fn): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + # However, there is no reasonable way to predict if html5lib will + # detect the argument to be unicode (all of that code is private), + # so we'll have to settle for a retry. + + # this decorator wraps around the a method, which is retried when html5lib + # complains about the useChardet argument + @functools.wraps(fn) + def inner(*args, **kwargs): + try: + return fn(*args, **kwargs) + except TypeError as exception: + if "'useChardet'" not in str(exception): + # Some other issue caused the exception. Tell the caller + raise + kwargs.pop('useChardet') + return fn(*args, **kwargs) + return inner + + +class _DodgeUseChardetMixin: + + @_dodgeUseChardet + def parse(self, *args, **kwargs): + return super(_DodgeUseChardetMixin, self).parse(*args, **kwargs) + + @_dodgeUseChardet + def parseFragment(self, *args, **kwargs): + return super(_DodgeUseChardetMixin, self).parseFragment(*args, **kwargs) + + +class HTMLParser(_DodgeUseChardetMixin, _HTMLParser): """An html5lib HTML parser with lxml as tree.""" def __init__(self, strict=False, **kwargs): @@ -37,7 +70,7 @@ try: except ImportError: pass else: - class XHTMLParser(_XHTMLParser): + class XHTMLParser(_DodgeUseChardetMixin, _XHTMLParser): """An html5lib XHTML Parser with lxml as tree.""" def __init__(self, strict=False, **kwargs): |