summaryrefslogtreecommitdiff
path: root/src/lxml/html/html5parser.py
diff options
context:
space:
mode:
authorscoder <stefan_ml@behnel.de>2017-08-12 16:35:50 +0200
committerGitHub <noreply@github.com>2017-08-12 16:35:50 +0200
commit8fb482d6b046b95e701cb1539e9cda4ffcca0520 (patch)
treea75e6aa1d0d033dab8fc5401bce6ada2e2c15649 /src/lxml/html/html5parser.py
parent798a6ea71dc3b59b973ea74e1fad9e2b768a8531 (diff)
parent656984a011298de68703e87cfb828878d809c882 (diff)
downloadpython-lxml-8fb482d6b046b95e701cb1539e9cda4ffcca0520.tar.gz
Merge pull request #232 from ondergetekende/1654544
Fix LP1654544
Diffstat (limited to 'src/lxml/html/html5parser.py')
-rw-r--r--src/lxml/html/html5parser.py43
1 files changed, 38 insertions, 5 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
index ed70b340..8c958269 100644
--- a/src/lxml/html/html5parser.py
+++ b/src/lxml/html/html5parser.py
@@ -1,15 +1,14 @@
"""
An interface to html5lib that mimics the lxml.html interface.
"""
-
+import functools
import sys
import string
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
-
from lxml import etree
-from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
+from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
# python3 compatibility
try:
@@ -25,7 +24,41 @@ try:
except ImportError:
from urllib.parse import urlparse
-class HTMLParser(_HTMLParser):
+
+def _dodgeUseChardet(fn):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ # However, there is no reasonable way to predict if html5lib will
+ # detect the argument to be unicode (all of that code is private),
+ # so we'll have to settle for a retry.
+
+ # this decorator wraps around the a method, which is retried when html5lib
+ # complains about the useChardet argument
+ @functools.wraps(fn)
+ def inner(*args, **kwargs):
+ try:
+ return fn(*args, **kwargs)
+ except TypeError as exception:
+ if "'useChardet'" not in str(exception):
+ # Some other issue caused the exception. Tell the caller
+ raise
+ kwargs.pop('useChardet')
+ return fn(*args, **kwargs)
+ return inner
+
+
+class _DodgeUseChardetMixin:
+
+ @_dodgeUseChardet
+ def parse(self, *args, **kwargs):
+ return super(_DodgeUseChardetMixin, self).parse(*args, **kwargs)
+
+ @_dodgeUseChardet
+ def parseFragment(self, *args, **kwargs):
+ return super(_DodgeUseChardetMixin, self).parseFragment(*args, **kwargs)
+
+
+class HTMLParser(_DodgeUseChardetMixin, _HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
@@ -37,7 +70,7 @@ try:
except ImportError:
pass
else:
- class XHTMLParser(_XHTMLParser):
+ class XHTMLParser(_DodgeUseChardetMixin, _XHTMLParser):
"""An html5lib XHTML Parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):