summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKoert van der Veer <koert@ondergetekende.nl>2017-03-16 09:13:18 +0100
committerKoert van der Veer <koert@ondergetekende.nl>2017-03-16 09:13:18 +0100
commit82a354a2ad2a1ecafc729e619bd7875646250fcc (patch)
treef7a2e39328818b486149a98b9d1bf048e00f412b
parent329fe90232ab31e1e2a30abab784c145bbeb75ec (diff)
downloadpython-lxml-82a354a2ad2a1ecafc729e619bd7875646250fcc.tar.gz
Perform full-document detection on decoded bytes.
Closes #1673355
-rw-r--r--src/lxml/html/html5parser.py9
-rw-r--r--src/lxml/html/tests/test_html5parser.py6
2 files changed, 14 insertions, 1 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
index 7188c7ea..ba9d41b3 100644
--- a/src/lxml/html/html5parser.py
+++ b/src/lxml/html/html5parser.py
@@ -147,7 +147,14 @@ def fromstring(html, guess_charset=True, parser=None):
guess_charset=guess_charset)
# document starts with doctype or <html>, full document!
- start = html[:50].lstrip().lower()
+ start = html[:50]
+ if hasattr(start, 'decode'):
+ # In python3, we may have been presented with a bytes object.
+ # Decode in ascii, that also covers latin-1 and utf-8 for the
+ # characters we need
+ start = start.decode('ascii', 'replace')
+
+ start = start.lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
return doc
diff --git a/src/lxml/html/tests/test_html5parser.py b/src/lxml/html/tests/test_html5parser.py
index fad45dc4..667c68d2 100644
--- a/src/lxml/html/tests/test_html5parser.py
+++ b/src/lxml/html/tests/test_html5parser.py
@@ -233,6 +233,12 @@ class Test_fromstring(unittest.TestCase):
self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser),
'the doc')
+ def test_returns_whole_doc_if_input_is_encoded(self):
+ parser = DummyParser(root='the doc')
+ input = '<!DOCTYPE html>'.encode('ascii')
+ self.assertEqual(self.call_it(input, parser=parser),
+ 'the doc')
+
def test_returns_whole_doc_if_head_not_empty(self, use_ns=True):
E = HTMLElementMaker(namespaceHTMLElements=use_ns)
root = E.html(E.head(E.title()))