diff options
author | Koert van der Veer <koert@ondergetekende.nl> | 2017-03-16 09:13:18 +0100 |
---|---|---|
committer | Koert van der Veer <koert@ondergetekende.nl> | 2017-03-16 09:13:18 +0100 |
commit | 82a354a2ad2a1ecafc729e619bd7875646250fcc (patch) | |
tree | f7a2e39328818b486149a98b9d1bf048e00f412b | |
parent | 329fe90232ab31e1e2a30abab784c145bbeb75ec (diff) | |
download | python-lxml-82a354a2ad2a1ecafc729e619bd7875646250fcc.tar.gz |
Perform full-document detection on decoded bytes.
Closes #1673355
-rw-r--r-- | src/lxml/html/html5parser.py | 9 | ||||
-rw-r--r-- | src/lxml/html/tests/test_html5parser.py | 6 |
2 files changed, 14 insertions, 1 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index 7188c7ea..ba9d41b3 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -147,7 +147,14 @@ def fromstring(html, guess_charset=True, parser=None): guess_charset=guess_charset) # document starts with doctype or <html>, full document! - start = html[:50].lstrip().lower() + start = html[:50] + if hasattr(start, 'decode'): + # In python3, we may have been presented with a bytes object. + # Decode in ascii, that also covers latin-1 and utf-8 for the + # characters we need + start = start.decode('ascii', 'replace') + + start = start.lstrip().lower() if start.startswith('<html') or start.startswith('<!doctype'): return doc diff --git a/src/lxml/html/tests/test_html5parser.py b/src/lxml/html/tests/test_html5parser.py index fad45dc4..667c68d2 100644 --- a/src/lxml/html/tests/test_html5parser.py +++ b/src/lxml/html/tests/test_html5parser.py @@ -233,6 +233,12 @@ class Test_fromstring(unittest.TestCase): self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser), 'the doc') + def test_returns_whole_doc_if_input_is_encoded(self): + parser = DummyParser(root='the doc') + input = '<!DOCTYPE html>'.encode('ascii') + self.assertEqual(self.call_it(input, parser=parser), + 'the doc') + def test_returns_whole_doc_if_head_not_empty(self, use_ns=True): E = HTMLElementMaker(namespaceHTMLElements=use_ns) root = E.html(E.head(E.title())) |