summaryrefslogtreecommitdiff
path: root/bs4/tests/test_lxml.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-28 16:58:23 -0400
commit37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree30826a9a744be8f2194c469484618a0326d0488e /bs4/tests/test_lxml.py
parent81f853622f808fba7cd89d02ec524abc8588f196 (diff)
downloadbeautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz
Correctly handle invalid HTML numeric character entities like &#147;
which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
Diffstat (limited to 'bs4/tests/test_lxml.py')
-rw-r--r--bs4/tests/test_lxml.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index a05870b..23cbaef 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+ def test_entities_in_original_document_encoding(self):
+ # We can't implement this case correctly because by the time we
+ # hear about markup like "&#147;", it's been (incorrectly) converted into
+ # a string like u'\x93'
+ pass
+
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.