Correctly handle invalid HTML numeric character entities like 

which reference code points that are not Unicode code points. Note that this is only fixed when Beautiful Soup is used with the html.parser parser -- html5lib already worked and I couldn't fix it with lxml. [bug=1782933]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-28 16:58:23 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-28 16:58:23 -0400
commit: 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree: 30826a9a744be8f2194c469484618a0326d0488e /bs4/tests/test_lxml.py
parent: 81f853622f808fba7cd89d02ec524abc8588f196 (diff)
download: beautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz
1 files changed, 6 insertions, 0 deletions
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index a05870b..23cbaef 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertSoupEquals(
             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
 
+    def test_entities_in_original_document_encoding(self):
+        # We can't implement this case correctly because by the time we
+        # hear about markup like "&#147;", it's been (incorrectly) converted into
+        # a string like u'\x93'
+        pass
+        
     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
     # test if an old version of lxml is installed.
author	Leonard Richardson <leonardr@segfault.org>	2018-07-28 16:58:23 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-28 16:58:23 -0400
commit	37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (patch)
tree	30826a9a744be8f2194c469484618a0326d0488e /bs4/tests/test_lxml.py
parent	81f853622f808fba7cd89d02ec524abc8588f196 (diff)
download	beautifulsoup4-37e4159cb49d2f7c8fdafa0268adca5a1e2017e4.tar.gz