summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2020-10-18 10:06:46 +0200
committerStefan Behnel <stefan_ml@behnel.de>2020-10-18 10:06:46 +0200
commit89e7aad6e7ff9ecd88678ff25f885988b184b26e (patch)
tree53a3df42867838fe1bf4f040b3498cc030c50552 /src
parent264f90376927fa370536f3b3e9f393d148b28ed3 (diff)
downloadpython-lxml-89e7aad6e7ff9ecd88678ff25f885988b184b26e.tar.gz
Prevent combinations of <noscript> and <style> to sneak JavaScript through the HTML cleaner.
Diffstat (limited to 'src')
-rw-r--r--src/lxml/html/clean.py3
-rw-r--r--src/lxml/html/tests/test_clean.py10
2 files changed, 13 insertions, 0 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index d43b9baf..7b51981d 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -536,6 +536,9 @@ class Cleaner(object):
return True
if 'expression(' in style:
return True
+ if '</noscript' in style:
+ # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ return True
return False
def clean_html(self, html):
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index 44773379..3c8ee252 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -103,6 +103,16 @@ class CleanerTest(unittest.TestCase):
'<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>',
cleaner.clean_html(html))
+ def test_sneaky_noscript_in_style(self):
+ # This gets parsed as <noscript> -> <style>"...</noscript>..."</style>
+ # thus passing the </noscript> through into the output.
+ html = '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ s = lxml.html.fragment_fromstring(html)
+
+ self.assertEqual(
+ b'<noscript><style>/* deleted */</style></noscript>',
+ lxml.html.tostring(clean_html(s)))
+
def test_suite():
suite = unittest.TestSuite()