diff options
| author | Stefan Behnel <stefan_ml@behnel.de> | 2020-10-18 10:06:46 +0200 |
|---|---|---|
| committer | Stefan Behnel <stefan_ml@behnel.de> | 2020-10-18 10:06:46 +0200 |
| commit | 89e7aad6e7ff9ecd88678ff25f885988b184b26e (patch) | |
| tree | 53a3df42867838fe1bf4f040b3498cc030c50552 /src | |
| parent | 264f90376927fa370536f3b3e9f393d148b28ed3 (diff) | |
| download | python-lxml-89e7aad6e7ff9ecd88678ff25f885988b184b26e.tar.gz | |
Prevent combinations of <noscript> and <style> to sneak JavaScript through the HTML cleaner.
Diffstat (limited to 'src')
| -rw-r--r-- | src/lxml/html/clean.py | 3 | ||||
| -rw-r--r-- | src/lxml/html/tests/test_clean.py | 10 |
2 files changed, 13 insertions, 0 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index d43b9baf..7b51981d 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -536,6 +536,9 @@ class Cleaner(object): return True if 'expression(' in style: return True + if '</noscript' in style: + # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' + return True return False def clean_html(self, html): diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index 44773379..3c8ee252 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -103,6 +103,16 @@ class CleanerTest(unittest.TestCase): '<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>', cleaner.clean_html(html)) + def test_sneaky_noscript_in_style(self): + # This gets parsed as <noscript> -> <style>"...</noscript>..."</style> + # thus passing the </noscript> through into the output. + html = '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' + s = lxml.html.fragment_fromstring(html) + + self.assertEqual( + b'<noscript><style>/* deleted */</style></noscript>', + lxml.html.tostring(clean_html(s))) + def test_suite(): suite = unittest.TestSuite() |
