diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2021-12-11 12:19:21 +0100 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2021-12-11 12:19:21 +0100 |
commit | 69a747356655158fdf9abaecea5feafb3bd6b5f5 (patch) | |
tree | 631424923bed89c1bb7565c0a5cc24ca94b8acdf | |
parent | 54d2985a36184a4b36017a6000fa4d11411f7292 (diff) | |
download | python-lxml-69a747356655158fdf9abaecea5feafb3bd6b5f5.tar.gz |
Cleaner: cover some more cases where scripts could sneak through in specially crafted style content.
-rw-r--r-- | src/lxml/html/clean.py | 20 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.py | 65 |
2 files changed, 73 insertions, 12 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index dd3a28ad..e6b0543c 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -76,22 +76,20 @@ _looks_like_tag_content = re.compile( # All kinds of schemes besides just javascript: that can cause # execution: _find_image_dataurls = re.compile( - r'^data:image/(.+);base64,', re.I).findall -_is_possibly_malicious_scheme = re.compile( + r'data:image/(.+);base64,', re.I).findall +_possibly_malicious_schemes = re.compile( r'(javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).findall # SVG images can contain script content -_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall +_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search -def _is_javascript_scheme(s): - is_image_url = False +def _has_javascript_scheme(s): + safe_image_urls = 0 for image_type in _find_image_dataurls(s): - is_image_url = True if _is_unsafe_image_type(image_type): return True - if is_image_url: - return False - return bool(_is_possibly_malicious_scheme(s)) + safe_image_urls += 1 + return len(_possibly_malicious_schemes(s)) > safe_image_urls _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub @@ -522,7 +520,7 @@ class Cleaner(object): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE new = _substitute_whitespace('', unquote_plus(link)) - if _is_javascript_scheme(new): + if _has_javascript_scheme(new): # FIXME: should this be None to delete? return '' return link @@ -544,7 +542,7 @@ class Cleaner(object): style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() - if 'javascript:' in style: + if _has_javascript_scheme(style): return True if 'expression(' in style: return True diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index a05d9673..aec87cd9 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -126,7 +126,7 @@ class CleanerTest(unittest.TestCase): lxml.html.tostring(clean_html(s))) def test_sneaky_import_in_style(self): - # Prevent "@@importimport" -> "@import" replacement. + # Prevent "@@importimport" -> "@import" replacement etc. style_codes = [ "@@importimport(extstyle.css)", "@ @ import import(extstyle.css)", @@ -134,6 +134,11 @@ class CleanerTest(unittest.TestCase): "@@ import import(extstyle.css)", "@ @import import(extstyle.css)", "@@importimport()", + "@@importimport() ()", + "@/* ... */import()", + "@im/* ... */port()", + "@ @import/* ... */import()", + "@ /* ... */ import()", ] for style_code in style_codes: html = '<style>%s</style>' % style_code @@ -145,6 +150,41 @@ class CleanerTest(unittest.TestCase): cleaned, "%s -> %s" % (style_code, cleaned)) + def test_sneaky_schemes_in_style(self): + style_codes = [ + "javasjavascript:cript:", + "javascriptjavascript::", + "javascriptjavascript:: :", + "vbjavascript:cript:", + ] + for style_code in style_codes: + html = '<style>%s</style>' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<style>/* deleted */</style>', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_sneaky_urls_in_style(self): + style_codes = [ + "url(data:image/svg+xml;base64,...)", + "url(javasjavascript:cript:)", + "url(javasjavascript:cript: ::)", + "url(vbjavascript:cript:)", + "url(vbjavascript:cript: :)", + ] + for style_code in style_codes: + html = '<style>%s</style>' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<style>url()</style>', + cleaned, + "%s -> %s" % (style_code, cleaned)) + def test_svg_data_links(self): # Remove SVG images with potentially insecure content. svg = b'<svg onload="alert(123)" />' @@ -188,6 +228,29 @@ class CleanerTest(unittest.TestCase): cleaned, "%s -> %s" % (url, cleaned)) + def test_image_data_links_in_style(self): + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + urls = [ + "data:image/jpeg;base64," + data_b64, + "data:image/apng;base64," + data_b64, + "data:image/png;base64," + data_b64, + "data:image/gif;base64," + data_b64, + "data:image/webp;base64," + data_b64, + "data:image/bmp;base64," + data_b64, + "data:image/tiff;base64," + data_b64, + "data:image/x-icon;base64," + data_b64, + ] + for url in urls: + html = '<style> url(%s) </style>' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (url, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute |