diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2021-12-12 15:14:42 +0100 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2021-12-12 15:14:42 +0100 |
commit | 99f6eb5495d5fe742958669ca3661524c037e177 (patch) | |
tree | 5dbb481390a75e304fdd32abedf7fad4b29fca7d | |
parent | add0d3d85eebc1ce7357352910c04e0e8a82f138 (diff) | |
parent | a3eacbc0dcf1de1c822ec29fb7d090a4b1712a9c (diff) | |
download | python-lxml-99f6eb5495d5fe742958669ca3661524c037e177.tar.gz |
Merge branch 'lxml-4.6'
-rw-r--r-- | CHANGES.txt | 13 | ||||
-rw-r--r-- | doc/main.txt | 10 | ||||
-rw-r--r-- | src/lxml/html/clean.py | 31 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.py | 128 |
4 files changed, 167 insertions, 15 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index d17f03d5..b1e49946 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -31,6 +31,19 @@ Other changes * Wheels include libxml2 2.9.12+ and libxslt 1.1.34 (also on Windows). +4.6.5 (2021-12-12) +================== + +Bugs fixed +---------- + +* A vulnerability (GHSL-2021-1038) in the HTML cleaner allowed sneaking script + content through SVG images. + +* A vulnerability (GHSL-2021-1037) in the HTML cleaner allowed sneaking script + content through CSS imports and other crafted constructs. + + 4.6.4 (2021-11-01) ================== diff --git a/doc/main.txt b/doc/main.txt index 75fedd5e..55e32d54 100644 --- a/doc/main.txt +++ b/doc/main.txt @@ -159,8 +159,8 @@ Index <http://pypi.python.org/pypi/lxml/>`_ (PyPI). It has the source that compiles on various platforms. The source distribution is signed with `this key <pubkey.asc>`_. -The latest version is `lxml 4.6.4`_, released 2021-11-01 -(`changes for 4.6.4`_). `Older versions <#old-versions>`_ +The latest version is `lxml 4.6.5`_, released 2021-12-12 +(`changes for 4.6.5`_). `Older versions <#old-versions>`_ are listed below. Please take a look at the @@ -256,7 +256,9 @@ See the websites of lxml .. and the `latest in-development version <http://lxml.de/dev/>`_. -.. _`PDF documentation`: lxmldoc-4.6.4.pdf +.. _`PDF documentation`: lxmldoc-4.6.5.pdf + +* `lxml 4.6.5`_, released 2021-12-12 (`changes for 4.6.5`_) * `lxml 4.6.4`_, released 2021-11-01 (`changes for 4.6.4`_) @@ -284,6 +286,7 @@ See the websites of lxml * `older releases <http://lxml.de/4.3/#old-versions>`_ +.. _`lxml 4.6.5`: /files/lxml-4.6.5.tgz .. _`lxml 4.6.4`: /files/lxml-4.6.4.tgz .. _`lxml 4.6.3`: /files/lxml-4.6.3.tgz .. _`lxml 4.6.2`: /files/lxml-4.6.2.tgz @@ -297,6 +300,7 @@ See the websites of lxml .. _`lxml 4.4.1`: /files/lxml-4.4.1.tgz .. _`lxml 4.4.0`: /files/lxml-4.4.0.tgz +.. _`changes for 4.6.5`: /changes-4.6.5.html .. _`changes for 4.6.4`: /changes-4.6.4.html .. _`changes for 4.6.3`: /changes-4.6.3.html .. _`changes for 4.6.2`: /changes-4.6.2.html diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index 0494357e..e6b0543c 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -75,18 +75,23 @@ _looks_like_tag_content = re.compile( # All kinds of schemes besides just javascript: that can cause # execution: -_is_image_dataurl = re.compile( - r'^data:image/.+;base64', re.I).search -_is_possibly_malicious_scheme = re.compile( - r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', - re.I).search -def _is_javascript_scheme(s): - if _is_image_dataurl(s): - return None - return _is_possibly_malicious_scheme(s) +_find_image_dataurls = re.compile( + r'data:image/(.+);base64,', re.I).findall +_possibly_malicious_schemes = re.compile( + r'(javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).findall +# SVG images can contain script content +_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search + +def _has_javascript_scheme(s): + safe_image_urls = 0 + for image_type in _find_image_dataurls(s): + if _is_unsafe_image_type(image_type): + return True + safe_image_urls += 1 + return len(_possibly_malicious_schemes(s)) > safe_image_urls _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub -# FIXME: should data: be blocked? # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx _conditional_comment_re = re.compile( @@ -515,7 +520,7 @@ class Cleaner(object): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE new = _substitute_whitespace('', unquote_plus(link)) - if _is_javascript_scheme(new): + if _has_javascript_scheme(new): # FIXME: should this be None to delete? return '' return link @@ -537,10 +542,12 @@ class Cleaner(object): style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() - if 'javascript:' in style: + if _has_javascript_scheme(style): return True if 'expression(' in style: return True + if '@import' in style: + return True if '</noscript' in style: # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' return True diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index 45c2e83a..aec87cd9 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -1,3 +1,5 @@ +import base64 +import gzip import unittest from lxml.tests.common_imports import make_doctest @@ -123,6 +125,132 @@ class CleanerTest(unittest.TestCase): b'<math><style>/* deleted */</style></math>', lxml.html.tostring(clean_html(s))) + def test_sneaky_import_in_style(self): + # Prevent "@@importimport" -> "@import" replacement etc. + style_codes = [ + "@@importimport(extstyle.css)", + "@ @ import import(extstyle.css)", + "@ @ importimport(extstyle.css)", + "@@ import import(extstyle.css)", + "@ @import import(extstyle.css)", + "@@importimport()", + "@@importimport() ()", + "@/* ... */import()", + "@im/* ... */port()", + "@ @import/* ... */import()", + "@ /* ... */ import()", + ] + for style_code in style_codes: + html = '<style>%s</style>' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<style>/* deleted */</style>', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_sneaky_schemes_in_style(self): + style_codes = [ + "javasjavascript:cript:", + "javascriptjavascript::", + "javascriptjavascript:: :", + "vbjavascript:cript:", + ] + for style_code in style_codes: + html = '<style>%s</style>' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<style>/* deleted */</style>', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_sneaky_urls_in_style(self): + style_codes = [ + "url(data:image/svg+xml;base64,...)", + "url(javasjavascript:cript:)", + "url(javasjavascript:cript: ::)", + "url(vbjavascript:cript:)", + "url(vbjavascript:cript: :)", + ] + for style_code in style_codes: + html = '<style>%s</style>' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<style>url()</style>', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_svg_data_links(self): + # Remove SVG images with potentially insecure content. + svg = b'<svg onload="alert(123)" />' + svgz = gzip.compress(svg) + svg_b64 = base64.b64encode(svg).decode('ASCII') + svgz_b64 = base64.b64encode(svgz).decode('ASCII') + urls = [ + "data:image/svg+xml;base64," + svg_b64, + "data:image/svg+xml-compressed;base64," + svgz_b64, + ] + for url in urls: + html = '<img src="%s">' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<img src="">', + cleaned, + "%s -> %s" % (url, cleaned)) + + def test_image_data_links(self): + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + urls = [ + "data:image/jpeg;base64," + data_b64, + "data:image/apng;base64," + data_b64, + "data:image/png;base64," + data_b64, + "data:image/gif;base64," + data_b64, + "data:image/webp;base64," + data_b64, + "data:image/bmp;base64," + data_b64, + "data:image/tiff;base64," + data_b64, + "data:image/x-icon;base64," + data_b64, + ] + for url in urls: + html = '<img src="%s">' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (url, cleaned)) + + def test_image_data_links_in_style(self): + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + urls = [ + "data:image/jpeg;base64," + data_b64, + "data:image/apng;base64," + data_b64, + "data:image/png;base64," + data_b64, + "data:image/gif;base64," + data_b64, + "data:image/webp;base64," + data_b64, + "data:image/bmp;base64," + data_b64, + "data:image/tiff;base64," + data_b64, + "data:image/x-icon;base64," + data_b64, + ] + for url in urls: + html = '<style> url(%s) </style>' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (url, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute |