diff options
-rw-r--r-- | CHANGES.txt | 11 | ||||
-rw-r--r-- | src/lxml/html/clean.py | 22 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.py | 10 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.txt | 18 |
4 files changed, 50 insertions, 11 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index 7afec7e2..e3b77140 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,17 @@ lxml changelog ============== +4.6.2 (2020-11-26) +================== + +Bugs fixed +---------- + +* A vulnerability (CVE-2020-27783) was discovered in the HTML Cleaner by Yaniv Nizry, + which allowed JavaScript to pass through. The cleaner now removes more sneaky + "style" content. + + 4.6.1 (2020-10-18) ================== diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index 7b51981d..0fa1544c 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -61,12 +61,15 @@ __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: -_css_javascript_re = re.compile( - r'expression\s*\(.*?\)', re.S|re.I) +_replace_css_javascript = re.compile( + r'expression\s*\(.*?\)', re.S|re.I).sub # Do I have to worry about @\nimport? -_css_import_re = re.compile( - r'@\s*import', re.I) +_replace_css_import = re.compile( + r'@\s*import', re.I).sub + +_looks_like_tag_content = re.compile( + r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', re.ASCII).search # All kinds of schemes besides just javascript: that can cause # execution: @@ -304,8 +307,8 @@ class Cleaner(object): if not self.inline_style: for el in _find_styled_elements(doc): old = el.get('style') - new = _css_javascript_re.sub('', old) - new = _css_import_re.sub('', new) + new = _replace_css_javascript('', old) + new = _replace_css_import('', new) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] @@ -317,9 +320,9 @@ class Cleaner(object): el.drop_tree() continue old = el.text or '' - new = _css_javascript_re.sub('', old) + new = _replace_css_javascript('', old) # The imported CSS can do anything; we just can't allow: - new = _css_import_re.sub('', old) + new = _replace_css_import('', new) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' @@ -539,6 +542,9 @@ class Cleaner(object): if '</noscript' in style: # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' return True + if _looks_like_tag_content(style): + # e.g. '<math><style><img src=x onerror=alert(1)></style></math>' + return True return False def clean_html(self, html): diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index 3c8ee252..0e669f98 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -113,6 +113,16 @@ class CleanerTest(unittest.TestCase): b'<noscript><style>/* deleted */</style></noscript>', lxml.html.tostring(clean_html(s))) + def test_sneaky_js_in_math_style(self): + # This gets parsed as <math> -> <style>"..."</style> + # thus passing any tag/script/whatever content through into the output. + html = '<math><style><img src=x onerror=alert(1)></style></math>' + s = lxml.html.fragment_fromstring(html) + + self.assertEqual( + b'<math><style>/* deleted */</style></math>', + lxml.html.tostring(clean_html(s))) + def test_suite(): suite = unittest.TestSuite() diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index 275be07c..18e6c7e6 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -104,7 +104,11 @@ >>> print(Cleaner(page_structure=False, comments=False).clean_html(doc)) <html> <head> - <style>/* deleted */</style> + <style> + body {background-image: url()}; + div {background-image: url()}; + div {color: }; + </style> </head> <body> <!-- I am interpreted for EVIL! --> @@ -126,7 +130,11 @@ >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)) <html> <head> - <style>/* deleted */</style> + <style> + body {background-image: url()}; + div {background-image: url()}; + div {color: }; + </style> </head> <body> <a href="">a link</a> @@ -190,7 +198,11 @@ <link rel="alternate" type="text/rss" src="evil-rss"> <link rel="alternate" type="text/rss" href="http://example.com"> <link rel="stylesheet" type="text/rss" href="http://example.com"> - <style>/* deleted */</style> + <style> + body {background-image: url()}; + div {background-image: url()}; + div {color: }; + </style> </head> <body> <a href="">a link</a> |