diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2018-09-09 16:44:17 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2018-09-09 16:44:17 +0200 |
commit | 6be1d081b49c97cfd7b3fbd934a193b668629109 (patch) | |
tree | fa61fc92594058ee327af7beb58eca221e5d4cae | |
parent | 1f534e2b957c0ea537c42d87fc262cb7069f0b1c (diff) | |
download | python-lxml-6be1d081b49c97cfd7b3fbd934a193b668629109.tar.gz |
Fix: make the cleaner also remove javascript URLs that use escaping.
-rw-r--r-- | src/lxml/html/clean.py | 5 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.txt | 6 |
2 files changed, 6 insertions, 5 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index adc3f450..11da2958 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -8,9 +8,10 @@ import re import copy try: from urlparse import urlsplit + from urllib import unquote_plus except ImportError: # Python 3 - from urllib.parse import urlsplit + from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, XHTML_NAMESPACE @@ -482,7 +483,7 @@ class Cleaner(object): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', link) + new = _substitute_whitespace('', unquote_plus(link)) if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index c78ab4f1..2824f64c 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -18,7 +18,7 @@ ... <body onload="evil_function()"> ... <!-- I am interpreted for EVIL! --> ... <a href="javascript:evil_function()">a link</a> -... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t:evil_function()">a control char link</a> +... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a> ... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a> ... <a href="#" onclick="evil_function()">another link</a> ... <p onclick="evil_function()">a paragraph</p> @@ -51,7 +51,7 @@ <body onload="evil_function()"> <!-- I am interpreted for EVIL! --> <a href="javascript:evil_function()">a link</a> - <a href="javascrip t:evil_function()">a control char link</a> + <a href="javascrip t%20:evil_function()">a control char link</a> <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a> <a href="#" onclick="evil_function()">another link</a> <p onclick="evil_function()">a paragraph</p> @@ -84,7 +84,7 @@ <body onload="evil_function()"> <!-- I am interpreted for EVIL! --> <a href="javascript:evil_function()">a link</a> - <a href="javascrip%20t:evil_function()">a control char link</a> + <a href="javascrip%20t%20:evil_function()">a control char link</a> <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a> <a href="#" onclick="evil_function()">another link</a> <p onclick="evil_function()">a paragraph</p> |