summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2018-09-09 16:44:17 +0200
committerStefan Behnel <stefan_ml@behnel.de>2018-09-09 16:44:17 +0200
commit6be1d081b49c97cfd7b3fbd934a193b668629109 (patch)
treefa61fc92594058ee327af7beb58eca221e5d4cae
parent1f534e2b957c0ea537c42d87fc262cb7069f0b1c (diff)
downloadpython-lxml-6be1d081b49c97cfd7b3fbd934a193b668629109.tar.gz
Fix: make the cleaner also remove javascript URLs that use escaping.
-rw-r--r--src/lxml/html/clean.py5
-rw-r--r--src/lxml/html/tests/test_clean.txt6
2 files changed, 6 insertions, 5 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index adc3f450..11da2958 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -8,9 +8,10 @@ import re
import copy
try:
from urlparse import urlsplit
+ from urllib import unquote_plus
except ImportError:
# Python 3
- from urllib.parse import urlsplit
+ from urllib.parse import urlsplit, unquote_plus
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, XHTML_NAMESPACE
@@ -482,7 +483,7 @@ class Cleaner(object):
def _remove_javascript_link(self, link):
# links like "j a v a s c r i p t:" might be interpreted in IE
- new = _substitute_whitespace('', link)
+ new = _substitute_whitespace('', unquote_plus(link))
if _is_javascript_scheme(new):
# FIXME: should this be None to delete?
return ''
diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt
index c78ab4f1..2824f64c 100644
--- a/src/lxml/html/tests/test_clean.txt
+++ b/src/lxml/html/tests/test_clean.txt
@@ -18,7 +18,7 @@
... <body onload="evil_function()">
... <!-- I am interpreted for EVIL! -->
... <a href="javascript:evil_function()">a link</a>
-... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t:evil_function()">a control char link</a>
+... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a>
... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
... <a href="#" onclick="evil_function()">another link</a>
... <p onclick="evil_function()">a paragraph</p>
@@ -51,7 +51,7 @@
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
<a href="javascript:evil_function()">a link</a>
- <a href="javascrip t:evil_function()">a control char link</a>
+ <a href="javascrip t%20:evil_function()">a control char link</a>
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
<a href="#" onclick="evil_function()">another link</a>
<p onclick="evil_function()">a paragraph</p>
@@ -84,7 +84,7 @@
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
<a href="javascript:evil_function()">a link</a>
- <a href="javascrip%20t:evil_function()">a control char link</a>
+ <a href="javascrip%20t%20:evil_function()">a control char link</a>
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
<a href="#" onclick="evil_function()">another link</a>
<p onclick="evil_function()">a paragraph</p>