diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2016-07-24 09:58:51 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2016-07-24 09:58:51 +0200 |
commit | d3b73e561628998a8738b23268f4f8bf5ca98706 (patch) | |
tree | 2b694fa787af8096c98d00ee51a17590ab05e9ad | |
parent | 0a43d0b52ea35e33564b4e08b3e50f4f890f4758 (diff) | |
download | python-lxml-d3b73e561628998a8738b23268f4f8bf5ca98706.tar.gz |
make the "inline_style" option in Cleaner default to (but independent of) the "style" option
-rw-r--r-- | src/lxml/html/clean.py | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index 47a32749..f0da1cd6 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -95,6 +95,7 @@ _find_external_links = etree.XPath( "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), namespaces={'x':XHTML_NAMESPACE}) + class Cleaner(object): """ Instances cleans the document of each of the possible offending @@ -115,7 +116,7 @@ class Cleaner(object): Removes any style tags. ``inline_style`` - Removes any style attributes. + Removes any style attributes. Defaults to the value of the ``style`` option. ``links``: Removes any ``<link>`` tags @@ -194,7 +195,7 @@ class Cleaner(object): javascript = True comments = True style = False - inline_style = False + inline_style = None links = True meta = True page_structure = True @@ -219,6 +220,8 @@ class Cleaner(object): raise TypeError( "Unknown parameter: %s=%r" % (name, value)) setattr(self, name, value) + if self.inline_style is None and 'inline_style' not in kw: + self.inline_style = self.style # Used to lookup the primary URL for a given tag that is up for # removal: @@ -284,9 +287,9 @@ class Cleaner(object): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) - if not self.style: - # If we're deleting style then we don't have to remove JS links - # from styles, otherwise... + # If we're deleting style then we don't have to remove JS links + # from styles, otherwise... + if not self.inline_style: for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) @@ -296,6 +299,7 @@ class Cleaner(object): del el.attrib['style'] elif new != old: el.set('style', new) + if not self.style: for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() |