>>> import re >>> from lxml.html import fromstring, tostring >>> from lxml.html.clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest >>> doc = ''' ...
... ... ... ... ... ... ... ... ... a link ... a control char link ... data ... another link ...a paragraph
... ... ... ... ... spam spam SPAM! ... Author ... Text ... ... ... ''' >>> print(re.sub('[\x00-\x07\x0E]', '', doc)) a link a control char link data another linka paragraph
spam spam SPAM! Author Text >>> print(tostring(fromstring(doc)).decode("utf-8")) a link a control char link data another linka paragraph
spam spam SPAM! Author Text >>> print(Cleaner(page_structure=False, comments=False).clean_html(doc)) a link a control char link data another linka paragraph
a paragraph
of EVIL! Password: spam spam SPAM! Author Text >>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another linka paragraph
a paragraph
of EVIL! Password: spam spam SPAM! Author Text >>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc)) a link a control char link data another linka paragraph