>>> import re >>> from lxml.html import fromstring, tostring >>> from lxml.html.clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest >>> doc = ''' ... ... ... ... ... ... ... ... ... ... a link ... a control char link ... data ... another link ...

a paragraph

...
secret EVIL!
... of EVIL! ... ...
... Password: ...
... spam spam SPAM! ... ... Text ... ... ... ''' >>> print(re.sub('[\x00-\x07\x0E]', '', doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL!
Password:
spam spam SPAM! Text >>> print(tostring(fromstring(doc)).decode("utf-8")) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL!
Password:
spam spam SPAM! Text >>> print(Cleaner(page_structure=False, comments=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Text >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Text >>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Author Text >>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Author Text >>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!
of EVIL! Password: spam spam SPAM! Text