>>> import re >>> from lxml.html import fromstring, tostring >>> from lxml.html.clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest >>> doc = ''' ... ... ... ... ... ... ... ... ... ... a link ... a control char link ... data ... another link ...

a paragraph

... ...

... ... ... spam spam SPAM! ... Author ... Text ...

... ... ''' >>> print(re.sub('[\x00-\x07\x0E]', '', doc)) a link a control char link data another link

a paragraph

spam spam SPAM! Author Text

>>> print(tostring(fromstring(doc)).decode("utf-8")) a link a control char link data another link

a paragraph

spam spam SPAM! Author Text

>>> print(Cleaner(page_structure=False, comments=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!

of EVIL! Password: spam spam SPAM! Author Text

>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

of EVIL! Password: spam spam SPAM! Author Text

>>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!

of EVIL! Password: spam spam SPAM! Author Text

>>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) a link a control char link data another link

a paragraph

of EVIL! Password: spam spam SPAM! Author Text

>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc)) a link a control char link data another link

a paragraph

secret EVIL!

of EVIL! Password: spam spam SPAM! Author Text