diff options
Diffstat (limited to 'examples/htmlStripper.py')
-rw-r--r-- | examples/htmlStripper.py | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py index bd99b77..6a209fa 100644 --- a/examples/htmlStripper.py +++ b/examples/htmlStripper.py @@ -7,8 +7,16 @@ # Copyright (c) 2006, 2016, Paul McGuire # from urllib.request import urlopen -from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, - htmlComment, anyOpenTag, anyCloseTag, LineEnd, replaceWith) +from pyparsing import ( + makeHTMLTags, + commonHTMLEntity, + replaceHTMLEntity, + htmlComment, + anyOpenTag, + anyCloseTag, + LineEnd, + replaceWith, +) scriptOpen, scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose @@ -16,15 +24,18 @@ commonHTMLEntity.setParseAction(replaceHTMLEntity) # get some HTML targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" -with urlopen( targetURL ) as targetPage: +with urlopen(targetURL) as targetPage: targetHTML = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities -firstPass = (htmlComment | scriptBody | commonHTMLEntity | - anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) +firstPass = ( + (htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag) + .suppress() + .transformString(targetHTML) +) # first pass leaves many blank lines, collapse these down -repeatedNewlines = LineEnd()*(2,) +repeatedNewlines = LineEnd() * (2,) repeatedNewlines.setParseAction(replaceWith("\n\n")) secondPass = repeatedNewlines.transformString(firstPass) |