From fa3a163cacd0516f5282424478a0f30f54fe13a4 Mon Sep 17 00:00:00 2001 From: ptmcg Date: Sun, 14 Aug 2016 08:17:00 +0000 Subject: Update to new pyparsing and Python features git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@417 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b --- src/examples/htmlStripper.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/examples/htmlStripper.py b/src/examples/htmlStripper.py index 0b0f459..1d7a0f0 100644 --- a/src/examples/htmlStripper.py +++ b/src/examples/htmlStripper.py @@ -4,32 +4,25 @@ # Sample code for stripping HTML markup tags and scripts from # HTML source files. # -# Copyright (c) 2006, Paul McGuire +# Copyright (c) 2006, 2016, Paul McGuire # -from pyparsing import * +from contextlib import closing import urllib.request, urllib.parse, urllib.error +from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity, + htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) -removeText = replaceWith("") scriptOpen,scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose -scriptBody.setParseAction(removeText) - -anyTag,anyClose = makeHTMLTags(Word(alphas,alphanums+":_")) -anyTag.setParseAction(removeText) -anyClose.setParseAction(removeText) -htmlComment.setParseAction(removeText) - commonHTMLEntity.setParseAction(replaceHTMLEntity) # get some HTML targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary" -targetPage = urllib.request.urlopen( targetURL ) -targetHTML = targetPage.read() -targetPage.close() +with closing(urllib.request.urlopen( targetURL )) as targetPage: + targetHTML = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities firstPass = (htmlComment | scriptBody | commonHTMLEntity | - anyTag | anyClose ).transformString(targetHTML) + anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) # first pass leaves many blank lines, collapse these down repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) -- cgit v1.2.1