summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>2016-08-14 08:17:00 +0000
committerptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>2016-08-14 08:17:00 +0000
commitfa3a163cacd0516f5282424478a0f30f54fe13a4 (patch)
tree3212b1930c55419f8367d548f794596447e03112
parentec420e9735ecc2a2870f8d86e3e9657245e6bfeb (diff)
downloadpyparsing-fa3a163cacd0516f5282424478a0f30f54fe13a4.tar.gz
Update to new pyparsing and Python features
git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@417 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b
-rw-r--r--src/examples/htmlStripper.py21
1 files changed, 7 insertions, 14 deletions
diff --git a/src/examples/htmlStripper.py b/src/examples/htmlStripper.py
index 0b0f459..1d7a0f0 100644
--- a/src/examples/htmlStripper.py
+++ b/src/examples/htmlStripper.py
@@ -4,32 +4,25 @@
# Sample code for stripping HTML markup tags and scripts from
# HTML source files.
#
-# Copyright (c) 2006, Paul McGuire
+# Copyright (c) 2006, 2016, Paul McGuire
#
-from pyparsing import *
+from contextlib import closing
import urllib.request, urllib.parse, urllib.error
+from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity,
+ htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)
-removeText = replaceWith("")
scriptOpen,scriptClose = makeHTMLTags("script")
scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
-scriptBody.setParseAction(removeText)
-
-anyTag,anyClose = makeHTMLTags(Word(alphas,alphanums+":_"))
-anyTag.setParseAction(removeText)
-anyClose.setParseAction(removeText)
-htmlComment.setParseAction(removeText)
-
commonHTMLEntity.setParseAction(replaceHTMLEntity)
# get some HTML
targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary"
-targetPage = urllib.request.urlopen( targetURL )
-targetHTML = targetPage.read()
-targetPage.close()
+with closing(urllib.request.urlopen( targetURL )) as targetPage:
+ targetHTML = targetPage.read().decode("UTF-8")
# first pass, strip out tags and translate entities
firstPass = (htmlComment | scriptBody | commonHTMLEntity |
- anyTag | anyClose ).transformString(targetHTML)
+ anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)
# first pass leaves many blank lines, collapse these down
repeatedNewlines = LineEnd() + OneOrMore(LineEnd())