Add example files to SVN

author: Paul McGuire <ptmcg@austin.rr.com> 2012-10-02 04:55:56 +0000
committer: Paul McGuire <ptmcg@austin.rr.com> 2012-10-02 04:55:56 +0000
commit: a7f9dda0668bfce4fba51df1bf2976b4a93a8bd5 (patch)
tree: 57ea8bcf2e66532a36c833a7bc57cff9d5d0e4dd /src/examples/htmlStripper.py
parent: f5d2b716ffb57b65660a7ee0bbf04332dfb29620 (diff)
download: pyparsing-git-a7f9dda0668bfce4fba51df1bf2976b4a93a8bd5.tar.gz
1 files changed, 39 insertions, 0 deletions
diff --git a/src/examples/htmlStripper.py b/src/examples/htmlStripper.py
new file mode 100644
index 0000000..502acc5
--- /dev/null
+++ b/src/examples/htmlStripper.py
@@ -0,0 +1,39 @@
+#
+# htmlStripper.py
+#
+#  Sample code for stripping HTML markup tags and scripts from 
+#  HTML source files.
+#
+# Copyright (c) 2006, Paul McGuire
+#
+from pyparsing import *
+import urllib
+
+removeText = replaceWith("")
+scriptOpen,scriptClose = makeHTMLTags("script")
+scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
+scriptBody.setParseAction(removeText)
+
+anyTag,anyClose = makeHTMLTags(Word(alphas,alphanums+":_"))
+anyTag.setParseAction(removeText)
+anyClose.setParseAction(removeText)
+htmlComment.setParseAction(removeText)
+
+commonHTMLEntity.setParseAction(replaceHTMLEntity)
+
+# get some HTML
+targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary"
+targetPage = urllib.urlopen( targetURL )
+targetHTML = targetPage.read()
+targetPage.close()
+
+# first pass, strip out tags and translate entities
+firstPass = (htmlComment | scriptBody | commonHTMLEntity | 
+             anyTag | anyClose ).transformString(targetHTML)
+
+# first pass leaves many blank lines, collapse these down
+repeatedNewlines = LineEnd() + OneOrMore(LineEnd())
+repeatedNewlines.setParseAction(replaceWith("\n\n"))
+secondPass = repeatedNewlines.transformString(firstPass)
+
+print secondPass
+\ No newline at end of file
author	Paul McGuire <ptmcg@austin.rr.com>	2012-10-02 04:55:56 +0000
committer	Paul McGuire <ptmcg@austin.rr.com>	2012-10-02 04:55:56 +0000
commit	a7f9dda0668bfce4fba51df1bf2976b4a93a8bd5 (patch)
tree	57ea8bcf2e66532a36c833a7bc57cff9d5d0e4dd /src/examples/htmlStripper.py
parent	f5d2b716ffb57b65660a7ee0bbf04332dfb29620 (diff)
download	pyparsing-git-a7f9dda0668bfce4fba51df1bf2976b4a93a8bd5.tar.gz