diff options
author | Paul McGuire <ptmcg@users.noreply.github.com> | 2018-01-06 23:38:53 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-06 23:38:53 -0600 |
commit | 430c5ad767cc946e9da7cd5f4673a4e3bd135a3c (patch) | |
tree | 5a7df11e0fd52ab320b0ef3e670e260f315ca9ae /examples/htmlStripper.py | |
parent | f1d12567a8da4d254e6d62bb0d650c87c7d0bb89 (diff) | |
parent | d953150a6db3ac247a64b047edc2df7156f3e56b (diff) | |
download | pyparsing-git-430c5ad767cc946e9da7cd5f4673a4e3bd135a3c.tar.gz |
Merge pull request #1 from cngkaygusuz/master
Add Scrutinizer-CI configuration and other niceties
Diffstat (limited to 'examples/htmlStripper.py')
-rw-r--r-- | examples/htmlStripper.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py new file mode 100644 index 0000000..1d7a0f0 --- /dev/null +++ b/examples/htmlStripper.py @@ -0,0 +1,32 @@ +#
+# htmlStripper.py
+#
+# Sample code for stripping HTML markup tags and scripts from
+# HTML source files.
+#
+# Copyright (c) 2006, 2016, Paul McGuire
+#
+from contextlib import closing
+import urllib.request, urllib.parse, urllib.error
+from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity,
+ htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)
+
+scriptOpen,scriptClose = makeHTMLTags("script")
+scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
+commonHTMLEntity.setParseAction(replaceHTMLEntity)
+
+# get some HTML
+targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary"
+with closing(urllib.request.urlopen( targetURL )) as targetPage:
+ targetHTML = targetPage.read().decode("UTF-8")
+
+# first pass, strip out tags and translate entities
+firstPass = (htmlComment | scriptBody | commonHTMLEntity |
+ anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)
+
+# first pass leaves many blank lines, collapse these down
+repeatedNewlines = LineEnd() + OneOrMore(LineEnd())
+repeatedNewlines.setParseAction(replaceWith("\n\n"))
+secondPass = repeatedNewlines.transformString(firstPass)
+
+print(secondPass)
\ No newline at end of file |