Move src to root

author: Cengiz Kaygusuz <cngkaygusuz@gmail.com> 2017-11-20 20:46:39 -0500
committer: Cengiz Kaygusuz <cngkaygusuz@gmail.com> 2017-11-20 20:46:39 -0500
commit: 27e183a78c8062ed7c2bbb91655a5e56cd697bba (patch)
tree: 88fd355a0cc6da4c130582e092d702836596cbb2 /examples/urlExtractor.py
parent: 4ba589cf13588e90992e23deb5a9784340efd2cc (diff)
download: pyparsing-git-27e183a78c8062ed7c2bbb91655a5e56cd697bba.tar.gz
1 files changed, 33 insertions, 0 deletions
diff --git a/examples/urlExtractor.py b/examples/urlExtractor.py
new file mode 100644
index 0000000..2c66d78
--- /dev/null
+++ b/examples/urlExtractor.py
@@ -0,0 +1,33 @@
+# URL extractor
+# Copyright 2004, Paul McGuire
+from pyparsing import makeHTMLTags, SkipTo, pyparsing_common
+import urllib.request
+from contextlib import closing
+import pprint
+
+linkOpenTag, linkCloseTag = makeHTMLTags('a')
+
+linkBody = SkipTo(linkCloseTag)
+linkBody.setParseAction(pyparsing_common.stripHTMLTags)
+linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split()))
+
+link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()
+
+# Go get some HTML with some links in it.
+with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage:
+    htmlText = serverListPage.read().decode("UTF-8")
+
+# scanString is a generator that loops through the input htmlText, and for each
+# match yields the tokens and start and end locations (for this application, we are
+# not interested in the start and end values).
+for toks,strt,end in link.scanString(htmlText):
+    print(toks.asList())
+
+# Create dictionary from list comprehension, assembled from each pair of tokens returned 
+# from a matched URL.
+pprint.pprint( 
+    dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText))
+    )
+
+
+
author	Cengiz Kaygusuz <cngkaygusuz@gmail.com>	2017-11-20 20:46:39 -0500
committer	Cengiz Kaygusuz <cngkaygusuz@gmail.com>	2017-11-20 20:46:39 -0500
commit	27e183a78c8062ed7c2bbb91655a5e56cd697bba (patch)
tree	88fd355a0cc6da4c130582e092d702836596cbb2 /examples/urlExtractor.py
parent	4ba589cf13588e90992e23deb5a9784340efd2cc (diff)
download	pyparsing-git-27e183a78c8062ed7c2bbb91655a5e56cd697bba.tar.gz