diff options
author | Paul McGuire <ptmcg@users.noreply.github.com> | 2018-01-06 23:38:53 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-06 23:38:53 -0600 |
commit | 430c5ad767cc946e9da7cd5f4673a4e3bd135a3c (patch) | |
tree | 5a7df11e0fd52ab320b0ef3e670e260f315ca9ae /examples/urlExtractor.py | |
parent | f1d12567a8da4d254e6d62bb0d650c87c7d0bb89 (diff) | |
parent | d953150a6db3ac247a64b047edc2df7156f3e56b (diff) | |
download | pyparsing-git-430c5ad767cc946e9da7cd5f4673a4e3bd135a3c.tar.gz |
Merge pull request #1 from cngkaygusuz/master
Add Scrutinizer-CI configuration and other niceties
Diffstat (limited to 'examples/urlExtractor.py')
-rw-r--r-- | examples/urlExtractor.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/examples/urlExtractor.py b/examples/urlExtractor.py new file mode 100644 index 0000000..2c66d78 --- /dev/null +++ b/examples/urlExtractor.py @@ -0,0 +1,33 @@ +# URL extractor
+# Copyright 2004, Paul McGuire
+from pyparsing import makeHTMLTags, SkipTo, pyparsing_common
+import urllib.request
+from contextlib import closing
+import pprint
+
+linkOpenTag, linkCloseTag = makeHTMLTags('a')
+
+linkBody = SkipTo(linkCloseTag)
+linkBody.setParseAction(pyparsing_common.stripHTMLTags)
+linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split()))
+
+link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()
+
+# Go get some HTML with some links in it.
+with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage:
+ htmlText = serverListPage.read().decode("UTF-8")
+
+# scanString is a generator that loops through the input htmlText, and for each
+# match yields the tokens and start and end locations (for this application, we are
+# not interested in the start and end values).
+for toks,strt,end in link.scanString(htmlText):
+ print(toks.asList())
+
+# Create dictionary from list comprehension, assembled from each pair of tokens returned
+# from a matched URL.
+pprint.pprint(
+ dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText))
+ )
+
+
+
|