From 27e183a78c8062ed7c2bbb91655a5e56cd697bba Mon Sep 17 00:00:00 2001 From: Cengiz Kaygusuz Date: Mon, 20 Nov 2017 20:46:39 -0500 Subject: Move src to root --- examples/urlExtractor.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 examples/urlExtractor.py (limited to 'examples/urlExtractor.py') diff --git a/examples/urlExtractor.py b/examples/urlExtractor.py new file mode 100644 index 0000000..2c66d78 --- /dev/null +++ b/examples/urlExtractor.py @@ -0,0 +1,33 @@ +# URL extractor +# Copyright 2004, Paul McGuire +from pyparsing import makeHTMLTags, SkipTo, pyparsing_common +import urllib.request +from contextlib import closing +import pprint + +linkOpenTag, linkCloseTag = makeHTMLTags('a') + +linkBody = SkipTo(linkCloseTag) +linkBody.setParseAction(pyparsing_common.stripHTMLTags) +linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split())) + +link = linkOpenTag + linkBody("body") + linkCloseTag.suppress() + +# Go get some HTML with some links in it. +with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage: + htmlText = serverListPage.read().decode("UTF-8") + +# scanString is a generator that loops through the input htmlText, and for each +# match yields the tokens and start and end locations (for this application, we are +# not interested in the start and end values). +for toks,strt,end in link.scanString(htmlText): + print(toks.asList()) + +# Create dictionary from list comprehension, assembled from each pair of tokens returned +# from a matched URL. +pprint.pprint( + dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText)) + ) + + + -- cgit v1.2.1