summaryrefslogtreecommitdiff
path: root/src/examples/urlExtractor.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/examples/urlExtractor.py')
-rw-r--r--src/examples/urlExtractor.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/src/examples/urlExtractor.py b/src/examples/urlExtractor.py
new file mode 100644
index 0000000..2e5f7be
--- /dev/null
+++ b/src/examples/urlExtractor.py
@@ -0,0 +1,42 @@
+# URL extractor
+# Copyright 2004, Paul McGuire
+from pyparsing import Literal,Suppress,CharsNotIn,CaselessLiteral,\
+ Word,dblQuotedString,alphanums,SkipTo
+import urllib
+import pprint
+
+# Define the pyparsing grammar for a URL, that is:
+# URLlink ::= <a href= URL>linkText</a>
+# URL ::= doubleQuotedString | alphanumericWordPath
+# Note that whitespace may appear just about anywhere in the link. Note also
+# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
+# pyparsing skips over whitespace between tokens.
+linkOpenTag = (Literal("<") + "a" + "href" + "=").suppress() + \
+ ( dblQuotedString | Word(alphanums+"/") ) + \
+ Suppress(">")
+linkCloseTag = Literal("<") + "/" + CaselessLiteral("a") + ">"
+link = linkOpenTag + SkipTo(linkCloseTag) + linkCloseTag.suppress()
+
+# Go get some HTML with some links in it.
+serverListPage = urllib.urlopen( "http://www.yahoo.com" )
+htmlText = serverListPage.read()
+serverListPage.close()
+
+# scanString is a generator that loops through the input htmlText, and for each
+# match yields the tokens and start and end locations (for this application, we are
+# not interested in the start and end values).
+for toks,strt,end in link.scanString(htmlText):
+ print toks.asList()
+
+# Rerun scanString, but this time create a dict of text:URL key-value pairs.
+# Need to reverse the tokens returned by link, using a parse action.
+link.setParseAction( lambda st,loc,toks: [ toks[1], toks[0] ] )
+
+# Create dictionary from list comprehension, assembled from each pair of tokens returned
+# from a matched URL.
+pprint.pprint(
+ dict( [ toks for toks,strt,end in link.scanString(htmlText) ] )
+ )
+
+
+