1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# URL extractor
# Copyright 2004, Paul McGuire
from pyparsing import SkipTo, makeHTMLTags
import urllib.request, urllib.parse, urllib.error
import pprint
# Define the pyparsing grammar for a URL, that is:
# URLlink ::= <a href= URL>linkText</a>
# URL ::= doubleQuotedString | alphanumericWordPath
# Note that whitespace may appear just about anywhere in the link. Note also
# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
# pyparsing skips over whitespace between tokens.
linkOpenTag,linkCloseTag = makeHTMLTags("a")
link = linkOpenTag + SkipTo(linkCloseTag)("body") + linkCloseTag.suppress()
# Go get some HTML with some links in it.
serverListPage = urllib.request.urlopen( "https://www.google.com/" )
htmlText = serverListPage.read()
serverListPage.close()
# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks,strt,end in link.scanString(htmlText):
print(toks.startA.href,"->",toks.body)
# Create dictionary from list comprehension, assembled from each pair of tokens returned
# from a matched URL.
pprint.pprint(
{ toks.body:toks.startA.href for toks,strt,end in link.scanString(htmlText) }
)
|