diff options
Diffstat (limited to 'examples/urlExtractorNew.py')
-rw-r--r-- | examples/urlExtractorNew.py | 62 |
1 files changed, 31 insertions, 31 deletions
diff --git a/examples/urlExtractorNew.py b/examples/urlExtractorNew.py index a21b2ab..d876eea 100644 --- a/examples/urlExtractorNew.py +++ b/examples/urlExtractorNew.py @@ -1,31 +1,31 @@ -# URL extractor
-# Copyright 2004, Paul McGuire
-from pyparsing import SkipTo, makeHTMLTags
-import urllib.request, urllib.parse, urllib.error
-import pprint
-
-# Define the pyparsing grammar for a URL, that is:
-# URLlink ::= <a href= URL>linkText</a>
-# URL ::= doubleQuotedString | alphanumericWordPath
-# Note that whitespace may appear just about anywhere in the link. Note also
-# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
-# pyparsing skips over whitespace between tokens.
-linkOpenTag,linkCloseTag = makeHTMLTags("a")
-link = linkOpenTag + SkipTo(linkCloseTag)("body") + linkCloseTag.suppress()
-
-# Go get some HTML with some links in it.
-serverListPage = urllib.request.urlopen( "https://www.google.com/" )
-htmlText = serverListPage.read()
-serverListPage.close()
-
-# scanString is a generator that loops through the input htmlText, and for each
-# match yields the tokens and start and end locations (for this application, we are
-# not interested in the start and end values).
-for toks,strt,end in link.scanString(htmlText):
- print(toks.startA.href,"->",toks.body)
-
-# Create dictionary from list comprehension, assembled from each pair of tokens returned
-# from a matched URL.
-pprint.pprint(
- { toks.body:toks.startA.href for toks,strt,end in link.scanString(htmlText) }
- )
+# URL extractor +# Copyright 2004, Paul McGuire +from pyparsing import makeHTMLTags +from contextlib import closing +import urllib.request, urllib.parse, urllib.error +import pprint + +# Define the pyparsing grammar for a URL, that is: +# URLlink ::= <a href= URL>linkText</a> +# URL ::= doubleQuotedString | alphanumericWordPath +# Note that whitespace may appear just about anywhere in the link. Note also +# that it is not necessary to explicitly show this in the pyparsing grammar; by default, +# pyparsing skips over whitespace between tokens. +linkOpenTag, linkCloseTag = makeHTMLTags("a") +link = linkOpenTag + linkOpenTag.tag_body("body") + linkCloseTag.suppress() + +# Go get some HTML with some links in it. +with closing(urllib.request.urlopen("https://www.cnn.com/")) as serverListPage: + htmlText = serverListPage.read() + +# scanString is a generator that loops through the input htmlText, and for each +# match yields the tokens and start and end locations (for this application, we are +# not interested in the start and end values). +for toks, strt, end in link.scanString(htmlText): + print(toks.startA.href, "->", toks.body) + +# Create dictionary from list comprehension, assembled from each pair of tokens returned +# from a matched URL. +pprint.pprint( + {toks.body: toks.startA.href for toks, strt, end in link.scanString(htmlText)} + ) |