Fixed dict structure in makeHTMLTags expressions, and added tag_body attribute to the generated start expression giving easy access to a SkipTo(closeTag) that will parse the tag's body text; some code cleanup and removed duplication among examples

author: Paul McGuire <ptmcg@austin.rr.com> 2019-04-06 23:44:02 -0500
committer: Paul McGuire <ptmcg@austin.rr.com> 2019-04-06 23:44:02 -0500
commit: a2439508ba5c94546db98593cfa676de9b59babe (patch)
tree: 80b02178820811c09b4befc9a9b5efb092813466 /examples/urlExtractorNew.py
parent: 832986ffccac943b363da43795c335eafc31b5da (diff)
download: pyparsing-git-a2439508ba5c94546db98593cfa676de9b59babe.tar.gz
1 files changed, 31 insertions, 31 deletions
diff --git a/examples/urlExtractorNew.py b/examples/urlExtractorNew.py
index a21b2ab..d876eea 100644
--- a/examples/urlExtractorNew.py
+++ b/examples/urlExtractorNew.py
@@ -1,31 +1,31 @@
-# URL extractor
-# Copyright 2004, Paul McGuire
-from pyparsing import SkipTo, makeHTMLTags
-import urllib.request, urllib.parse, urllib.error
-import pprint
-
-# Define the pyparsing grammar for a URL, that is:
-#    URLlink ::= <a href= URL>linkText</a>
-#    URL ::= doubleQuotedString | alphanumericWordPath
-# Note that whitespace may appear just about anywhere in the link.  Note also
-# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
-# pyparsing skips over whitespace between tokens.
-linkOpenTag,linkCloseTag = makeHTMLTags("a")
-link = linkOpenTag + SkipTo(linkCloseTag)("body") + linkCloseTag.suppress()
-
-# Go get some HTML with some links in it.
-serverListPage = urllib.request.urlopen( "https://www.google.com/" )
-htmlText = serverListPage.read()
-serverListPage.close()
-
-# scanString is a generator that loops through the input htmlText, and for each
-# match yields the tokens and start and end locations (for this application, we are
-# not interested in the start and end values).
-for toks,strt,end in link.scanString(htmlText):
-    print(toks.startA.href,"->",toks.body)
-
-# Create dictionary from list comprehension, assembled from each pair of tokens returned
-# from a matched URL.
-pprint.pprint(
-    {  toks.body:toks.startA.href for toks,strt,end in link.scanString(htmlText)  }
-    )
+# URL extractor
+# Copyright 2004, Paul McGuire
+from pyparsing import makeHTMLTags
+from contextlib import closing
+import urllib.request, urllib.parse, urllib.error
+import pprint
+
+# Define the pyparsing grammar for a URL, that is:
+#    URLlink ::= <a href= URL>linkText</a>
+#    URL ::= doubleQuotedString | alphanumericWordPath
+# Note that whitespace may appear just about anywhere in the link.  Note also
+# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
+# pyparsing skips over whitespace between tokens.
+linkOpenTag, linkCloseTag = makeHTMLTags("a")
+link = linkOpenTag + linkOpenTag.tag_body("body") + linkCloseTag.suppress()
+
+# Go get some HTML with some links in it.
+with closing(urllib.request.urlopen("https://www.cnn.com/")) as serverListPage:
+    htmlText = serverListPage.read()
+
+# scanString is a generator that loops through the input htmlText, and for each
+# match yields the tokens and start and end locations (for this application, we are
+# not interested in the start and end values).
+for toks, strt, end in link.scanString(htmlText):
+    print(toks.startA.href, "->", toks.body)
+
+# Create dictionary from list comprehension, assembled from each pair of tokens returned
+# from a matched URL.
+pprint.pprint(
+    {toks.body: toks.startA.href for toks, strt, end in link.scanString(htmlText)}
+    )
author	Paul McGuire <ptmcg@austin.rr.com>	2019-04-06 23:44:02 -0500
committer	Paul McGuire <ptmcg@austin.rr.com>	2019-04-06 23:44:02 -0500
commit	a2439508ba5c94546db98593cfa676de9b59babe (patch)
tree	80b02178820811c09b4befc9a9b5efb092813466 /examples/urlExtractorNew.py
parent	832986ffccac943b363da43795c335eafc31b5da (diff)
download	pyparsing-git-a2439508ba5c94546db98593cfa676de9b59babe.tar.gz