From 7ab1d7c84daba103692f77698ccbd67af03773d9 Mon Sep 17 00:00:00 2001 From: Paul McGuire Date: Sun, 7 Aug 2016 14:54:01 +0000 Subject: Update to current pyparsing features --- src/examples/urlExtractor.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'src/examples/urlExtractor.py') diff --git a/src/examples/urlExtractor.py b/src/examples/urlExtractor.py index 7c90bd7..2c66d78 100644 --- a/src/examples/urlExtractor.py +++ b/src/examples/urlExtractor.py @@ -1,26 +1,21 @@ # URL extractor # Copyright 2004, Paul McGuire -from pyparsing import Literal,Suppress,CharsNotIn,CaselessLiteral,\ - Word,dblQuotedString,alphanums,SkipTo -import urllib.request, urllib.parse, urllib.error +from pyparsing import makeHTMLTags, SkipTo, pyparsing_common +import urllib.request +from contextlib import closing import pprint -# Define the pyparsing grammar for a URL, that is: -# URLlink ::= linkText -# URL ::= doubleQuotedString | alphanumericWordPath -# Note that whitespace may appear just about anywhere in the link. Note also -# that it is not necessary to explicitly show this in the pyparsing grammar; by default, -# pyparsing skips over whitespace between tokens. -linkOpenTag = (Literal("<") + "a" + "href" + "=").suppress() + \ - ( dblQuotedString | Word(alphanums+"/") ) + \ - Suppress(">") -linkCloseTag = Literal("<") + "/" + CaselessLiteral("a") + ">" -link = linkOpenTag + SkipTo(linkCloseTag) + linkCloseTag.suppress() +linkOpenTag, linkCloseTag = makeHTMLTags('a') + +linkBody = SkipTo(linkCloseTag) +linkBody.setParseAction(pyparsing_common.stripHTMLTags) +linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split())) + +link = linkOpenTag + linkBody("body") + linkCloseTag.suppress() # Go get some HTML with some links in it. -serverListPage = urllib.request.urlopen( "http://www.yahoo.com" ) -htmlText = serverListPage.read() -serverListPage.close() +with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage: + htmlText = serverListPage.read().decode("UTF-8") # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are @@ -28,14 +23,10 @@ serverListPage.close() for toks,strt,end in link.scanString(htmlText): print(toks.asList()) -# Rerun scanString, but this time create a dict of text:URL key-value pairs. -# Need to reverse the tokens returned by link, using a parse action. -link.setParseAction( lambda st,loc,toks: [ toks[1], toks[0] ] ) - # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint( - dict( [ toks for toks,strt,end in link.scanString(htmlText) ] ) + dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText)) ) -- cgit v1.2.1