diff options
author | Guido van Rossum <guido@python.org> | 2014-01-09 14:45:48 -0800 |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2014-01-09 14:45:48 -0800 |
commit | ce6aea21d015cd5c59d46b2346be17827c588ee0 (patch) | |
tree | 8078c0d2cec702d205d7a2ab46159918d130100c /examples | |
parent | b955402407ebf067ca8ad9c579b69a238687ad97 (diff) | |
download | trollius-ce6aea21d015cd5c59d46b2346be17827c588ee0.tar.gz |
Refactor Logger: No more VPrinter base class.
Diffstat (limited to 'examples')
-rw-r--r-- | examples/crawl.py | 189 |
1 files changed, 85 insertions, 104 deletions
diff --git a/examples/crawl.py b/examples/crawl.py index 044c7d3..6716815 100644 --- a/examples/crawl.py +++ b/examples/crawl.py @@ -3,8 +3,8 @@ """A simple web crawler.""" # TODO: -# - Make VPrinter a sub-object, not a base class. -# - More organized logging (with task ID?). Use logging module. +# - More organized logging (with task ID or URL?). +# - Use logging module for Logger. # - KeyboardInterrupt in HTML parsing may hang or report unretrieved error. # - Support gzip encoding. # - Close connection if HTTP/1.0 response. @@ -59,13 +59,12 @@ ARGS.add_argument( ARGS.add_argument( '--lenient', action='store_false', dest='strict', default=False, help='Lenient host matching') - ARGS.add_argument( - '-v', '--verbose', action='count', dest='verbose', + '-v', '--verbose', action='count', dest='level', default=1, help='Verbose logging (repeat for more verbose)') ARGS.add_argument( - '-q', '--quiet', action='store_const', const=0, dest='verbose', - help='Quiet logging (opposite of --verbose)') + '-q', '--quiet', action='store_const', const=0, dest='level', + default=1, help='Quiet logging (opposite of --verbose)') ESCAPES = [('quot', '"'), @@ -92,39 +91,23 @@ def fix_url(url): return url -class VPrinter: - """Mix-in class defining vprint() which is like print() if verbose > 0. - - The output goes to stderr. Only positional arguments are - supported. There are also methods vvprint(), vvvprint() - etc. which print() only if verbose > larger values. - - The verbose instance variable is public. +class Logger: - TODO: This should probably be a shared object rather than a mix-in class. - """ - - def __init__(self, verbose): - self.verbose = verbose + def __init__(self, level): + self.level = level - def _nvprint(self, n, args): - if self.verbose >= n: + def _log(self, n, args): + if self.level >= n: print(*args, file=sys.stderr, flush=True) - def nvprint(self, n, *args): - self._nvprint(n, args) - - def vprint(self, *args): - self._nvprint(1, args) + def log(self, n, *args): + self._log(n, args) - def vvprint(self, *args): - self._nvprint(2, args) + def __call__(self, n, *args): + self._log(n, args) - def vvvprint(self, *args): - self._nvprint(3, args) - -class ConnectionPool(VPrinter): +class ConnectionPool: """A connection pool. To open a connection, use reserve(). To recycle it, use unreserve(). @@ -139,8 +122,8 @@ class ConnectionPool(VPrinter): There are limits to both the overal pool and the per-key pool. """ - def __init__(self, max_pool=10, max_tasks=5, verbose=0): - VPrinter.__init__(self, verbose) + def __init__(self, log, max_pool=10, max_tasks=5): + self.log = log self.max_pool = max_pool # Overall limit. self.max_tasks = max_tasks # Per-key limit. self.loop = asyncio.get_event_loop() @@ -162,9 +145,9 @@ class ConnectionPool(VPrinter): try: ipaddrs = yield from self.loop.getaddrinfo(host, port) except Exception as exc: - self.vprint('Exception %r for (%r, %r)' % (exc, host, port)) + self.log(0, 'Exception %r for (%r, %r)' % (exc, host, port)) raise - self.vprint('* %s resolves to %s' % + self.log(1, '* %s resolves to %s' % (host, ', '.join(ip[4][0] for ip in ipaddrs))) # Look for a reusable connection. @@ -179,11 +162,11 @@ class ConnectionPool(VPrinter): del self.connections[key] reader, writer = pair if reader._eof: - self.vprint('(cached connection closed for %s)' % + self.log(1, '(cached connection closed for %s)' % repr(key)) writer.close() # Just in case. else: - self.vprint('* Reusing pooled connection', key, + self.log(1, '* Reusing pooled connection', key, 'FD =', writer._transport._sock.fileno()) return key, reader, writer @@ -194,9 +177,9 @@ class ConnectionPool(VPrinter): if peername: host, port, *_ = peername else: - self.vprint('NO PEERNAME???', host, port, ssl) + self.log(1, 'NO PEERNAME???', host, port, ssl) key = host, port, ssl - self.vprint('* New connection', key, + self.log(1, '* New connection', key, 'FD =', writer._transport._sock.fileno()) return key, reader, writer @@ -216,7 +199,7 @@ class ConnectionPool(VPrinter): # Close oldest connection(s) for this key if limit reached. while len(pairs) > self.max_tasks: pair = pairs.pop(0) - self.vprint('closing oldest connection for', key) + self.log(1, 'closing oldest connection for', key) self.queue.remove((key, pair)) reader, writer = pair writer.close() @@ -224,7 +207,7 @@ class ConnectionPool(VPrinter): # Close oldest overall connection(s) if limit reached. while len(self.queue) > self.max_pool: key, pair = self.queue.pop(0) - self.vprint('closing oldest connection', key) + self.log(1, 'closing oldest connection', key) pairs = self.connections.get(key) p = pairs.pop(0) assert pair == p, (key, pair, p, pairs) @@ -232,15 +215,15 @@ class ConnectionPool(VPrinter): writer.close() -class Request(VPrinter): +class Request: """HTTP request. Use connect() to open a connection; send_request() to send the request; get_response() to receive the response headers. """ - def __init__(self, url, pool, verbose=0): - VPrinter.__init__(self, verbose) + def __init__(self, log, url, pool): + self.log = log self.url = url self.pool = pool self.parts = urllib.parse.urlparse(self.url) @@ -266,13 +249,13 @@ class Request(VPrinter): @asyncio.coroutine def connect(self): """Open a connection to the server.""" - self.vprint('* Connecting to %s:%s using %s for %s' % + self.log(1, '* Connecting to %s:%s using %s for %s' % (self.hostname, self.port, 'ssl' if self.ssl else 'tcp', self.url)) self.key, self.reader, self.writer = \ yield from self.pool.reserve(self.hostname, self.port, self.ssl) - self.vprint('* Connected to %s' % + self.log(1, '* Connected to %s' % (self.writer.get_extra_info('peername'),)) def recycle_connection(self): @@ -295,7 +278,7 @@ class Request(VPrinter): Used for the request line and headers. """ - self.vvprint('>', line) + self.log(2, '>', line) self.writer.write(line.encode('latin-1') + b'\r\n') @asyncio.coroutine @@ -317,12 +300,12 @@ class Request(VPrinter): @asyncio.coroutine def get_response(self): """Receive the response.""" - response = Response(self.reader, self.verbose) + response = Response(self.log, self.reader) yield from response.read_headers() return response -class Response(VPrinter): +class Response: """HTTP response. Call read_headers() to receive the request headers. Then check @@ -330,8 +313,8 @@ class Response(VPrinter): Finally call read() to receive the body. """ - def __init__(self, reader, verbose=0): - VPrinter.__init__(self, verbose) + def __init__(self, log, reader): + self.log = log self.reader = reader self.http_version = None # 'HTTP/1.1' self.status = None # 200 @@ -342,7 +325,7 @@ class Response(VPrinter): def getline(self): """Read one line from the connection.""" line = (yield from self.reader.readline()).decode('latin-1').rstrip() - self.vvprint('<', line) + self.log(2, '<', line) return line @asyncio.coroutine @@ -351,7 +334,7 @@ class Response(VPrinter): status_line = yield from self.getline() status_parts = status_line.split(None, 2) if len(status_parts) != 3: - self.vprint('bad status_line', repr(status_line)) + self.log(0, 'bad status_line', repr(status_line)) raise BadStatusLine(status_line) self.http_version, status, self.reason = status_parts self.status = int(status) @@ -386,10 +369,10 @@ class Response(VPrinter): blocks = [] nread = 0 while nread < nbytes: - self.vvvprint('reading block', len(blocks), - 'with', nbytes - nread, 'bytes remaining') + self.log(3, 'reading block', len(blocks), + 'with', nbytes - nread, 'bytes remaining') block = yield from self.reader.read(nbytes-nread) - self.vvvprint('read', len(block), 'bytes') + self.log(3, 'read', len(block), 'bytes') if not block: raise EOFError('EOF with %d more bytes expected' % (nbytes - nread)) @@ -410,18 +393,18 @@ class Response(VPrinter): break if nbytes is None: if self.get_header('transfer-encoding').lower() == 'chunked': - self.vvprint('parsing chunked response') + self.log(2, 'parsing chunked response') blocks = [] while True: size_header = yield from self.reader.readline() if not size_header: - self.vprint('premature end of chunked response') + self.log(0, 'premature end of chunked response') break - self.vvvprint('size_header =', repr(size_header)) + self.log(3, 'size_header =', repr(size_header)) parts = size_header.split(b';') size = int(parts[0], 16) if size: - self.vvvprint('reading chunk of', size, 'bytes') + self.log(3, 'reading chunk of', size, 'bytes') block = yield from self.readexactly(size) assert len(block) == size, (len(block), size) blocks.append(block) @@ -430,10 +413,10 @@ class Response(VPrinter): if not size: break body = b''.join(blocks) - self.vprint('chunked response had', len(body), + self.log(1, 'chunked response had', len(body), 'bytes in', len(blocks), 'blocks') else: - self.vvvprint('reading until EOF') + self.log(3, 'reading until EOF') body = yield from self.reader.read() # TODO: Should make sure not to recycle the connection # in this case. @@ -442,7 +425,7 @@ class Response(VPrinter): return body -class Fetcher(VPrinter): +class Fetcher: """Logic and state for one URL. When found in crawler.busy, this represents a URL to be fetched or @@ -456,8 +439,8 @@ class Fetcher(VPrinter): Call fetch() to do the fetching, then report() to print the results. """ - def __init__(self, url, crawler, max_redirect=10, max_tries=4, verbose=0): - VPrinter.__init__(self, verbose) + def __init__(self, log, url, crawler, max_redirect=10, max_tries=4): + self.log = log self.url = url self.crawler = crawler # We don't loop resolving redirects here -- we just use this @@ -490,8 +473,7 @@ class Fetcher(VPrinter): self.tries += 1 self.request = None try: - self.request = Request(self.url, self.crawler.pool, - self.verbose) + self.request = Request(self.log, self.url, self.crawler.pool) yield from self.request.connect() yield from self.request.send_request() self.response = yield from self.request.get_response() @@ -502,11 +484,11 @@ class Fetcher(VPrinter): self.request.recycle_connection() self.request = None if self.tries > 1: - self.vprint('try', self.tries, 'for', self.url, 'success') + self.log(1, 'try', self.tries, 'for', self.url, 'success') break except (BadStatusLine, OSError) as exc: self.exceptions.append(exc) - self.vprint('try', self.tries, 'for', self.url, + self.log(1, 'try', self.tries, 'for', self.url, 'raised', repr(exc)) ##import pdb; pdb.set_trace() # Don't reuse the connection in this case. @@ -515,17 +497,17 @@ class Fetcher(VPrinter): self.request.close() else: # We never broke out of the while loop, i.e. all tries failed. - self.vprint('no success for', self.url, + self.log(0, 'no success for', self.url, 'in', self.max_tries, 'tries') return next_url = self.response.get_redirect_url() if next_url: self.next_url = urllib.parse.urljoin(self.url, next_url) if self.max_redirect > 0: - self.vprint('redirect to', self.next_url, 'from', self.url) + self.log(1, 'redirect to', self.next_url, 'from', self.url) self.crawler.add_url(self.next_url, self.max_redirect-1) else: - self.vprint('redirect limit reached for', self.next_url, + self.log(0, 'redirect limit reached for', self.next_url, 'from', self.url) else: if self.response.status == 200: @@ -536,10 +518,11 @@ class Fetcher(VPrinter): self.encoding = self.pdict.get('charset', 'utf-8') if self.ctype == 'text/html': body = self.body.decode(self.encoding, 'replace') + # Replace href with (?:href|src) to follow image links. self.urls = set(re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', body)) if self.urls: - self.vprint('got', len(self.urls), + self.log(1, 'got', len(self.urls), 'distinct urls from', self.url) self.new_urls = set() for url in self.urls: @@ -616,7 +599,7 @@ class Stats: print(' %-20s %10d' % (key, count), file=file) -class Crawler(VPrinter): +class Crawler: """Crawl a set of URLs. This manages three disjoint sets of URLs (todo, busy, done). The @@ -624,12 +607,12 @@ class Crawler(VPrinter): the redirect limit, while the values in busy and done are Fetcher instances. """ - def __init__(self, + def __init__(self, log, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, max_pool=10, # Global limits. - verbose=0): - VPrinter.__init__(self, verbose) + ): + self.log = log self.roots = roots self.exclude = exclude self.strict = strict @@ -640,7 +623,7 @@ class Crawler(VPrinter): self.todo = {} self.busy = {} self.done = {} - self.pool = ConnectionPool(max_pool, max_tasks, self.verbose) + self.pool = ConnectionPool(self.log, max_pool, max_tasks) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) @@ -719,17 +702,17 @@ class Crawler(VPrinter): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): - self.vvprint('skipping non-http scheme in', url) + self.log(2, 'skipping non-http scheme in', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): - self.vvprint('skipping non-root host in', url) + self.log(2, 'skipping non-root host in', url) return False if max_redirect is None: max_redirect = self.max_redirect if url in self.todo or url in self.busy or url in self.done: return False - self.vprint('adding', url, max_redirect) + self.log(1, 'adding', url, max_redirect) self.todo[url] = max_redirect return True @@ -740,11 +723,11 @@ class Crawler(VPrinter): while self.todo or self.busy: if self.todo: url, max_redirect = self.todo.popitem() - fetcher = Fetcher(url, + fetcher = Fetcher(self.log, url, crawler=self, max_redirect=max_redirect, max_tries=self.max_tries, - verbose=self.verbose) + ) self.busy[url] = fetcher fetcher.task = asyncio.Task(self.fetch(fetcher)) else: @@ -761,10 +744,9 @@ class Crawler(VPrinter): with (yield from self.governor): try: yield from fetcher.fetch() # Fetcher gonna fetch. - except Exception: + finally: # Force GC of the task, so the error is logged. fetcher.task = None - raise with (yield from self.termination): self.done[url] = fetcher del self.busy[url] @@ -780,27 +762,24 @@ class Crawler(VPrinter): else: speed = 0 stats = Stats() - if self.verbose > 0: - print('*** Report ***', file=file) - try: - show = [] - show.extend(self.done.items()) - show.extend(self.busy.items()) - show.sort() - for url, fetcher in show: - fetcher.report(stats, file=file) - except KeyboardInterrupt: - print('\nInterrupted', file=file) + print('*** Report ***', file=file) + try: + show = [] + show.extend(self.done.items()) + show.extend(self.busy.items()) + show.sort() + for url, fetcher in show: + fetcher.report(stats, file=file) + except KeyboardInterrupt: + print('\nInterrupted', file=file) print('Finished', len(self.done), 'urls in %.3f secs' % dt, '(max_tasks=%d)' % self.max_tasks, '(%.3f urls/sec/task)' % speed, file=file) stats.report(file=file) - if self.todo: - print('Todo:', len(self.todo), file=file) - if self.busy: - print('Busy:', len(self.busy), file=file) + print('Todo:', len(self.todo), file=file) + print('Busy:', len(self.busy), file=file) print('Done:', len(self.done), file=file) print('Date:', time.ctime(), 'local time', file=file) @@ -815,6 +794,8 @@ def main(): print('Use --help for command line help') return + log = Logger(args.level) + if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() @@ -827,14 +808,14 @@ def main(): roots = {fix_url(root) for root in args.roots} - crawler = Crawler(roots, - exclude=args.exclude, + crawler = Crawler(log, + roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, max_pool=args.max_pool, - verbose=args.verbose) + ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: |