diff options
author | Kjetil Jacobsen <kjetilja@gmail.com> | 2002-08-17 13:15:47 +0000 |
---|---|---|
committer | Kjetil Jacobsen <kjetilja@gmail.com> | 2002-08-17 13:15:47 +0000 |
commit | 796bb1faa052ce9405333f54434b5c99e987ede2 (patch) | |
tree | 6042f7a895f9adcc5628512420db1f20d1788f27 /examples/retriever.py | |
parent | b916921ed4753527e03b03d3919ec16b454d042f (diff) | |
download | pycurl-796bb1faa052ce9405333f54434b5c99e987ede2.tar.gz |
added, the same as crawler.py was previously
Diffstat (limited to 'examples/retriever.py')
-rw-r--r-- | examples/retriever.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/examples/retriever.py b/examples/retriever.py new file mode 100644 index 0000000..455586b --- /dev/null +++ b/examples/retriever.py @@ -0,0 +1,60 @@ +# $Id$ + +import sys, threading, Queue +import pycurl + + +class WorkerThread(threading.Thread): + def __init__(self, iq): + threading.Thread.__init__(self) + self.iq = iq + + def run(self): + while 1: + try: + url, no = self.iq.get_nowait() + except: + break + f = open(str(no), 'w') + self.curl = pycurl.Curl() + self.curl.setopt(pycurl.FOLLOWLOCATION, 1) + self.curl.setopt(pycurl.MAXREDIRS, 5) + self.curl.setopt(pycurl.URL, url) + self.curl.setopt(pycurl.WRITEDATA, f) + try: + self.curl.perform() + except: + pass + f.close() + self.curl.close() + sys.stdout.write('.') + sys.stdout.flush() + +# Read list of URLs from file specified on commandline +try: + urls = open(sys.argv[1]).readlines() + num_workers = int(sys.argv[2]) +except: + # File or number of workers was not specified, show usage string + print "Usage: %s <file with URLs to fetch> <number of workers>" % sys.argv[0] + raise SystemExit + +# Initialize thread array and the file number used to store documents +threads = [] +fileno = 0 +iq = Queue.Queue() + +# Fill the work input queue with URLs +for url in urls: + fileno = fileno + 1 + iq.put((url, fileno)) + +# Start a bunch of threads +for num_threads in range(num_workers): + t = WorkerThread(iq) + t.start() + threads.append(t) + +# Wait for all threads to finish +for thread in threads: + thread.join() |