summaryrefslogtreecommitdiff
path: root/examples/retriever.py
diff options
context:
space:
mode:
authorKjetil Jacobsen <kjetilja@gmail.com>2002-08-17 13:15:47 +0000
committerKjetil Jacobsen <kjetilja@gmail.com>2002-08-17 13:15:47 +0000
commit796bb1faa052ce9405333f54434b5c99e987ede2 (patch)
tree6042f7a895f9adcc5628512420db1f20d1788f27 /examples/retriever.py
parentb916921ed4753527e03b03d3919ec16b454d042f (diff)
downloadpycurl-796bb1faa052ce9405333f54434b5c99e987ede2.tar.gz
added, the same as crawler.py was previously
Diffstat (limited to 'examples/retriever.py')
-rw-r--r--examples/retriever.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/examples/retriever.py b/examples/retriever.py
new file mode 100644
index 0000000..455586b
--- /dev/null
+++ b/examples/retriever.py
@@ -0,0 +1,60 @@
+# $Id$
+
+import sys, threading, Queue
+import pycurl
+
+
+class WorkerThread(threading.Thread):
+ def __init__(self, iq):
+ threading.Thread.__init__(self)
+ self.iq = iq
+
+ def run(self):
+ while 1:
+ try:
+ url, no = self.iq.get_nowait()
+ except:
+ break
+ f = open(str(no), 'w')
+ self.curl = pycurl.Curl()
+ self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
+ self.curl.setopt(pycurl.MAXREDIRS, 5)
+ self.curl.setopt(pycurl.URL, url)
+ self.curl.setopt(pycurl.WRITEDATA, f)
+ try:
+ self.curl.perform()
+ except:
+ pass
+ f.close()
+ self.curl.close()
+ sys.stdout.write('.')
+ sys.stdout.flush()
+
+# Read list of URLs from file specified on commandline
+try:
+ urls = open(sys.argv[1]).readlines()
+ num_workers = int(sys.argv[2])
+except:
+ # File or number of workers was not specified, show usage string
+ print "Usage: %s <file with URLs to fetch> <number of workers>" % sys.argv[0]
+ raise SystemExit
+
+# Initialize thread array and the file number used to store documents
+threads = []
+fileno = 0
+iq = Queue.Queue()
+
+# Fill the work input queue with URLs
+for url in urls:
+ fileno = fileno + 1
+ iq.put((url, fileno))
+
+# Start a bunch of threads
+for num_threads in range(num_workers):
+ t = WorkerThread(iq)
+ t.start()
+ threads.append(t)
+
+# Wait for all threads to finish
+for thread in threads:
+ thread.join()