added, the same as crawler.py was previously

author: Kjetil Jacobsen <kjetilja@gmail.com> 2002-08-17 13:15:47 +0000
committer: Kjetil Jacobsen <kjetilja@gmail.com> 2002-08-17 13:15:47 +0000
commit: 796bb1faa052ce9405333f54434b5c99e987ede2 (patch)
tree: 6042f7a895f9adcc5628512420db1f20d1788f27 /examples/retriever.py
parent: b916921ed4753527e03b03d3919ec16b454d042f (diff)
download: pycurl-796bb1faa052ce9405333f54434b5c99e987ede2.tar.gz
1 files changed, 60 insertions, 0 deletions
diff --git a/examples/retriever.py b/examples/retriever.py
new file mode 100644
index 0000000..455586b
--- /dev/null
+++ b/examples/retriever.py
@@ -0,0 +1,60 @@
+# $Id$
+
+import sys, threading, Queue
+import pycurl
+
+
+class WorkerThread(threading.Thread):
+    def __init__(self, iq):
+        threading.Thread.__init__(self)
+        self.iq = iq
+
+    def run(self):
+        while 1:
+            try:
+                url, no = self.iq.get_nowait()
+            except:
+                break
+            f = open(str(no), 'w')
+            self.curl = pycurl.Curl()
+            self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
+            self.curl.setopt(pycurl.MAXREDIRS, 5)
+            self.curl.setopt(pycurl.URL, url)
+            self.curl.setopt(pycurl.WRITEDATA, f)
+            try:
+                self.curl.perform()
+            except:
+                pass
+            f.close()
+            self.curl.close()
+            sys.stdout.write('.')
+            sys.stdout.flush()
+
+# Read list of URLs from file specified on commandline
+try:
+    urls = open(sys.argv[1]).readlines()
+    num_workers = int(sys.argv[2])
+except:
+    # File or number of workers was not specified, show usage string
+    print "Usage: %s <file with URLs to fetch> <number of workers>" % sys.argv[0]
+    raise SystemExit
+
+# Initialize thread array and the file number used to store documents
+threads = []
+fileno = 0
+iq = Queue.Queue()
+
+# Fill the work input queue with URLs
+for url in urls:
+    fileno = fileno + 1
+    iq.put((url, fileno))
+
+# Start a bunch of threads
+for num_threads in range(num_workers):
+    t = WorkerThread(iq)
+    t.start()
+    threads.append(t)
+
+# Wait for all threads to finish
+for thread in threads:
+    thread.join()
author	Kjetil Jacobsen <kjetilja@gmail.com>	2002-08-17 13:15:47 +0000
committer	Kjetil Jacobsen <kjetilja@gmail.com>	2002-08-17 13:15:47 +0000
commit	796bb1faa052ce9405333f54434b5c99e987ede2 (patch)
tree	6042f7a895f9adcc5628512420db1f20d1788f27 /examples/retriever.py
parent	b916921ed4753527e03b03d3919ec16b454d042f (diff)
download	pycurl-796bb1faa052ce9405333f54434b5c99e987ede2.tar.gz