summaryrefslogtreecommitdiff
path: root/Lib/urllib/robotparser.py
diff options
context:
space:
mode:
authorMartin Panter <vadmium+py@gmail.com>2016-10-22 03:21:36 +0000
committerMartin Panter <vadmium+py@gmail.com>2016-10-22 03:21:36 +0000
commit73a0af3545f0e9b62ac149367c0cfbdbfe40afe4 (patch)
tree9db0bac0a71ca7296ff714745c0f4eee0b07f6a5 /Lib/urllib/robotparser.py
parent6f5021addeb1d9ddd3610409d2461ec79870f9a5 (diff)
parent6fec8a2a83b0f93a80bb83e76dceeda1cf9d00dc (diff)
downloadcpython-73a0af3545f0e9b62ac149367c0cfbdbfe40afe4.tar.gz
Issue #28435: Merge urllib test fixes from 3.5 into 3.6
Diffstat (limited to 'Lib/urllib/robotparser.py')
-rw-r--r--Lib/urllib/robotparser.py43
1 files changed, 41 insertions, 2 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 8b69fd985e..9dab4c1c3a 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -10,7 +10,9 @@
http://www.robotstxt.org/norobots-rfc.txt
"""
-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request
__all__ = ["RobotFileParser"]
@@ -120,10 +122,29 @@ class RobotFileParser:
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
+ elif line[0] == "crawl-delay":
+ if state != 0:
+ # before trying to convert to int we need to make
+ # sure that robots.txt has valid syntax otherwise
+ # it will crash
+ if line[1].strip().isdigit():
+ entry.delay = int(line[1])
+ state = 2
+ elif line[0] == "request-rate":
+ if state != 0:
+ numbers = line[1].split('/')
+ # check if all values are sane
+ if (len(numbers) == 2 and numbers[0].strip().isdigit()
+ and numbers[1].strip().isdigit()):
+ req_rate = collections.namedtuple('req_rate',
+ 'requests seconds')
+ entry.req_rate = req_rate
+ entry.req_rate.requests = int(numbers[0])
+ entry.req_rate.seconds = int(numbers[1])
+ state = 2
if state == 2:
self._add_entry(entry)
-
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
@@ -153,6 +174,22 @@ class RobotFileParser:
# agent not found ==> access granted
return True
+ def crawl_delay(self, useragent):
+ if not self.mtime():
+ return None
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.delay
+ return self.default_entry.delay
+
+ def request_rate(self, useragent):
+ if not self.mtime():
+ return None
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.req_rate
+ return self.default_entry.req_rate
+
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
@@ -180,6 +217,8 @@ class Entry:
def __init__(self):
self.useragents = []
self.rulelines = []
+ self.delay = None
+ self.req_rate = None
def __str__(self):
ret = []