diff options
author | Martin Panter <vadmium+py@gmail.com> | 2016-10-22 03:21:36 +0000 |
---|---|---|
committer | Martin Panter <vadmium+py@gmail.com> | 2016-10-22 03:21:36 +0000 |
commit | 73a0af3545f0e9b62ac149367c0cfbdbfe40afe4 (patch) | |
tree | 9db0bac0a71ca7296ff714745c0f4eee0b07f6a5 /Lib/urllib/robotparser.py | |
parent | 6f5021addeb1d9ddd3610409d2461ec79870f9a5 (diff) | |
parent | 6fec8a2a83b0f93a80bb83e76dceeda1cf9d00dc (diff) | |
download | cpython-73a0af3545f0e9b62ac149367c0cfbdbfe40afe4.tar.gz |
Issue #28435: Merge urllib test fixes from 3.5 into 3.6
Diffstat (limited to 'Lib/urllib/robotparser.py')
-rw-r--r-- | Lib/urllib/robotparser.py | 43 |
1 files changed, 41 insertions, 2 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 8b69fd985e..9dab4c1c3a 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -10,7 +10,9 @@ http://www.robotstxt.org/norobots-rfc.txt """ -import urllib.parse, urllib.request +import collections +import urllib.parse +import urllib.request __all__ = ["RobotFileParser"] @@ -120,10 +122,29 @@ class RobotFileParser: if state != 0: entry.rulelines.append(RuleLine(line[1], True)) state = 2 + elif line[0] == "crawl-delay": + if state != 0: + # before trying to convert to int we need to make + # sure that robots.txt has valid syntax otherwise + # it will crash + if line[1].strip().isdigit(): + entry.delay = int(line[1]) + state = 2 + elif line[0] == "request-rate": + if state != 0: + numbers = line[1].split('/') + # check if all values are sane + if (len(numbers) == 2 and numbers[0].strip().isdigit() + and numbers[1].strip().isdigit()): + req_rate = collections.namedtuple('req_rate', + 'requests seconds') + entry.req_rate = req_rate + entry.req_rate.requests = int(numbers[0]) + entry.req_rate.seconds = int(numbers[1]) + state = 2 if state == 2: self._add_entry(entry) - def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" if self.disallow_all: @@ -153,6 +174,22 @@ class RobotFileParser: # agent not found ==> access granted return True + def crawl_delay(self, useragent): + if not self.mtime(): + return None + for entry in self.entries: + if entry.applies_to(useragent): + return entry.delay + return self.default_entry.delay + + def request_rate(self, useragent): + if not self.mtime(): + return None + for entry in self.entries: + if entry.applies_to(useragent): + return entry.req_rate + return self.default_entry.req_rate + def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) @@ -180,6 +217,8 @@ class Entry: def __init__(self): self.useragents = [] self.rulelines = [] + self.delay = None + self.req_rate = None def __str__(self): ret = [] |