summaryrefslogtreecommitdiff
path: root/Lib/urllib/robotparser.py
diff options
context:
space:
mode:
authorJeremy Hylton <jeremy@alum.mit.edu>2008-06-18 20:49:58 +0000
committerJeremy Hylton <jeremy@alum.mit.edu>2008-06-18 20:49:58 +0000
commit671edc4bdfe7652384d23d440268bb862414c4d2 (patch)
tree4e1cb7e1dd7b200ceb3662b94ab4080bb131f08c /Lib/urllib/robotparser.py
parentbde7b5495aae4e98abf5c61c45505bf69cbfaf70 (diff)
downloadcpython-671edc4bdfe7652384d23d440268bb862414c4d2.tar.gz
Make a new urllib package .
It consists of code from urllib, urllib2, urlparse, and robotparser. The old modules have all been removed. The new package has five submodules: urllib.parse, urllib.request, urllib.response, urllib.error, and urllib.robotparser. The urllib.request.urlopen() function uses the url opener from urllib2. Note that the unittests have not been renamed for the beta, but they will be renamed in the future. Joint work with Senthil Kumaran.
Diffstat (limited to 'Lib/urllib/robotparser.py')
-rw-r--r--Lib/urllib/robotparser.py191
1 files changed, 191 insertions, 0 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
new file mode 100644
index 0000000000..a91df8d815
--- /dev/null
+++ b/Lib/urllib/robotparser.py
@@ -0,0 +1,191 @@
+""" robotparser.py
+
+ Copyright (C) 2000 Bastian Kleineidam
+
+ You can choose between two licenses when using this package:
+ 1) GNU GPLv2
+ 2) PSF license for Python 2.2
+
+ The robots.txt Exclusion Protocol is implemented as specified in
+ http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
+"""
+
+import urllib.parse, urllib.request
+
+__all__ = ["RobotFileParser"]
+
+class RobotFileParser:
+ """ This class provides a set of methods to read, parse and answer
+ questions about a single robots.txt file.
+
+ """
+
+ def __init__(self, url=''):
+ self.entries = []
+ self.default_entry = None
+ self.disallow_all = False
+ self.allow_all = False
+ self.set_url(url)
+ self.last_checked = 0
+
+ def mtime(self):
+ """Returns the time the robots.txt file was last fetched.
+
+ This is useful for long-running web spiders that need to
+ check for new robots.txt files periodically.
+
+ """
+ return self.last_checked
+
+ def modified(self):
+ """Sets the time the robots.txt file was last fetched to the
+ current time.
+
+ """
+ import time
+ self.last_checked = time.time()
+
+ def set_url(self, url):
+ """Sets the URL referring to a robots.txt file."""
+ self.url = url
+ self.host, self.path = urllib.parse.urlparse(url)[1:3]
+
+ def read(self):
+ """Reads the robots.txt URL and feeds it to the parser."""
+ try:
+ f = urllib.request.urlopen(self.url)
+ except urllib.error.HTTPError as err:
+ if err.code in (401, 403):
+ self.disallow_all = True
+ elif err.code >= 400:
+ self.allow_all = True
+ else:
+ self.parse(f.read().splitlines())
+
+ def _add_entry(self, entry):
+ if "*" in entry.useragents:
+ # the default entry is considered last
+ self.default_entry = entry
+ else:
+ self.entries.append(entry)
+
+ def parse(self, lines):
+ """Parse the input lines from a robots.txt file.
+
+ We allow that a user-agent: line is not preceded by
+ one or more blank lines.
+ """
+ state = 0
+ entry = Entry()
+
+ for line in lines:
+ if not line:
+ if state == 1:
+ entry = Entry()
+ state = 0
+ elif state == 2:
+ self._add_entry(entry)
+ entry = Entry()
+ state = 0
+ # remove optional comment and strip line
+ i = line.find('#')
+ if i >= 0:
+ line = line[:i]
+ line = line.strip()
+ if not line:
+ continue
+ line = line.split(':', 1)
+ if len(line) == 2:
+ line[0] = line[0].strip().lower()
+ line[1] = urllib.parse.unquote(line[1].strip())
+ if line[0] == "user-agent":
+ if state == 2:
+ self._add_entry(entry)
+ entry = Entry()
+ entry.useragents.append(line[1])
+ state = 1
+ elif line[0] == "disallow":
+ if state != 0:
+ entry.rulelines.append(RuleLine(line[1], False))
+ state = 2
+ elif line[0] == "allow":
+ if state != 0:
+ entry.rulelines.append(RuleLine(line[1], True))
+ if state == 2:
+ self.entries.append(entry)
+
+
+ def can_fetch(self, useragent, url):
+ """using the parsed robots.txt decide if useragent can fetch url"""
+ if self.disallow_all:
+ return False
+ if self.allow_all:
+ return True
+ # search for given user agent matches
+ # the first match counts
+ url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.allowance(url)
+ # try the default entry last
+ if self.default_entry:
+ return self.default_entry.allowance(url)
+ # agent not found ==> access granted
+ return True
+
+ def __str__(self):
+ return ''.join([str(entry) + "\n" for entry in self.entries])
+
+
+class RuleLine:
+ """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
+ (allowance==False) followed by a path."""
+ def __init__(self, path, allowance):
+ if path == '' and not allowance:
+ # an empty value means allow all
+ allowance = True
+ self.path = urllib.parse.quote(path)
+ self.allowance = allowance
+
+ def applies_to(self, filename):
+ return self.path == "*" or filename.startswith(self.path)
+
+ def __str__(self):
+ return (self.allowance and "Allow" or "Disallow") + ": " + self.path
+
+
+class Entry:
+ """An entry has one or more user-agents and zero or more rulelines"""
+ def __init__(self):
+ self.useragents = []
+ self.rulelines = []
+
+ def __str__(self):
+ ret = []
+ for agent in self.useragents:
+ ret.extend(["User-agent: ", agent, "\n"])
+ for line in self.rulelines:
+ ret.extend([str(line), "\n"])
+ return ''.join(ret)
+
+ def applies_to(self, useragent):
+ """check if this entry applies to the specified agent"""
+ # split the name token and make it lower case
+ useragent = useragent.split("/")[0].lower()
+ for agent in self.useragents:
+ if agent == '*':
+ # we have the catch-all agent
+ return True
+ agent = agent.lower()
+ if agent in useragent:
+ return True
+ return False
+
+ def allowance(self, filename):
+ """Preconditions:
+ - our agent applies to this entry
+ - filename is URL decoded"""
+ for line in self.rulelines:
+ if line.applies_to(filename):
+ return line.allowance
+ return True