diff options
author | Jeremy Hylton <jeremy@alum.mit.edu> | 2008-07-18 20:59:44 +0000 |
---|---|---|
committer | Jeremy Hylton <jeremy@alum.mit.edu> | 2008-07-18 20:59:44 +0000 |
commit | d60f968ad383f9bc966bfeffb77479bba8b2ddd1 (patch) | |
tree | 723e5033864c5d3a05ebfccb922bf28bafebd19f /Lib/urllib/robotparser.py | |
parent | 792978733ea56ab7da3cf8f0678c8b214d157974 (diff) | |
download | cpython-d60f968ad383f9bc966bfeffb77479bba8b2ddd1.tar.gz |
Bug 3347: robotparser failed because it didn't convert bytes to string.
The solution is to convert bytes to text via utf-8. I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.
Diffstat (limited to 'Lib/urllib/robotparser.py')
-rw-r--r-- | Lib/urllib/robotparser.py | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index a91df8d815..c55fb5082f 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -60,7 +60,8 @@ class RobotFileParser: elif err.code >= 400: self.allow_all = True else: - self.parse(f.read().splitlines()) + raw = f.read() + self.parse(raw.decode("utf-8").splitlines()) def _add_entry(self, entry): if "*" in entry.useragents: @@ -123,7 +124,10 @@ class RobotFileParser: return True # search for given user agent matches # the first match counts - url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/" + url = urllib.parse.quote( + urllib.parse.urlparse(urllib.parse.unquote(url))[2]) + if not url: + url = "/" for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) |