diff options
author | Senthil Kumaran <senthil@uthcode.com> | 2013-05-29 05:57:21 -0700 |
---|---|---|
committer | Senthil Kumaran <senthil@uthcode.com> | 2013-05-29 05:57:21 -0700 |
commit | 5ab81807b8d6f7e077a211ffa6a3e061fdb361ab (patch) | |
tree | fcea77dc5f4fa6b9d0ced8c819572a548a149ed8 /Lib | |
parent | 94d3481a5942af5da0bc3708c17d76329a219094 (diff) | |
parent | c9c1dc401720cff3d98faceee70ad26e25aea598 (diff) | |
download | cpython-5ab81807b8d6f7e077a211ffa6a3e061fdb361ab.tar.gz |
merge from 3.3
#17403: urllib.parse.robotparser normalizes the urls before adding to
ruleline. This helps in handling certain types invalid urls in a conservative
manner. Patch contributed by Mher Movsisyan.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/test/test_robotparser.py | 12 | ||||
-rw-r--r-- | Lib/urllib/robotparser.py | 1 |
2 files changed, 13 insertions, 0 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 8c09e7452c..d1dfd9eeec 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -234,6 +234,18 @@ bad = ['/some/path'] RobotTest(15, doc, good, bad) +# 16. Empty query (issue #17403). Normalizing the url first. +doc = """ +User-agent: * +Allow: /some/path? +Disallow: /another/path? +""" + +good = ['/some/path?'] +bad = ['/another/path?'] + +RobotTest(16, doc, good, bad) + class NetworkTestCase(unittest.TestCase): diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 75be4af409..978ba58d84 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -157,6 +157,7 @@ class RuleLine: if path == '' and not allowance: # an empty value means allow all allowance = True + path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) self.path = urllib.parse.quote(path) self.allowance = allowance |