diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2002-02-28 15:24:47 +0000 |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2002-02-28 15:24:47 +0000 |
commit | 9b88d5b5189efe447f3e5d06e6f6a58d4bd0825a (patch) | |
tree | e879fcab7a2b9f247dbf7d61868298b9cd5939b0 /Lib/test | |
parent | 71868bb47084702656ab5b39cda2217e0ff7c258 (diff) | |
download | cpython-9b88d5b5189efe447f3e5d06e6f6a58d4bd0825a.tar.gz |
Correct various errors:
- Use substring search, not re search for user-agent and paths.
- Consider * entry last. Unquote, then requote URLs.
- Treat empty Disallow as "allow everything".
Add test cases. Fixes #523041
Diffstat (limited to 'Lib/test')
-rw-r--r-- | Lib/test/test_robotparser.py | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py new file mode 100644 index 0000000000..a0107250aa --- /dev/null +++ b/Lib/test/test_robotparser.py @@ -0,0 +1,141 @@ +import unittest, StringIO, robotparser, test_support + +class RobotTestCase(unittest.TestCase): + def __init__(self, index, parser, url, good, agent): + unittest.TestCase.__init__(self) + if good: + self.str = "RobotTest(%d, good, %s)" % (index, url) + else: + self.str = "RobotTest(%d, bad, %s)" % (index, url) + self.parser = parser + self.url = url + self.good = good + self.agent = agent + + def runTest(self): + if isinstance(self.url, tuple): + agent, url = self.url + else: + url = self.url + agent = self.agent + if self.good: + self.failUnless(self.parser.can_fetch(agent, url)) + else: + self.failIf(self.parser.can_fetch(agent, url)) + + def __str__(self): + return self.str + +tests = unittest.TestSuite() + +def RobotTest(index, robots_txt, good_urls, bad_urls, + agent="test_robotparser"): + + lines = StringIO.StringIO(robots_txt).readlines() + parser = robotparser.RobotFileParser() + parser.parse(lines) + for url in good_urls: + tests.addTest(RobotTestCase(index, parser, url, 1, agent)) + for url in bad_urls: + tests.addTest(RobotTestCase(index, parser, url, 0, agent)) + +# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) + +# 1. +doc = """ +User-agent: * +Disallow: /cyberworld/map/ # This is an infinite virtual URL space +Disallow: /tmp/ # these will soon disappear +Disallow: /foo.html +""" + +good = ['/','/test.html'] +bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] + +RobotTest(1, doc, good, bad) + +# 2. +doc = """ +# robots.txt for http://www.example.com/ + +User-agent: * +Disallow: /cyberworld/map/ # This is an infinite virtual URL space + +# Cybermapper knows where to go. +User-agent: cybermapper +Disallow: + +""" + +good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] +bad = ['/cyberworld/map/index.html'] + +RobotTest(2, doc, good, bad) + +# 3. +doc = """ +# go away +User-agent: * +Disallow: / +""" + +good = [] +bad = ['/cyberworld/map/index.html','/','/tmp/'] + +RobotTest(3, doc, good, bad) + +# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) + +# 4. +doc = """ +User-agent: figtree +Disallow: /tmp +Disallow: /a%3cd.html +Disallow: /a%2fb.html +Disallow: /%7ejoe/index.html +""" + +good = [] # XFAIL '/a/b.html' +bad = ['/tmp','/tmp.html','/tmp/a.html', + '/a%3cd.html','/a%3Cd.html','/a%2fb.html', + '/~joe/index.html' + ] + +RobotTest(4, doc, good, bad, 'figtree') +RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') + +# 6. +doc = """ +User-agent: * +Disallow: /tmp/ +Disallow: /a%3Cd.html +Disallow: /a/b.html +Disallow: /%7ejoe/index.html +""" + +good = ['/tmp',] # XFAIL: '/a%2fb.html' +bad = ['/tmp/','/tmp/a.html', + '/a%3cd.html','/a%3Cd.html',"/a/b.html", + '/%7Ejoe/index.html'] + +RobotTest(6, doc, good, bad) + +# From bug report #523041 + +# 7. +doc = """ +User-Agent: * +Disallow: /. +""" + +good = ['/foo.html'] +bad = [] # Bug report says "/" should be denied, but that is not in the RFC + +RobotTest(7, doc, good, bad) + +def test_main(): + test_support.run_suite(tests) + +if __name__=='__main__': + test_support.Verbose = 1 + test_support.run_suite(tests) |