1
import unittest, StringIO, robotparser
2
from test import test_support
4
class RobotTestCase(unittest.TestCase):
5
def __init__(self, index, parser, url, good, agent):
6
unittest.TestCase.__init__(self)
8
self.str = "RobotTest(%d, good, %s)" % (index, url)
10
self.str = "RobotTest(%d, bad, %s)" % (index, url)
17
if isinstance(self.url, tuple):
23
self.failUnless(self.parser.can_fetch(agent, url))
25
self.failIf(self.parser.can_fetch(agent, url))
30
tests = unittest.TestSuite()
32
def RobotTest(index, robots_txt, good_urls, bad_urls,
33
agent="test_robotparser"):
35
lines = StringIO.StringIO(robots_txt).readlines()
36
parser = robotparser.RobotFileParser()
39
tests.addTest(RobotTestCase(index, parser, url, 1, agent))
41
tests.addTest(RobotTestCase(index, parser, url, 0, agent))
43
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
48
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
49
Disallow: /tmp/ # these will soon disappear
53
good = ['/','/test.html']
54
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
56
RobotTest(1, doc, good, bad)
60
# robots.txt for http://www.example.com/
63
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
65
# Cybermapper knows where to go.
66
User-agent: cybermapper
71
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
72
bad = ['/cyberworld/map/index.html']
74
RobotTest(2, doc, good, bad)
84
bad = ['/cyberworld/map/index.html','/','/tmp/']
86
RobotTest(3, doc, good, bad)
88
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
96
Disallow: /%7ejoe/index.html
99
good = [] # XFAIL '/a/b.html'
100
bad = ['/tmp','/tmp.html','/tmp/a.html',
101
'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
105
RobotTest(4, doc, good, bad, 'figtree')
106
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
112
Disallow: /a%3Cd.html
114
Disallow: /%7ejoe/index.html
117
good = ['/tmp',] # XFAIL: '/a%2fb.html'
118
bad = ['/tmp/','/tmp/a.html',
119
'/a%3cd.html','/a%3Cd.html',"/a/b.html",
120
'/%7Ejoe/index.html']
122
RobotTest(6, doc, good, bad)
124
# From bug report #523041
133
bad = [] # Bug report says "/" should be denied, but that is not in the RFC
135
RobotTest(7, doc, good, bad)
138
test_support.run_suite(tests)
140
if __name__=='__main__':
141
test_support.Verbose = 1
142
test_support.run_suite(tests)