~ubuntu-branches/ubuntu/karmic/pypy/karmic

« back to all changes in this revision

Viewing changes to lib-python/2.4.1/test/test_robotparser.py

  • Committer: Bazaar Package Importer
  • Author(s): Alexandre Fayolle
  • Date: 2007-04-13 09:33:09 UTC
  • Revision ID: james.westby@ubuntu.com-20070413093309-yoojh4jcoocu2krz
Tags: upstream-1.0.0
ImportĀ upstreamĀ versionĀ 1.0.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
import unittest, StringIO, robotparser
 
2
from test import test_support
 
3
 
 
4
class RobotTestCase(unittest.TestCase):
 
5
    def __init__(self, index, parser, url, good, agent):
 
6
        unittest.TestCase.__init__(self)
 
7
        if good:
 
8
            self.str = "RobotTest(%d, good, %s)" % (index, url)
 
9
        else:
 
10
            self.str = "RobotTest(%d, bad, %s)" % (index, url)
 
11
        self.parser = parser
 
12
        self.url = url
 
13
        self.good = good
 
14
        self.agent = agent
 
15
 
 
16
    def runTest(self):
 
17
        if isinstance(self.url, tuple):
 
18
            agent, url = self.url
 
19
        else:
 
20
            url = self.url
 
21
            agent = self.agent
 
22
        if self.good:
 
23
            self.failUnless(self.parser.can_fetch(agent, url))
 
24
        else:
 
25
            self.failIf(self.parser.can_fetch(agent, url))
 
26
 
 
27
    def __str__(self):
 
28
        return self.str
 
29
 
 
30
tests = unittest.TestSuite()
 
31
 
 
32
def RobotTest(index, robots_txt, good_urls, bad_urls,
 
33
              agent="test_robotparser"):
 
34
 
 
35
    lines = StringIO.StringIO(robots_txt).readlines()
 
36
    parser = robotparser.RobotFileParser()
 
37
    parser.parse(lines)
 
38
    for url in good_urls:
 
39
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
 
40
    for url in bad_urls:
 
41
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))
 
42
 
 
43
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
 
44
 
 
45
# 1.
 
46
doc = """
 
47
User-agent: *
 
48
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 
49
Disallow: /tmp/ # these will soon disappear
 
50
Disallow: /foo.html
 
51
"""
 
52
 
 
53
good = ['/','/test.html']
 
54
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
 
55
 
 
56
RobotTest(1, doc, good, bad)
 
57
 
 
58
# 2.
 
59
doc = """
 
60
# robots.txt for http://www.example.com/
 
61
 
 
62
User-agent: *
 
63
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 
64
 
 
65
# Cybermapper knows where to go.
 
66
User-agent: cybermapper
 
67
Disallow:
 
68
 
 
69
"""
 
70
 
 
71
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
 
72
bad = ['/cyberworld/map/index.html']
 
73
 
 
74
RobotTest(2, doc, good, bad)
 
75
 
 
76
# 3.
 
77
doc = """
 
78
# go away
 
79
User-agent: *
 
80
Disallow: /
 
81
"""
 
82
 
 
83
good = []
 
84
bad = ['/cyberworld/map/index.html','/','/tmp/']
 
85
 
 
86
RobotTest(3, doc, good, bad)
 
87
 
 
88
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
 
89
 
 
90
# 4.
 
91
doc = """
 
92
User-agent: figtree
 
93
Disallow: /tmp
 
94
Disallow: /a%3cd.html
 
95
Disallow: /a%2fb.html
 
96
Disallow: /%7ejoe/index.html
 
97
"""
 
98
 
 
99
good = [] # XFAIL '/a/b.html'
 
100
bad = ['/tmp','/tmp.html','/tmp/a.html',
 
101
       '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
 
102
       '/~joe/index.html'
 
103
       ]
 
104
 
 
105
RobotTest(4, doc, good, bad, 'figtree')
 
106
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
 
107
 
 
108
# 6.
 
109
doc = """
 
110
User-agent: *
 
111
Disallow: /tmp/
 
112
Disallow: /a%3Cd.html
 
113
Disallow: /a/b.html
 
114
Disallow: /%7ejoe/index.html
 
115
"""
 
116
 
 
117
good = ['/tmp',] # XFAIL: '/a%2fb.html'
 
118
bad = ['/tmp/','/tmp/a.html',
 
119
       '/a%3cd.html','/a%3Cd.html',"/a/b.html",
 
120
       '/%7Ejoe/index.html']
 
121
 
 
122
RobotTest(6, doc, good, bad)
 
123
 
 
124
# From bug report #523041
 
125
 
 
126
# 7.
 
127
doc = """
 
128
User-Agent: *
 
129
Disallow: /.
 
130
"""
 
131
 
 
132
good = ['/foo.html']
 
133
bad = [] # Bug report says "/" should be denied, but that is not in the RFC
 
134
 
 
135
RobotTest(7, doc, good, bad)
 
136
 
 
137
def test_main():
 
138
    test_support.run_suite(tests)
 
139
 
 
140
if __name__=='__main__':
 
141
    test_support.Verbose = 1
 
142
    test_support.run_suite(tests)