~pythonregexp2.7/python/issue2636-11

« back to all changes in this revision

Viewing changes to Lib/robotparser.py

  • Committer: Jeffrey C. "The TimeHorse" Jacobs
  • Date: 2008-09-21 17:53:26 UTC
  • mfrom: (39025.1.14 Regexp-2.7)
  • Revision ID: darklord@timehorse.com-20080921175326-92vaej2hc3yuecxb
Merged in changes from the core Regexp branch.

Show diffs side-by-side

added added

removed removed

Lines of Context:
55
55
        """Reads the robots.txt URL and feeds it to the parser."""
56
56
        opener = URLopener()
57
57
        f = opener.open(self.url)
58
 
        lines = []
59
 
        line = f.readline()
60
 
        while line:
61
 
            lines.append(line.strip())
62
 
            line = f.readline()
 
58
        lines = [line.strip() for line in f]
 
59
        f.close()
63
60
        self.errcode = opener.errcode
64
61
        if self.errcode in (401, 403):
65
62
            self.disallow_all = True
79
76
        """parse the input lines from a robots.txt file.
80
77
           We allow that a user-agent: line is not preceded by
81
78
           one or more blank lines."""
 
79
        # states:
 
80
        #   0: start state
 
81
        #   1: saw user-agent line
 
82
        #   2: saw an allow or disallow line
82
83
        state = 0
83
84
        linenumber = 0
84
85
        entry = Entry()
85
86
 
86
87
        for line in lines:
87
 
            linenumber = linenumber + 1
 
88
            linenumber += 1
88
89
            if not line:
89
90
                if state == 1:
90
91
                    entry = Entry()
117
118
                elif line[0] == "allow":
118
119
                    if state != 0:
119
120
                        entry.rulelines.append(RuleLine(line[1], True))
 
121
                        state = 2
120
122
        if state == 2:
121
123
            self.entries.append(entry)
122
124