1
--- urlgrabber-3.9.1/urlgrabber/grabber.py.orig 2010-07-02 21:24:12.000000000 -0400
2
+++ urlgrabber-3.9.1/urlgrabber/grabber.py 2010-07-02 20:30:25.000000000 -0400
4
(which can be set on default_grabber.throttle) is used. See
5
BANDWIDTH THROTTLING for more information.
10
- a positive float expressing the number of seconds to wait for socket
11
- operations. If the value is None or 0.0, socket operations will block
12
- forever. Setting this option causes urlgrabber to call the settimeout
13
- method on the Socket object used for the request. See the Python
14
- documentation on settimeout for more information.
15
- http://www.python.org/doc/current/lib/socket-objects.html
16
+ a positive integer expressing the number of seconds to wait before
17
+ timing out attempts to connect to a server. If the value is None
18
+ or 0, connection attempts will not time out. The timeout is passed
19
+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
20
+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
21
+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
30
+ # this part isn't going to do much - need to talk to gettext
32
+except ImportError, msg:
33
+ def _(st): return st
35
########################################################################
36
# functions for debugging output. These functions are here because they
37
# are also part of the module initialization.
41
self.cache_openers = True
45
self.http_headers = None
46
self.ftp_headers = None
47
@@ -1052,9 +1058,15 @@
48
self._reget_length = 0
49
self._prog_running = False
50
self._error = (None, None)
53
+ self._hdr_ended = False
58
+ """ Provide the geturl() method, used to be got from
59
+ urllib.addinfourl, via. urllib.URLopener.* """
62
def __getattr__(self, name):
63
"""This effectively allows us to wrap at the instance level.
64
@@ -1085,9 +1097,14 @@
67
def _hdr_retrieve(self, buf):
71
+ self._hdr_ended = False
73
if self._over_max_size(cur=len(self._hdr_dump),
74
max_size=self.opts.max_header_size):
79
# we have to get the size before we do the progress obj start
80
@@ -1104,7 +1121,17 @@
86
+ if buf.lower().find('location') != -1:
87
+ location = ':'.join(buf.split(':')[1:])
88
+ location = location.strip()
89
+ self.scheme = urlparse.urlsplit(location)[0]
92
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
93
+ self._hdr_ended = True
94
+ if DEBUG: DEBUG.info('header ended:')
97
except KeyboardInterrupt:
98
return pycurl.READFUNC_ABORT
99
@@ -1113,8 +1140,10 @@
101
return self._parsed_hdr
102
statusend = self._hdr_dump.find('\n')
103
+ statusend += 1 # ridiculous as it may seem.
105
hdrfp.write(self._hdr_dump[statusend:])
107
self._parsed_hdr = mimetools.Message(hdrfp)
108
return self._parsed_hdr
110
@@ -1136,6 +1165,7 @@
111
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
112
self.curl_obj.setopt(pycurl.FAILONERROR, True)
113
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
114
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
117
self.curl_obj.setopt(pycurl.VERBOSE, True)
118
@@ -1148,9 +1178,11 @@
123
- timeout = int(opts.timeout)
124
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
125
+ if hasattr(opts, 'timeout'):
126
+ timeout = int(opts.timeout or 0)
127
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
128
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
129
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
132
if self.scheme == 'https':
133
@@ -1276,7 +1308,7 @@
137
- msg = _("client cert cannot be verified or client cert incorrect")
138
+ msg = _("Peer cert cannot be verified or peer cert invalid")
139
err = URLGrabError(14, msg)
142
@@ -1291,7 +1323,12 @@
145
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
146
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
147
+ if self.scheme in ['http', 'https']:
148
+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
149
+ elif self.scheme in ['ftp']:
150
+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
152
+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
154
msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
156
@@ -1299,6 +1336,12 @@
162
+ msg = self._error[1]
163
+ err = URLGRabError(14, msg)
168
self.curl_obj = _curl_cache
169
@@ -1446,9 +1489,23 @@
171
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
173
- os.utime(self.filename, (mod_time, mod_time))
175
+ os.utime(self.filename, (mod_time, mod_time))
177
+ err = URLGrabError(16, _(\
178
+ 'error setting timestamp on file %s from %s, OSError: %s')
179
+ % (self.filenameself.url, e))
183
- self.fo = open(self.filename, 'r')
185
+ self.fo = open(self.filename, 'r')
187
+ err = URLGrabError(16, _(\
188
+ 'error opening file from %s, IOError: %s') % (self.url, e))
193
#self.fo = open(self._temp_name, 'r')
195
@@ -1532,11 +1589,14 @@
196
def _over_max_size(self, cur, max_size=None):
199
- max_size = self.size
200
- if self.opts.size: # if we set an opts size use that, no matter what
201
- max_size = self.opts.size
202
+ if not self.opts.size:
203
+ max_size = self.size
205
+ max_size = self.opts.size
207
if not max_size: return False # if we have None for all of the Max then this is dumb
208
- if cur > max_size + max_size*.10:
210
+ if cur > int(float(max_size) * 1.10):
212
msg = _("Downloaded more than max size for %s: %s > %s") \
213
% (self.url, cur, max_size)
214
@@ -1582,9 +1642,21 @@
215
self.opts.progress_obj.end(self._amount_read)
220
+ """ Provide the geturl() method, used to be got from
221
+ urllib.addinfourl, via. urllib.URLopener.* """
224
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
226
+def reset_curl_obj():
227
+ """To make sure curl has reread the network/dns info we force a reload"""
229
+ _curl_cache.close()
230
+ _curl_cache = pycurl.Curl()
235
#####################################################################
236
# DEPRECATED FUNCTIONS