1
# This library is free software; you can redistribute it and/or
2
# modify it under the terms of the GNU Lesser General Public
3
# License as published by the Free Software Foundation; either
4
# version 2.1 of the License, or (at your option) any later version.
6
# This library is distributed in the hope that it will be useful,
7
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9
# Lesser General Public License for more details.
11
# You should have received a copy of the GNU Lesser General Public
12
# License along with this library; if not, write to the
13
# Free Software Foundation, Inc.,
14
# 59 Temple Place, Suite 330,
15
# Boston, MA 02111-1307 USA
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
20
"""A high-level cross-protocol url-grabber.
22
GENERAL ARGUMENTS (kwargs)
24
Where possible, the module-level default is indicated, and legal
29
ignored except for file:// urls, in which case it specifies
30
whether urlgrab should still make a copy of the file, or simply
31
point to the existing copy. The module level default for this
34
close_connection = 0 [0|1]
36
tells URLGrabber to close the connection after a file has been
37
transfered. This is ignored unless the download happens with the
38
http keepalive handler (keepalive=1). Otherwise, the connection
39
is left open for further use. The module level default for this
40
option is 0 (keepalive connections will not be closed).
44
specifies whether keepalive should be used for HTTP/1.1 servers
45
that support it. The module level default for this option is 1
46
(keepalive is enabled).
50
a class instance that supports the following methods:
51
po.start(filename, url, basename, length, text)
52
# length will be None if unknown
53
po.update(read) # read == bytes read so far
58
specifies alternative text to be passed to the progress meter
59
object. If not given, the default progress meter will use the
64
a number - if it's an int, it's the bytes/second throttle limit.
65
If it's a float, it is first multiplied by bandwidth. If throttle
66
== 0, throttling is disabled. If None, the module-level default
67
(which can be set on default_grabber.throttle) is used. See
68
BANDWIDTH THROTTLING for more information.
72
a positive float expressing the number of seconds to wait for socket
73
operations. If the value is None or 0.0, socket operations will block
74
forever. Setting this option causes urlgrabber to call the settimeout
75
method on the Socket object used for the request. See the Python
76
documentation on settimeout for more information.
77
http://www.python.org/doc/current/lib/socket-objects.html
81
the nominal max bandwidth in bytes/second. If throttle is a float
82
and bandwidth == 0, throttling is disabled. If None, the
83
module-level default (which can be set on
84
default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
89
a tuple of the form (first_byte, last_byte) describing a byte
90
range to retrieve. Either or both of the values may set to
91
None. If first_byte is None, byte offset 0 is assumed. If
92
last_byte is None, the last byte available is assumed. Note that
93
the range specification is python-like in that (0,10) will yeild
94
the first 10 bytes of the file.
96
If set to None, no range will be used.
98
reget = None [None|'simple'|'check_timestamp']
100
whether to attempt to reget a partially-downloaded file. Reget
101
only applies to .urlgrab and (obviously) only if there is a
102
partially downloaded file. Reget has two modes:
104
'simple' -- the local file will always be trusted. If there
105
are 100 bytes in the local file, then the download will always
106
begin 100 bytes into the requested file.
108
'check_timestamp' -- the timestamp of the server file will be
109
compared to the timestamp of the local file. ONLY if the
110
local file is newer than or the same age as the server file
111
will reget be used. If the server file is newer, or the
112
timestamp is not returned, the entire file will be fetched.
114
NOTE: urlgrabber can do very little to verify that the partial
115
file on disk is identical to the beginning of the remote file.
116
You may want to either employ a custom "checkfunc" or simply avoid
117
using reget in situations where corruption is a concern.
119
user_agent = 'urlgrabber/VERSION'
121
a string, usually of the form 'AGENT/VERSION' that is provided to
122
HTTP servers in the User-agent header. The module level default
123
for this option is "urlgrabber/VERSION".
127
a tuple of 2-tuples, each containing a header and value. These
128
will be used for http and https requests only. For example, you
130
http_headers = (('Pragma', 'no-cache'),)
134
this is just like http_headers, but will be used for ftp requests.
138
a dictionary that maps protocol schemes to proxy hosts. For
139
example, to use a proxy server on host "foo" port 3128 for http
141
proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
142
note that proxy authentication information may be provided using
143
normal URL constructs:
144
proxies={ 'http' : 'http://user:host@foo:3128' }
145
Lastly, if proxies is None, the default environment settings will
150
a url prefix that will be prepended to all requested urls. For
152
g = URLGrabber(prefix='http://foo.com/mirror/')
153
g.urlgrab('some/file.txt')
154
## this will fetch 'http://foo.com/mirror/some/file.txt'
155
This option exists primarily to allow identical behavior to
156
MirrorGroup (and derived) instances. Note: a '/' will be inserted
157
if necessary, so you cannot specify a prefix that ends with a
158
partial file or directory name.
162
Overrides the default urllib2.OpenerDirector provided to urllib2
163
when making requests. This option exists so that the urllib2
164
handler chain may be customized. Note that the range, reget,
165
proxy, and keepalive features require that custom handlers be
166
provided to urllib2 in order to function properly. If an opener
167
option is provided, no attempt is made by urlgrabber to ensure
168
chain integrity. You are responsible for ensuring that any
169
extension handlers are present if said features are required.
173
controls whether urllib2 openers should be cached and reused, or
174
whether they should be created each time. There's a modest
175
overhead in recreating them, but it's slightly safer to do so if
176
you're modifying the handlers between calls.
180
Only relevant for the HTTP family (and ignored for other
181
protocols), this allows HTTP POSTs. When the data kwarg is
182
present (and not None), an HTTP request will automatically become
183
a POST rather than GET. This is done by direct passthrough to
184
urllib2. If you use this, you may also want to set the
185
'Content-length' and 'Content-type' headers with the http_headers
186
option. Note that python 2.2 handles the case of these
187
badly and if you do not use the proper case (shown here), your
188
values will be overridden with the defaults.
190
urlparser = URLParser()
192
The URLParser class handles pre-processing of URLs, including
193
auth-handling for user/pass encoded in http urls, file handing
194
(that is, filenames not sent as a URL), and URL quoting. If you
195
want to override any of this behavior, you can pass in a
196
replacement instance. See also the 'quote' option.
200
Whether or not to quote the path portion of a url.
201
quote = 1 -> quote the URLs (they're not quoted yet)
202
quote = 0 -> do not quote them (they're already quoted)
203
quote = None -> guess what to do
205
This option only affects proper urls like 'file:///etc/passwd'; it
206
does not affect 'raw' filenames like '/etc/passwd'. The latter
207
will always be quoted as they are converted to URLs. Also, only
208
the path part of a url is quoted. If you need more fine-grained
209
control, you should probably subclass URLParser and pass it in via
210
the 'urlparser' option.
214
this option can be used if M2Crypto is available and will be
215
ignored otherwise. If provided, it will be used to create an SSL
216
context. If both ssl_ca_cert and ssl_context are provided, then
217
ssl_context will be ignored and a new context will be created from
222
this option can be used if M2Crypto is available and will be
223
ignored otherwise. If provided, this SSL context will be used.
224
If both ssl_ca_cert and ssl_context are provided, then ssl_context
225
will be ignored and a new context will be created from
229
RETRY RELATED ARGUMENTS
233
the number of times to retry the grab before bailing. If this is
234
zero, it will retry forever. This was intentional... really, it
235
was :). If this value is not supplied or is supplied but is None
236
retrying does not occur.
238
retrycodes = [-1,2,4,5,6,7]
240
a sequence of errorcodes (values of e.errno) for which it should
241
retry. See the doc on URLGrabError for more details on this. You
242
might consider modifying a copy of the default codes rather than
243
building yours from scratch so that if the list is extended in the
244
future (or one code is split into two) you can still enjoy the
245
benefits of the default list. You can do that with something like
248
retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
249
if 12 not in retrycodes:
250
retrycodes.append(12)
254
a function to do additional checks. This defaults to None, which
255
means no additional checking. The function should simply return
256
on a successful check. It should raise URLGrabError on an
257
unsuccessful check. Raising of any other exception will be
258
considered immediate failure and no retries will occur.
260
If it raises URLGrabError, the error code will determine the retry
261
behavior. Negative error numbers are reserved for use by these
262
passed in functions, so you can use many negative numbers for
263
different types of failure. By default, -1 results in a retry,
264
but this can be customized with retrycodes.
266
If you simply pass in a function, it will be given exactly one
267
argument: a CallbackObject instance with the .url attribute
268
defined and either .filename (for urlgrab) or .data (for urlread).
269
For urlgrab, .filename is the name of the local file. For
270
urlread, .data is the actual string data. If you need other
271
arguments passed to the callback (program state of some sort), you
274
checkfunc=(function, ('arg1', 2), {'kwarg': 3})
276
if the downloaded file has filename /tmp/stuff, then this will
277
result in this call (for urlgrab):
279
function(obj, 'arg1', 2, kwarg=3)
280
# obj.filename = '/tmp/stuff'
281
# obj.url = 'http://foo.com/stuff'
283
NOTE: both the "args" tuple and "kwargs" dict must be present if
284
you use this syntax, but either (or both) can be empty.
286
failure_callback = None
288
The callback that gets called during retries when an attempt to
289
fetch a file fails. The syntax for specifying the callback is
290
identical to checkfunc, except for the attributes defined in the
291
CallbackObject instance. The attributes for failure_callback are:
293
exception = the raised exception
294
url = the url we're trying to fetch
295
tries = the number of tries so far (including this one)
296
retry = the value of the retry option
298
The callback is present primarily to inform the calling program of
299
the failure, but if it raises an exception (including the one it's
300
passed) that exception will NOT be caught and will therefore cause
301
future retries to be aborted.
303
The callback is called for EVERY failure, including the last one.
304
On the last try, the callback can raise an alternate exception,
305
but it cannot (without severe trickiness) prevent the exception
308
interrupt_callback = None
310
This callback is called if KeyboardInterrupt is received at any
311
point in the transfer. Basically, this callback can have three
312
impacts on the fetch process based on the way it exits:
314
1) raise no exception: the current fetch will be aborted, but
315
any further retries will still take place
317
2) raise a URLGrabError: if you're using a MirrorGroup, then
318
this will prompt a failover to the next mirror according to
319
the behavior of the MirrorGroup subclass. It is recommended
320
that you raise URLGrabError with code 15, 'user abort'. If
321
you are NOT using a MirrorGroup subclass, then this is the
324
3) raise some other exception (such as KeyboardInterrupt), which
325
will not be caught at either the grabber or mirror levels.
326
That is, it will be raised up all the way to the caller.
328
This callback is very similar to failure_callback. They are
329
passed the same arguments, so you could use the same function for
334
urlgrabber supports throttling via two values: throttle and
335
bandwidth Between the two, you can either specify and absolute
336
throttle threshold or specify a theshold as a fraction of maximum
339
throttle is a number - if it's an int, it's the bytes/second
340
throttle limit. If it's a float, it is first multiplied by
341
bandwidth. If throttle == 0, throttling is disabled. If None, the
342
module-level default (which can be set with set_throttle) is used.
344
bandwidth is the nominal max bandwidth in bytes/second. If throttle
345
is a float and bandwidth == 0, throttling is disabled. If None, the
346
module-level default (which can be set with set_bandwidth) is used.
350
Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
351
per second, or 12,500,000 Bytes per second. You have a number of
354
*) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
356
This will limit urlgrab to use half of your available bandwidth.
358
*) set_throttle(6250000) # throttle is an int
360
This will also limit urlgrab to use half of your available
361
bandwidth, regardless of what bandwidth is set to.
363
*) set_throttle(6250000); set_throttle(1.0) # float
365
Use half your bandwidth
367
*) set_throttle(6250000); set_throttle(2.0) # float
369
Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
371
*) set_throttle(6250000); set_throttle(0) # throttle = 0
373
Disable throttling - this is more efficient than a very large
376
*) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
378
Disable throttling - this is the default when the module is loaded.
380
SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
382
While this is flexible, it's not extremely obvious to the user. I
383
suggest you implement a float throttle as a percent to make the
384
distinction between absolute and relative throttling very explicit.
386
Also, you may want to convert the units to something more convenient
387
than bytes/second, such as kbps or kB/s, etc.
391
# $Id: grabber.py,v 1.52 2006/12/12 19:08:46 mstenner Exp $
403
from stat import * # S_* and ST_*
405
########################################################################
406
# MODULE INITIALIZATION
407
########################################################################
409
exec('from ' + (__name__.split('.'))[0] + ' import __version__')
415
auth_handler = urllib2.HTTPBasicAuthHandler( \
416
urllib2.HTTPPasswordMgrWithDefaultRealm())
420
except ImportError, msg:
424
from httplib import HTTPException
425
except ImportError, msg:
429
# This is a convenient way to make keepalive optional.
430
# Just rename the module so it can't be imported.
432
from keepalive import HTTPHandler, HTTPSHandler
433
have_keepalive = True
434
keepalive_http_handler = HTTPHandler()
435
except ImportError, msg:
436
have_keepalive = False
437
keepalive_http_handler = None
440
# add in range support conditionally too
442
from byterange import HTTPRangeHandler, HTTPSRangeHandler, \
443
FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \
444
range_tuple_to_header, RangeError
445
except ImportError, msg:
450
range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(),
451
FileRangeHandler(), FTPRangeHandler())
455
# check whether socket timeout support is available (Python >= 2.3)
458
TimeoutError = socket.timeout
459
have_socket_timeout = True
460
except AttributeError:
462
have_socket_timeout = False
464
########################################################################
465
# functions for debugging output. These functions are here because they
466
# are also part of the module initialization.
468
def set_logger(DBOBJ):
469
"""Set the DEBUG object. This is called by _init_default_logger when
470
the environment variable URLGRABBER_DEBUG is set, but can also be
471
called by a calling program. Basically, if the calling program uses
472
the logging module and would like to incorporate urlgrabber logging,
473
then it can do so this way. It's probably not necessary as most
474
internal logging is only for debugging purposes.
476
The passed-in object should be a logging.Logger instance. It will
477
be pushed into the keepalive and byterange modules if they're
478
being used. The mirror module pulls this object in on import, so
479
you will need to manually push into it. In fact, you may find it
480
tidier to simply push your logging object (or objects) into each
481
of these modules independently.
486
if have_keepalive and keepalive.DEBUG is None:
487
keepalive.DEBUG = DBOBJ
488
if have_range and byterange.DEBUG is None:
489
byterange.DEBUG = DBOBJ
490
if sslfactory.DEBUG is None:
491
sslfactory.DEBUG = DBOBJ
493
def _init_default_logger(logspec=None):
494
'''Examines the environment variable URLGRABBER_DEBUG and creates
495
a logging object (logging.logger) based on the contents. It takes
498
URLGRABBER_DEBUG=level,filename
500
where "level" can be either an integer or a log level from the
501
logging module (DEBUG, INFO, etc). If the integer is zero or
502
less, logging will be disabled. Filename is the filename where
503
logs will be sent. If it is "-", then stdout will be used. If
504
the filename is empty or missing, stderr will be used. If the
505
variable cannot be processed or the logging module cannot be
506
imported (python < 2.3) then logging will be disabled. Here are
509
URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
510
URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
511
URLGRABBER_DEBUG=INFO # log info and higher to stderr
513
This funtion is called during module initialization. It is not
514
intended to be called from outside. The only reason it is a
515
function at all is to keep the module-level namespace tidy and to
516
collect the code into a nice block.'''
520
logspec = os.environ['URLGRABBER_DEBUG']
521
dbinfo = logspec.split(',')
523
level = logging._levelNames.get(dbinfo[0], None)
524
if level is None: level = int(dbinfo[0])
525
if level < 1: raise ValueError()
527
formatter = logging.Formatter('%(asctime)s %(message)s')
528
if len(dbinfo) > 1: filename = dbinfo[1]
530
if filename == '': handler = logging.StreamHandler(sys.stderr)
531
elif filename == '-': handler = logging.StreamHandler(sys.stdout)
532
else: handler = logging.FileHandler(filename)
533
handler.setFormatter(formatter)
534
DBOBJ = logging.getLogger('urlgrabber')
535
DBOBJ.addHandler(handler)
536
DBOBJ.setLevel(level)
537
except (KeyError, ImportError, ValueError):
541
def _log_package_state():
543
DEBUG.info('urlgrabber version = %s' % __version__)
544
DEBUG.info('have_m2crypto = %s' % sslfactory.have_m2crypto)
545
DEBUG.info('trans function "_" = %s' % _)
546
DEBUG.info('have_keepalive = %s' % have_keepalive)
547
DEBUG.info('have_range = %s' % have_range)
548
DEBUG.info('have_socket_timeout = %s' % have_socket_timeout)
550
_init_default_logger()
552
########################################################################
553
# END MODULE INITIALIZATION
554
########################################################################
558
class URLGrabError(IOError):
560
URLGrabError error codes:
562
URLGrabber error codes (0 -- 255)
563
0 - everything looks good (you should never see this)
565
2 - local file doesn't exist
566
3 - request for non-file local file (dir, etc)
569
6 - no content length header when we expected one
571
8 - Exceeded read limit (for urlread)
572
9 - Requested byte range not satisfiable.
573
10 - Byte range requested, but range support unavailable
574
11 - Illegal reget mode
576
13 - malformed proxy url
577
14 - HTTPError (includes .code and .exception attributes)
579
16 - error writing to local file
581
MirrorGroup error codes (256 -- 511)
582
256 - No more mirrors left to try
584
Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
585
[ this range reserved for application-specific error codes ]
588
-1 - retry the download, unknown reason
590
Note: to test which group a code is in, you can simply do integer
591
division by 256: e.errno / 256
593
Negative codes are reserved for use by functions passed in to
594
retrygrab with checkfunc. The value -1 is built in as a generic
595
retry code and is already included in the retrycodes list.
596
Therefore, you can create a custom check function that simply
597
returns -1 and the fetch will be re-tried. For more customized
598
retries, you can use other negative number and include them in
599
retry-codes. This is nice for outputting useful messages about
602
You can use these error codes like so:
604
except URLGrabError, e:
609
print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
613
class CallbackObject:
614
"""Container for returned callback data.
616
This is currently a dummy class into which urlgrabber can stuff
617
information for passing to callbacks. This way, the prototype for
618
all callbacks is the same, regardless of the data that will be
619
passed back. Any function that accepts a callback function as an
620
argument SHOULD document what it will define in this object.
622
It is possible that this class will have some greater
623
functionality in the future.
625
def __init__(self, **kwargs):
626
self.__dict__.update(kwargs)
628
def urlgrab(url, filename=None, **kwargs):
629
"""grab the file at <url> and make a local copy at <filename>
630
If filename is none, the basename of the url is used.
631
urlgrab returns the filename of the local file, which may be different
632
from the passed-in filename if the copy_local kwarg == 0.
634
See module documentation for a description of possible kwargs.
636
return default_grabber.urlgrab(url, filename, **kwargs)
638
def urlopen(url, **kwargs):
639
"""open the url and return a file object
640
If a progress object or throttle specifications exist, then
641
a special file object will be returned that supports them.
642
The file object can be treated like any other file object.
644
See module documentation for a description of possible kwargs.
646
return default_grabber.urlopen(url, **kwargs)
648
def urlread(url, limit=None, **kwargs):
649
"""read the url into a string, up to 'limit' bytes
650
If the limit is exceeded, an exception will be thrown. Note that urlread
651
is NOT intended to be used as a way of saying "I want the first N bytes"
652
but rather 'read the whole file into memory, but don't use too much'
654
See module documentation for a description of possible kwargs.
656
return default_grabber.urlread(url, limit, **kwargs)
660
"""Process the URLs before passing them to urllib2.
662
This class does several things:
665
* translate a "raw" file to a proper file: url
666
* handle any http or https auth that's encoded within the url
669
Only the "parse" method is called directly, and it calls sub-methods.
671
An instance of this class is held in the options object, which
672
means that it's easy to change the behavior by sub-classing and
673
passing the replacement in. It need only have a method like:
675
url, parts = urlparser.parse(url, opts)
678
def parse(self, url, opts):
679
"""parse the url and return the (modified) url and its parts
681
Note: a raw file WILL be quoted when it's converted to a URL.
682
However, other urls (ones which come with a proper scheme) may
683
or may not be quoted according to opts.quote
685
opts.quote = 1 --> quote it
686
opts.quote = 0 --> do not quote it
687
opts.quote = None --> guess
692
url = self.add_prefix(url, opts.prefix)
694
parts = urlparse.urlparse(url)
695
(scheme, host, path, parm, query, frag) = parts
697
if not scheme or (len(scheme) == 1 and scheme in string.letters):
698
# if a scheme isn't specified, we guess that it's "file:"
699
if url[0] not in '/\\': url = os.path.abspath(url)
700
url = 'file:' + urllib.pathname2url(url)
701
parts = urlparse.urlparse(url)
702
quote = 0 # pathname2url quotes, so we won't do it again
704
if scheme in ['http', 'https']:
705
parts = self.process_http(parts)
708
quote = self.guess_should_quote(parts)
710
parts = self.quote(parts)
712
url = urlparse.urlunparse(parts)
715
def add_prefix(self, url, prefix):
716
if prefix[-1] == '/' or url[0] == '/':
719
url = prefix + '/' + url
722
def process_http(self, parts):
723
(scheme, host, path, parm, query, frag) = parts
725
if '@' in host and auth_handler:
727
user_pass, host = host.split('@', 1)
729
user, password = user_pass.split(':', 1)
730
except ValueError, e:
731
raise URLGrabError(1, _('Bad URL: %s') % url)
732
if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password)
733
auth_handler.add_password(None, host, user, password)
735
return (scheme, host, path, parm, query, frag)
737
def quote(self, parts):
740
This method quotes ONLY the path part. If you need to quote
741
other parts, you should override this and pass in your derived
742
class. The other alternative is to quote other parts before
743
passing into urlgrabber.
745
(scheme, host, path, parm, query, frag) = parts
746
path = urllib.quote(path)
747
return (scheme, host, path, parm, query, frag)
749
hexvals = '0123456789ABCDEF'
750
def guess_should_quote(self, parts):
752
Guess whether we should quote a path. This amounts to
753
guessing whether it's already quoted.
760
(scheme, host, path, parm, query, frag) = parts
763
ind = string.find(path, '%')
766
if len(path) < ind+3:
768
code = path[ind+1:ind+3].upper()
769
if code[0] not in self.hexvals or \
770
code[1] not in self.hexvals:
772
ind = string.find(path, '%', ind+1)
776
class URLGrabberOptions:
777
"""Class to ease kwargs handling."""
779
def __init__(self, delegate=None, **kwargs):
780
"""Initialize URLGrabberOptions object.
781
Set default values for all options and then update options specified
784
self.delegate = delegate
787
self._set_attributes(**kwargs)
789
def __getattr__(self, name):
790
if self.delegate and hasattr(self.delegate, name):
791
return getattr(self.delegate, name)
792
raise AttributeError, name
794
def raw_throttle(self):
795
"""Calculate raw throttle value from throttle and bandwidth
798
if self.throttle <= 0:
800
elif type(self.throttle) == type(0):
801
return float(self.throttle)
802
else: # throttle is a float
803
return self.bandwidth * self.throttle
805
def derive(self, **kwargs):
806
"""Create a derived URLGrabberOptions instance.
807
This method creates a new instance and overrides the
808
options specified in kwargs.
810
return URLGrabberOptions(delegate=self, **kwargs)
812
def _set_attributes(self, **kwargs):
813
"""Update object attributes with those provided in kwargs."""
814
self.__dict__.update(kwargs)
815
if have_range and kwargs.has_key('range'):
816
# normalize the supplied range value
817
self.range = range_tuple_normalize(self.range)
818
if not self.reget in [None, 'simple', 'check_timestamp']:
819
raise URLGrabError(11, _('Illegal reget mode: %s') \
822
def _set_defaults(self):
823
"""Set all options to their default values.
824
When adding new options, make sure a default is
827
self.progress_obj = None
831
self.retrycodes = [-1,2,4,5,6,7]
832
self.checkfunc = None
834
self.close_connection = 0
836
self.user_agent = 'urlgrabber/%s' % __version__
840
self.failure_callback = None
841
self.interrupt_callback = None
844
self.cache_openers = True
847
self.http_headers = None
848
self.ftp_headers = None
850
self.urlparser = URLParser()
852
self.ssl_ca_cert = None
853
self.ssl_context = None
858
def format(self, indent=' '):
859
keys = self.__dict__.keys()
860
if self.delegate is not None:
861
keys.remove('delegate')
865
s = s + indent + '%-15s: %s,\n' % \
866
(repr(k), repr(self.__dict__[k]))
868
df = self.delegate.format(indent + ' ')
869
s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
874
"""Provides easy opening of URLs with a variety of options.
876
All options are specified as kwargs. Options may be specified when
877
the class is created and may be overridden on a per request basis.
879
New objects inherit default values from default_grabber.
882
def __init__(self, **kwargs):
883
self.opts = URLGrabberOptions(**kwargs)
885
def _retry(self, opts, func, *args):
888
# there are only two ways out of this loop. The second has
890
# 1) via the return in the "try" block
891
# 2) by some exception being raised
892
# a) an excepton is raised that we don't "except"
893
# b) a callback raises ANY exception
894
# c) we're not retry-ing or have run out of retries
895
# d) the URLGrabError code is not in retrycodes
896
# beware of infinite loops :)
901
if DEBUG: DEBUG.info('attempt %i/%s: %s',
902
tries, opts.retry, args[0])
904
r = apply(func, (opts,) + args, {})
905
if DEBUG: DEBUG.info('success')
907
except URLGrabError, e:
909
callback = opts.failure_callback
911
except KeyboardInterrupt, e:
913
callback = opts.interrupt_callback
915
if DEBUG: DEBUG.info('exception: %s', exception)
917
if DEBUG: DEBUG.info('calling callback: %s', callback)
918
cb_func, cb_args, cb_kwargs = self._make_callback(callback)
919
obj = CallbackObject(exception=exception, url=args[0],
920
tries=tries, retry=opts.retry)
921
cb_func(obj, *cb_args, **cb_kwargs)
923
if (opts.retry is None) or (tries == opts.retry):
924
if DEBUG: DEBUG.info('retries exceeded, re-raising')
927
if (retrycode is not None) and (retrycode not in opts.retrycodes):
928
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
929
retrycode, opts.retrycodes)
932
def urlopen(self, url, **kwargs):
933
"""open the url and return a file object
934
If a progress object or throttle value specified when this
935
object was created, then a special file object will be
936
returned that supports them. The file object can be treated
937
like any other file object.
939
opts = self.opts.derive(**kwargs)
940
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
941
(url,parts) = opts.urlparser.parse(url, opts)
942
def retryfunc(opts, url):
943
return URLGrabberFileObject(url, filename=None, opts=opts)
944
return self._retry(opts, retryfunc, url)
946
def urlgrab(self, url, filename=None, **kwargs):
947
"""grab the file at <url> and make a local copy at <filename>
948
If filename is none, the basename of the url is used.
949
urlgrab returns the filename of the local file, which may be
950
different from the passed-in filename if copy_local == 0.
952
opts = self.opts.derive(**kwargs)
953
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
954
(url,parts) = opts.urlparser.parse(url, opts)
955
(scheme, host, path, parm, query, frag) = parts
957
filename = os.path.basename( urllib.unquote(path) )
958
if scheme == 'file' and not opts.copy_local:
959
# just return the name of the local file - don't make a
961
path = urllib.url2pathname(path)
963
path = os.path.normpath('//' + host + path)
964
if not os.path.exists(path):
965
raise URLGrabError(2,
966
_('Local file does not exist: %s') % (path, ))
967
elif not os.path.isfile(path):
968
raise URLGrabError(3,
969
_('Not a normal file: %s') % (path, ))
973
def retryfunc(opts, url, filename):
974
fo = URLGrabberFileObject(url, filename, opts)
977
if not opts.checkfunc is None:
978
cb_func, cb_args, cb_kwargs = \
979
self._make_callback(opts.checkfunc)
980
obj = CallbackObject()
981
obj.filename = filename
983
apply(cb_func, (obj, )+cb_args, cb_kwargs)
988
return self._retry(opts, retryfunc, url, filename)
990
def urlread(self, url, limit=None, **kwargs):
991
"""read the url into a string, up to 'limit' bytes
992
If the limit is exceeded, an exception will be thrown. Note
993
that urlread is NOT intended to be used as a way of saying
994
"I want the first N bytes" but rather 'read the whole file
995
into memory, but don't use too much'
997
opts = self.opts.derive(**kwargs)
998
if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
999
(url,parts) = opts.urlparser.parse(url, opts)
1000
if limit is not None:
1003
def retryfunc(opts, url, limit):
1004
fo = URLGrabberFileObject(url, filename=None, opts=opts)
1007
# this is an unfortunate thing. Some file-like objects
1008
# have a default "limit" of None, while the built-in (real)
1009
# file objects have -1. They each break the other, so for
1010
# now, we just force the default if necessary.
1011
if limit is None: s = fo.read()
1012
else: s = fo.read(limit)
1014
if not opts.checkfunc is None:
1015
cb_func, cb_args, cb_kwargs = \
1016
self._make_callback(opts.checkfunc)
1017
obj = CallbackObject()
1020
apply(cb_func, (obj, )+cb_args, cb_kwargs)
1025
s = self._retry(opts, retryfunc, url, limit)
1026
if limit and len(s) > limit:
1027
raise URLGrabError(8,
1028
_('Exceeded limit (%i): %s') % (limit, url))
1031
def _make_callback(self, callback_obj):
1032
if callable(callback_obj):
1033
return callback_obj, (), {}
1037
# create the default URLGrabber used by urlXXX functions.
1038
# NOTE: actual defaults are set in URLGrabberOptions
1039
default_grabber = URLGrabber()
1041
class URLGrabberFileObject:
1042
"""This is a file-object wrapper that supports progress objects
1045
This exists to solve the following problem: lets say you want to
1046
drop-in replace a normal open with urlopen. You want to use a
1047
progress meter and/or throttling, but how do you do that without
1048
rewriting your code? Answer: urlopen will return a wrapped file
1049
object that does the progress meter and-or throttling internally.
1052
def __init__(self, url, filename, opts):
1054
self.filename = filename
1058
self._rbufsize = 1024*8
1059
self._ttime = time.time()
1061
self._amount_read = 0
1065
def __getattr__(self, name):
1066
"""This effectively allows us to wrap at the instance level.
1067
Any attribute not found in _this_ object will be searched for
1068
in self.fo. This includes methods."""
1069
if hasattr(self.fo, name):
1070
return getattr(self.fo, name)
1071
raise AttributeError, name
1073
def _get_opener(self):
1074
"""Build a urllib2 OpenerDirector based on request options."""
1075
if self.opts.opener:
1076
return self.opts.opener
1077
elif self._opener is None:
1079
need_keepalive_handler = (have_keepalive and self.opts.keepalive)
1080
need_range_handler = (range_handlers and \
1081
(self.opts.range or self.opts.reget))
1082
# if you specify a ProxyHandler when creating the opener
1083
# it _must_ come before all other handlers in the list or urllib2
1085
if self.opts.proxies:
1086
handlers.append( _proxy_handler_cache.get(self.opts.proxies) )
1088
# -------------------------------------------------------
1089
# OK, these next few lines are a serious kludge to get
1090
# around what I think is a bug in python 2.2's
1091
# urllib2. The basic idea is that default handlers
1092
# get applied first. If you override one (like a
1093
# proxy handler), then the default gets pulled, but
1094
# the replacement goes on the end. In the case of
1095
# proxies, this means the normal handler picks it up
1096
# first and the proxy isn't used. Now, this probably
1097
# only happened with ftp or non-keepalive http, so not
1098
# many folks saw it. The simple approach to fixing it
1099
# is just to make sure you override the other
1100
# conflicting defaults as well. I would LOVE to see
1101
# these go way or be dealt with more elegantly. The
1102
# problem isn't there after 2.2. -MDS 2005/02/24
1103
if not need_keepalive_handler:
1104
handlers.append( urllib2.HTTPHandler() )
1105
if not need_range_handler:
1106
handlers.append( urllib2.FTPHandler() )
1107
# -------------------------------------------------------
1110
ssl_factory = _ssl_factory_cache.get( (self.opts.ssl_ca_cert,
1111
self.opts.ssl_context) )
1112
if need_keepalive_handler:
1113
handlers.append(keepalive_http_handler)
1114
handlers.append(_https_handler_cache.get(ssl_factory))
1115
if need_range_handler:
1116
handlers.extend( range_handlers )
1117
handlers.append( auth_handler )
1118
if self.opts.cache_openers:
1119
self._opener = _opener_cache.get([ssl_factory,] + handlers)
1121
self._opener = _opener_cache.create([ssl_factory,] + handlers)
1122
# OK, I don't like to do this, but otherwise, we end up with
1123
# TWO user-agent headers.
1124
self._opener.addheaders = []
1128
opener = self._get_opener()
1130
req = urllib2.Request(self.url, self.opts.data) # build request object
1131
self._add_headers(req) # add misc headers that we need
1132
self._build_range(req) # take care of reget and byterange stuff
1134
fo, hdr = self._make_request(req, opener)
1135
if self.reget_time and self.opts.reget == 'check_timestamp':
1136
# do this if we have a local file with known timestamp AND
1137
# we're in check_timestamp reget mode.
1140
modified_tuple = hdr.getdate_tz('last-modified')
1141
modified_stamp = rfc822.mktime_tz(modified_tuple)
1142
if modified_stamp > self.reget_time: fetch_again = 1
1143
except (TypeError,):
1147
# the server version is newer than the (incomplete) local
1148
# version, so we should abandon the version we're getting
1149
# and fetch the whole thing again.
1151
self.opts.reget = None
1152
del req.headers['Range']
1153
self._build_range(req)
1154
fo, hdr = self._make_request(req, opener)
1156
(scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
1157
path = urllib.unquote(path)
1158
if not (self.opts.progress_obj or self.opts.raw_throttle() \
1159
or self.opts.timeout):
1160
# if we're not using the progress_obj, throttling, or timeout
1161
# we can get a performance boost by going directly to
1162
# the underlying fileobject for reads.
1164
if hasattr(fo, 'readline'):
1165
self.readline = fo.readline
1166
elif self.opts.progress_obj:
1168
length = int(hdr['Content-Length'])
1169
length = length + self._amount_read # Account for regets
1170
except (KeyError, ValueError, TypeError):
1173
self.opts.progress_obj.start(str(self.filename),
1174
urllib.unquote(self.url),
1175
os.path.basename(path),
1176
length, text=self.opts.text)
1177
self.opts.progress_obj.update(0)
1178
(self.fo, self.hdr) = (fo, hdr)
1180
def _add_headers(self, req):
1181
if self.opts.user_agent:
1182
req.add_header('User-agent', self.opts.user_agent)
1183
try: req_type = req.get_type()
1184
except ValueError: req_type = None
1185
if self.opts.http_headers and req_type in ('http', 'https'):
1186
for h, v in self.opts.http_headers:
1187
req.add_header(h, v)
1188
if self.opts.ftp_headers and req_type == 'ftp':
1189
for h, v in self.opts.ftp_headers:
1190
req.add_header(h, v)
1192
def _build_range(self, req):
1193
self.reget_time = None
1197
if have_range and self.opts.reget and type(self.filename) == type(''):
1198
# we have reget turned on and we're dumping to a file
1200
s = os.stat(self.filename)
1204
self.reget_time = s[ST_MTIME]
1205
reget_length = s[ST_SIZE]
1207
# Set initial length when regetting
1208
self._amount_read = reget_length
1210
rt = reget_length, ''
1215
raise URLGrabError(10, _('Byte range requested but range '\
1216
'support unavailable'))
1217
rt = self.opts.range
1218
if rt[0]: rt = (rt[0] + reget_length, rt[1])
1221
header = range_tuple_to_header(rt)
1222
if header: req.add_header('Range', header)
1224
def _make_request(self, req, opener):
1226
if have_socket_timeout and self.opts.timeout:
1227
old_to = socket.getdefaulttimeout()
1228
socket.setdefaulttimeout(self.opts.timeout)
1230
fo = opener.open(req)
1232
socket.setdefaulttimeout(old_to)
1234
fo = opener.open(req)
1236
except ValueError, e:
1237
raise URLGrabError(1, _('Bad URL: %s') % (e, ))
1238
except RangeError, e:
1239
raise URLGrabError(9, str(e))
1240
except urllib2.HTTPError, e:
1241
new_e = URLGrabError(14, str(e))
1246
if hasattr(e, 'reason') and have_socket_timeout and \
1247
isinstance(e.reason, TimeoutError):
1248
raise URLGrabError(12, _('Timeout: %s') % (e, ))
1250
raise URLGrabError(4, _('IOError: %s') % (e, ))
1252
raise URLGrabError(5, _('OSError: %s') % (e, ))
1253
except HTTPException, e:
1254
raise URLGrabError(7, _('HTTP Exception (%s): %s') % \
1255
(e.__class__.__name__, e))
1260
"""dump the file to self.filename."""
1261
if self.append: mode = 'ab'
1263
if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
1264
(self.filename, mode))
1266
new_fo = open(self.filename, mode)
1268
raise URLGrabError(16, _(\
1269
'error opening local file, IOError: %s') % (e, ))
1272
# if we have a known range, only try to read that much.
1273
(low, high) = self.opts.range
1275
except TypeError, ValueError:
1280
if amount is not None: bs = min(bs, amount - size)
1281
block = self.read(bs)
1282
size = size + len(block)
1287
raise URLGrabError(16, _(\
1288
'error writing to local file, IOError: %s') % (e, ))
1289
if amount is not None: bs = min(bs, amount - size)
1290
block = self.read(bs)
1291
size = size + len(block)
1295
modified_tuple = self.hdr.getdate_tz('last-modified')
1296
modified_stamp = rfc822.mktime_tz(modified_tuple)
1297
os.utime(self.filename, (modified_stamp, modified_stamp))
1298
except (TypeError,), e: pass
1302
def _fill_buffer(self, amt=None):
1303
"""fill the buffer to contain at least 'amt' bytes by reading
1304
from the underlying file object. If amt is None, then it will
1305
read until it gets nothing more. It updates the progress meter
1306
and throttles after every self._rbufsize bytes."""
1307
# the _rbuf test is only in this first 'if' for speed. It's not
1308
# logically necessary
1309
if self._rbuf and not amt is None:
1316
# if we've made it here, then we don't have enough in the buffer
1317
# and we need to read more.
1320
bufsize = len(self._rbuf)
1321
while amt is None or amt:
1322
# first, delay if necessary for throttling reasons
1323
if self.opts.raw_throttle():
1324
diff = self._tsize/self.opts.raw_throttle() - \
1325
(time.time() - self._ttime)
1326
if diff > 0: time.sleep(diff)
1327
self._ttime = time.time()
1329
# now read some data, up to self._rbufsize
1330
if amt is None: readamount = self._rbufsize
1331
else: readamount = min(amt, self._rbufsize)
1333
new = self.fo.read(readamount)
1334
except socket.error, e:
1335
raise URLGrabError(4, _('Socket Error: %s') % (e, ))
1336
except TimeoutError, e:
1337
raise URLGrabError(12, _('Timeout: %s') % (e, ))
1339
raise URLGrabError(4, _('IOError: %s') %(e,))
1341
if not newsize: break # no more to read
1343
if amt: amt = amt - newsize
1345
bufsize = bufsize + newsize
1346
self._tsize = newsize
1347
self._amount_read = self._amount_read + newsize
1348
if self.opts.progress_obj:
1349
self.opts.progress_obj.update(self._amount_read)
1351
self._rbuf = string.join(buf, '')
1354
def read(self, amt=None):
1355
self._fill_buffer(amt)
1357
s, self._rbuf = self._rbuf, ''
1359
s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
1362
def readline(self, limit=-1):
1363
i = string.find(self._rbuf, '\n')
1364
while i < 0 and not (0 < limit <= len(self._rbuf)):
1366
self._fill_buffer(L + self._rbufsize)
1367
if not len(self._rbuf) > L: break
1368
i = string.find(self._rbuf, '\n', L)
1370
if i < 0: i = len(self._rbuf)
1372
if 0 <= limit < len(self._rbuf): i = limit
1374
s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
1378
if self.opts.progress_obj:
1379
self.opts.progress_obj.end(self._amount_read)
1381
if self.opts.close_connection:
1382
try: self.fo.close_connection()
1385
#####################################################################
1387
class NoDefault: pass
1389
def __init__(self, name=None):
1390
self.name = name or self.__class__.__name__
1391
self._lock = thread.allocate_lock()
1395
self._lock.acquire()
1398
self._lock.release()
1400
def get(self, key, create=None, found=None):
1401
for (k, v) in self._cache:
1404
DEBUG.debug('%s: found key' % self.name)
1405
DEBUG.debug('%s: key = %s' % (self.name, key))
1406
DEBUG.debug('%s: val = %s' % (self.name, v))
1407
found = found or getattr(self, 'found', None)
1408
if found: v = found(key, v)
1411
DEBUG.debug('%s: no key found' % self.name)
1412
DEBUG.debug('%s: key = %s' % (self.name, key))
1413
create = create or getattr(self, 'create', None)
1417
DEBUG.info('%s: new value created' % self.name)
1418
DEBUG.debug('%s: val = %s' % (self.name, value))
1419
self._cache.append( (key, value) )
1422
raise KeyError('key not found: %s' % key)
1424
def set(self, key, value):
1426
DEBUG.info('%s: inserting key' % self.name)
1427
DEBUG.debug('%s: key = %s' % (self.name, key))
1428
DEBUG.debug('%s: val = %s' % (self.name, value))
1429
self._cache.append( (key, value) )
1431
def ts_get(self, key, create=None, found=None):
1432
self._lock.acquire()
1434
self.get(key, create, found)
1436
self._lock.release()
1438
def ts_set(self, key, value):
1439
self._lock.acquire()
1441
self.set(key, value)
1443
self._lock.release()
1445
class OpenerCache(ObjectCache):
1446
def found(self, factory_and_handlers, opener):
1447
for handler in factory_and_handlers[1:]:
1448
handler.add_parent(opener)
1450
def create(self, factory_and_handlers):
1451
factory = factory_and_handlers[0]
1452
handlers = factory_and_handlers[1:]
1453
return factory.create_opener(*handlers)
1454
_opener_cache = OpenerCache()
1456
class ProxyHandlerCache(ObjectCache):
1457
def create(self, proxies):
1458
for k, v in proxies.items():
1459
utype, url = urllib.splittype(v)
1460
host, other = urllib.splithost(url)
1461
if (utype is None) or (host is None):
1462
raise URLGrabError(13, _('Bad proxy URL: %s') % v)
1463
return urllib2.ProxyHandler(proxies)
1464
_proxy_handler_cache = ProxyHandlerCache()
1466
class HTTPSHandlerCache(ObjectCache):
1467
def create(self, ssl_factory):
1468
return HTTPSHandler(ssl_factory)
1469
_https_handler_cache = HTTPSHandlerCache()
1471
class SSLFactoryCache(ObjectCache):
1472
def create(self, cert_and_context):
1473
return sslfactory.get_factory(*cert_and_context)
1474
_ssl_factory_cache = SSLFactoryCache()
1476
#####################################################################
1477
# DEPRECATED FUNCTIONS
1478
def set_throttle(new_throttle):
1479
"""Deprecated. Use: default_grabber.throttle = new_throttle"""
1480
default_grabber.throttle = new_throttle
1482
def set_bandwidth(new_bandwidth):
1483
"""Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
1484
default_grabber.bandwidth = new_bandwidth
1486
def set_progress_obj(new_progress_obj):
1487
"""Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
1488
default_grabber.progress_obj = new_progress_obj
1490
def set_user_agent(new_user_agent):
1491
"""Deprecated. Use: default_grabber.user_agent = new_user_agent"""
1492
default_grabber.user_agent = new_user_agent
1494
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1495
progress_obj=None, throttle=None, bandwidth=None,
1496
numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
1497
"""Deprecated. Use: urlgrab() with the retry arg instead"""
1498
kwargs = {'copy_local' : copy_local,
1499
'close_connection' : close_connection,
1500
'progress_obj' : progress_obj,
1501
'throttle' : throttle,
1502
'bandwidth' : bandwidth,
1504
'retrycodes' : retrycodes,
1505
'checkfunc' : checkfunc
1507
return urlgrab(url, filename, **kwargs)
1510
#####################################################################
1514
try: url, filename = sys.argv[1:3]
1516
print 'usage:', sys.argv[0], \
1517
'<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1521
for a in sys.argv[3:]:
1522
k, v = string.split(a, '=', 1)
1526
set_bandwidth(32 * 1024)
1527
print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
1528
default_grabber.bandwidth)
1530
try: from progress import text_progress_meter
1531
except ImportError, e: pass
1532
else: kwargs['progress_obj'] = text_progress_meter()
1534
try: name = apply(urlgrab, (url, filename), kwargs)
1535
except URLGrabError, e: print e
1536
else: print 'LOCAL FILE:', name
1541
try: url, filename = sys.argv[1:3]
1543
print 'usage:', sys.argv[0], \
1544
'<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1548
for a in sys.argv[3:]:
1549
k, v = string.split(a, '=', 1)
1552
try: from progress import text_progress_meter
1553
except ImportError, e: pass
1554
else: kwargs['progress_obj'] = text_progress_meter()
1556
def cfunc(filename, hello, there='foo'):
1559
rnum = random.random()
1561
print 'forcing retry'
1562
raise URLGrabError(-1, 'forcing retry')
1564
print 'forcing failure'
1565
raise URLGrabError(-2, 'forcing immediate failure')
1569
kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
1570
try: name = apply(retrygrab, (url, filename), kwargs)
1571
except URLGrabError, e: print e
1572
else: print 'LOCAL FILE:', name
1574
def _file_object_test(filename=None):
1575
import random, cStringIO, sys
1576
if filename is None:
1578
print 'using file "%s" for comparisons' % filename
1583
for testfunc in [_test_file_object_smallread,
1584
_test_file_object_readall,
1585
_test_file_object_readline,
1586
_test_file_object_readlines]:
1587
fo_input = cStringIO.StringIO(s_input)
1588
fo_output = cStringIO.StringIO()
1589
wrapper = URLGrabberFileObject(fo_input, None, 0)
1590
print 'testing %-30s ' % testfunc.__name__,
1591
testfunc(wrapper, fo_output)
1592
s_output = fo_output.getvalue()
1593
if s_output == s_input: print 'passed'
1594
else: print 'FAILED'
1596
def _test_file_object_smallread(wrapper, fo_output):
1598
s = wrapper.read(23)
1602
def _test_file_object_readall(wrapper, fo_output):
1606
def _test_file_object_readline(wrapper, fo_output):
1608
s = wrapper.readline()
1612
def _test_file_object_readlines(wrapper, fo_output):
1613
li = wrapper.readlines()
1614
fo_output.write(string.join(li, ''))
1616
if __name__ == '__main__':
1619
_file_object_test('test')