1
# This library is free software; you can redistribute it and/or
2
# modify it under the terms of the GNU Lesser General Public
3
# License as published by the Free Software Foundation; either
4
# version 2.1 of the License, or (at your option) any later version.
6
# This library is distributed in the hope that it will be useful,
7
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9
# Lesser General Public License for more details.
11
# You should have received a copy of the GNU Lesser General Public
12
# License along with this library; if not, write to the
13
# Free Software Foundation, Inc.,
14
# 59 Temple Place, Suite 330,
15
# Boston, MA 02111-1307 USA
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
20
# $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $
31
from cStringIO import StringIO
32
except ImportError, msg:
33
from StringIO import StringIO
35
class RangeError(IOError):
36
"""Error raised when an unsatisfiable range is requested."""
39
class HTTPRangeHandler(urllib2.BaseHandler):
40
"""Handler that enables HTTP Range headers.
42
This was extremely simple. The Range header is a HTTP feature to
43
begin with so all this class does is tell urllib2 that the
44
"206 Partial Content" reponse from the HTTP server is what we
51
range_handler = range.HTTPRangeHandler()
52
opener = urllib2.build_opener(range_handler)
55
urllib2.install_opener(opener)
57
# create Request and set Range header
58
req = urllib2.Request('http://www.python.org/')
59
req.header['Range'] = 'bytes=30-50'
60
f = urllib2.urlopen(req)
63
def http_error_206(self, req, fp, code, msg, hdrs):
64
# 206 Partial Content Response
65
r = urllib.addinfourl(fp, hdrs, req.get_full_url())
70
def http_error_416(self, req, fp, code, msg, hdrs):
71
# HTTP's Range Not Satisfiable error
72
raise RangeError('Requested Range Not Satisfiable')
74
class HTTPSRangeHandler(HTTPRangeHandler):
75
""" Range Header support for HTTPS. """
77
def https_error_206(self, req, fp, code, msg, hdrs):
78
return self.http_error_206(req, fp, code, msg, hdrs)
80
def https_error_416(self, req, fp, code, msg, hdrs):
81
self.https_error_416(req, fp, code, msg, hdrs)
83
class RangeableFileObject:
84
"""File object wrapper to enable raw range handling.
85
This was implemented primarilary for handling range
86
specifications for file:// urls. This object effectively makes
87
a file object look like it consists only of a range of bytes in
91
# expose 10 bytes, starting at byte position 20, from
93
>>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
94
# seek seeks within the range (to position 23 in this case)
96
# tell tells where your at _within the range_ (position 3 in
99
# read EOFs if an attempt is made to read past the last
100
# byte in the range. the following will return only 7 bytes.
104
def __init__(self, fo, rangetup):
105
"""Create a RangeableFileObject.
106
fo -- a file like object. only the read() method need be
107
supported but supporting an optimized seek() is
109
rangetup -- a (firstbyte,lastbyte) tuple specifying the range
111
The file object provided is assumed to be at byte offset 0.
114
(self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
116
self._do_seek(self.firstbyte)
118
def __getattr__(self, name):
119
"""This effectively allows us to wrap at the instance level.
120
Any attribute not found in _this_ object will be searched for
121
in self.fo. This includes methods."""
122
if hasattr(self.fo, name):
123
return getattr(self.fo, name)
124
raise AttributeError, name
127
"""Return the position within the range.
128
This is different from fo.seek in that position 0 is the
129
first byte position of the range tuple. For example, if
130
this object was created with a range tuple of (500,899),
131
tell() will return 0 when at byte position 500 of the file.
133
return (self.realpos - self.firstbyte)
135
def seek(self,offset,whence=0):
136
"""Seek within the byte range.
137
Positioning is identical to that described under tell().
139
assert whence in (0, 1, 2)
140
if whence == 0: # absolute seek
141
realoffset = self.firstbyte + offset
142
elif whence == 1: # relative seek
143
realoffset = self.realpos + offset
144
elif whence == 2: # absolute from end of file
145
# XXX: are we raising the right Error here?
146
raise IOError('seek from end of file not supported.')
148
# do not allow seek past lastbyte in range
149
if self.lastbyte and (realoffset >= self.lastbyte):
150
realoffset = self.lastbyte
152
self._do_seek(realoffset - self.realpos)
154
def read(self, size=-1):
155
"""Read within the range.
156
This method will limit the size read based on the range.
158
size = self._calc_read_size(size)
159
rslt = self.fo.read(size)
160
self.realpos += len(rslt)
163
def readline(self, size=-1):
164
"""Read lines within the range.
165
This method will limit the size read based on the range.
167
size = self._calc_read_size(size)
168
rslt = self.fo.readline(size)
169
self.realpos += len(rslt)
172
def _calc_read_size(self, size):
173
"""Handles calculating the amount of data to read based on
178
if ((self.realpos + size) >= self.lastbyte):
179
size = (self.lastbyte - self.realpos)
181
size = (self.lastbyte - self.realpos)
184
def _do_seek(self,offset):
185
"""Seek based on whether wrapped object supports seek().
186
offset is relative to the current position (self.realpos).
189
if not hasattr(self.fo, 'seek'):
190
self._poor_mans_seek(offset)
192
self.fo.seek(self.realpos + offset)
193
self.realpos+= offset
195
def _poor_mans_seek(self,offset):
196
"""Seek by calling the wrapped file objects read() method.
197
This is used for file like objects that do not have native
198
seek support. The wrapped objects read() method is called
199
to manually seek to the desired position.
200
offset -- read this number of bytes from the wrapped
202
raise RangeError if we encounter EOF before reaching the
208
if (pos + bufsize) > offset:
209
bufsize = offset - pos
210
buf = self.fo.read(bufsize)
211
if len(buf) != bufsize:
212
raise RangeError('Requested Range Not Satisfiable')
215
class FileRangeHandler(urllib2.FileHandler):
216
"""FileHandler subclass that adds Range support.
217
This class handles Range headers exactly like an HTTP
220
def open_local_file(self, req):
223
host = req.get_host()
224
file = req.get_selector()
225
localfile = urllib.url2pathname(file)
226
stats = os.stat(localfile)
227
size = stats[stat.ST_SIZE]
228
modified = rfc822.formatdate(stats[stat.ST_MTIME])
229
mtype = mimetypes.guess_type(file)[0]
231
host, port = urllib.splitport(host)
232
if port or socket.gethostbyname(host) not in self.get_names():
233
raise urllib2.URLError('file not on local host')
234
fo = open(localfile,'rb')
235
brange = req.headers.get('Range',None)
236
brange = range_header_to_tuple(brange)
240
if lb == '': lb = size
241
if fb < 0 or fb > size or lb > size:
242
raise RangeError('Requested Range Not Satisfiable')
244
fo = RangeableFileObject(fo, (fb,lb))
245
headers = mimetools.Message(StringIO(
246
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
247
(mtype or 'text/plain', size, modified)))
248
return urllib.addinfourl(fo, headers, 'file:'+file)
252
# Unfortunately, a large amount of base FTP code had to be copied
253
# from urllib and urllib2 in order to insert the FTP REST command.
254
# Code modifications for range support have been commented as
256
# -- range support modifications start/end here
258
from urllib import splitport, splituser, splitpasswd, splitattr, \
259
unquote, addclosehook, addinfourl
267
class FTPRangeHandler(urllib2.FTPHandler):
268
def ftp_open(self, req):
269
host = req.get_host()
271
raise IOError, ('ftp error', 'no host given')
272
host, port = splitport(host)
274
port = ftplib.FTP_PORT
276
# username/password handling
277
user, host = splituser(host)
279
user, passwd = splitpasswd(user)
283
user = unquote(user or '')
284
passwd = unquote(passwd or '')
287
host = socket.gethostbyname(host)
288
except socket.error, msg:
289
raise urllib2.URLError(msg)
290
path, attrs = splitattr(req.get_selector())
291
dirs = path.split('/')
292
dirs = map(unquote, dirs)
293
dirs, file = dirs[:-1], dirs[-1]
294
if dirs and not dirs[0]:
297
fw = self.connect_ftp(user, passwd, host, port, dirs)
298
type = file and 'I' or 'D'
300
attr, value = splitattr(attr)
301
if attr.lower() == 'type' and \
302
value in ('a', 'A', 'i', 'I', 'd', 'D'):
305
# -- range support modifications start here
307
range_tup = range_header_to_tuple(req.headers.get('Range',None))
308
assert range_tup != ()
312
# -- range support modifications end here
314
fp, retrlen = fw.retrfile(file, type, rest)
316
# -- range support modifications start here
320
if retrlen is None or retrlen == 0:
321
raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
325
# beginning of range is larger than file
326
raise RangeError('Requested Range Not Satisfiable')
329
fp = RangeableFileObject(fp, (0,retrlen))
330
# -- range support modifications end here
333
mtype = mimetypes.guess_type(req.get_full_url())[0]
335
headers += "Content-Type: %s\n" % mtype
336
if retrlen is not None and retrlen >= 0:
337
headers += "Content-Length: %d\n" % retrlen
338
sf = StringIO(headers)
339
headers = mimetools.Message(sf)
340
return addinfourl(fp, headers, req.get_full_url())
341
except ftplib.all_errors, msg:
342
raise IOError, ('ftp error', msg), sys.exc_info()[2]
344
def connect_ftp(self, user, passwd, host, port, dirs):
345
fw = ftpwrapper(user, passwd, host, port, dirs)
348
class ftpwrapper(urllib.ftpwrapper):
349
# range support note:
350
# this ftpwrapper code is copied directly from
351
# urllib. The only enhancement is to add the rest
352
# argument and pass it on to ftp.ntransfercmd
353
def retrfile(self, file, type, rest=None):
355
if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
356
else: cmd = 'TYPE ' + type; isdir = 0
358
self.ftp.voidcmd(cmd)
359
except ftplib.all_errors:
361
self.ftp.voidcmd(cmd)
363
if file and not isdir:
364
# Use nlst to see if the file exists at all
367
except ftplib.error_perm, reason:
368
raise IOError, ('ftp error', reason), sys.exc_info()[2]
369
# Restore the transfer mode!
370
self.ftp.voidcmd(cmd)
371
# Try to retrieve as a file
374
conn = self.ftp.ntransfercmd(cmd, rest)
375
except ftplib.error_perm, reason:
376
if str(reason)[:3] == '501':
377
# workaround for REST not supported error
378
fp, retrlen = self.retrfile(file, type)
379
fp = RangeableFileObject(fp, (rest,''))
381
elif str(reason)[:3] != '550':
382
raise IOError, ('ftp error', reason), sys.exc_info()[2]
384
# Set transfer mode to ASCII!
385
self.ftp.voidcmd('TYPE A')
386
# Try a directory listing
387
if file: cmd = 'LIST ' + file
389
conn = self.ftp.ntransfercmd(cmd)
391
# Pass back both a suitably decorated object and a retrieval length
392
return (addclosehook(conn[0].makefile('rb'),
393
self.endtransfer), conn[1])
396
####################################################################
397
# Range Tuple Functions
398
# XXX: These range tuple functions might go better in a class.
401
def range_header_to_tuple(range_header):
402
"""Get a (firstbyte,lastbyte) tuple from a Range header value.
404
Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
405
function pulls the firstbyte and lastbyte values and returns
406
a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
407
the header value, it is returned as an empty string in the
410
Return None if range_header is None
411
Return () if range_header does not conform to the range spec
416
if range_header is None: return None
419
_rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
420
match = _rangere.match(range_header)
422
tup = range_tuple_normalize(match.group(1,2))
424
tup = (tup[0],tup[1]+1)
428
def range_tuple_to_header(range_tup):
429
"""Convert a range tuple to a Range header value.
430
Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
431
if no range is needed.
433
if range_tup is None: return None
434
range_tup = range_tuple_normalize(range_tup)
437
range_tup = (range_tup[0],range_tup[1] - 1)
438
return 'bytes=%s-%s' % range_tup
440
def range_tuple_normalize(range_tup):
441
"""Normalize a (first_byte,last_byte) range tuple.
442
Return a tuple whose first element is guaranteed to be an int
443
and whose second element will be '' (meaning: the last byte) or
444
an int. Finally, return None if the normalized tuple == (0,'')
445
as that is equivelant to retrieving the entire file.
447
if range_tup is None: return None
450
if fb in (None,''): fb = 0
453
try: lb = range_tup[1]
454
except IndexError: lb = ''
456
if lb is None: lb = ''
457
elif lb != '': lb = int(lb)
458
# check if range is over the entire file
459
if (fb,lb) == (0,''): return None
460
# check that the range is valid
461
if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))