1
# This library is free software; you can redistribute it and/or
2
# modify it under the terms of the GNU Lesser General Public
3
# License as published by the Free Software Foundation; either
4
# version 2.1 of the License, or (at your option) any later version.
6
# This library is distributed in the hope that it will be useful,
7
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9
# Lesser General Public License for more details.
11
# You should have received a copy of the GNU Lesser General Public
12
# License along with this library; if not, write to the
13
# Free Software Foundation, Inc.,
14
# 59 Temple Place, Suite 330,
15
# Boston, MA 02111-1307 USA
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
20
"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
23
>>> from keepalive import HTTPHandler
24
>>> keepalive_handler = HTTPHandler()
25
>>> opener = urllib2.build_opener(keepalive_handler)
26
>>> urllib2.install_opener(opener)
28
>>> fo = urllib2.urlopen('http://www.python.org')
30
If a connection to a given host is requested, and all of the existing
31
connections are still in use, another connection will be opened. If
32
the handler tries to use an existing connection but it fails in some
33
way, it will be closed and removed from the pool.
35
To remove the handler, simply re-run build_opener with no arguments, and
38
You can explicitly close connections by using the close_connection()
39
method of the returned file-like object (described below) or you can
40
use the handler methods:
42
close_connection(host)
46
NOTE: using the close_connection and close_all methods of the handler
47
should be done with care when using multiple threads.
48
* there is nothing that prevents another thread from creating new
49
connections immediately after connections are closed
50
* no checks are done to prevent in-use connections from being closed
52
>>> keepalive_handler.close_all()
54
EXTRA ATTRIBUTES AND METHODS
56
Upon a status of 200, the object returned has a few additional
57
attributes and methods, which should not be used if you want to
58
remain consistent with the normal urllib2-returned objects:
60
close_connection() - close the connection to the host
61
readlines() - you know, readlines()
62
status - the return status (ie 404)
63
reason - english translation of status (ie 'File not found')
65
If you want the best of both worlds, use this inside an
66
AttributeError-catching try:
68
>>> try: status = fo.status
69
>>> except AttributeError: status = None
71
Unfortunately, these are ONLY there if status == 200, so it's not
72
easy to distinguish between non-200 responses. The reason is that
73
urllib2 tries to do clever things with error codes 301, 302, 401,
74
and 407, and it wraps the object upon return.
76
For python versions earlier than 2.4, you can avoid this fancy error
77
handling by setting the module-level global HANDLE_ERRORS to zero.
78
You see, prior to 2.4, it's the HTTP Handler's job to determine what
79
to handle specially, and what to just pass up. HANDLE_ERRORS == 0
80
means "pass everything up". In python 2.4, however, this job no
81
longer belongs to the HTTP Handler and is now done by a NEW handler,
82
HTTPErrorProcessor. Here's the bottom line:
85
HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
87
HANDLE_ERRORS == 0 pass everything up, error processing is
88
left to the calling code
90
HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
91
HANDLE_ERRORS == 0 (default) pass everything up, let the
92
other handlers (specifically,
93
HTTPErrorProcessor) decide what to do
95
In practice, setting the variable either way makes little difference
96
in python 2.4, so for the most consistent behavior across versions,
97
you probably just want to use the defaults, which will give you
102
# $Id: keepalive.py,v 1.17 2006/12/08 00:14:16 mstenner Exp $
114
if sys.version_info < (2, 4): HANDLE_ERRORS = 1
115
else: HANDLE_ERRORS = 0
117
class ConnectionManager:
119
The connection manager must be able to:
120
* keep track of all existing
123
self._lock = thread.allocate_lock()
124
self._hostmap = {} # map hosts to a list of connections
125
self._connmap = {} # map connections to host
126
self._readymap = {} # map connection to ready state
128
def add(self, host, connection, ready):
131
if not self._hostmap.has_key(host): self._hostmap[host] = []
132
self._hostmap[host].append(connection)
133
self._connmap[connection] = host
134
self._readymap[connection] = ready
138
def remove(self, connection):
142
host = self._connmap[connection]
146
del self._connmap[connection]
147
del self._readymap[connection]
148
self._hostmap[host].remove(connection)
149
if not self._hostmap[host]: del self._hostmap[host]
153
def set_ready(self, connection, ready):
154
try: self._readymap[connection] = ready
155
except KeyError: pass
157
def get_ready_conn(self, host):
161
if self._hostmap.has_key(host):
162
for c in self._hostmap[host]:
163
if self._readymap[c]:
164
self._readymap[c] = 0
171
def get_all(self, host=None):
173
return list(self._hostmap.get(host, []))
175
return dict(self._hostmap)
177
class KeepAliveHandler:
179
self._cm = ConnectionManager()
181
#### Connection Management
182
def open_connections(self):
183
"""return a list of connected hosts and the number of connections
184
to each. [('foo.com:80', 2), ('bar.org', 1)]"""
185
return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
187
def close_connection(self, host):
188
"""close connection(s) to <host>
189
host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
190
no error occurs if there is no connection to that host."""
191
for h in self._cm.get_all(host):
196
"""close all open connections"""
197
for host, conns in self._cm.get_all().items():
202
def _request_closed(self, request, host, connection):
203
"""tells us that this request is now closed and the the
204
connection is ready for another request"""
205
self._cm.set_ready(connection, 1)
207
def _remove_connection(self, host, connection, close=0):
208
if close: connection.close()
209
self._cm.remove(connection)
211
#### Transaction Execution
212
def do_open(self, req):
213
host = req.get_host()
215
raise urllib2.URLError('no host given')
218
h = self._cm.get_ready_conn(host)
220
r = self._reuse_connection(h, req, host)
222
# if this response is non-None, then it worked and we're
223
# done. Break out, skipping the else block.
226
# connection is bad - possibly closed by server
227
# discard it and ask for the next free connection
230
h = self._cm.get_ready_conn(host)
232
# no (working) free connections were found. Create a new one.
233
h = self._get_connection(host)
234
if DEBUG: DEBUG.info("creating new connection to %s (%d)",
236
self._cm.add(host, h, 0)
237
self._start_transaction(h, req)
239
except (socket.error, httplib.HTTPException), err:
240
raise urllib2.URLError(err)
242
if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
244
# if not a persistent connection, don't try to reuse it
246
if DEBUG: DEBUG.info('server will close connection, discarding')
251
r._url = req.get_full_url()
257
if r.status == 200 or not HANDLE_ERRORS:
260
return self.parent.error('http', req, r,
261
r.status, r.msg, r.headers)
263
def _reuse_connection(self, h, req, host):
264
"""start the transaction with a re-used connection
265
return a response object (r) upon success or None on failure.
266
This DOES not close or remove bad connections in cases where
267
it returns. However, if an unexpected exception occurs, it
268
will close and remove the connection before re-raising.
271
self._start_transaction(h, req)
273
# note: just because we got something back doesn't mean it
274
# worked. We'll check the version below, too.
275
except (socket.error, httplib.HTTPException):
278
# adding this block just in case we've missed
279
# something we will still raise the exception, but
280
# lets try and close the connection and remove it
281
# first. We previously got into a nasty loop
282
# where an exception was uncaught, and so the
283
# connection stayed open. On the next try, the
284
# same exception was raised, etc. The tradeoff is
285
# that it's now possible this call will raise
286
# a DIFFERENT exception
287
if DEBUG: DEBUG.error("unexpected exception - closing " + \
288
"connection to %s (%d)", host, id(h))
293
if r is None or r.version == 9:
294
# httplib falls back to assuming HTTP 0.9 if it gets a
295
# bad header back. This is most likely to happen if
296
# the socket has been closed by the server since we
297
# last used the connection.
298
if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
302
if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
306
def _start_transaction(self, h, req):
309
data = req.get_data()
310
h.putrequest('POST', req.get_selector())
311
if not req.headers.has_key('Content-type'):
312
h.putheader('Content-type',
313
'application/x-www-form-urlencoded')
314
if not req.headers.has_key('Content-length'):
315
h.putheader('Content-length', '%d' % len(data))
317
h.putrequest('GET', req.get_selector())
318
except (socket.error, httplib.HTTPException), err:
319
raise urllib2.URLError(err)
321
for args in self.parent.addheaders:
323
for k, v in req.headers.items():
329
def _get_connection(self, host):
330
return NotImplementedError
332
class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
334
KeepAliveHandler.__init__(self)
336
def http_open(self, req):
337
return self.do_open(req)
339
def _get_connection(self, host):
340
return HTTPConnection(host)
342
class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
343
def __init__(self, ssl_factory=None):
344
KeepAliveHandler.__init__(self)
346
ssl_factory = sslfactory.get_factory()
347
self._ssl_factory = ssl_factory
349
def https_open(self, req):
350
return self.do_open(req)
352
def _get_connection(self, host):
353
try: return self._ssl_factory.get_https_connection(host)
354
except AttributeError: return HTTPSConnection(host)
356
class HTTPResponse(httplib.HTTPResponse):
357
# we need to subclass HTTPResponse in order to
358
# 1) add readline() and readlines() methods
359
# 2) add close_connection() methods
360
# 3) add info() and geturl() methods
362
# in order to add readline(), read must be modified to deal with a
363
# buffer. example: readline must read a buffer and then spit back
364
# one line at a time. The only real alternative is to read one
365
# BYTE at a time (ick). Once something has been read, it can't be
366
# put back (ok, maybe it can, but that's even uglier than this),
367
# so if you THEN do a normal read, you must first take stuff from
370
# the read method wraps the original to accomodate buffering,
371
# although read() never adds to the buffer.
372
# Both readline and readlines have been stolen with almost no
373
# modification from socket.py
376
def __init__(self, sock, debuglevel=0, strict=0, method=None):
377
if method: # the httplib in python 2.3 uses the method arg
378
httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
380
httplib.HTTPResponse.__init__(self, sock, debuglevel)
381
self.fileno = sock.fileno
384
self._rbufsize = 8096
385
self._handler = None # inserted by the handler later
386
self._host = None # (same)
387
self._url = None # (same)
388
self._connection = None # (same)
390
_raw_read = httplib.HTTPResponse.read
397
self._handler._request_closed(self, self._host,
400
def close_connection(self):
401
self._handler._remove_connection(self._host, self._connection, close=1)
410
def read(self, amt=None):
411
# the _rbuf test is only in this first if for speed. It's not
412
# logically necessary
413
if self._rbuf and not amt is None:
419
self._rbuf = self._rbuf[amt:]
422
s = self._rbuf + self._raw_read(amt)
426
def readline(self, limit=-1):
428
i = self._rbuf.find('\n')
429
while i < 0 and not (0 < limit <= len(self._rbuf)):
430
new = self._raw_read(self._rbufsize)
433
if i >= 0: i = i + len(self._rbuf)
434
self._rbuf = self._rbuf + new
435
if i < 0: i = len(self._rbuf)
437
if 0 <= limit < len(self._rbuf): i = limit
438
data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
441
def readlines(self, sizehint = 0):
445
line = self.readline()
449
if sizehint and total >= sizehint:
454
class HTTPConnection(httplib.HTTPConnection):
455
# use the modified response class
456
response_class = HTTPResponse
458
class HTTPSConnection(httplib.HTTPSConnection):
459
response_class = HTTPResponse
461
#########################################################################
463
#########################################################################
465
def error_handler(url):
468
keepalive_handler = HTTPHandler()
469
opener = urllib2.build_opener(keepalive_handler)
470
urllib2.install_opener(opener)
471
pos = {0: 'off', 1: 'on'}
473
print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
476
fo = urllib2.urlopen(url)
479
try: status, reason = fo.status, fo.reason
480
except AttributeError: status, reason = None, None
482
print " EXCEPTION: %s" % e
485
print " status = %s, reason = %s" % (status, reason)
487
hosts = keepalive_handler.open_connections()
488
print "open connections:", hosts
489
keepalive_handler.close_all()
495
# first fetch the file with the normal http handler
496
opener = urllib2.build_opener()
497
urllib2.install_opener(opener)
498
fo = urllib2.urlopen(url)
502
print format % ('normal urllib', m.hexdigest())
504
# now install the keepalive handler and try again
505
opener = urllib2.build_opener(HTTPHandler())
506
urllib2.install_opener(opener)
508
fo = urllib2.urlopen(url)
512
print format % ('keepalive read', m.hexdigest())
514
fo = urllib2.urlopen(url)
522
print format % ('keepalive readline', m.hexdigest())
525
print ' making %i connections to:\n %s' % (N, url)
527
sys.stdout.write(' first using the normal urllib handlers')
528
# first use normal opener
529
opener = urllib2.build_opener()
530
urllib2.install_opener(opener)
532
print ' TIME: %.3f s' % t1
534
sys.stdout.write(' now using the keepalive handler ')
535
# now install the keepalive handler and try again
536
opener = urllib2.build_opener(HTTPHandler())
537
urllib2.install_opener(opener)
539
print ' TIME: %.3f s' % t2
540
print ' improvement factor: %.2f' % (t1/t2, )
542
def fetch(N, url, delay=0):
545
starttime = time.time()
547
if delay and i > 0: time.sleep(delay)
548
fo = urllib2.urlopen(url)
551
lens.append(len(foo))
552
diff = time.time() - starttime
558
print "WARNING: inconsistent length on read %i: %i" % (j, i)
562
def test_timeout(url):
566
def debug(self, msg, *args): print msg % args
567
info = warning = error = debug
569
print " fetching the file to establish a connection"
570
fo = urllib2.urlopen(url)
575
print " waiting %i seconds for the server to close the connection" % i
577
sys.stdout.write('\r %2i' % i)
581
sys.stderr.write('\r')
583
print " fetching the file a second time"
584
fo = urllib2.urlopen(url)
589
print ' data are identical'
591
print ' ERROR: DATA DIFFER'
597
print "checking error hander (do this on a non-200)"
598
try: error_handler(url)
600
print "exiting - exception will prevent further tests"
603
print "performing continuity test (making sure stuff isn't corrupted)"
606
print "performing speed comparison"
609
print "performing dropped-connection check"
612
if __name__ == '__main__':
619
print "%s <integer> <url>" % sys.argv[0]