1
from __future__ import generators
5
A caching http interface that supports ETags and gzip
8
Requires Python 2.3 or later
11
2007-08-18, Rick: Modified so it's able to use a socks proxy if needed.
15
__author__ = "Joe Gregorio (joe@bitworking.org)"
16
__copyright__ = "Copyright 2006, Joe Gregorio"
17
__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
19
"Xavier Verges Farrero",
32
import email.FeedParser
44
# remove depracated warning in python2.6
46
from hashlib import sha1 as _sha, md5 as _md5
53
from gettext import gettext as _
61
# Build the appropriate socket wrapper for ssl
63
import ssl # python 2.6
64
_ssl_wrap_socket = ssl.wrap_socket
66
def _ssl_wrap_socket(sock, key_file, cert_file):
67
ssl_sock = socket.ssl(sock, key_file, cert_file)
68
return httplib.FakeSocket(sock, ssl_sock)
71
if sys.version_info >= (2,3):
72
from iri2uri import iri2uri
77
def has_timeout(timeout): # python 2.6
78
if hasattr(socket, '_GLOBAL_DEFAULT_TIMEOUT'):
79
return (timeout is not None and timeout is not socket._GLOBAL_DEFAULT_TIMEOUT)
80
return (timeout is not None)
82
__all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error',
83
'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
84
'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
88
# The httplib debug level, set to a non-zero value to get debug output
93
if sys.version_info < (2,4):
99
def HTTPResponse__getheaders(self):
100
"""Return list of (header, value) tuples."""
102
raise httplib.ResponseNotReady()
103
return self.msg.items()
105
if not hasattr(httplib.HTTPResponse, 'getheaders'):
106
httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
108
# All exceptions raised here derive from HttpLib2Error
109
class HttpLib2Error(Exception): pass
111
# Some exceptions can be caught and optionally
112
# be turned back into responses.
113
class HttpLib2ErrorWithResponse(HttpLib2Error):
114
def __init__(self, desc, response, content):
115
self.response = response
116
self.content = content
117
HttpLib2Error.__init__(self, desc)
119
class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass
120
class RedirectLimit(HttpLib2ErrorWithResponse): pass
121
class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass
122
class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
123
class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
125
class RelativeURIError(HttpLib2Error): pass
126
class ServerNotFoundError(HttpLib2Error): pass
132
# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
134
# Pluggable cache storage (supports storing the cache in
135
# flat files by default. We need a plug-in architecture
136
# that can support Berkeley DB and Squid)
139
# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
140
# Does not handle Cache-Control: max-stale
141
# Does not use Age: headers when calculating cache freshness.
144
# The number of redirections to follow before giving up.
145
# Note that only GET redirects are automatically followed.
146
# Will also honor 301 requests by saving that info and never
147
# requesting that URI again.
148
DEFAULT_MAX_REDIRECTS = 5
150
# Which headers are hop-by-hop headers by default
151
HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
153
def _get_end2end_headers(response):
154
hopbyhop = list(HOP_BY_HOP)
155
hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
156
return [header for header in response.keys() if header not in hopbyhop]
158
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
161
"""Parses a URI using the regex given in Appendix B of RFC 3986.
163
(scheme, authority, path, query, fragment) = parse_uri(uri)
165
groups = URI.match(uri).groups()
166
return (groups[1], groups[3], groups[4], groups[6], groups[8])
169
(scheme, authority, path, query, fragment) = parse_uri(uri)
170
if not scheme or not authority:
171
raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
172
authority = authority.lower()
173
scheme = scheme.lower()
176
# Could do syntax based normalization of the URI before
177
# computing the digest. See Section 6.2.2 of Std 66.
178
request_uri = query and "?".join([path, query]) or path
179
scheme = scheme.lower()
180
defrag_uri = scheme + "://" + authority + request_uri
181
return scheme, authority, request_uri, defrag_uri
184
# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
185
re_url_scheme = re.compile(r'^\w+://')
186
re_slash = re.compile(r'[?/:|]+')
188
def safename(filename):
189
"""Return a filename suitable for the cache.
191
Strips dangerous and common characters to create a filename we
192
can use to store the cache in.
196
if re_url_scheme.match(filename):
197
if isinstance(filename,str):
198
filename = filename.decode('utf-8')
199
filename = filename.encode('idna')
201
filename = filename.encode('idna')
204
if isinstance(filename,unicode):
205
filename=filename.encode('utf-8')
206
filemd5 = _md5(filename).hexdigest()
207
filename = re_url_scheme.sub("", filename)
208
filename = re_slash.sub(",", filename)
210
# limit length of filename
211
if len(filename)>200:
212
filename=filename[:200]
213
return ",".join((filename, filemd5))
215
NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
216
def _normalize_headers(headers):
217
return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
219
def _parse_cache_control(headers):
221
if headers.has_key('cache-control'):
222
parts = headers['cache-control'].split(',')
223
parts_with_args = [tuple([x.strip().lower() for x in part.split("=", 1)]) for part in parts if -1 != part.find("=")]
224
parts_wo_args = [(name.strip().lower(), 1) for name in parts if -1 == name.find("=")]
225
retval = dict(parts_with_args + parts_wo_args)
228
# Whether to use a strict mode to parse WWW-Authenticate headers
229
# Might lead to bad results in case of ill-formed header value,
230
# so disabled by default, falling back to relaxed parsing.
231
# Set to true to turn on, usefull for testing servers.
232
USE_WWW_AUTH_STRICT_PARSING = 0
235
# [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP
236
# "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
237
# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
238
# \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
239
WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
240
WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
241
UNQUOTE_PAIRS = re.compile(r'\\(.)')
242
def _parse_www_authenticate(headers, headername='www-authenticate'):
243
"""Returns a dictionary of dictionaries, one dict
246
if headers.has_key(headername):
247
authenticate = headers[headername].strip()
248
www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
250
# Break off the scheme at the beginning of the line
251
if headername == 'authentication-info':
252
(auth_scheme, the_rest) = ('digest', authenticate)
254
(auth_scheme, the_rest) = authenticate.split(" ", 1)
255
# Now loop over all the key value pairs that come after the scheme,
256
# being careful not to roll into the next scheme
257
match = www_auth.search(the_rest)
260
if match and len(match.groups()) == 3:
261
(key, value, the_rest) = match.groups()
262
auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
263
match = www_auth.search(the_rest)
264
retval[auth_scheme.lower()] = auth_params
265
authenticate = the_rest.strip()
269
def _entry_disposition(response_headers, request_headers):
270
"""Determine freshness from the Date, Expires and Cache-Control headers.
272
We don't handle the following:
274
1. Cache-Control: max-stale
275
2. Age: headers are not used in the calculations.
277
Not that this algorithm is simpler than you might think
278
because we are operating as a private (non-shared) cache.
279
This lets us ignore 's-maxage'. We can also ignore
280
'proxy-invalidate' since we aren't a proxy.
281
We will never return a stale document as
282
fresh as a design decision, and thus the non-implementation
283
of 'max-stale'. This also lets us safely ignore 'must-revalidate'
284
since we operate as if every server has sent 'must-revalidate'.
285
Since we are private we get to ignore both 'public' and
286
'private' parameters. We also ignore 'no-transform' since
287
we don't do any transformations.
288
The 'no-store' parameter is handled at a higher level.
289
So the only Cache-Control parameters we look at are:
298
cc = _parse_cache_control(request_headers)
299
cc_response = _parse_cache_control(response_headers)
301
if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
302
retval = "TRANSPARENT"
303
if 'cache-control' not in request_headers:
304
request_headers['cache-control'] = 'no-cache'
305
elif cc.has_key('no-cache'):
306
retval = "TRANSPARENT"
307
elif cc_response.has_key('no-cache'):
309
elif cc.has_key('only-if-cached'):
311
elif response_headers.has_key('date'):
312
date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
314
current_age = max(0, now - date)
315
if cc_response.has_key('max-age'):
317
freshness_lifetime = int(cc_response['max-age'])
319
freshness_lifetime = 0
320
elif response_headers.has_key('expires'):
321
expires = email.Utils.parsedate_tz(response_headers['expires'])
323
freshness_lifetime = 0
325
freshness_lifetime = max(0, calendar.timegm(expires) - date)
327
freshness_lifetime = 0
328
if cc.has_key('max-age'):
330
freshness_lifetime = int(cc['max-age'])
332
freshness_lifetime = 0
333
if cc.has_key('min-fresh'):
335
min_fresh = int(cc['min-fresh'])
338
current_age += min_fresh
339
if freshness_lifetime > current_age:
343
def _decompressContent(response, new_content):
344
content = new_content
346
encoding = response.get('content-encoding', None)
347
if encoding in ['gzip', 'deflate']:
348
if encoding == 'gzip':
349
content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
350
if encoding == 'deflate':
351
content = zlib.decompress(content)
352
response['content-length'] = str(len(content))
353
# Record the historical presence of the encoding in a way the won't interfere.
354
response['-content-encoding'] = response['content-encoding']
355
del response['content-encoding']
358
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
361
def _updateCache(request_headers, response_headers, content, cache, cachekey):
363
cc = _parse_cache_control(request_headers)
364
cc_response = _parse_cache_control(response_headers)
365
if cc.has_key('no-store') or cc_response.has_key('no-store'):
366
cache.delete(cachekey)
368
info = email.Message.Message()
369
for key, value in response_headers.iteritems():
370
if key not in ['status','content-encoding','transfer-encoding']:
373
# Add annotations to the cache to indicate what headers
374
# are variant for this request.
375
vary = response_headers.get('vary', None)
377
vary_headers = vary.lower().replace(' ', '').split(',')
378
for header in vary_headers:
379
key = '-varied-%s' % header
381
info[key] = request_headers[header]
385
status = response_headers.status
389
status_header = 'status: %d\r\n' % response_headers.status
391
header_str = info.as_string()
393
header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
394
text = "".join([status_header, header_str, content])
396
cache.set(cachekey, text)
399
dig = _md5("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
402
def _wsse_username_token(cnonce, iso_now, password):
403
return base64.b64encode(_sha("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
406
# For credentials we need two things, first
407
# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
408
# Then we also need a list of URIs that have already demanded authentication
409
# That list is tricky since sub-URIs can take the same auth, or the
410
# auth scheme may change as you descend the tree.
411
# So we also need each Auth instance to be able to tell us
412
# how close to the 'top' it is.
414
class Authentication(object):
415
def __init__(self, credentials, host, request_uri, headers, response, content, http):
416
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
419
self.credentials = credentials
422
def depth(self, request_uri):
423
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
424
return request_uri[len(self.path):].count("/")
426
def inscope(self, host, request_uri):
427
# XXX Should we normalize the request_uri?
428
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
429
return (host == self.host) and path.startswith(self.path)
431
def request(self, method, request_uri, headers, content):
432
"""Modify the request headers to add the appropriate
433
Authorization header. Over-rise this in sub-classes."""
436
def response(self, response, content):
437
"""Gives us a chance to update with new nonces
438
or such returned from the last authorized response.
439
Over-rise this in sub-classes if necessary.
441
Return TRUE is the request is to be retried, for
442
example Digest may return stale=true.
448
class BasicAuthentication(Authentication):
449
def __init__(self, credentials, host, request_uri, headers, response, content, http):
450
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
452
def request(self, method, request_uri, headers, content):
453
"""Modify the request headers to add the appropriate
454
Authorization header."""
455
headers['authorization'] = 'Basic ' + base64.b64encode("%s:%s" % self.credentials).strip()
458
class DigestAuthentication(Authentication):
459
"""Only do qop='auth' and MD5, since that
460
is all Apache currently implements"""
461
def __init__(self, credentials, host, request_uri, headers, response, content, http):
462
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
463
challenge = _parse_www_authenticate(response, 'www-authenticate')
464
self.challenge = challenge['digest']
465
qop = self.challenge.get('qop', 'auth')
466
self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
467
if self.challenge['qop'] is None:
468
raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
469
self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5').upper()
470
if self.challenge['algorithm'] != 'MD5':
471
raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
472
self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
473
self.challenge['nc'] = 1
475
def request(self, method, request_uri, headers, content, cnonce = None):
476
"""Modify the request headers"""
477
H = lambda x: _md5(x).hexdigest()
478
KD = lambda s, d: H("%s:%s" % (s, d))
479
A2 = "".join([method, ":", request_uri])
480
self.challenge['cnonce'] = cnonce or _cnonce()
481
request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
482
'%08x' % self.challenge['nc'],
483
self.challenge['cnonce'],
484
self.challenge['qop'], H(A2)
486
headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
488
self.challenge['realm'],
489
self.challenge['nonce'],
491
self.challenge['algorithm'],
493
self.challenge['qop'],
494
self.challenge['nc'],
495
self.challenge['cnonce'],
497
self.challenge['nc'] += 1
499
def response(self, response, content):
500
if not response.has_key('authentication-info'):
501
challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
502
if 'true' == challenge.get('stale'):
503
self.challenge['nonce'] = challenge['nonce']
504
self.challenge['nc'] = 1
507
updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
509
if updated_challenge.has_key('nextnonce'):
510
self.challenge['nonce'] = updated_challenge['nextnonce']
511
self.challenge['nc'] = 1
515
class HmacDigestAuthentication(Authentication):
516
"""Adapted from Robert Sayre's code and DigestAuthentication above."""
517
__author__ = "Thomas Broyer (t.broyer@ltgt.net)"
519
def __init__(self, credentials, host, request_uri, headers, response, content, http):
520
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
521
challenge = _parse_www_authenticate(response, 'www-authenticate')
522
self.challenge = challenge['hmacdigest']
523
# TODO: self.challenge['domain']
524
self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
525
if self.challenge['reason'] not in ['unauthorized', 'integrity']:
526
self.challenge['reason'] = 'unauthorized'
527
self.challenge['salt'] = self.challenge.get('salt', '')
528
if not self.challenge.get('snonce'):
529
raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
530
self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
531
if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
532
raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
533
self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
534
if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
535
raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
536
if self.challenge['algorithm'] == 'HMAC-MD5':
540
if self.challenge['pw-algorithm'] == 'MD5':
541
self.pwhashmod = _md5
543
self.pwhashmod = _sha
544
self.key = "".join([self.credentials[0], ":",
545
self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
546
":", self.challenge['realm']
548
self.key = self.pwhashmod.new(self.key).hexdigest().lower()
550
def request(self, method, request_uri, headers, content):
551
"""Modify the request headers"""
552
keys = _get_end2end_headers(headers)
553
keylist = "".join(["%s " % k for k in keys])
554
headers_val = "".join([headers[k] for k in keys])
555
created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
557
request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
558
request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
559
headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
561
self.challenge['realm'],
562
self.challenge['snonce'],
570
def response(self, response, content):
571
challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
572
if challenge.get('reason') in ['integrity', 'stale']:
577
class WsseAuthentication(Authentication):
578
"""This is thinly tested and should not be relied upon.
579
At this time there isn't any third party server to test against.
580
Blogger and TypePad implemented this algorithm at one point
581
but Blogger has since switched to Basic over HTTPS and
582
TypePad has implemented it wrong, by never issuing a 401
583
challenge but instead requiring your client to telepathically know that
584
their endpoint is expecting WSSE profile="UsernameToken"."""
585
def __init__(self, credentials, host, request_uri, headers, response, content, http):
586
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
588
def request(self, method, request_uri, headers, content):
589
"""Modify the request headers to add the appropriate
590
Authorization header."""
591
headers['Authorization'] = 'WSSE profile="UsernameToken"'
592
iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
594
password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
595
headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
601
class GoogleLoginAuthentication(Authentication):
602
def __init__(self, credentials, host, request_uri, headers, response, content, http):
603
from urllib import urlencode
604
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
605
challenge = _parse_www_authenticate(response, 'www-authenticate')
606
service = challenge['googlelogin'].get('service', 'xapi')
607
# Bloggger actually returns the service in the challenge
608
# For the rest we guess based on the URI
609
if service == 'xapi' and request_uri.find("calendar") > 0:
611
# No point in guessing Base or Spreadsheet
612
#elif request_uri.find("spreadsheets") > 0:
615
auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent'])
616
resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
617
lines = content.split('\n')
618
d = dict([tuple(line.split("=", 1)) for line in lines if line])
619
if resp.status == 403:
622
self.Auth = d['Auth']
624
def request(self, method, request_uri, headers, content):
625
"""Modify the request headers to add the appropriate
626
Authorization header."""
627
headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
630
AUTH_SCHEME_CLASSES = {
631
"basic": BasicAuthentication,
632
"wsse": WsseAuthentication,
633
"digest": DigestAuthentication,
634
"hmacdigest": HmacDigestAuthentication,
635
"googlelogin": GoogleLoginAuthentication
638
AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
640
class FileCache(object):
641
"""Uses a local directory as a store for cached files.
642
Not really safe to use if multiple threads or processes are going to
643
be running on the same cache.
645
def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
648
if not os.path.exists(cache):
649
os.makedirs(self.cache)
653
cacheFullPath = os.path.join(self.cache, self.safe(key))
655
f = file(cacheFullPath, "rb")
662
def set(self, key, value):
663
cacheFullPath = os.path.join(self.cache, self.safe(key))
664
f = file(cacheFullPath, "wb")
668
def delete(self, key):
669
cacheFullPath = os.path.join(self.cache, self.safe(key))
670
if os.path.exists(cacheFullPath):
671
os.remove(cacheFullPath)
673
class Credentials(object):
675
self.credentials = []
677
def add(self, name, password, domain=""):
678
self.credentials.append((domain.lower(), name, password))
681
self.credentials = []
683
def iter(self, domain):
684
for (cdomain, name, password) in self.credentials:
685
if cdomain == "" or domain == cdomain:
686
yield (name, password)
688
class KeyCerts(Credentials):
689
"""Identical to Credentials except that
690
name/password are mapped to key/cert."""
694
class ProxyInfo(object):
695
"""Collect information required to use a proxy."""
696
def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None):
697
"""The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX
698
constants. For example:
700
p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000)
702
self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass
705
return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns,
706
self.proxy_user, self.proxy_pass)
709
return socks and (self.proxy_host != None) and (self.proxy_port != None)
712
class HTTPConnectionWithTimeout(httplib.HTTPConnection):
713
"""HTTPConnection subclass that supports timeouts"""
715
def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None):
716
httplib.HTTPConnection.__init__(self, host, port, strict)
717
self.timeout = timeout
718
self.proxy_info = proxy_info
721
"""Connect to the host and port specified in __init__."""
722
# Mostly verbatim from httplib.py.
723
msg = "getaddrinfo returns an empty list"
724
for res in socket.getaddrinfo(self.host, self.port, 0,
726
af, socktype, proto, canonname, sa = res
728
if self.proxy_info and self.proxy_info.isgood():
729
self.sock = socks.socksocket(af, socktype, proto)
730
self.sock.setproxy(*self.proxy_info.astuple())
732
self.sock = socket.socket(af, socktype, proto)
733
# Different from httplib: support timeouts.
734
if has_timeout(self.timeout):
735
self.sock.settimeout(self.timeout)
736
# End of difference from httplib.
737
if self.debuglevel > 0:
738
print "connect: (%s, %s)" % (self.host, self.port)
740
self.sock.connect(sa)
741
except socket.error, msg:
742
if self.debuglevel > 0:
743
print 'connect fail:', (self.host, self.port)
750
raise socket.error, msg
752
class HTTPSConnectionWithTimeout(httplib.HTTPSConnection):
753
"This class allows communication via SSL."
755
def __init__(self, host, port=None, key_file=None, cert_file=None,
756
strict=None, timeout=None, proxy_info=None):
757
httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file,
758
cert_file=cert_file, strict=strict)
759
self.timeout = timeout
760
self.proxy_info = proxy_info
763
"Connect to a host on a given (SSL) port."
765
if self.proxy_info and self.proxy_info.isgood():
766
sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM)
767
sock.setproxy(*self.proxy_info.astuple())
769
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
771
if has_timeout(self.timeout):
772
sock.settimeout(self.timeout)
773
sock.connect((self.host, self.port))
774
self.sock =_ssl_wrap_socket(sock, self.key_file, self.cert_file)
779
"""An HTTP client that handles:
791
def __init__(self, cache=None, timeout=None, proxy_info=None):
792
"""The value of proxy_info is a ProxyInfo instance.
794
If 'cache' is a string then it is used as a directory name
795
for a disk cache. Otherwise it must be an object that supports
796
the same interface as FileCache."""
797
self.proxy_info = proxy_info
798
# Map domain name to an httplib connection
799
self.connections = {}
800
# The location of the cache, for now a directory
801
# where cached responses are held.
802
if cache and isinstance(cache, str):
803
self.cache = FileCache(cache)
808
self.credentials = Credentials()
811
self.certificates = KeyCerts()
813
# authorization objects
814
self.authorizations = []
816
# If set to False then no redirects are followed, even safe ones.
817
self.follow_redirects = True
819
# Which HTTP methods do we apply optimistic concurrency to, i.e.
820
# which methods get an "if-match:" etag header added to them.
821
self.optimistic_concurrency_methods = ["PUT"]
823
# If 'follow_redirects' is True, and this is set to True then
824
# all redirecs are followed, including unsafe ones.
825
self.follow_all_redirects = False
827
self.ignore_etag = False
829
self.force_exception_to_status_code = False
831
self.timeout = timeout
833
def _auth_from_challenge(self, host, request_uri, headers, response, content):
834
"""A generator that creates Authorization objects
835
that can be applied to requests.
837
challenges = _parse_www_authenticate(response, 'www-authenticate')
838
for cred in self.credentials.iter(host):
839
for scheme in AUTH_SCHEME_ORDER:
840
if challenges.has_key(scheme):
841
yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
843
def add_credentials(self, name, password, domain=""):
844
"""Add a name and password that will be used
845
any time a request requires authentication."""
846
self.credentials.add(name, password, domain)
848
def add_certificate(self, key, cert, domain):
849
"""Add a key and cert that will be used
850
any time a request requires authentication."""
851
self.certificates.add(key, cert, domain)
853
def clear_credentials(self):
854
"""Remove all the names and passwords
855
that are used for authentication"""
856
self.credentials.clear()
857
self.authorizations = []
859
def _conn_request(self, conn, request_uri, method, body, headers):
862
conn.request(method, request_uri, body, headers)
863
except socket.gaierror:
865
raise ServerNotFoundError("Unable to find the server at %s" % conn.host)
866
except (socket.error, httplib.HTTPException):
867
# Just because the server closed the connection doesn't apparently mean
868
# that the server didn't send a response.
871
response = conn.getresponse()
872
except (socket.error, httplib.HTTPException):
884
content = response.read()
885
response = Response(response)
887
content = _decompressContent(response, content)
889
return (response, content)
892
def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
893
"""Do the actual request using the connection object
894
and also follow one level of redirects if necessary"""
896
auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
897
auth = auths and sorted(auths)[0][1] or None
899
auth.request(method, request_uri, headers, body)
901
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
904
if auth.response(response, body):
905
auth.request(method, request_uri, headers, body)
906
(response, content) = self._conn_request(conn, request_uri, method, body, headers )
907
response._stale_digest = 1
909
if response.status == 401:
910
for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
911
authorization.request(method, request_uri, headers, body)
912
(response, content) = self._conn_request(conn, request_uri, method, body, headers, )
913
if response.status != 401:
914
self.authorizations.append(authorization)
915
authorization.response(response, body)
918
if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303):
919
if self.follow_redirects and response.status in [300, 301, 302, 303, 307]:
920
# Pick out the location header and basically start from the beginning
921
# remembering first to strip the ETag header and decrement our 'depth'
923
if not response.has_key('location') and response.status != 300:
924
raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content)
925
# Fix-up relative redirects (which violate an RFC 2616 MUST)
926
if response.has_key('location'):
927
location = response['location']
928
(scheme, authority, path, query, fragment) = parse_uri(location)
929
if authority == None:
930
response['location'] = urlparse.urljoin(absolute_uri, location)
931
if response.status == 301 and method in ["GET", "HEAD"]:
932
response['-x-permanent-redirect-url'] = response['location']
933
if not response.has_key('content-location'):
934
response['content-location'] = absolute_uri
935
_updateCache(headers, response, content, self.cache, cachekey)
936
if headers.has_key('if-none-match'):
937
del headers['if-none-match']
938
if headers.has_key('if-modified-since'):
939
del headers['if-modified-since']
940
if response.has_key('location'):
941
location = response['location']
942
old_response = copy.deepcopy(response)
943
if not old_response.has_key('content-location'):
944
old_response['content-location'] = absolute_uri
945
redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
946
(response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
947
response.previous = old_response
949
raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content)
950
elif response.status in [200, 203] and method == "GET":
951
# Don't cache 206's since we aren't going to handle byte range requests
952
if not response.has_key('content-location'):
953
response['content-location'] = absolute_uri
954
_updateCache(headers, response, content, self.cache, cachekey)
956
return (response, content)
958
def _normalize_headers(self, headers):
959
return _normalize_headers(headers)
961
# Need to catch and rebrand some exceptions
962
# Then need to optionally turn all exceptions into status codes
963
# including all socket.* and httplib.* exceptions.
966
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
967
""" Performs a single HTTP request.
968
The 'uri' is the URI of the HTTP resource and can begin
969
with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
971
The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
972
There is no restriction on the methods allowed.
974
The 'body' is the entity body to be sent with the request. It is a string
977
Any extra headers that are to be sent with the request should be provided in the
978
'headers' dictionary.
980
The maximum number of redirect to follow before raising an
981
exception is 'redirections. The default is 5.
983
The return value is a tuple of (response, content), the first
984
being and instance of the 'Response' class, the second being
985
a string that contains the response entity body.
991
headers = self._normalize_headers(headers)
993
if not headers.has_key('user-agent'):
994
headers['user-agent'] = "Python-httplib2/%s" % __version__
998
(scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
999
domain_port = authority.split(":")[0:2]
1000
if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http':
1002
authority = domain_port[0]
1004
conn_key = scheme+":"+authority
1005
if conn_key in self.connections:
1006
conn = self.connections[conn_key]
1008
if not connection_type:
1009
connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
1010
certs = list(self.certificates.iter(authority))
1011
if scheme == 'https' and certs:
1012
conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
1013
cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
1015
conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
1016
conn.set_debuglevel(debuglevel)
1018
if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers:
1019
headers['accept-encoding'] = 'gzip, deflate'
1021
info = email.Message.Message()
1024
cachekey = defrag_uri
1025
cached_value = self.cache.get(cachekey)
1027
# info = email.message_from_string(cached_value)
1029
# Need to replace the line above with the kludge below
1030
# to fix the non-existent bug not fixed in this
1031
# bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html
1033
info, content = cached_value.split('\r\n\r\n', 1)
1034
feedparser = email.FeedParser.FeedParser()
1035
feedparser.feed(info)
1036
info = feedparser.close()
1037
feedparser._parse = None
1039
self.cache.delete(cachekey)
1045
if method in self.optimistic_concurrency_methods and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
1046
# http://www.w3.org/1999/04/Editing/
1047
headers['if-match'] = info['etag']
1049
if method not in ["GET", "HEAD"] and self.cache and cachekey:
1050
# RFC 2616 Section 13.10
1051
self.cache.delete(cachekey)
1053
# Check the vary header in the cache to see if this request
1054
# matches what varies in the cache.
1055
if method in ['GET', 'HEAD'] and 'vary' in info:
1057
vary_headers = vary.lower().replace(' ', '').split(',')
1058
for header in vary_headers:
1059
key = '-varied-%s' % header
1061
if headers.get(header, '') != value:
1065
if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
1066
if info.has_key('-x-permanent-redirect-url'):
1067
# Should cached permanent redirects be counted in our redirection count? For now, yes.
1068
(response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
1069
response.previous = Response(info)
1070
response.previous.fromcache = True
1072
# Determine our course of action:
1073
# Is the cached entry fresh or stale?
1074
# Has the client requested a non-cached response?
1076
# There seems to be three possible answers:
1077
# 1. [FRESH] Return the cache entry w/o doing a GET
1078
# 2. [STALE] Do the GET (but add in cache validators if available)
1079
# 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
1080
entry_disposition = _entry_disposition(info, headers)
1082
if entry_disposition == "FRESH":
1083
if not cached_value:
1084
info['status'] = '504'
1086
response = Response(info)
1088
response.fromcache = True
1089
return (response, content)
1091
if entry_disposition == "STALE":
1092
if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
1093
headers['if-none-match'] = info['etag']
1094
if info.has_key('last-modified') and not 'last-modified' in headers:
1095
headers['if-modified-since'] = info['last-modified']
1096
elif entry_disposition == "TRANSPARENT":
1099
(response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1101
if response.status == 304 and method == "GET":
1102
# Rewrite the cache entry with the new end-to-end headers
1103
# Take all headers that are in response
1104
# and overwrite their values in info.
1105
# unless they are hop-by-hop, or are listed in the connection header.
1107
for key in _get_end2end_headers(response):
1108
info[key] = response[key]
1109
merged_response = Response(info)
1110
if hasattr(response, "_stale_digest"):
1111
merged_response._stale_digest = response._stale_digest
1112
_updateCache(headers, merged_response, content, self.cache, cachekey)
1113
response = merged_response
1114
response.status = 200
1115
response.fromcache = True
1117
elif response.status == 200:
1118
content = new_content
1120
self.cache.delete(cachekey)
1121
content = new_content
1123
cc = _parse_cache_control(headers)
1124
if cc.has_key('only-if-cached'):
1125
info['status'] = '504'
1126
response = Response(info)
1129
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1130
except Exception, e:
1131
if self.force_exception_to_status_code:
1132
if isinstance(e, HttpLib2ErrorWithResponse):
1133
response = e.response
1135
response.status = 500
1136
response.reason = str(e)
1137
elif isinstance(e, socket.timeout):
1138
content = "Request Timeout"
1139
response = Response( {
1140
"content-type": "text/plain",
1142
"content-length": len(content)
1144
response.reason = "Request Timeout"
1147
response = Response( {
1148
"content-type": "text/plain",
1150
"content-length": len(content)
1152
response.reason = "Bad Request"
1157
return (response, content)
1161
class Response(dict):
1162
"""An object more like email.Message than httplib.HTTPResponse."""
1164
"""Is this response from our local cache"""
1167
"""HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
1170
"Status code returned by server. "
1173
"""Reason phrase returned by server."""
1178
def __init__(self, info):
1179
# info is either an email.Message or
1180
# an httplib.HTTPResponse object.
1181
if isinstance(info, httplib.HTTPResponse):
1182
for key, value in info.getheaders():
1183
self[key.lower()] = value
1184
self.status = info.status
1185
self['status'] = str(self.status)
1186
self.reason = info.reason
1187
self.version = info.version
1188
elif isinstance(info, email.Message.Message):
1189
for key, value in info.items():
1191
self.status = int(self['status'])
1193
for key, value in info.iteritems():
1195
self.status = int(self.get('status', self.status))
1198
def __getattr__(self, name):
1202
raise AttributeError, name