1
"""An extensible library for opening URLs using a variety of protocols
3
The simplest way to use this module is to call the urlopen function,
4
which accepts a string containing a URL or a Request object (described
5
below). It opens the URL and returns the results as file-like
6
object; the returned object has some extra methods described below.
8
The OpenerDirector manages a collection of Handler objects that do
9
all the actual work. Each Handler implements a particular protocol or
10
option. The OpenerDirector is a composite object that invokes the
11
Handlers needed to open the requested URL. For example, the
12
HTTPHandler performs HTTP GET and POST requests and deals with
13
non-error returns. The HTTPRedirectHandler automatically deals with
14
HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15
deals with digest authentication.
17
urlopen(url, data=None) -- basic usage is that same as original
18
urllib. pass the url and optionally data to post to an HTTP URL, and
19
get a file-like object back. One difference is that you can also pass
20
a Request instance instead of URL. Raises a URLError (subclass of
21
IOError); for HTTP errors, raises an HTTPError, which can also be
22
treated as a valid response.
24
build_opener -- function that creates a new OpenerDirector instance.
25
will install the default handlers. accepts one or more Handlers as
26
arguments, either instances or Handler classes that it will
27
instantiate. if one of the argument is a subclass of the default
28
handler, the argument will be installed instead of the default.
30
install_opener -- installs a new opener as the default opener.
35
Request -- an object that encapsulates the state of a request. the
36
state can be a simple as the URL. it can also include extra HTTP
37
headers, e.g. a User-Agent.
42
URLError-- a subclass of IOError, individual protocols have their own
45
HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46
as an exceptional event or valid response
49
BaseHandler and parent
50
_call_chain conventions
56
# set up authentication info
57
authinfo = urllib2.HTTPBasicAuthHandler()
58
authinfo.add_password('realm', 'host', 'username', 'password')
60
proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62
# build a new opener that adds authentication and caching FTP handlers
63
opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
66
urllib2.install_opener(opener)
68
f = urllib2.urlopen('http://www.python.org/')
74
# If an authentication error handler that tries to perform
75
# authentication for some reason but fails, how should the error be
76
# signalled? The client needs to know the HTTP error code. But if
77
# the handler knows that the problem was, e.g., that it didn't know
78
# that hash algo that requested in the challenge, it would be good to
79
# pass that information along to the client, too.
83
# documentation (getting there)
85
# abstract factory for opener
86
# ftp errors aren't handled cleanly
87
# gopher can return a socket.error
88
# check digest against correct (i.e. non-apache) implementation
111
from cStringIO import StringIO
113
from StringIO import StringIO
115
# not sure how many of these need to be gotten rid of
116
from urllib import (unwrap, unquote, splittype, splithost,
117
addinfourl, splitport, splitgophertype, splitquery,
118
splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
120
# support for FileHandler, proxies via environment variables
121
from urllib import localhost, url2pathname, getproxies
126
def urlopen(url, data=None):
129
_opener = build_opener()
130
return _opener.open(url, data)
132
def install_opener(opener):
136
# do these error classes make sense?
137
# make sure all of the IOError stuff is overridden. we just want to be
140
class URLError(IOError):
141
# URLError is a sub-type of IOError, but it doesn't share any of
142
# the implementation. need to override __init__ and __str__.
143
# It sets self.args for compatibility with other EnvironmentError
144
# subclasses, but args doesn't have the typical format with errno in
145
# slot 0 and strerror in slot 1. This may be better than nothing.
146
def __init__(self, reason):
151
return '<urlopen error %s>' % self.reason
153
class HTTPError(URLError, addinfourl):
154
"""Raised when HTTP error occurs, but also acts like non-error return"""
155
__super_init = addinfourl.__init__
157
def __init__(self, url, code, msg, hdrs, fp):
163
# The addinfourl classes depend on fp being a valid file
164
# object. In some cases, the HTTPError may not have a valid
165
# file object. If this happens, the simplest workaround is to
166
# not initialize the base classes.
168
self.__super_init(fp, hdrs, url)
171
return 'HTTP Error %s: %s' % (self.code, self.msg)
173
class GopherError(URLError):
179
def __init__(self, url, data=None, headers={},
180
origin_req_host=None, unverifiable=False):
181
# unwrap('<URL:type://host/path>') --> 'type://host/path'
182
self.__original = unwrap(url)
184
# self.__r_type is what's left after doing the splittype
189
for key, value in headers.items():
190
self.add_header(key, value)
191
self.unredirected_hdrs = {}
192
if origin_req_host is None:
193
origin_req_host = cookielib.request_host(self)
194
self.origin_req_host = origin_req_host
195
self.unverifiable = unverifiable
197
def __getattr__(self, attr):
198
# XXX this is a fallback mechanism to guard against these
199
# methods getting called in a non-standard order. this may be
200
# too complicated and/or unnecessary.
201
# XXX should the __r_XXX attributes be public?
202
if attr[:12] == '_Request__r_':
204
if hasattr(Request, 'get_' + name):
205
getattr(self, 'get_' + name)()
206
return getattr(self, attr)
207
raise AttributeError, attr
209
def get_method(self):
215
# XXX these helper methods are lame
217
def add_data(self, data):
221
return self.data is not None
226
def get_full_url(self):
227
return self.__original
230
if self.type is None:
231
self.type, self.__r_type = splittype(self.__original)
232
if self.type is None:
233
raise ValueError, "unknown url type: %s" % self.__original
237
if self.host is None:
238
self.host, self.__r_host = splithost(self.__r_type)
240
self.host = unquote(self.host)
243
def get_selector(self):
246
def set_proxy(self, host, type):
247
self.host, self.type = host, type
248
self.__r_host = self.__original
250
def get_origin_req_host(self):
251
return self.origin_req_host
253
def is_unverifiable(self):
254
return self.unverifiable
256
def add_header(self, key, val):
257
# useful for something like authentication
258
self.headers[key.capitalize()] = val
260
def add_unredirected_header(self, key, val):
261
# will not be added to a redirected request
262
self.unredirected_hdrs[key.capitalize()] = val
264
def has_header(self, header_name):
265
return (header_name in self.headers or
266
header_name in self.unredirected_hdrs)
268
def get_header(self, header_name, default=None):
269
return self.headers.get(
271
self.unredirected_hdrs.get(header_name, default))
273
def header_items(self):
274
hdrs = self.unredirected_hdrs.copy()
275
hdrs.update(self.headers)
278
class OpenerDirector:
280
server_version = "Python-urllib/%s" % __version__
281
self.addheaders = [('User-agent', server_version)]
282
# manage the individual handlers
284
self.handle_open = {}
285
self.handle_error = {}
286
self.process_response = {}
287
self.process_request = {}
289
def add_handler(self, handler):
291
for meth in dir(handler):
294
condition = meth[i+1:]
296
if condition.startswith("error"):
297
j = condition.find("_") + i + 1
303
lookup = self.handle_error.get(protocol, {})
304
self.handle_error[protocol] = lookup
305
elif condition == "open":
307
lookup = getattr(self, "handle_"+condition)
308
elif condition in ["response", "request"]:
310
lookup = getattr(self, "process_"+condition)
314
handlers = lookup.setdefault(kind, [])
316
bisect.insort(handlers, handler)
318
handlers.append(handler)
322
# XXX why does self.handlers need to be sorted?
323
bisect.insort(self.handlers, handler)
324
handler.add_parent(self)
327
# Only exists for backwards compatibility.
330
def _call_chain(self, chain, kind, meth_name, *args):
331
# XXX raise an exception if no one else should try to handle
332
# this url. return None if you can't but someone else could.
333
handlers = chain.get(kind, ())
334
for handler in handlers:
335
func = getattr(handler, meth_name)
338
if result is not None:
341
def open(self, fullurl, data=None):
342
# accept a URL or a Request object
343
if isinstance(fullurl, basestring):
344
req = Request(fullurl, data)
350
protocol = req.get_type()
352
# pre-process request
353
meth_name = protocol+"_request"
354
for processor in self.process_request.get(protocol, []):
355
meth = getattr(processor, meth_name)
358
response = self._open(req, data)
360
# post-process response
361
meth_name = protocol+"_response"
362
for processor in self.process_response.get(protocol, []):
363
meth = getattr(processor, meth_name)
364
response = meth(req, response)
368
def _open(self, req, data=None):
369
result = self._call_chain(self.handle_open, 'default',
374
protocol = req.get_type()
375
result = self._call_chain(self.handle_open, protocol, protocol +
380
return self._call_chain(self.handle_open, 'unknown',
383
def error(self, proto, *args):
384
if proto in ['http', 'https']:
385
# XXX http[s] protocols are special-cased
386
dict = self.handle_error['http'] # https is not different than http
387
proto = args[2] # YUCK!
388
meth_name = 'http_error_%s' % proto
392
dict = self.handle_error
393
meth_name = proto + '_error'
395
args = (dict, proto, meth_name) + args
396
result = self._call_chain(*args)
401
args = (dict, 'default', 'http_error_default') + orig_args
402
return self._call_chain(*args)
404
# XXX probably also want an abstract factory that knows when it makes
405
# sense to skip a superclass in favor of a subclass and when it might
406
# make sense to include both
408
def build_opener(*handlers):
409
"""Create an opener object from a list of handlers.
411
The opener will use several default handlers, including support
414
If any of the handlers passed as arguments are subclasses of the
415
default handlers, the default handlers will not be used.
418
opener = OpenerDirector()
419
default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
420
HTTPDefaultErrorHandler, HTTPRedirectHandler,
421
FTPHandler, FileHandler, HTTPErrorProcessor]
422
if hasattr(httplib, 'HTTPS'):
423
default_classes.append(HTTPSHandler)
425
for klass in default_classes:
426
for check in handlers:
427
if inspect.isclass(check):
428
if issubclass(check, klass):
430
elif isinstance(check, klass):
433
default_classes.remove(klass)
435
for klass in default_classes:
436
opener.add_handler(klass())
439
if inspect.isclass(h):
441
opener.add_handler(h)
447
def add_parent(self, parent):
451
# Only exists for backwards compatibility
454
def __lt__(self, other):
455
if not hasattr(other, "handler_order"):
456
# Try to preserve the old behavior of having custom classes
457
# inserted after default ones (works only for custom user
458
# classes which are not aware of handler_order).
460
return self.handler_order < other.handler_order
463
class HTTPErrorProcessor(BaseHandler):
464
"""Process HTTP error responses."""
465
handler_order = 1000 # after all other processing
467
def http_response(self, request, response):
468
code, msg, hdrs = response.code, response.msg, response.info()
470
if code not in (200, 206):
471
response = self.parent.error(
472
'http', request, response, code, msg, hdrs)
476
https_response = http_response
478
class HTTPDefaultErrorHandler(BaseHandler):
479
def http_error_default(self, req, fp, code, msg, hdrs):
480
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
482
class HTTPRedirectHandler(BaseHandler):
483
# maximum number of redirections to any single URL
484
# this is needed because of the state that cookies introduce
486
# maximum total number of redirections (regardless of URL) before
487
# assuming we're in a loop
488
max_redirections = 10
490
def redirect_request(self, req, fp, code, msg, headers, newurl):
491
"""Return a Request or None in response to a redirect.
493
This is called by the http_error_30x methods when a
494
redirection response is received. If a redirection should
495
take place, return a new Request to allow http_error_30x to
496
perform the redirect. Otherwise, raise HTTPError if no-one
497
else should try to handle this url. Return None if you can't
498
but another Handler might.
501
if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
502
or code in (301, 302, 303) and m == "POST"):
503
# Strictly (according to RFC 2616), 301 or 302 in response
504
# to a POST MUST NOT cause a redirection without confirmation
505
# from the user (of urllib2, in this case). In practice,
506
# essentially all clients do redirect in this case, so we
508
return Request(newurl,
510
origin_req_host=req.get_origin_req_host(),
513
raise HTTPError(req.get_full_url(), code, msg, headers, fp)
515
# Implementation note: To avoid the server sending us into an
516
# infinite loop, the request object needs to track what URLs we
517
# have already seen. Do this by adding a handler-specific
518
# attribute to the Request object.
519
def http_error_302(self, req, fp, code, msg, headers):
520
# Some servers (incorrectly) return multiple Location headers
521
# (so probably same goes for URI). Use first header.
522
if 'location' in headers:
523
newurl = headers.getheaders('location')[0]
524
elif 'uri' in headers:
525
newurl = headers.getheaders('uri')[0]
528
newurl = urlparse.urljoin(req.get_full_url(), newurl)
530
# XXX Probably want to forget about the state of the current
531
# request, although that might interact poorly with other
532
# handlers that also use handler-specific request attributes
533
new = self.redirect_request(req, fp, code, msg, headers, newurl)
538
# .redirect_dict has a key url if url was previously visited.
539
if hasattr(req, 'redirect_dict'):
540
visited = new.redirect_dict = req.redirect_dict
541
if (visited.get(newurl, 0) >= self.max_repeats or
542
len(visited) >= self.max_redirections):
543
raise HTTPError(req.get_full_url(), code,
544
self.inf_msg + msg, headers, fp)
546
visited = new.redirect_dict = req.redirect_dict = {}
547
visited[newurl] = visited.get(newurl, 0) + 1
549
# Don't close the fp until we are sure that we won't use it
554
return self.parent.open(new)
556
http_error_301 = http_error_303 = http_error_307 = http_error_302
558
inf_msg = "The HTTP server returned a redirect error that would " \
559
"lead to an infinite loop.\n" \
560
"The last 30x error message was:\n"
562
class ProxyHandler(BaseHandler):
563
# Proxies must be in front
566
def __init__(self, proxies=None):
568
proxies = getproxies()
569
assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
570
self.proxies = proxies
571
for type, url in proxies.items():
572
setattr(self, '%s_open' % type,
573
lambda r, proxy=url, type=type, meth=self.proxy_open: \
574
meth(r, proxy, type))
576
def proxy_open(self, req, proxy, type):
577
orig_type = req.get_type()
578
type, r_type = splittype(proxy)
579
host, XXX = splithost(r_type)
581
user_pass, host = host.split('@', 1)
583
user, password = user_pass.split(':', 1)
584
user_pass = base64.encodestring('%s:%s' % (unquote(user),
586
req.add_header('Proxy-authorization', 'Basic ' + user_pass)
588
req.set_proxy(host, type)
589
if orig_type == type:
590
# let other handlers take care of it
591
# XXX this only makes sense if the proxy is before the
595
# need to start over, because the other handlers don't
596
# grok the proxy's URL type
597
return self.parent.open(req)
599
# feature suggested by Duncan Booth
600
# XXX custom is not a good name
602
# either pass a function to the constructor or override handle
603
def __init__(self, proto, func=None, proxy_addr=None):
606
self.addr = proxy_addr
608
def handle(self, req):
609
if self.func and self.func(req):
615
class CustomProxyHandler(BaseHandler):
616
# Proxies must be in front
619
def __init__(self, *proxies):
622
def proxy_open(self, req):
623
proto = req.get_type()
625
proxies = self.proxies[proto]
630
req.set_proxy(p.get_proxy())
631
return self.parent.open(req)
634
def do_proxy(self, p, req):
635
return self.parent.open(req)
637
def add_proxy(self, cpo):
638
if cpo.proto in self.proxies:
639
self.proxies[cpo.proto].append(cpo)
641
self.proxies[cpo.proto] = [cpo]
643
class HTTPPasswordMgr:
647
def add_password(self, realm, uri, user, passwd):
648
# uri could be a single URI or a sequence
649
if isinstance(uri, basestring):
651
uri = tuple(map(self.reduce_uri, uri))
652
if not realm in self.passwd:
653
self.passwd[realm] = {}
654
self.passwd[realm][uri] = (user, passwd)
656
def find_user_password(self, realm, authuri):
657
domains = self.passwd.get(realm, {})
658
authuri = self.reduce_uri(authuri)
659
for uris, authinfo in domains.iteritems():
661
if self.is_suburi(uri, authuri):
665
def reduce_uri(self, uri):
666
"""Accept netloc or URI and extract only the netloc and path"""
667
parts = urlparse.urlparse(uri)
669
return parts[1], parts[2] or '/'
673
def is_suburi(self, base, test):
674
"""Check if test is below base in a URI tree
676
Both args must be URIs in reduced form.
680
if base[0] != test[0]:
682
common = posixpath.commonprefix((base[1], test[1]))
683
if len(common) == len(base[1]):
688
class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
690
def find_user_password(self, realm, authuri):
691
user, password = HTTPPasswordMgr.find_user_password(self, realm,
694
return user, password
695
return HTTPPasswordMgr.find_user_password(self, None, authuri)
698
class AbstractBasicAuthHandler:
700
rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
702
# XXX there can actually be multiple auth-schemes in a
703
# www-authenticate header. should probably be a lot more careful
704
# in parsing them to extract multiple alternatives
706
def __init__(self, password_mgr=None):
707
if password_mgr is None:
708
password_mgr = HTTPPasswordMgr()
709
self.passwd = password_mgr
710
self.add_password = self.passwd.add_password
712
def http_error_auth_reqed(self, authreq, host, req, headers):
713
# XXX could be multiple headers
714
authreq = headers.get(authreq, None)
716
mo = AbstractBasicAuthHandler.rx.search(authreq)
718
scheme, realm = mo.groups()
719
if scheme.lower() == 'basic':
720
return self.retry_http_basic_auth(host, req, realm)
722
def retry_http_basic_auth(self, host, req, realm):
723
user,pw = self.passwd.find_user_password(realm, host)
725
raw = "%s:%s" % (user, pw)
726
auth = 'Basic %s' % base64.encodestring(raw).strip()
727
if req.headers.get(self.auth_header, None) == auth:
729
req.add_header(self.auth_header, auth)
730
return self.parent.open(req)
734
class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
736
auth_header = 'Authorization'
738
def http_error_401(self, req, fp, code, msg, headers):
739
host = urlparse.urlparse(req.get_full_url())[1]
740
return self.http_error_auth_reqed('www-authenticate',
744
class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
746
auth_header = 'Proxy-authorization'
748
def http_error_407(self, req, fp, code, msg, headers):
749
host = req.get_host()
750
return self.http_error_auth_reqed('proxy-authenticate',
755
"""Return n random bytes."""
756
# Use /dev/urandom if it is available. Fall back to random module
757
# if not. It might be worthwhile to extend this function to use
758
# other platform-specific mechanisms for getting random bytes.
759
if os.path.exists("/dev/urandom"):
760
f = open("/dev/urandom")
765
L = [chr(random.randrange(0, 256)) for i in range(n)]
768
class AbstractDigestAuthHandler:
769
# Digest authentication is specified in RFC 2617.
771
# XXX The client does not inspect the Authentication-Info header
772
# in a successful response.
774
# XXX It should be possible to test this implementation against
775
# a mock server that just generates a static set of challenges.
777
# XXX qop="auth-int" supports is shaky
779
def __init__(self, passwd=None):
781
passwd = HTTPPasswordMgr()
783
self.add_password = self.passwd.add_password
787
def reset_retry_count(self):
790
def http_error_auth_reqed(self, auth_header, host, req, headers):
791
authreq = headers.get(auth_header, None)
793
# Don't fail endlessly - if we failed once, we'll probably
794
# fail a second time. Hm. Unless the Password Manager is
795
# prompting for the information. Crap. This isn't great
796
# but it's better than the current 'repeat until recursion
797
# depth exceeded' approach <wink>
798
raise HTTPError(req.get_full_url(), 401, "digest auth failed",
803
scheme = authreq.split()[0]
804
if scheme.lower() == 'digest':
805
return self.retry_http_digest_auth(req, authreq)
807
raise ValueError("AbstractDigestAuthHandler doesn't know "
810
def retry_http_digest_auth(self, req, auth):
811
token, challenge = auth.split(' ', 1)
812
chal = parse_keqv_list(parse_http_list(challenge))
813
auth = self.get_authorization(req, chal)
815
auth_val = 'Digest %s' % auth
816
if req.headers.get(self.auth_header, None) == auth_val:
818
req.add_header(self.auth_header, auth_val)
819
resp = self.parent.open(req)
822
def get_cnonce(self, nonce):
823
# The cnonce-value is an opaque
824
# quoted string value provided by the client and used by both client
825
# and server to avoid chosen plaintext attacks, to provide mutual
826
# authentication, and to provide some message integrity protection.
827
# This isn't a fabulous effort, but it's probably Good Enough.
828
dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
829
randombytes(8))).hexdigest()
832
def get_authorization(self, req, chal):
834
realm = chal['realm']
835
nonce = chal['nonce']
836
qop = chal.get('qop')
837
algorithm = chal.get('algorithm', 'MD5')
838
# mod_digest doesn't send an opaque, even though it isn't
839
# supposed to be optional
840
opaque = chal.get('opaque', None)
844
H, KD = self.get_algorithm_impls(algorithm)
848
user, pw = self.passwd.find_user_password(realm, req.get_full_url())
852
# XXX not implemented yet
854
entdig = self.get_entity_digest(req.get_data(), chal)
858
A1 = "%s:%s:%s" % (user, realm, pw)
859
A2 = "%s:%s" % (req.get_method(),
860
# XXX selector: what about proxies and full urls
863
self.nonce_count += 1
864
ncvalue = '%08x' % self.nonce_count
865
cnonce = self.get_cnonce(nonce)
866
noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
867
respdig = KD(H(A1), noncebit)
869
respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
871
# XXX handle auth-int.
874
# XXX should the partial digests be encoded too?
876
base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
877
'response="%s"' % (user, realm, nonce, req.get_selector(),
880
base = base + ', opaque="%s"' % opaque
882
base = base + ', digest="%s"' % entdig
883
if algorithm != 'MD5':
884
base = base + ', algorithm="%s"' % algorithm
886
base = base + ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
889
def get_algorithm_impls(self, algorithm):
890
# lambdas assume digest modules are imported at the top level
891
if algorithm == 'MD5':
892
H = lambda x: md5.new(x).hexdigest()
893
elif algorithm == 'SHA':
894
H = lambda x: sha.new(x).hexdigest()
896
KD = lambda s, d: H("%s:%s" % (s, d))
899
def get_entity_digest(self, data, chal):
900
# XXX not implemented yet
904
class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
905
"""An authentication protocol defined by RFC 2069
907
Digest authentication improves on basic authentication because it
908
does not transmit passwords in the clear.
911
auth_header = 'Authorization'
913
def http_error_401(self, req, fp, code, msg, headers):
914
host = urlparse.urlparse(req.get_full_url())[1]
915
retry = self.http_error_auth_reqed('www-authenticate',
917
self.reset_retry_count()
921
class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
923
auth_header = 'Proxy-Authorization'
925
def http_error_407(self, req, fp, code, msg, headers):
926
host = req.get_host()
927
retry = self.http_error_auth_reqed('proxy-authenticate',
929
self.reset_retry_count()
932
class AbstractHTTPHandler(BaseHandler):
934
def __init__(self, debuglevel=0):
935
self._debuglevel = debuglevel
937
def set_http_debuglevel(self, level):
938
self._debuglevel = level
940
def do_request_(self, request):
941
host = request.get_host()
943
raise URLError('no host given')
945
if request.has_data(): # POST
946
data = request.get_data()
947
if not request.has_header('Content-type'):
948
request.add_unredirected_header(
950
'application/x-www-form-urlencoded')
951
if not request.has_header('Content-length'):
952
request.add_unredirected_header(
953
'Content-length', '%d' % len(data))
955
scheme, sel = splittype(request.get_selector())
956
sel_host, sel_path = splithost(sel)
957
if not request.has_header('Host'):
958
request.add_unredirected_header('Host', sel_host or host)
959
for name, value in self.parent.addheaders:
960
name = name.capitalize()
961
if not request.has_header(name):
962
request.add_unredirected_header(name, value)
966
def do_open(self, http_class, req):
967
"""Return an addinfourl object for the request, using http_class.
969
http_class must implement the HTTPConnection API from httplib.
970
The addinfourl return value is a file-like object. It also
971
has methods and attributes including:
972
- info(): return a mimetools.Message object for the headers
973
- geturl(): return the original request URL
974
- code: HTTP status code
976
host = req.get_host()
978
raise URLError('no host given')
980
h = http_class(host) # will parse host:port
981
h.set_debuglevel(self._debuglevel)
983
headers = dict(req.headers)
984
headers.update(req.unredirected_hdrs)
985
# We want to make an HTTP/1.1 request, but the addinfourl
986
# class isn't prepared to deal with a persistent connection.
987
# It will try to read all remaining data from the socket,
988
# which will block while the server waits for the next request.
989
# So make sure the connection gets closed after the (only)
991
headers["Connection"] = "close"
993
h.request(req.get_method(), req.get_selector(), req.data, headers)
995
except socket.error, err: # XXX what error?
998
# Pick apart the HTTPResponse object to get the addinfourl
999
# object initialized properly.
1001
# Wrap the HTTPResponse object in socket's file object adapter
1002
# for Windows. That adapter calls recv(), so delegate recv()
1003
# to read(). This weird wrapping allows the returned object to
1004
# have readline() and readlines() methods.
1006
# XXX It might be better to extract the read buffering code
1007
# out of socket._fileobject() and into a base class.
1010
fp = socket._fileobject(r)
1012
resp = addinfourl(fp, r.msg, req.get_full_url())
1013
resp.code = r.status
1018
class HTTPHandler(AbstractHTTPHandler):
1020
def http_open(self, req):
1021
return self.do_open(httplib.HTTPConnection, req)
1023
http_request = AbstractHTTPHandler.do_request_
1025
if hasattr(httplib, 'HTTPS'):
1026
class HTTPSHandler(AbstractHTTPHandler):
1028
def https_open(self, req):
1029
return self.do_open(httplib.HTTPSConnection, req)
1031
https_request = AbstractHTTPHandler.do_request_
1033
class HTTPCookieProcessor(BaseHandler):
1034
def __init__(self, cookiejar=None):
1035
if cookiejar is None:
1036
cookiejar = cookielib.CookieJar()
1037
self.cookiejar = cookiejar
1039
def http_request(self, request):
1040
self.cookiejar.add_cookie_header(request)
1043
def http_response(self, request, response):
1044
self.cookiejar.extract_cookies(response, request)
1047
https_request = http_request
1048
https_response = http_response
1050
class UnknownHandler(BaseHandler):
1051
def unknown_open(self, req):
1052
type = req.get_type()
1053
raise URLError('unknown url type: %s' % type)
1055
def parse_keqv_list(l):
1056
"""Parse list of key=value strings where keys are not duplicated."""
1059
k, v = elt.split('=', 1)
1060
if v[0] == '"' and v[-1] == '"':
1065
def parse_http_list(s):
1066
"""Parse lists as described by RFC 2068 Section 2.
1068
In particular, parse comma-separated lists where the elements of
1069
the list may include quoted-strings. A quoted-string could
1072
# XXX this function could probably use more testing
1084
list.append(s[start:])
1088
raise ValueError, "unbalanced quotes"
1090
list.append(s[start:i+c])
1095
list.append(s[start:i+c])
1103
list.append(s[start:i+c])
1109
return map(lambda x: x.strip(), list)
1111
class FileHandler(BaseHandler):
1112
# Use local file or FTP depending on form of URL
1113
def file_open(self, req):
1114
url = req.get_selector()
1115
if url[:2] == '//' and url[2:3] != '/':
1117
return self.parent.open(req)
1119
return self.open_local_file(req)
1121
# names for the localhost
1123
def get_names(self):
1124
if FileHandler.names is None:
1125
FileHandler.names = (socket.gethostbyname('localhost'),
1126
socket.gethostbyname(socket.gethostname()))
1127
return FileHandler.names
1129
# not entirely sure what the rules are here
1130
def open_local_file(self, req):
1132
host = req.get_host()
1133
file = req.get_selector()
1134
localfile = url2pathname(file)
1135
stats = os.stat(localfile)
1136
size = stats.st_size
1137
modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
1138
mtype = mimetypes.guess_type(file)[0]
1139
headers = mimetools.Message(StringIO(
1140
'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1141
(mtype or 'text/plain', size, modified)))
1143
host, port = splitport(host)
1145
(not port and socket.gethostbyname(host) in self.get_names()):
1146
return addinfourl(open(localfile, 'rb'),
1147
headers, 'file:'+file)
1148
raise URLError('file not on local host')
1150
class FTPHandler(BaseHandler):
1151
def ftp_open(self, req):
1152
host = req.get_host()
1154
raise IOError, ('ftp error', 'no host given')
1155
host, port = splitport(host)
1157
port = ftplib.FTP_PORT
1161
# username/password handling
1162
user, host = splituser(host)
1164
user, passwd = splitpasswd(user)
1167
host = unquote(host)
1168
user = unquote(user or '')
1169
passwd = unquote(passwd or '')
1172
host = socket.gethostbyname(host)
1173
except socket.error, msg:
1175
path, attrs = splitattr(req.get_selector())
1176
dirs = path.split('/')
1177
dirs = map(unquote, dirs)
1178
dirs, file = dirs[:-1], dirs[-1]
1179
if dirs and not dirs[0]:
1182
fw = self.connect_ftp(user, passwd, host, port, dirs)
1183
type = file and 'I' or 'D'
1185
attr, value = splitvalue(attr)
1186
if attr.lower() == 'type' and \
1187
value in ('a', 'A', 'i', 'I', 'd', 'D'):
1188
type = value.upper()
1189
fp, retrlen = fw.retrfile(file, type)
1191
mtype = mimetypes.guess_type(req.get_full_url())[0]
1193
headers += "Content-type: %s\n" % mtype
1194
if retrlen is not None and retrlen >= 0:
1195
headers += "Content-length: %d\n" % retrlen
1196
sf = StringIO(headers)
1197
headers = mimetools.Message(sf)
1198
return addinfourl(fp, headers, req.get_full_url())
1199
except ftplib.all_errors, msg:
1200
raise IOError, ('ftp error', msg), sys.exc_info()[2]
1202
def connect_ftp(self, user, passwd, host, port, dirs):
1203
fw = ftpwrapper(user, passwd, host, port, dirs)
1204
## fw.ftp.set_debuglevel(1)
1207
class CacheFTPHandler(FTPHandler):
1208
# XXX would be nice to have pluggable cache strategies
1209
# XXX this stuff is definitely not thread safe
1217
def setTimeout(self, t):
1220
def setMaxConns(self, m):
1223
def connect_ftp(self, user, passwd, host, port, dirs):
1224
key = user, host, port, '/'.join(dirs)
1225
if key in self.cache:
1226
self.timeout[key] = time.time() + self.delay
1228
self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1229
self.timeout[key] = time.time() + self.delay
1231
return self.cache[key]
1233
def check_cache(self):
1234
# first check for old ones
1236
if self.soonest <= t:
1237
for k, v in self.timeout.items():
1239
self.cache[k].close()
1242
self.soonest = min(self.timeout.values())
1244
# then check the size
1245
if len(self.cache) == self.max_conns:
1246
for k, v in self.timeout.items():
1247
if v == self.soonest:
1251
self.soonest = min(self.timeout.values())
1253
class GopherHandler(BaseHandler):
1254
def gopher_open(self, req):
1255
host = req.get_host()
1257
raise GopherError('no host given')
1258
host = unquote(host)
1259
selector = req.get_selector()
1260
type, selector = splitgophertype(selector)
1261
selector, query = splitquery(selector)
1262
selector = unquote(selector)
1264
query = unquote(query)
1265
fp = gopherlib.send_query(selector, query, host)
1267
fp = gopherlib.send_selector(selector, host)
1268
return addinfourl(fp, noheaders(), req.get_full_url())
1270
#bleck! don't use this yet
1271
class OpenerFactory:
1273
default_handlers = [UnknownHandler, HTTPHandler,
1274
HTTPDefaultErrorHandler, HTTPRedirectHandler,
1275
FTPHandler, FileHandler]
1277
replacement_handlers = []
1279
def add_handler(self, h):
1280
self.handlers = self.handlers + [h]
1282
def replace_handler(self, h):
1285
def build_opener(self):
1286
opener = OpenerDirector()
1287
for ph in self.default_handlers:
1288
if inspect.isclass(ph):
1290
opener.add_handler(ph)