1
"""Integration with Python standard library module urllib2.
3
Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
4
the META HTTP-EQUIV tag contents, and following Refresh header redirects.
6
Copyright 2002-2003 John J Lee <jjl@pobox.com>
8
This code is free software; you can redistribute it and/or modify it under
9
the terms of the BSD License (see the file COPYING included with the
17
from _ClientCookie import CookieJar, request_host
18
from _Util import isstringlike, startswith, getheaders
19
from _Debug import getLogger
20
info = getLogger("ClientCookie").info
28
CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
31
"""Return method names of class instance.
33
dir(obj) doesn't work across Python versions, this does.
36
return methnames_of_instance_as_dict(obj).keys()
38
def methnames_of_instance_as_dict(inst):
40
names.update(methnames_of_class_as_dict(inst.__class__))
41
for methname in dir(inst):
42
candidate = getattr(inst, methname)
43
if callable(candidate):
44
names[methname] = None
47
def methnames_of_class_as_dict(klass):
49
for methname in dir(klass):
50
candidate = getattr(klass, methname)
51
if callable(candidate):
52
names[methname] = None
53
for baseclass in klass.__bases__:
54
names.update(methnames_of_class_as_dict(baseclass))
58
from urllib2 import AbstractHTTPHandler
62
import urlparse, urllib2, urllib, httplib, robotparser
63
from urllib2 import URLError, HTTPError
64
import types, string, socket, bisect
65
from cStringIO import StringIO
66
from _Util import response_seek_wrapper
69
_threading = threading; del threading
71
import dummy_threading
72
_threading = dummy_threading; del dummy_threading
74
# This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
75
# (http://www.python.org/sf/549151)
76
# 2.2.3 is broken here (my fault!), 2.3 is fixed.
77
class HTTPRedirectHandler(urllib2.BaseHandler):
78
# maximum number of redirections to any single URL
79
# this is needed because of the state that cookies introduce
81
# maximum total number of redirections (regardless of URL) before
82
# assuming we're in a loop
85
# Implementation notes:
87
# To avoid the server sending us into an infinite loop, the request
88
# object needs to track what URLs we have already seen. Do this by
89
# adding a handler-specific attribute to the Request object. The value
90
# of the dict is used to count the number of times the same url has
91
# been visited. This is needed because this isn't necessarily a loop:
92
# there is more than one way to redirect (Refresh, 302, 303, 307).
94
# Another handler-specific Request attribute, original_url, is used to
95
# remember the URL of the original request so that it is possible to
96
# decide whether or not RFC 2965 cookies should be turned on during
99
# Always unhandled redirection codes:
100
# 300 Multiple Choices: should not handle this here.
101
# 304 Not Modified: no need to handle here: only of interest to caches
102
# that do conditional GETs
103
# 305 Use Proxy: probably not worth dealing with here
104
# 306 Unused: what was this for in the previous versions of protocol??
106
def redirect_request(self, newurl, req, fp, code, msg, headers):
107
"""Return a Request or None in response to a redirect.
109
This is called by the http_error_30x methods when a redirection
110
response is received. If a redirection should take place, return a
111
new Request to allow http_error_30x to perform the redirect;
112
otherwise, return None to indicate that an HTTPError should be
116
if code in (301, 302, 303, "refresh") or \
117
(code == 307 and not req.has_data()):
118
# Strictly (according to RFC 2616), 301 or 302 in response to
119
# a POST MUST NOT cause a redirection without confirmation
120
# from the user (of urllib2, in this case). In practice,
121
# essentially all clients do redirect in this case, so we do
123
return Request(newurl, headers=req.headers)
125
raise HTTPError(req.get_full_url(), code, msg, headers, fp)
127
def http_error_302(self, req, fp, code, msg, headers):
128
if headers.has_key('location'):
129
newurl = getheaders(headers, 'location')[0]
130
elif headers.has_key('uri'):
131
newurl = getheaders(headers, 'uri')[0]
134
newurl = urlparse.urljoin(req.get_full_url(), newurl)
136
# XXX Probably want to forget about the state of the current
137
# request, although that might interact poorly with other
138
# handlers that also use handler-specific request attributes
139
new = self.redirect_request(newurl, req, fp, code, msg, headers)
143
# remember where we started from
144
try: new.origin_req_host = req.origin_req_host
145
except AttributeError: pass
148
# .redirect_dict has a key url if url was previously visited.
149
if hasattr(req, 'redirect_dict'):
150
visited = new.redirect_dict = req.redirect_dict
151
if (visited.get(newurl, 0) >= self.max_repeats or
152
len(visited) >= self.max_redirections):
153
raise HTTPError(req.get_full_url(), code,
154
self.inf_msg + msg, headers, fp)
156
visited = new.redirect_dict = req.redirect_dict = {}
157
visited[newurl] = visited.get(newurl, 0) + 1
160
## # .redirect_dict has a key (url, code) if url was previously
161
## # visited as a result of a redirection with this code. The
162
## # code is needed in addition to the URL because visiting a URL
163
## # twice isn't necessarily a loop: there is more than one way
164
## # to redirect (Refresh, 301, 302, 303, 307).
165
## key = (newurl, code)
166
## if hasattr(req, 'redirect_dict'):
167
## visited = new.redirect_dict = req.redirect_dict
168
## if (visited.has_key(key) or
169
## len(visited) >= self.max_redirections):
170
## raise HTTPError(req.get_full_url(), code,
171
## self.inf_msg + msg, headers, fp)
173
## visited = new.redirect_dict = req.redirect_dict = {}
174
## visited[key] = None
176
# Don't close the fp until we are sure that we won't use it
181
return self.parent.open(new)
183
http_error_301 = http_error_303 = http_error_307 = http_error_302
184
http_error_refresh = http_error_302
186
inf_msg = "The HTTP server returned a redirect error that would " \
187
"lead to an infinite loop.\n" \
188
"The last 30x error message was:\n"
191
class Request(urllib2.Request):
192
def __init__(self, url, data=None, headers={}):
193
urllib2.Request.__init__(self, url, data, headers)
194
self.unredirected_hdrs = {}
196
def add_unredirected_header(self, key, val):
197
"""Add a header that will not be added to a redirected request."""
198
self.unredirected_hdrs[string.capitalize(key)] = val
200
def has_header(self, header_name):
201
"""True iff request has named header (regular or unredirected)."""
202
if (self.headers.has_key(header_name) or
203
self.unredirected_hdrs.has_key(header_name)):
207
def get_header(self, header_name, default=None):
208
return self.headers.get(
210
self.unredirected_hdrs.get(header_name, default))
212
def iter_headers(self):
213
hdrs = self.unredirected_hdrs.copy()
214
hdrs.update(self.headers)
219
processor_order = 500
221
def add_parent(self, parent):
225
def __cmp__(self, other):
226
if not hasattr(other, "processor_order"):
228
return cmp(self.processor_order, other.processor_order)
229
## def __lt__(self, other):
230
## if not hasattr(other, "processor_order"):
232
## return self.processor_order < other.processor_order
234
class HTTPRequestUpgradeProcessor(BaseProcessor):
235
# upgrade Request to class with support for headers that don't get
237
processor_order = 0 # before anything else
239
def http_request(self, request):
240
if not hasattr(request, "add_unredirected_header"):
241
newrequest = Request(request._Request__original, request.data,
244
try: newrequest.origin_req_host = request.origin_req_host
245
except AttributeError: pass
246
try: newrequest.unverifiable = request.unverifiable
247
except AttributeError: pass
251
https_request = http_request
253
class HTTPEquivProcessor(BaseProcessor):
254
"""Append META HTTP-EQUIV headers to regular HTTP headers."""
255
def http_response(self, request, response):
256
if not hasattr(response, "seek"):
257
response = response_seek_wrapper(response)
258
# grab HTTP-EQUIV headers and add them to the true HTTP headers
259
headers = response.info()
260
for hdr, val in parse_head(response):
265
https_response = http_response
267
# XXX ATM this only takes notice of http responses -- probably
268
# should be independent of protocol scheme (http, ftp, etc.)
269
class SeekableProcessor(BaseProcessor):
270
"""Make responses seekable."""
272
def http_response(self, request, response):
273
if not hasattr(response, "seek"):
274
return response_seek_wrapper(response)
277
https_response = http_response
279
# XXX if this gets added to urllib2, unverifiable would end up as an
280
# attribute / method on Request.
281
class HTTPCookieProcessor(BaseProcessor):
282
"""Handle HTTP cookies."""
283
def __init__(self, cookies=None):
285
cookies = CookieJar()
286
self.cookies = cookies
288
def _unverifiable(self, request):
289
if hasattr(request, "redirect_dict") and request.redirect_dict:
294
(hasattr(request, "unverifiable") and request.unverifiable)):
300
def http_request(self, request):
301
unverifiable = self._unverifiable(request)
303
# Stuff request-host of this origin transaction into Request
304
# object, because we need to know it to know whether cookies
305
# should be in operation during derived requests (redirects,
306
# specifically -- including refreshes).
307
request.origin_req_host = request_host(request)
308
self.cookies.add_cookie_header(request, unverifiable)
311
def http_response(self, request, response):
312
unverifiable = self._unverifiable(request)
313
self.cookies.extract_cookies(response, request, unverifiable)
316
https_request = http_request
317
https_response = http_response
319
class RobotExclusionError(urllib2.HTTPError):
320
def __init__(self, request, *args):
321
apply(urllib2.HTTPError.__init__, (self,)+args)
322
self.request = request
324
class HTTPRobotRulesProcessor(BaseProcessor):
325
# before redirections and response debugging, after everything else
326
processor_order = 800
327
def __init__(self, rfp_class=robotparser.RobotFileParser):
328
self.rfp_class = rfp_class
331
def http_request(self, request):
332
host = request.get_host()
333
if host != self._host:
334
self.rfp = self.rfp_class()
335
self.rfp.set_url("http://"+host+"/robots.txt")
339
ua = request.get_header("User-agent", "")
340
if self.rfp.can_fetch(ua, request.get_full_url()):
343
msg = "request disallowed by robots.txt"
344
raise RobotExclusionError(
346
request.get_full_url(),
348
httplib.HTTPMessage(StringIO()), StringIO(msg))
350
https_request = http_request
352
class HTTPRefererProcessor(BaseProcessor):
353
"""Add Referer header to requests.
355
This only makes sense if you use each RefererProcessor for a single
356
chain of requests only (so, for example, if you use a single
357
HTTPRefererProcessor to fetch a series of URLs extracted from a single
358
page, this will break).
364
def http_request(self, request):
365
if ((self.referer is not None) and
366
not request.has_header("Referer")):
367
request.add_unredirected_header("Referer", self.referer)
370
def http_response(self, request, response):
371
self.referer = response.geturl()
374
https_request = http_request
375
https_response = http_response
377
class HTTPResponseDebugProcessor(BaseProcessor):
378
processor_order = 900 # before redirections, after everything else
380
def http_response(self, request, response):
381
if not hasattr(response, "seek"):
382
response = response_seek_wrapper(response)
383
info(response.read())
384
info("*****************************************************")
388
https_response = http_response
390
class HTTPRedirectDebugProcessor(BaseProcessor):
391
def http_request(self, request):
392
if hasattr(request, "redirect_dict"):
393
info("redirecting to %s", request.get_full_url())
396
class HTTPRefreshProcessor(BaseProcessor):
397
"""Perform HTTP Refresh redirections.
399
Note that if a non-200 HTTP code has occurred (for example, a 30x
400
redirect), this processor will do nothing.
402
By default, only zero-time Refresh headers are redirected. Use the
403
max_time constructor argument to allow Refresh with longer pauses. Use
404
the honor_time argument to control whether the requested pause is
405
honoured (with a time.sleep()) or skipped in favour of immediate
409
processor_order = 1000
411
def __init__(self, max_time=0, honor_time=True):
412
self.max_time = max_time
413
self.honor_time = honor_time
415
def http_response(self, request, response):
416
code, msg, hdrs = response.code, response.msg, response.info()
418
if code == 200 and hdrs.has_key("refresh"):
419
refresh = getheaders(hdrs, "refresh")[0]
420
i = string.find(refresh, ";")
422
pause, newurl_spec = refresh[:i], refresh[i+1:]
423
i = string.find(newurl_spec, "=")
426
if (self.max_time is None) or (pause <= self.max_time):
427
if pause != 0 and self.honor_time:
429
newurl = newurl_spec[i+1:]
430
hdrs["location"] = newurl
431
response = self.parent.error(
432
'http', request, response,
433
"refresh", msg, hdrs)
437
https_response = http_response
439
class HTTPErrorProcessor(BaseProcessor):
440
"""Process HTTP error responses.
442
The purpose of this handler is to to allow other response processors a
443
look-in by removing the call to parent.error() from
446
For non-200 error codes, this just passes the job on to the
447
Handler.<proto>_error_<code> methods, via the OpenerDirector.error
448
method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
449
HTTPError if no other handler handles the error.
452
processor_order = 1000 # after all other processors
454
def http_response(self, request, response):
455
code, msg, hdrs = response.code, response.msg, response.info()
458
response = self.parent.error(
459
'http', request, response, code, msg, hdrs)
463
https_response = http_response
466
def insort(a, x, lo=0, hi=None, lt=lambda x,y: x<y):
470
mid = divmod((lo+hi), 2)[0]
471
if lt(x, a[mid]): hi = mid
475
class OpenerDirector(urllib2.OpenerDirector):
477
urllib2.OpenerDirector.__init__(self)
478
self.process_response = {}
479
self.process_request = {}
481
def add_handler(self, handler):
483
for meth in methnames(handler):
484
i = string.find(meth, "_")
486
condition = meth[i+1:]
488
if startswith(condition, "error"):
489
j = string.find(meth[i+1:], "_") + i + 1
495
map = self.handle_error.get(protocol, {})
496
self.handle_error[protocol] = map
498
elif (condition == "open" and
499
protocol != "do"): # hack: see below
500
map = self.handle_open
503
elif (condition in ["response", "request"] and
504
protocol != "redirect"): # yucky hack
505
# hack above is to fix HTTPRedirectHandler problem, which
506
# appears to above line to be a processor because of the
507
# redirect_request method :-((
508
map = getattr(self, "process_"+condition)
514
if map.has_key(kind):
516
lt = lambda x,y: x.processor_order < y.processor_order
519
insort(map[kind], handler, lt=lt)
521
map[kind] = [handler]
526
# XXX why does self.handlers need to be sorted?
527
bisect.insort(self.handlers, handler)
528
handler.add_parent(self)
530
def _request(self, url_or_req, data):
531
if isstringlike(url_or_req):
532
req = Request(url_or_req, data)
534
# already a urllib2.Request instance
540
def open(self, fullurl, data=None):
541
req = self._request(fullurl, data)
542
type_ = req.get_type()
544
# pre-process request
545
# XXX should we allow a Processor to change the type (URL
546
# scheme) of the request?
547
meth_name = type_+"_request"
548
for processor in self.process_request.get(type_, []):
549
meth = getattr(processor, meth_name)
552
response = urllib2.OpenerDirector.open(self, req, data)
554
# post-process response
555
meth_name = type_+"_response"
556
for processor in self.process_response.get(type_, []):
557
meth = getattr(processor, meth_name)
558
response = meth(req, response)
562
def error(self, proto, *args):
563
if proto in ['http', 'https']:
564
# XXX http[s] protocols are special-cased
565
dict = self.handle_error['http'] # https is not different than http
566
proto = args[2] # YUCK!
567
meth_name = 'http_error_%s' % proto
571
dict = self.handle_error
572
meth_name = proto + '_error'
574
args = (dict, proto, meth_name) + args
575
result = apply(self._call_chain, args)
580
args = (dict, 'default', 'http_error_default') + orig_args
581
return apply(self._call_chain, args)
584
# Note the absence of redirect and header-adding code here
585
# (AbstractHTTPHandler), and the lack of other clutter that would be
586
# here without Processors.
587
class AbstractHTTPHandler(urllib2.BaseHandler):
588
processor_order = 500
590
def __init__(self, debuglevel=0):
591
self._debuglevel = debuglevel
593
def set_http_debuglevel(self, level):
594
self._debuglevel = level
596
def do_request_(self, request):
597
host = request.get_host()
599
raise URLError('no host given')
601
if request.has_data(): # POST
602
data = request.get_data()
603
if not request.has_header('Content-type'):
604
request.add_unredirected_header(
606
'application/x-www-form-urlencoded')
607
if not request.has_header('Content-length'):
608
request.add_unredirected_header(
609
'Content-length', '%d' % len(data))
611
scheme, sel = urllib.splittype(request.get_selector())
612
sel_host, sel_path = urllib.splithost(sel)
613
if not request.has_header('Host'):
614
request.add_unredirected_header('Host', sel_host or host)
615
for name, value in self.parent.addheaders:
616
name = string.capitalize(name)
617
if not request.has_header(name):
618
request.add_unredirected_header(name, value)
622
def do_open(self, http_class, req):
623
host = req.get_host()
625
raise URLError('no host given')
627
h = http_class(host) # will parse host:port
628
h.set_debuglevel(self._debuglevel)
630
#h.putrequest(req.get_method(), req.get_selector())
632
h.putrequest('POST', req.get_selector())
634
h.putrequest('GET', req.get_selector())
636
for k, v in req.iter_headers():
639
# httplib will attempt to connect() here. be prepared
640
# to convert a socket error to a URLError.
643
except socket.error, err:
646
h.send(req.get_data())
648
code, msg, hdrs = h.getreply()
651
response = urllib.addinfourl(fp, hdrs, req.get_full_url())
657
# XXX would self.reset() work, instead of raising this exception?
658
class EndOfHeadError(Exception): pass
659
class AbstractHeadParser:
660
# only these elements are allowed in or before HEAD of document
661
head_elems = ("html", "head",
663
"script", "style", "meta", "link", "object")
667
def start_meta(self, attrs):
668
http_equiv = content = None
669
for key, value in attrs:
670
if key == "http-equiv":
672
elif key == "content":
674
if http_equiv is not None:
675
self.http_equiv.append((http_equiv, content))
678
raise EndOfHeadError()
680
# use HTMLParser if we have it (it does XHTML), htmllib otherwise
684
import htmllib, formatter
685
class HeadParser(AbstractHeadParser, htmllib.HTMLParser):
687
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
688
AbstractHeadParser.__init__(self)
690
def handle_starttag(self, tag, method, attrs):
691
if tag in self.head_elems:
694
raise EndOfHeadError()
696
def handle_endtag(self, tag, method):
697
if tag in self.head_elems:
700
raise EndOfHeadError()
702
HEAD_PARSER_CLASS = HeadParser
704
class XHTMLCompatibleHeadParser(AbstractHeadParser,
705
HTMLParser.HTMLParser):
707
HTMLParser.HTMLParser.__init__(self)
708
AbstractHeadParser.__init__(self)
710
def handle_starttag(self, tag, attrs):
711
if tag not in self.head_elems:
712
raise EndOfHeadError()
714
method = getattr(self, 'start_' + tag)
715
except AttributeError:
717
method = getattr(self, 'do_' + tag)
718
except AttributeError:
725
def handle_endtag(self, tag):
726
if tag not in self.head_elems:
727
raise EndOfHeadError()
729
method = getattr(self, 'end_' + tag)
730
except AttributeError:
735
# handle_charref, handle_entityref and default entitydefs are taken
737
def handle_charref(self, name):
741
self.unknown_charref(name)
743
if not 0 <= n <= 255:
744
self.unknown_charref(name)
746
self.handle_data(chr(n))
748
# Definition of entities -- derived classes may override
750
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
752
def handle_entityref(self, name):
753
table = self.entitydefs
755
self.handle_data(table[name])
757
self.unknown_entityref(name)
760
def unknown_entityref(self, ref):
761
self.handle_data("&%s;" % ref)
763
def unknown_charref(self, ref):
764
self.handle_data("&#%s;" % ref)
766
HEAD_PARSER_CLASS = XHTMLCompatibleHeadParser
768
def parse_head(fileobj):
769
"""Return a list of key, value pairs."""
770
hp = HEAD_PARSER_CLASS()
772
data = fileobj.read(CHUNK)
775
except EndOfHeadError:
777
if len(data) != CHUNK:
778
# this should only happen if there is no HTML body, or if
784
class HTTPHandler(AbstractHTTPHandler):
785
def http_open(self, req):
786
return self.do_open(httplib.HTTP, req)
788
http_request = AbstractHTTPHandler.do_request_
790
if hasattr(httplib, 'HTTPS'):
791
class HTTPSHandler(AbstractHTTPHandler):
792
def https_open(self, req):
793
return self.do_open(httplib.HTTPS, req)
795
https_request = AbstractHTTPHandler.do_request_
797
def build_opener(*handlers):
798
"""Create an opener object from a list of handlers and processors.
800
The opener will use several default handlers and processors, including
801
support for HTTP and FTP.
803
If any of the handlers passed as arguments are subclasses of the
804
default handlers, the default handlers will not be used.
807
opener = OpenerDirector()
810
urllib2.ProxyHandler,
811
urllib2.UnknownHandler,
812
HTTPHandler, # from this module (derived from new AbstractHTTPHandler)
813
urllib2.HTTPDefaultErrorHandler,
814
HTTPRedirectHandler, # from this module (bugfixed)
818
HTTPRequestUpgradeProcessor,
822
#HTTPRefererProcessor,
823
#HTTPRefreshProcessor,
826
if hasattr(httplib, 'HTTPS'):
827
default_classes.append(HTTPSHandler)
829
for klass in default_classes:
830
for check in handlers:
831
if type(check) == types.ClassType:
832
if issubclass(check, klass):
834
elif type(check) == types.InstanceType:
835
if isinstance(check, klass):
838
default_classes.remove(klass)
840
for klass in default_classes:
841
opener.add_handler(klass())
843
if type(h) == types.ClassType:
845
opener.add_handler(h)
851
urlopen_lock = _threading.Lock()
852
def urlopen(url, data=None):
855
urlopen_lock.acquire()
858
_opener = build_opener()
860
urlopen_lock.release()
861
return _opener.open(url, data)
863
def install_opener(opener):