1
# -*- test-case-name: twisted.web.test.test_webclient -*-
2
# Copyright (c) 2001-2010 Twisted Matrix Laboratories.
3
# See LICENSE for details.
10
from urlparse import urlunparse
12
from twisted.python import log
13
from twisted.web import http
14
from twisted.internet import defer, protocol, reactor
15
from twisted.python import failure
16
from twisted.python.util import InsensitiveDict
17
from twisted.web import error
18
from twisted.web.http_headers import Headers
19
from twisted.python.compat import set
22
class PartialDownloadError(error.Error):
24
Page was only partially downloaded, we got disconnected in middle.
26
@ivar response: All of the response body which was downloaded.
30
class HTTPPageGetter(http.HTTPClient):
32
Gets a resource via HTTP, then quits.
34
Typically used with L{HTTPClientFactory}. Note that this class does not, by
35
itself, do anything with the response. If you want to download a resource
36
into a file, use L{HTTPPageDownloader} instead.
43
_specialHeaders = set(('host', 'user-agent', 'cookie', 'content-length'))
45
def connectionMade(self):
46
method = getattr(self.factory, 'method', 'GET')
47
self.sendCommand(method, self.factory.path)
48
self.sendHeader('Host', self.factory.headers.get("host", self.factory.host))
49
self.sendHeader('User-Agent', self.factory.agent)
50
data = getattr(self.factory, 'postdata', None)
52
self.sendHeader("Content-Length", str(len(data)))
55
for (key, value) in self.factory.headers.items():
56
if key.lower() not in self._specialHeaders:
57
# we calculated it on our own
58
self.sendHeader(key, value)
59
if key.lower() == 'cookie':
60
cookieData.append(value)
61
for cookie, cookval in self.factory.cookies.items():
62
cookieData.append('%s=%s' % (cookie, cookval))
64
self.sendHeader('Cookie', '; '.join(cookieData))
69
self.transport.write(data)
71
def handleHeader(self, key, value):
73
Called every time a header is received. Stores the header information
74
as key-value pairs in the C{headers} attribute.
77
@param key: An HTTP header field name.
80
@param value: An HTTP header field value.
83
l = self.headers.setdefault(key, [])
86
def handleStatus(self, version, status, message):
87
self.version, self.status, self.message = version, status, message
88
self.factory.gotStatus(version, status, message)
90
def handleEndHeaders(self):
91
self.factory.gotHeaders(self.headers)
92
m = getattr(self, 'handleStatus_'+self.status, self.handleStatusDefault)
95
def handleStatus_200(self):
98
handleStatus_201 = lambda self: self.handleStatus_200()
99
handleStatus_202 = lambda self: self.handleStatus_200()
101
def handleStatusDefault(self):
104
def handleStatus_301(self):
105
l = self.headers.get('location')
107
self.handleStatusDefault()
110
if self.followRedirect:
111
scheme, host, port, path = \
112
_parse(url, defaultPort=self.transport.getPeer().port)
114
self.factory._redirectCount += 1
115
if self.factory._redirectCount >= self.factory.redirectLimit:
116
err = error.InfiniteRedirection(
118
'Infinite redirection detected',
120
self.factory.noPage(failure.Failure(err))
121
self.quietLoss = True
122
self.transport.loseConnection()
125
self.factory.setURL(url)
127
if self.factory.scheme == 'https':
128
from twisted.internet import ssl
129
contextFactory = ssl.ClientContextFactory()
130
reactor.connectSSL(self.factory.host, self.factory.port,
131
self.factory, contextFactory)
133
reactor.connectTCP(self.factory.host, self.factory.port,
136
self.handleStatusDefault()
140
self.status, self.message, location = url)))
141
self.quietLoss = True
142
self.transport.loseConnection()
144
def handleStatus_302(self):
145
if self.afterFoundGet:
146
self.handleStatus_303()
147
self.handleStatus_301()
150
def handleStatus_303(self):
151
self.factory.method = 'GET'
152
self.handleStatus_301()
154
def connectionLost(self, reason):
155
if not self.quietLoss:
156
http.HTTPClient.connectionLost(self, reason)
157
self.factory.noPage(reason)
159
def handleResponse(self, response):
166
self.status, self.message, response)))
167
if self.factory.method == 'HEAD':
168
# Callback with empty string, since there is never a response
169
# body for HEAD requests.
170
self.factory.page('')
171
elif self.length != None and self.length != 0:
172
self.factory.noPage(failure.Failure(
173
PartialDownloadError(self.status, self.message, response)))
175
self.factory.page(response)
176
# server might be stupid and not close connection. admittedly
177
# the fact we do only one request per connection is also
179
self.transport.loseConnection()
182
self.quietLoss = True
183
self.transport.loseConnection()
184
self.factory.noPage(defer.TimeoutError("Getting %s took longer than %s seconds." % (self.factory.url, self.factory.timeout)))
187
class HTTPPageDownloader(HTTPPageGetter):
191
def handleStatus_200(self, partialContent=0):
192
HTTPPageGetter.handleStatus_200(self)
193
self.transmittingPage = 1
194
self.factory.pageStart(partialContent)
196
def handleStatus_206(self):
197
self.handleStatus_200(partialContent=1)
199
def handleResponsePart(self, data):
200
if self.transmittingPage:
201
self.factory.pagePart(data)
203
def handleResponseEnd(self):
205
self.transmittingPage = 0
208
PartialDownloadError(self.status)))
209
if self.transmittingPage:
210
self.factory.pageEnd()
211
self.transmittingPage = 0
216
self.status, self.message, None)))
217
self.transport.loseConnection()
220
class HTTPClientFactory(protocol.ClientFactory):
221
"""Download a given URL.
223
@type deferred: Deferred
224
@ivar deferred: A Deferred that will fire when the content has
225
been retrieved. Once this is fired, the ivars `status', `version',
226
and `message' will be set.
229
@ivar status: The status of the response.
232
@ivar version: The version of the response.
235
@ivar message: The text message returned with the status.
237
@type response_headers: dict
238
@ivar response_headers: The headers that were specified in the
239
response from the server.
242
@ivar method: The HTTP method to use in the request. This should be one of
243
OPTIONS, GET, HEAD, POST, PUT, DELETE, TRACE, or CONNECT (case
244
matters). Other values may be specified if the server being contacted
247
@type redirectLimit: int
248
@ivar redirectLimit: The maximum number of HTTP redirects that can occur
249
before it is assumed that the redirection is endless.
251
@type afterFoundGet: C{bool}
252
@ivar afterFoundGet: Deviate from the HTTP 1.1 RFC by handling redirects
253
the same way as most web browsers; if the request method is POST and a
254
302 status is encountered, the redirect is followed with a GET method
256
@type _redirectCount: int
257
@ivar _redirectCount: The current number of HTTP redirects encountered.
260
protocol = HTTPPageGetter
268
def __init__(self, url, method='GET', postdata=None, headers=None,
269
agent="Twisted PageGetter", timeout=0, cookies=None,
270
followRedirect=True, redirectLimit=20,
271
afterFoundGet=False):
272
self.followRedirect = followRedirect
273
self.redirectLimit = redirectLimit
274
self._redirectCount = 0
275
self.timeout = timeout
277
self.afterFoundGet = afterFoundGet
280
self.cookies = cookies
281
if headers is not None:
282
self.headers = InsensitiveDict(headers)
284
self.headers = InsensitiveDict()
285
if postdata is not None:
286
self.headers.setdefault('Content-Length', len(postdata))
287
# just in case a broken http/1.1 decides to keep connection alive
288
self.headers.setdefault("connection", "close")
289
self.postdata = postdata
295
self.deferred = defer.Deferred()
296
self.response_headers = None
299
return "<%s: %s>" % (self.__class__.__name__, self.url)
301
def setURL(self, url):
303
scheme, host, port, path = _parse(url)
310
def buildProtocol(self, addr):
311
p = protocol.ClientFactory.buildProtocol(self, addr)
312
p.followRedirect = self.followRedirect
313
p.afterFoundGet = self.afterFoundGet
315
timeoutCall = reactor.callLater(self.timeout, p.timeout)
316
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
319
def _cancelTimeout(self, result, timeoutCall):
320
if timeoutCall.active():
324
def gotHeaders(self, headers):
325
self.response_headers = headers
326
if headers.has_key('set-cookie'):
327
for cookie in headers['set-cookie']:
328
cookparts = cookie.split(';')
331
k, v = cook.split('=', 1)
332
self.cookies[k.lstrip()] = v.lstrip()
334
def gotStatus(self, version, status, message):
335
self.version, self.status, self.message = version, status, message
337
def page(self, page):
340
self.deferred.callback(page)
342
def noPage(self, reason):
345
self.deferred.errback(reason)
347
def clientConnectionFailed(self, _, reason):
350
self.deferred.errback(reason)
353
class HTTPDownloader(HTTPClientFactory):
354
"""Download to a file."""
356
protocol = HTTPPageDownloader
359
def __init__(self, url, fileOrName,
360
method='GET', postdata=None, headers=None,
361
agent="Twisted client", supportPartial=0,
362
timeout=0, cookies=None, followRedirect=1,
364
self.requestedPartial = 0
365
if isinstance(fileOrName, types.StringTypes):
366
self.fileName = fileOrName
368
if supportPartial and os.path.exists(self.fileName):
369
fileLength = os.path.getsize(self.fileName)
371
self.requestedPartial = fileLength
374
headers["range"] = "bytes=%d-" % fileLength
376
self.file = fileOrName
377
HTTPClientFactory.__init__(
378
self, url, method=method, postdata=postdata, headers=headers,
379
agent=agent, timeout=timeout, cookies=cookies,
380
followRedirect=followRedirect, redirectLimit=redirectLimit)
383
def gotHeaders(self, headers):
384
HTTPClientFactory.gotHeaders(self, headers)
385
if self.requestedPartial:
386
contentRange = headers.get("content-range", None)
388
# server doesn't support partial requests, oh well
389
self.requestedPartial = 0
391
start, end, realLength = http.parseContentRange(contentRange[0])
392
if start != self.requestedPartial:
393
# server is acting wierdly
394
self.requestedPartial = 0
397
def openFile(self, partialContent):
399
file = open(self.fileName, 'rb+')
402
file = open(self.fileName, 'wb')
405
def pageStart(self, partialContent):
406
"""Called on page download start.
408
@param partialContent: tells us if the download is partial download we requested.
410
if partialContent and not self.requestedPartial:
411
raise ValueError, "we shouldn't get partial content response if we didn't want it!"
415
self.file = self.openFile(partialContent)
418
self.deferred.errback(failure.Failure())
420
def pagePart(self, data):
424
self.file.write(data)
428
self.deferred.errback(failure.Failure())
431
def noPage(self, reason):
433
Close the storage file and errback the waiting L{Deferred} with the
442
log.err(None, "Error closing HTTPDownloader file")
443
self.deferred.errback(reason)
453
self.deferred.errback(failure.Failure())
455
self.deferred.callback(self.value)
459
def _parse(url, defaultPort=None):
461
Split the given URL into the scheme, host, port, and path.
464
@param url: An URL to parse.
466
@type defaultPort: C{int} or C{None}
467
@param defaultPort: An alternate value to use as the port if the URL does
470
@return: A four-tuple of the scheme, host, port, and path of the URL. All
471
of these are C{str} instances except for port, which is an C{int}.
474
parsed = http.urlparse(url)
476
path = urlunparse(('', '') + parsed[2:])
478
if defaultPort is None:
479
if scheme == 'https':
484
host, port = parsed[1], defaultPort
486
host, port = host.split(':')
495
return scheme, host, port, path
498
def _makeGetterFactory(url, factoryFactory, contextFactory=None,
501
Create and connect an HTTP page getting factory.
503
Any additional positional or keyword arguments are used when calling
506
@param factoryFactory: Factory factory that is called with C{url}, C{args}
507
and C{kwargs} to produce the getter
509
@param contextFactory: Context factory to use when creating a secure
510
connection, defaulting to C{None}
512
@return: The factory created by C{factoryFactory}
514
scheme, host, port, path = _parse(url)
515
factory = factoryFactory(url, *args, **kwargs)
516
if scheme == 'https':
517
from twisted.internet import ssl
518
if contextFactory is None:
519
contextFactory = ssl.ClientContextFactory()
520
reactor.connectSSL(host, port, factory, contextFactory)
522
reactor.connectTCP(host, port, factory)
526
def getPage(url, contextFactory=None, *args, **kwargs):
528
Download a web page as a string.
530
Download a page. Return a deferred, which will callback with a
531
page (as a string) or errback with a description of the error.
533
See HTTPClientFactory to see what extra args can be passed.
535
return _makeGetterFactory(
538
contextFactory=contextFactory,
539
*args, **kwargs).deferred
542
def downloadPage(url, file, contextFactory=None, *args, **kwargs):
544
Download a web page to a file.
546
@param file: path to file on filesystem, or file-like object.
548
See HTTPDownloader to see what extra args can be passed.
550
factoryFactory = lambda url, *a, **kw: HTTPDownloader(url, file, *a, **kw)
551
return _makeGetterFactory(
554
contextFactory=contextFactory,
555
*args, **kwargs).deferred
558
# The code which follows is based on the new HTTP client implementation. It
559
# should be significantly better than anything above, though it is not yet
560
# feature equivalent.
562
from twisted.internet.protocol import ClientCreator
563
from twisted.web.error import SchemeNotSupported
564
from twisted.web._newclient import ResponseDone, Request, HTTP11ClientProtocol
565
from twisted.web._newclient import Response
569
L{Agent} is a very basic HTTP client. It supports I{HTTP} scheme URIs. It
570
does not support persistent connections.
572
@ivar _reactor: The L{IReactorTCP} implementation which will be used to set
573
up connections over which to issue requests.
577
_protocol = HTTP11ClientProtocol
579
def __init__(self, reactor):
580
self._reactor = reactor
583
def request(self, method, uri, headers=None, bodyProducer=None):
587
@param method: The request method to send.
590
@param uri: The request URI send.
593
@param headers: The request headers to send. If no I{Host} header is
594
included, one will be added based on the request URI.
595
@type headers: L{Headers}
597
@param bodyProducer: An object which will produce the request body or,
598
if the request body is to be empty, L{None}.
599
@type bodyProducer: L{IBodyProducer} provider
601
@return: A L{Deferred} which fires with the result of the request (a
602
L{Response} instance), or fails if there is a problem setting up a
603
connection over which to issue the request. It may also fail with
604
L{SchemeNotSupported} if the scheme of the given URI is not
608
scheme, host, port, path = _parse(uri)
610
return defer.fail(SchemeNotSupported(
611
"Unsupported scheme: %r" % (scheme,)))
612
cc = ClientCreator(self._reactor, self._protocol)
613
d = cc.connectTCP(host, port)
616
if not headers.hasHeader('host'):
617
# This is a lot of copying. It might be nice if there were a bit
619
headers = Headers(dict(headers.getAllRawHeaders()))
620
headers.addRawHeader(
621
'host', self._computeHostValue(scheme, host, port))
622
def cbConnected(proto):
623
return proto.request(Request(method, path, headers, bodyProducer))
624
d.addCallback(cbConnected)
628
def _computeHostValue(self, scheme, host, port):
630
Compute the string to use for the value of the I{Host} header, based on
631
the given scheme, host name, and port number.
635
return '%s:%d' % (host, port)
640
'PartialDownloadError',
641
'HTTPPageGetter', 'HTTPPageDownloader', 'HTTPClientFactory', 'HTTPDownloader',
642
'getPage', 'downloadPage',
644
'ResponseDone', 'Response', 'Agent']