1
4
from optparse import OptionParser
2
5
from StringIO import StringIO
7
9
from twisted.internet.threads import deferToThread
10
from twisted.internet.defer import DeferredList
9
13
class FetchError(Exception):
12
17
class HTTPCodeError(FetchError):
14
19
def __init__(self, http_code, body):
34
40
return "<PyCurlError args=(%d, '%s')>" % (self.error_code,
37
48
def fetch(url, post=False, data="", headers={}, cainfo=None, curl=None,
38
49
connect_timeout=30, total_timeout=600):
39
50
"""Retrieve a URL and return the content.
65
76
curl.setopt(pycurl.HTTPHEADER,
66
77
["%s: %s" % pair for pair in sorted(headers.iteritems())])
68
curl.setopt(pycurl.URL, url)
79
curl.setopt(pycurl.URL, str(url))
69
80
curl.setopt(pycurl.FOLLOWLOCATION, True)
70
81
curl.setopt(pycurl.MAXREDIRS, 5)
71
82
curl.setopt(pycurl.CONNECTTIMEOUT, connect_timeout)
101
112
def fetch_async(*args, **kwargs):
113
"""Retrieve a URL asynchronously.
115
@return: A C{Deferred} resulting in the URL content.
102
117
return deferToThread(fetch, *args, **kwargs)
120
def fetch_many_async(urls, callback=None, errback=None, **kwargs):
122
Retrieve a list of URLs asynchronously.
124
@param callback: Optionally, a function that will be fired one time for
125
each successful URL, and will be passed its content and the URL itself.
126
@param errback: Optionally, a function that will be fired one time for each
127
failing URL, and will be passed the failure and the URL itself.
128
@return: A C{DeferredList} whose callback chain will be fired as soon as
129
all downloads have terminated. If an error occurs, the errback chain
130
of the C{DeferredList} will be fired immediatly.
134
result = fetch_async(url, **kwargs)
136
result.addCallback(callback, url)
138
result.addErrback(errback, url)
139
results.append(result)
140
return DeferredList(results, fireOnOneErrback=True, consumeErrors=True)
143
def url_to_filename(url, directory=None):
144
"""Return the last component of the given C{url}.
146
@param url: The URL to get the filename from.
147
@param directory: Optionally a path to prepend to the returned filename.
149
@note: Any trailing slash in the C{url} will be removed
151
filename = url.rstrip("/").split("/")[-1]
152
if directory is not None:
153
filename = os.path.join(directory, filename)
157
def fetch_to_files(urls, directory, logger=None, **kwargs):
159
Retrieve a list of URLs and save their content as files in a directory.
161
@param urls: The list URLs to fetch.
162
@param directory: The directory to save the files to, the name of the file
163
will equal the last fragment of the URL.
164
@param logger: Optional function to be used to log errors for failed URLs.
167
def write(data, url):
168
filename = url_to_filename(url, directory=directory)
169
fd = open(filename, "w")
173
def log_error(failure, url):
175
logger("Couldn't fetch file from %s (%s)" % (
176
url, str(failure.value)))
179
return fetch_many_async(urls, callback=write, errback=log_error, **kwargs)
105
182
if __name__ == "__main__":
106
183
test(sys.argv[1:])