~ubuntu-branches/ubuntu/natty/landscape-client/natty

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import sys

from optparse import OptionParser
from StringIO import StringIO

from twisted.internet.threads import deferToThread
from twisted.internet.defer import DeferredList


class FetchError(Exception):
    pass


class HTTPCodeError(FetchError):

    def __init__(self, http_code, body):
        self.http_code = http_code
        self.body = body

    def __str__(self):
        return "Server returned HTTP code %d" % self.http_code

    def __repr__(self):
        return "<HTTPCodeError http_code=%d>" % self.http_code


class PyCurlError(FetchError):

    def __init__(self, error_code, message):
        self.error_code = error_code
        self._message = message

    def __str__(self):
        return "Error %d: %s" % (self.error_code, self.message)

    def __repr__(self):
        return "<PyCurlError args=(%d, '%s')>" % (self.error_code,
                                                  self.message)

    @property
    def message(self):
        return self._message


def fetch(url, post=False, data="", headers={}, cainfo=None, curl=None,
          connect_timeout=30, total_timeout=600):
    """Retrieve a URL and return the content.

    @param url: The url to be fetched.
    @param post: If true, the POST method will be used (defaults to GET).
    @param data: Data to be sent to the server as the POST content.
    @param headers: Dictionary of header => value entries to be used
                    on the request.
    @param cainfo: Path to the file with CA certificates.
    """
    import pycurl
    output = StringIO(data)
    input = StringIO()

    if curl is None:
        curl = pycurl.Curl()

    if post:
        curl.setopt(pycurl.POST, True)

        if data:
            curl.setopt(pycurl.POSTFIELDSIZE, len(data))
            curl.setopt(pycurl.READFUNCTION, output.read)

    if cainfo and url.startswith("https:"):
        curl.setopt(pycurl.CAINFO, cainfo)

    if headers:
        curl.setopt(pycurl.HTTPHEADER,
                    ["%s: %s" % pair for pair in sorted(headers.iteritems())])

    curl.setopt(pycurl.URL, str(url))
    curl.setopt(pycurl.FOLLOWLOCATION, True)
    curl.setopt(pycurl.MAXREDIRS, 5)
    curl.setopt(pycurl.CONNECTTIMEOUT, connect_timeout)
    curl.setopt(pycurl.LOW_SPEED_LIMIT, 1)
    curl.setopt(pycurl.LOW_SPEED_TIME, total_timeout)
    curl.setopt(pycurl.NOSIGNAL, 1)
    curl.setopt(pycurl.WRITEFUNCTION, input.write)
    curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
    curl.setopt(pycurl.ENCODING, "gzip,deflate")

    try:
        curl.perform()
    except pycurl.error, e:
        raise PyCurlError(e.args[0], e.args[1])

    body = input.getvalue()

    http_code = curl.getinfo(pycurl.HTTP_CODE)
    if http_code != 200:
        raise HTTPCodeError(http_code, body)

    return body


def test(args):
    parser = OptionParser()
    parser.add_option("--post", action="store_true")
    parser.add_option("--data", default="")
    parser.add_option("--cainfo")
    options, (url,) = parser.parse_args(args)
    print fetch(url, post=options.post, data=options.data,
                cainfo=options.cainfo)


def fetch_async(*args, **kwargs):
    """Retrieve a URL asynchronously.

    @return: A C{Deferred} resulting in the URL content.
    """
    return deferToThread(fetch, *args, **kwargs)


def fetch_many_async(urls, callback=None, errback=None, **kwargs):
    """
    Retrieve a list of URLs asynchronously.

    @param callback: Optionally, a function that will be fired one time for
        each successful URL, and will be passed its content and the URL itself.
    @param errback: Optionally, a function that will be fired one time for each
        failing URL, and will be passed the failure and the URL itself.
    @return: A C{DeferredList} whose callback chain will be fired as soon as
        all downloads have terminated. If an error occurs, the errback chain
        of the C{DeferredList} will be fired immediatly.
    """
    results = []
    for url in urls:
        result = fetch_async(url, **kwargs)
        if callback:
            result.addCallback(callback, url)
        if errback:
            result.addErrback(errback, url)
        results.append(result)
    return DeferredList(results, fireOnOneErrback=True, consumeErrors=True)


def url_to_filename(url, directory=None):
    """Return the last component of the given C{url}.

    @param url: The URL to get the filename from.
    @param directory: Optionally a path to prepend to the returned filename.

    @note: Any trailing slash in the C{url} will be removed
    """
    filename = url.rstrip("/").split("/")[-1]
    if directory is not None:
        filename = os.path.join(directory, filename)
    return filename


def fetch_to_files(urls, directory, logger=None, **kwargs):
    """
    Retrieve a list of URLs and save their content as files in a directory.

    @param urls: The list URLs to fetch.
    @param directory: The directory to save the files to, the name of the file
        will equal the last fragment of the URL.
    @param logger: Optional function to be used to log errors for failed URLs.
    """

    def write(data, url):
        filename = url_to_filename(url, directory=directory)
        fd = open(filename, "w")
        fd.write(data)
        fd.close()

    def log_error(failure, url):
        if logger:
            logger("Couldn't fetch file from %s (%s)" % (
                url, str(failure.value)))
        return failure

    return fetch_many_async(urls, callback=write, errback=log_error, **kwargs)


if __name__ == "__main__":
    test(sys.argv[1:])