~chipaca/ytd/trunk

8 by john.lenton at canonical
reworked ytd a little
1
"""
2
YouTubeDownloader
3
4
(for now! eventually, any-video-site-out-there downloader ;-p
5
"""
6
1 by john.lenton at canonical
initial checkin
7
import cgi
7 by john.lenton at canonical
moved ytd over to use aio
8
from cStringIO import StringIO
1 by john.lenton at canonical
initial checkin
9
import logging
3 by john.lenton at canonical
scraping!
10
import os
1 by john.lenton at canonical
initial checkin
11
import re
3 by john.lenton at canonical
scraping!
12
import sys
1 by john.lenton at canonical
initial checkin
13
import tempfile
14
import urllib
15
from htmlentitydefs import name2codepoint
16
17
import glib
18
import gtk
7 by john.lenton at canonical
moved ytd over to use aio
19
20
from async_downloader import AsyncDownloader
1 by john.lenton at canonical
initial checkin
21
22
logging.basicConfig(level=logging.DEBUG)
23
3 by john.lenton at canonical
scraping!
24
# each extension has its own format, and so its own extension
1 by john.lenton at canonical
initial checkin
25
EXTS = {'37': 'mp4',
26
        '22': 'mp4',
27
        '35': 'flv',
28
        '18': 'mp4',
29
        '34': 'flv',
7 by john.lenton at canonical
moved ytd over to use aio
30
        '17': '3gp',
8 by john.lenton at canonical
reworked ytd a little
31
        '0': 'flv',
32
        '5': 'flv',
33
        '6': 'flv',
34
        '13': '3gp',
7 by john.lenton at canonical
moved ytd over to use aio
35
        }
36
8 by john.lenton at canonical
reworked ytd a little
37
FMT_MAP = {'mobile': ['17', '13'],
38
           'low': ['34', '5', '0'],
39
           'medium': ['18', '6'],
40
           'high': ['35'],
41
           '720p': ['22'],
42
           '1080p': ['37'],
43
           }
1 by john.lenton at canonical
initial checkin
44
45
def match2char(match):
8 by john.lenton at canonical
reworked ytd a little
46
    """
47
    Transform a entity (in the match's groups) to its unicode character
48
    """
1 by john.lenton at canonical
initial checkin
49
    num, name = match.groups()
50
    if num is not None:
51
        num = int(num)
52
        try:
53
            return unichr(num)
8 by john.lenton at canonical
reworked ytd a little
54
        except (ValueError, OverflowError):
1 by john.lenton at canonical
initial checkin
55
            return match.group()
56
    else:
57
        codepoint = name2codepoint.get(name)
58
        if codepoint is not None:
59
            return unichr(codepoint)
60
        else:
61
            return match.group()
62
63
def de_entify(string):
8 by john.lenton at canonical
reworked ytd a little
64
    """
65
    Strip entities from a string
66
    """
1 by john.lenton at canonical
initial checkin
67
    return re.sub(r'&(?:#(\d+)|(\w+));', match2char, string)
68
69
70
71
# pylint: disable-msg=E1101
72
73
class YouTubeDownloader(object):
8 by john.lenton at canonical
reworked ytd a little
74
    """
75
    An app that lets you download a video from youtube
76
    """
3 by john.lenton at canonical
scraping!
77
    def __init__(self, url=None):
1 by john.lenton at canonical
initial checkin
78
        builder = gtk.Builder()
79
        builder.set_translation_domain('YouTubeDownloader')
80
        builder.add_from_file('ytd.glade')
81
        builder.connect_signals(self)
82
        for obj in builder.get_objects():
10 by john.lenton at canonical
work around bug lp:507739
83
            name = getattr(obj, 'name', None)
84
            if name is None and isinstance(obj, gtk.Buildable):
85
                # work around bug lp:507739
86
                name = gtk.Buildable.get_name(obj)
87
            if name is None:
3 by john.lenton at canonical
scraping!
88
                logging.warn("%s has no name (??)", obj)
10 by john.lenton at canonical
work around bug lp:507739
89
            else:
90
                setattr(self, name.replace('-', '_'), obj)
3 by john.lenton at canonical
scraping!
91
1 by john.lenton at canonical
initial checkin
92
        # work around a bug somewhere
93
        self.entry_filename.set_property('secondary-icon-sensitive', False)
94
        self.entry_url.set_property('secondary-icon-sensitive', False)
95
        # random stuff we need to cart around
96
        self.entry_url_timer = None
97
        self.entry_filename_timer = None
98
        self._thumb_tempfile = None
99
        self.urlmap = None
100
        self.current_fmt = None
101
        self.file_size = None
102
3 by john.lenton at canonical
scraping!
103
    def run(self, url=None):
8 by john.lenton at canonical
reworked ytd a little
104
        """
105
        Start the app
106
        """
3 by john.lenton at canonical
scraping!
107
        self.entry_url.set_text(url)
108
        glib.timeout_add_seconds(0, self.get_info)
2 by john.lenton at canonical
added back the "run" method
109
        try:
110
            gtk.main()
111
        except KeyboardInterrupt:
112
            pass
113
7 by john.lenton at canonical
moved ytd over to use aio
114
    def download(self):
8 by john.lenton at canonical
reworked ytd a little
115
        """
116
        Download the full video (at the quality selected)
117
        """
7 by john.lenton at canonical
moved ytd over to use aio
118
        self.vbox1.set_sensitive(False)
119
        self.button_ok.set_sensitive(False)
120
        self.entry_filename_timer = glib.timeout_add(250,
121
                                                     self.entry_filename_pulse)
122
        AsyncDownloader(self.urlmap[self.current_fmt],
123
                        self.partial_factory,
8 by john.lenton at canonical
reworked ytd a little
124
                        self._download_download_done_cb,
7 by john.lenton at canonical
moved ytd over to use aio
125
                        self.error,
8 by john.lenton at canonical
reworked ytd a little
126
                        query_info_cb=self._download_query_info_cb,
127
                        tick_cb=self._download_tick_cb).start()
1 by john.lenton at canonical
initial checkin
128
7 by john.lenton at canonical
moved ytd over to use aio
129
    def partial_factory(self):
8 by john.lenton at canonical
reworked ytd a little
130
        """
131
        Create a .partial file in a reasonably safe way
132
        """
3 by john.lenton at canonical
scraping!
133
        filename = os.path.join(self.filechooserbutton.get_current_folder(),
134
                                self.entry_filename.get_text() + '.part')
8 by john.lenton at canonical
reworked ytd a little
135
        filedes = os.open(filename, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0600)
3 by john.lenton at canonical
scraping!
136
        # XXX this can still fail in older versions of NFS
8 by john.lenton at canonical
reworked ytd a little
137
        fileobj = os.fdopen(filedes, 'w')
138
        return fileobj
7 by john.lenton at canonical
moved ytd over to use aio
139
8 by john.lenton at canonical
reworked ytd a little
140
    def _download_query_info_cb(self, qinfo):
141
        """
142
        Callback for when the query_info arrives
143
        """
7 by john.lenton at canonical
moved ytd over to use aio
144
        self.file_size = qinfo.get_size()
145
8 by john.lenton at canonical
reworked ytd a little
146
    def _download_tick_cb(self, fileobj):
147
        """
148
        Callback for when we get a chunk of data
149
        """
7 by john.lenton at canonical
moved ytd over to use aio
150
        if self.entry_filename_timer is not None:
151
            glib.source_remove(self.entry_filename_timer)
152
            self.entry_filename_timer = None
8 by john.lenton at canonical
reworked ytd a little
153
        fraction = 1.0 * fileobj.tell() / self.file_size
154
        self.entry_filename.set_progress_fraction(fraction)
7 by john.lenton at canonical
moved ytd over to use aio
155
8 by john.lenton at canonical
reworked ytd a little
156
    def _download_download_done_cb(self, fileobj):
157
        """
158
        Callback for when the download finishes
159
        """
160
        fileobj.flush()
161
        os.fsync(fileobj.fileno())
7 by john.lenton at canonical
moved ytd over to use aio
162
        filename = os.path.join(self.filechooserbutton.get_current_folder(),
163
                                self.entry_filename.get_text())
164
        os.rename(filename + '.part', filename)
165
        logging.debug('downloaded movie to %s', filename)
166
        self.clear_progress()
167
        self.vbox1.set_sensitive(True)
168
        self.button_ok.set_sensitive(True)
1 by john.lenton at canonical
initial checkin
169
170
    def get_video_id(self):
8 by john.lenton at canonical
reworked ytd a little
171
        """
172
        Extract the video id from the url
173
        """
1 by john.lenton at canonical
initial checkin
174
        url = self.entry_url.get_text()
3 by john.lenton at canonical
scraping!
175
        url = re.sub(r'^http://(?:www\.|m\.)youtube\.com/.*next_url=([^&]+)',
1 by john.lenton at canonical
initial checkin
176
                     lambda m: urllib.unquote_plus(m.group(1)), url)
8 by john.lenton at canonical
reworked ytd a little
177
        match = re.match(r'http://(?:www\.|m\.)?youtube\.com/'
178
                         r'(?:watch\?v=|v/)([^<>?&,]+)',
179
                         url)
180
        if match is None:
1 by john.lenton at canonical
initial checkin
181
            return None
182
        else:
8 by john.lenton at canonical
reworked ytd a little
183
            return match.group(1)
1 by john.lenton at canonical
initial checkin
184
185
    def get_thumb(self, uri):
8 by john.lenton at canonical
reworked ytd a little
186
        """
187
        Download the thumbnail of the video
188
        """
7 by john.lenton at canonical
moved ytd over to use aio
189
        AsyncDownloader(uri, tempfile.NamedTemporaryFile,
8 by john.lenton at canonical
reworked ytd a little
190
                        self._thumb_download_done_cb, self.error).start()
7 by john.lenton at canonical
moved ytd over to use aio
191
8 by john.lenton at canonical
reworked ytd a little
192
    def _thumb_download_done_cb(self, fileobj):
193
        """
194
        Callback for when the thumbnail download finished
195
        """
196
        fileobj.flush()
197
        logging.debug("downloaded thumb into %s", fileobj.name)
198
        self.img_preview.set_from_file(fileobj.name)
7 by john.lenton at canonical
moved ytd over to use aio
199
        self.clear_progress()
1 by john.lenton at canonical
initial checkin
200
201
    def clear_progress(self):
8 by john.lenton at canonical
reworked ytd a little
202
        """
203
        Turn off any progress indicators
204
        """
1 by john.lenton at canonical
initial checkin
205
        if self.entry_url_timer is not None:
206
            glib.source_remove(self.entry_url_timer)
207
            self.entry_url_timer = None
208
        self.entry_url.set_progress_fraction(0.)
209
        if self.entry_filename_timer is not None:
210
            glib.source_remove(self.entry_filename_timer)
211
            self.entry_filename_timer = None
212
        self.entry_filename.set_progress_fraction(0.)
213
3 by john.lenton at canonical
scraping!
214
    def error(self, exc):
8 by john.lenton at canonical
reworked ytd a little
215
        """
216
        Show and log an error
217
        """
3 by john.lenton at canonical
scraping!
218
        msg = str(exc)
1 by john.lenton at canonical
initial checkin
219
        logging.exception(msg)
220
        self.main_window.set_sensitive(False)
221
        self.error_dialog.format_secondary_text(msg)
222
        self.error_dialog.run()
223
        self.error_dialog.hide()
224
        self.main_window.set_sensitive(True)
3 by john.lenton at canonical
scraping!
225
        self.vbox1.set_sensitive(True)
226
        self.button_ok.set_sensitive(True)
1 by john.lenton at canonical
initial checkin
227
        self.clear_progress()
228
3 by john.lenton at canonical
scraping!
229
    def get_info_result(self, urls, title, thumbnail=None):
8 by john.lenton at canonical
reworked ytd a little
230
        """
231
        Called with the parsed result of the call to get_info
232
233
        Enables/disables the appropriate radiobuttons (depening on the
234
        qualities available), and gets the thumbnail
235
        """
3 by john.lenton at canonical
scraping!
236
        logging.debug("get_info answered with: title: %r  thumb: %r  urls: %r",
237
                      title, thumbnail, urls)
8 by john.lenton at canonical
reworked ytd a little
238
        title = title.replace('/', u'\u2044')
3 by john.lenton at canonical
scraping!
239
        urls = re.findall(r'(\d+)\|([^,]+)', urls)
240
        self.urlmap = dict(urls)
241
        logging.debug("movie has formats %s", sorted(self.urlmap.keys()))
242
        logging.info("unknown formats: %s",
243
                     sorted(set(self.urlmap) - set(EXTS)))
244
        self.hbox_url_info.set_sensitive(True)
8 by john.lenton at canonical
reworked ytd a little
245
        best = None
246
        for name in '1080p', '720p', 'high', 'medium', 'low', 'mobile':
247
            radiobtn = getattr(self, 'radiobutton_' + name)
248
            radiobtn.set_active(False)
249
            radiobtn.fmt = None
250
            for fmt in FMT_MAP[name]:
251
                has_fmt = fmt in self.urlmap
252
                radiobtn.set_sensitive(has_fmt)
253
                if has_fmt:
254
                    radiobtn.fmt = fmt
255
                    if best is None:
256
                        best = fmt
257
                        radiobtn.set_active(True)
258
                    break
259
        if best is None:
7 by john.lenton at canonical
moved ytd over to use aio
260
            raise ValueError("No known formats available")
8 by john.lenton at canonical
reworked ytd a little
261
        self.current_fmt = best
262
        self.entry_filename.set_text(title + '.' + EXTS[best])
3 by john.lenton at canonical
scraping!
263
        if thumbnail:
264
            self.get_thumb(thumbnail)
265
        else:
4 by john.lenton at canonical
made the broken icon be broken rather than missing
266
            self.img_preview.set_from_icon_name('b0rken', gtk.ICON_SIZE_DND)
3 by john.lenton at canonical
scraping!
267
            self.clear_progress()
268
269
    def scrape(self):
8 by john.lenton at canonical
reworked ytd a little
270
        """
271
        Download the page, so it gets scraped (get_info failed)
272
        """
7 by john.lenton at canonical
moved ytd over to use aio
273
        AsyncDownloader(self.entry_url.get_text(),
274
                        StringIO,
8 by john.lenton at canonical
reworked ytd a little
275
                        self._scrape_download_done).start()
7 by john.lenton at canonical
moved ytd over to use aio
276
8 by john.lenton at canonical
reworked ytd a little
277
    def _scrape_download_done(self, buf):
278
        """
279
        extract the necessary info out of the page
280
        """
7 by john.lenton at canonical
moved ytd over to use aio
281
        try:
282
            body = buf.getvalue()
8 by john.lenton at canonical
reworked ytd a little
283
            match = re.search(r"'SWF_ARGS' *: *({.*?})", body, re.S)
284
            if not match:
7 by john.lenton at canonical
moved ytd over to use aio
285
                # XXX check for error-box (cf. youtubedown)
286
                raise ValueError("Unable to scrape the HTML")
8 by john.lenton at canonical
reworked ytd a little
287
            match = re.search(r'"fmt_url_map": "(.*?)"', match.group(1), re.S)
288
            if not match:
7 by john.lenton at canonical
moved ytd over to use aio
289
                raise ValueError("no fmt_url_map found")
8 by john.lenton at canonical
reworked ytd a little
290
            urls = urllib.unquote(de_entify(match.group(1)))
291
            match = re.search(r"'VIDEO_TITLE': '(.*?)'", body, re.S)
292
            if not match:
7 by john.lenton at canonical
moved ytd over to use aio
293
                raise ValueError("No title found")
8 by john.lenton at canonical
reworked ytd a little
294
            title = de_entify(match.group(1)).strip()
7 by john.lenton at canonical
moved ytd over to use aio
295
            self.get_info_result(urls, title)
8 by john.lenton at canonical
reworked ytd a little
296
        except StandardError, exc:
297
            self.error(exc)
7 by john.lenton at canonical
moved ytd over to use aio
298
        self.vbox1.set_sensitive(True)
299
300
    def get_info(self):
8 by john.lenton at canonical
reworked ytd a little
301
        """
302
        Download the "get_info" page for the video, which is easyer to
303
        parse than the "real" page
304
        """
7 by john.lenton at canonical
moved ytd over to use aio
305
        self.vbox1.set_sensitive(False)
306
        video_id = self.get_video_id()
307
        if video_id is None:
308
            self.error('Unable to find an ID in the URL %r'
309
                       % (self.entry_url.get_text(),))
310
        else:
311
            self.entry_url_timer = glib.timeout_add(250,
312
                                                    self.entry_url_pulse)
8 by john.lenton at canonical
reworked ytd a little
313
            info_url = "http://www.youtube.com/get_video_info?video_id=" \
314
                + video_id
7 by john.lenton at canonical
moved ytd over to use aio
315
            AsyncDownloader(info_url, StringIO,
8 by john.lenton at canonical
reworked ytd a little
316
                            self._get_info_download_done_cb, self.error).start()
7 by john.lenton at canonical
moved ytd over to use aio
317
        return False
318
8 by john.lenton at canonical
reworked ytd a little
319
    def _get_info_download_done_cb(self, fileobj):
320
        """
321
        The "get_info" page finished downloading
322
        """
7 by john.lenton at canonical
moved ytd over to use aio
323
        try:
8 by john.lenton at canonical
reworked ytd a little
324
            result = de_entify(fileobj.getvalue())
1 by john.lenton at canonical
initial checkin
325
            result = dict(cgi.parse_qsl(result))
326
            urls = result.get('fmt_url_map', result.get('fmt_stream_map'))
327
            if urls is None:
3 by john.lenton at canonical
scraping!
328
                self.scrape()
329
                return
1 by john.lenton at canonical
initial checkin
330
            if 'title' not in result:
331
                raise RuntimeError("No title found")
332
            title = de_entify(result['title']).strip()
3 by john.lenton at canonical
scraping!
333
            self.get_info_result(urls, title, result.get('thumbnail_url'))
8 by john.lenton at canonical
reworked ytd a little
334
        except StandardError, exc:
335
            self.error(exc)
3 by john.lenton at canonical
scraping!
336
        self.vbox1.set_sensitive(True)
1 by john.lenton at canonical
initial checkin
337
338
    def entry_filename_pulse(self):
8 by john.lenton at canonical
reworked ytd a little
339
        """
340
        pulse the filename entry progressbar
341
        """
1 by john.lenton at canonical
initial checkin
342
        self.entry_filename.progress_pulse()
343
        return True
344
345
    def entry_url_pulse(self):
8 by john.lenton at canonical
reworked ytd a little
346
        """
347
        pulse the url entry progressbar
348
        """
1 by john.lenton at canonical
initial checkin
349
        self.entry_url.progress_pulse()
350
        return True
351
352
    def on_radiobutton_fmt_toggled(self, widget):
8 by john.lenton at canonical
reworked ytd a little
353
        """
354
        Callback for when one of the format radiobuttons is toggled
355
356
        This sets the appropriate extension on the filename
357
        """
358
        if widget.fmt is None:
359
            return
360
        this_ext = EXTS[widget.fmt]
1 by john.lenton at canonical
initial checkin
361
        if widget.get_active():
8 by john.lenton at canonical
reworked ytd a little
362
            self.current_fmt = widget.fmt
363
            text = self.entry_filename.get_text() + this_ext
1 by john.lenton at canonical
initial checkin
364
        else:
8 by john.lenton at canonical
reworked ytd a little
365
            text = self.entry_filename.get_text().replace(this_ext, '')
366
        self.entry_filename.set_text(text)
1 by john.lenton at canonical
initial checkin
367
368
    def on_entry_filename_activate(self, *ignored):
8 by john.lenton at canonical
reworked ytd a little
369
        """
370
        Callback for when the filename entry is activated: Start the download
371
        """
1 by john.lenton at canonical
initial checkin
372
        return self.download()
373
    on_entry_filename_icon_press = on_entry_filename_activate
374
375
    def on_entry_url_activate(self, *ignored):
8 by john.lenton at canonical
reworked ytd a little
376
        """
377
        Callback for when the url entry is activated: get the metadata
378
        """
1 by john.lenton at canonical
initial checkin
379
        return self.get_info()
380
    on_entry_url_icon_press = on_entry_url_activate
381
8 by john.lenton at canonical
reworked ytd a little
382
    def on_main_window_response(self, ignored, response):
383
        """
384
        Callback for handling a dialog response (either cancel close or ok)
385
386
        cancel should abort and close
387
        close should just close
388
        ok should start the download
389
        """
1 by john.lenton at canonical
initial checkin
390
        if response == gtk.RESPONSE_OK:
391
            self.download()
392
        else:
393
            if response not in (gtk.RESPONSE_DELETE_EVENT, gtk.RESPONSE_CANCEL):
394
                desc = "???"
8 by john.lenton at canonical
reworked ytd a little
395
                for key, val in vars(gtk).iteritems():
396
                    if key.startswith('RESPONSE_') and val == response:
397
                        desc = key
3 by john.lenton at canonical
scraping!
398
                logging.error("Unknown response %s (%s)", response, desc)
1 by john.lenton at canonical
initial checkin
399
            gtk.main_quit()
400
401
    def on_entry_url_changed(self, widget):
8 by john.lenton at canonical
reworked ytd a little
402
        """
403
        Callback for when the url entry changed
404
405
        Used to give feedback about the validity / utility of the url
406
        """
407
        widget.set_property('secondary-icon-sensitive',
408
                            self.get_video_id() is not None)
1 by john.lenton at canonical
initial checkin
409
410
    def on_entry_filename_changed(self, widget):
8 by john.lenton at canonical
reworked ytd a little
411
        """
412
        Callback for when the filename entry changed
413
414
        Used to give feedback about the validity / utility of the filename
415
        """
1 by john.lenton at canonical
initial checkin
416
        widget.set_property('secondary-icon-sensitive', bool(widget.get_text()))
417
418
if __name__ == '__main__':
8 by john.lenton at canonical
reworked ytd a little
419
    YouTubeDownloader().run(sys.argv[1] if len(sys.argv) > 1 else None)