~ubuntu-branches/ubuntu/maverick/youtube-dl/maverick-updates

« back to all changes in this revision

Viewing changes to .pc/01-prefer-open-formats.patch/youtube-dl

  • Committer: Package Import Robot
  • Author(s): Evan Broder
  • Date: 2012-01-11 15:59:23 UTC
  • mfrom: (21.1.8 sid)
  • Revision ID: package-import@ubuntu.com-20120111155923-w53vce5ov71bti3c
Tags: 2011.08.04-1~maverick0.1
Backport new upstream release to Maverick to fix changes in
Youtube. (LP: #915029)

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/env python
 
2
# -*- coding: utf-8 -*-
 
3
# Author: Ricardo Garcia Gonzalez
 
4
# Author: Danny Colligan
 
5
# Author: Benjamin Johnson
 
6
# Author: Vasyl' Vavrychuk
 
7
# Author: Witold Baryluk
 
8
# Author: Paweł Paprota
 
9
# Author: Gergely Imreh
 
10
# License: Public domain code
 
11
import cookielib
 
12
import ctypes
 
13
import datetime
 
14
import email.utils
 
15
import gzip
 
16
import htmlentitydefs
 
17
import httplib
 
18
import locale
 
19
import math
 
20
import netrc
 
21
import os
 
22
import os.path
 
23
import re
 
24
import socket
 
25
import string
 
26
import StringIO
 
27
import subprocess
 
28
import sys
 
29
import time
 
30
import urllib
 
31
import urllib2
 
32
import zlib
 
33
 
 
34
# parse_qs was moved from the cgi module to the urlparse module recently.
 
35
try:
 
36
        from urlparse import parse_qs
 
37
except ImportError:
 
38
        from cgi import parse_qs
 
39
 
 
40
std_headers = {
 
41
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
 
42
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 
43
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 
44
        'Accept-Encoding': 'gzip, deflate',
 
45
        'Accept-Language': 'en-us,en;q=0.5',
 
46
}
 
47
 
 
48
simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
 
49
 
 
50
def preferredencoding():
 
51
        """Get preferred encoding.
 
52
 
 
53
        Returns the best encoding scheme for the system, based on
 
54
        locale.getpreferredencoding() and some further tweaks.
 
55
        """
 
56
        def yield_preferredencoding():
 
57
                try:
 
58
                        pref = locale.getpreferredencoding()
 
59
                        u'TEST'.encode(pref)
 
60
                except:
 
61
                        pref = 'UTF-8'
 
62
                while True:
 
63
                        yield pref
 
64
        return yield_preferredencoding().next()
 
65
 
 
66
def htmlentity_transform(matchobj):
 
67
        """Transforms an HTML entity to a Unicode character.
 
68
 
 
69
        This function receives a match object and is intended to be used with
 
70
        the re.sub() function.
 
71
        """
 
72
        entity = matchobj.group(1)
 
73
 
 
74
        # Known non-numeric HTML entity
 
75
        if entity in htmlentitydefs.name2codepoint:
 
76
                return unichr(htmlentitydefs.name2codepoint[entity])
 
77
 
 
78
        # Unicode character
 
79
        mobj = re.match(ur'(?u)#(x?\d+)', entity)
 
80
        if mobj is not None:
 
81
                numstr = mobj.group(1)
 
82
                if numstr.startswith(u'x'):
 
83
                        base = 16
 
84
                        numstr = u'0%s' % numstr
 
85
                else:
 
86
                        base = 10
 
87
                return unichr(long(numstr, base))
 
88
 
 
89
        # Unknown entity in name, return its literal representation
 
90
        return (u'&%s;' % entity)
 
91
 
 
92
def sanitize_title(utitle):
 
93
        """Sanitizes a video title so it could be used as part of a filename."""
 
94
        utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 
95
        return utitle.replace(unicode(os.sep), u'%')
 
96
 
 
97
def sanitize_open(filename, open_mode):
 
98
        """Try to open the given filename, and slightly tweak it if this fails.
 
99
 
 
100
        Attempts to open the given filename. If this fails, it tries to change
 
101
        the filename slightly, step by step, until it's either able to open it
 
102
        or it fails and raises a final exception, like the standard open()
 
103
        function.
 
104
 
 
105
        It returns the tuple (stream, definitive_file_name).
 
106
        """
 
107
        try:
 
108
                if filename == u'-':
 
109
                        if sys.platform == 'win32':
 
110
                                import msvcrt
 
111
                                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 
112
                        return (sys.stdout, filename)
 
113
                stream = open(filename, open_mode)
 
114
                return (stream, filename)
 
115
        except (IOError, OSError), err:
 
116
                # In case of error, try to remove win32 forbidden chars
 
117
                filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 
118
 
 
119
                # An exception here should be caught in the caller
 
120
                stream = open(filename, open_mode)
 
121
                return (stream, filename)
 
122
 
 
123
def timeconvert(timestr):
 
124
    """Convert RFC 2822 defined time string into system timestamp"""
 
125
    timestamp = None
 
126
    timetuple = email.utils.parsedate_tz(timestr)
 
127
    if timetuple is not None:
 
128
        timestamp = email.utils.mktime_tz(timetuple)
 
129
    return timestamp
 
130
 
 
131
class DownloadError(Exception):
 
132
        """Download Error exception.
 
133
 
 
134
        This exception may be thrown by FileDownloader objects if they are not
 
135
        configured to continue on errors. They will contain the appropriate
 
136
        error message.
 
137
        """
 
138
        pass
 
139
 
 
140
class SameFileError(Exception):
 
141
        """Same File exception.
 
142
 
 
143
        This exception will be thrown by FileDownloader objects if they detect
 
144
        multiple files would have to be downloaded to the same file on disk.
 
145
        """
 
146
        pass
 
147
 
 
148
class PostProcessingError(Exception):
 
149
        """Post Processing exception.
 
150
 
 
151
        This exception may be raised by PostProcessor's .run() method to
 
152
        indicate an error in the postprocessing task.
 
153
        """
 
154
        pass
 
155
 
 
156
class UnavailableVideoError(Exception):
 
157
        """Unavailable Format exception.
 
158
 
 
159
        This exception will be thrown when a video is requested
 
160
        in a format that is not available for that video.
 
161
        """
 
162
        pass
 
163
 
 
164
class ContentTooShortError(Exception):
 
165
        """Content Too Short exception.
 
166
 
 
167
        This exception may be raised by FileDownloader objects when a file they
 
168
        download is too small for what the server announced first, indicating
 
169
        the connection was probably interrupted.
 
170
        """
 
171
        # Both in bytes
 
172
        downloaded = None
 
173
        expected = None
 
174
 
 
175
        def __init__(self, downloaded, expected):
 
176
                self.downloaded = downloaded
 
177
                self.expected = expected
 
178
 
 
179
class YoutubeDLHandler(urllib2.HTTPHandler):
 
180
        """Handler for HTTP requests and responses.
 
181
 
 
182
        This class, when installed with an OpenerDirector, automatically adds
 
183
        the standard headers to every HTTP request and handles gzipped and
 
184
        deflated responses from web servers. If compression is to be avoided in
 
185
        a particular request, the original request in the program code only has
 
186
        to include the HTTP header "Youtubedl-No-Compression", which will be
 
187
        removed before making the real request.
 
188
        
 
189
        Part of this code was copied from:
 
190
 
 
191
          http://techknack.net/python-urllib2-handlers/
 
192
          
 
193
        Andrew Rowls, the author of that code, agreed to release it to the
 
194
        public domain.
 
195
        """
 
196
 
 
197
        @staticmethod
 
198
        def deflate(data):
 
199
                try:
 
200
                        return zlib.decompress(data, -zlib.MAX_WBITS)
 
201
                except zlib.error:
 
202
                        return zlib.decompress(data)
 
203
        
 
204
        @staticmethod
 
205
        def addinfourl_wrapper(stream, headers, url, code):
 
206
                if hasattr(urllib2.addinfourl, 'getcode'):
 
207
                        return urllib2.addinfourl(stream, headers, url, code)
 
208
                ret = urllib2.addinfourl(stream, headers, url)
 
209
                ret.code = code
 
210
                return ret
 
211
        
 
212
        def http_request(self, req):
 
213
                for h in std_headers:
 
214
                        if h in req.headers:
 
215
                                del req.headers[h]
 
216
                        req.add_header(h, std_headers[h])
 
217
                if 'Youtubedl-no-compression' in req.headers:
 
218
                        if 'Accept-encoding' in req.headers:
 
219
                                del req.headers['Accept-encoding']
 
220
                        del req.headers['Youtubedl-no-compression']
 
221
                return req
 
222
 
 
223
        def http_response(self, req, resp):
 
224
                old_resp = resp
 
225
                # gzip
 
226
                if resp.headers.get('Content-encoding', '') == 'gzip':
 
227
                        gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 
228
                        resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 
229
                        resp.msg = old_resp.msg
 
230
                # deflate
 
231
                if resp.headers.get('Content-encoding', '') == 'deflate':
 
232
                        gz = StringIO.StringIO(self.deflate(resp.read()))
 
233
                        resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 
234
                        resp.msg = old_resp.msg
 
235
                return resp
 
236
 
 
237
class FileDownloader(object):
 
238
        """File Downloader class.
 
239
 
 
240
        File downloader objects are the ones responsible of downloading the
 
241
        actual video file and writing it to disk if the user has requested
 
242
        it, among some other tasks. In most cases there should be one per
 
243
        program. As, given a video URL, the downloader doesn't know how to
 
244
        extract all the needed information, task that InfoExtractors do, it
 
245
        has to pass the URL to one of them.
 
246
 
 
247
        For this, file downloader objects have a method that allows
 
248
        InfoExtractors to be registered in a given order. When it is passed
 
249
        a URL, the file downloader handles it to the first InfoExtractor it
 
250
        finds that reports being able to handle it. The InfoExtractor extracts
 
251
        all the information about the video or videos the URL refers to, and
 
252
        asks the FileDownloader to process the video information, possibly
 
253
        downloading the video.
 
254
 
 
255
        File downloaders accept a lot of parameters. In order not to saturate
 
256
        the object constructor with arguments, it receives a dictionary of
 
257
        options instead. These options are available through the params
 
258
        attribute for the InfoExtractors to use. The FileDownloader also
 
259
        registers itself as the downloader in charge for the InfoExtractors
 
260
        that are added to it, so this is a "mutual registration".
 
261
 
 
262
        Available options:
 
263
 
 
264
        username:         Username for authentication purposes.
 
265
        password:         Password for authentication purposes.
 
266
        usenetrc:         Use netrc for authentication instead.
 
267
        quiet:            Do not print messages to stdout.
 
268
        forceurl:         Force printing final URL.
 
269
        forcetitle:       Force printing title.
 
270
        forcethumbnail:   Force printing thumbnail URL.
 
271
        forcedescription: Force printing description.
 
272
        forcefilename:    Force printing final filename.
 
273
        simulate:         Do not download the video files.
 
274
        format:           Video format code.
 
275
        format_limit:     Highest quality format to try.
 
276
        outtmpl:          Template for output names.
 
277
        ignoreerrors:     Do not stop on download errors.
 
278
        ratelimit:        Download speed limit, in bytes/sec.
 
279
        nooverwrites:     Prevent overwriting files.
 
280
        retries:          Number of times to retry for HTTP error 5xx
 
281
        continuedl:       Try to continue downloads if possible.
 
282
        noprogress:       Do not print the progress bar.
 
283
        playliststart:    Playlist item to start at.
 
284
        playlistend:      Playlist item to end at.
 
285
        logtostderr:      Log messages to stderr instead of stdout.
 
286
        consoletitle:     Display progress in console window's titlebar.
 
287
        nopart:           Do not use temporary .part files.
 
288
        updatetime:       Use the Last-modified header to set output file timestamps.
 
289
        """
 
290
 
 
291
        params = None
 
292
        _ies = []
 
293
        _pps = []
 
294
        _download_retcode = None
 
295
        _num_downloads = None
 
296
        _screen_file = None
 
297
 
 
298
        def __init__(self, params):
 
299
                """Create a FileDownloader object with the given options."""
 
300
                self._ies = []
 
301
                self._pps = []
 
302
                self._download_retcode = 0
 
303
                self._num_downloads = 0
 
304
                self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 
305
                self.params = params
 
306
 
 
307
        @staticmethod
 
308
        def pmkdir(filename):
 
309
                """Create directory components in filename. Similar to Unix "mkdir -p"."""
 
310
                components = filename.split(os.sep)
 
311
                aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 
312
                aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 
313
                for dir in aggregate:
 
314
                        if not os.path.exists(dir):
 
315
                                os.mkdir(dir)
 
316
 
 
317
        @staticmethod
 
318
        def format_bytes(bytes):
 
319
                if bytes is None:
 
320
                        return 'N/A'
 
321
                if type(bytes) is str:
 
322
                        bytes = float(bytes)
 
323
                if bytes == 0.0:
 
324
                        exponent = 0
 
325
                else:
 
326
                        exponent = long(math.log(bytes, 1024.0))
 
327
                suffix = 'bkMGTPEZY'[exponent]
 
328
                converted = float(bytes) / float(1024**exponent)
 
329
                return '%.2f%s' % (converted, suffix)
 
330
 
 
331
        @staticmethod
 
332
        def calc_percent(byte_counter, data_len):
 
333
                if data_len is None:
 
334
                        return '---.-%'
 
335
                return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 
336
 
 
337
        @staticmethod
 
338
        def calc_eta(start, now, total, current):
 
339
                if total is None:
 
340
                        return '--:--'
 
341
                dif = now - start
 
342
                if current == 0 or dif < 0.001: # One millisecond
 
343
                        return '--:--'
 
344
                rate = float(current) / dif
 
345
                eta = long((float(total) - float(current)) / rate)
 
346
                (eta_mins, eta_secs) = divmod(eta, 60)
 
347
                if eta_mins > 99:
 
348
                        return '--:--'
 
349
                return '%02d:%02d' % (eta_mins, eta_secs)
 
350
 
 
351
        @staticmethod
 
352
        def calc_speed(start, now, bytes):
 
353
                dif = now - start
 
354
                if bytes == 0 or dif < 0.001: # One millisecond
 
355
                        return '%10s' % '---b/s'
 
356
                return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 
357
 
 
358
        @staticmethod
 
359
        def best_block_size(elapsed_time, bytes):
 
360
                new_min = max(bytes / 2.0, 1.0)
 
361
                new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 
362
                if elapsed_time < 0.001:
 
363
                        return long(new_max)
 
364
                rate = bytes / elapsed_time
 
365
                if rate > new_max:
 
366
                        return long(new_max)
 
367
                if rate < new_min:
 
368
                        return long(new_min)
 
369
                return long(rate)
 
370
 
 
371
        @staticmethod
 
372
        def parse_bytes(bytestr):
 
373
                """Parse a string indicating a byte quantity into a long integer."""
 
374
                matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 
375
                if matchobj is None:
 
376
                        return None
 
377
                number = float(matchobj.group(1))
 
378
                multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 
379
                return long(round(number * multiplier))
 
380
 
 
381
        def add_info_extractor(self, ie):
 
382
                """Add an InfoExtractor object to the end of the list."""
 
383
                self._ies.append(ie)
 
384
                ie.set_downloader(self)
 
385
 
 
386
        def add_post_processor(self, pp):
 
387
                """Add a PostProcessor object to the end of the chain."""
 
388
                self._pps.append(pp)
 
389
                pp.set_downloader(self)
 
390
 
 
391
        def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 
392
                """Print message to stdout if not in quiet mode."""
 
393
                try:
 
394
                        if not self.params.get('quiet', False):
 
395
                                terminator = [u'\n', u''][skip_eol]
 
396
                                print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 
397
                        self._screen_file.flush()
 
398
                except (UnicodeEncodeError), err:
 
399
                        if not ignore_encoding_errors:
 
400
                                raise
 
401
 
 
402
        def to_stderr(self, message):
 
403
                """Print message to stderr."""
 
404
                print >>sys.stderr, message.encode(preferredencoding())
 
405
 
 
406
        def to_cons_title(self, message):
 
407
                """Set console/terminal window title to message."""
 
408
                if not self.params.get('consoletitle', False):
 
409
                        return
 
410
                if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 
411
                        # c_wchar_p() might not be necessary if `message` is
 
412
                        # already of type unicode()
 
413
                        ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 
414
                elif 'TERM' in os.environ:
 
415
                        sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 
416
 
 
417
        def fixed_template(self):
 
418
                """Checks if the output template is fixed."""
 
419
                return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 
420
 
 
421
        def trouble(self, message=None):
 
422
                """Determine action to take when a download problem appears.
 
423
 
 
424
                Depending on if the downloader has been configured to ignore
 
425
                download errors or not, this method may throw an exception or
 
426
                not when errors are found, after printing the message.
 
427
                """
 
428
                if message is not None:
 
429
                        self.to_stderr(message)
 
430
                if not self.params.get('ignoreerrors', False):
 
431
                        raise DownloadError(message)
 
432
                self._download_retcode = 1
 
433
 
 
434
        def slow_down(self, start_time, byte_counter):
 
435
                """Sleep if the download speed is over the rate limit."""
 
436
                rate_limit = self.params.get('ratelimit', None)
 
437
                if rate_limit is None or byte_counter == 0:
 
438
                        return
 
439
                now = time.time()
 
440
                elapsed = now - start_time
 
441
                if elapsed <= 0.0:
 
442
                        return
 
443
                speed = float(byte_counter) / elapsed
 
444
                if speed > rate_limit:
 
445
                        time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 
446
 
 
447
        def temp_name(self, filename):
 
448
                """Returns a temporary filename for the given filename."""
 
449
                if self.params.get('nopart', False) or filename == u'-' or \
 
450
                                (os.path.exists(filename) and not os.path.isfile(filename)):
 
451
                        return filename
 
452
                return filename + u'.part'
 
453
 
 
454
        def undo_temp_name(self, filename):
 
455
                if filename.endswith(u'.part'):
 
456
                        return filename[:-len(u'.part')]
 
457
                return filename
 
458
 
 
459
        def try_rename(self, old_filename, new_filename):
 
460
                try:
 
461
                        if old_filename == new_filename:
 
462
                                return
 
463
                        os.rename(old_filename, new_filename)
 
464
                except (IOError, OSError), err:
 
465
                        self.trouble(u'ERROR: unable to rename file')
 
466
        
 
467
        def try_utime(self, filename, last_modified_hdr):
 
468
                """Try to set the last-modified time of the given file."""
 
469
                if last_modified_hdr is None:
 
470
                        return
 
471
                if not os.path.isfile(filename):
 
472
                        return
 
473
                timestr = last_modified_hdr
 
474
                if timestr is None:
 
475
                        return
 
476
                filetime = timeconvert(timestr)
 
477
                if filetime is None:
 
478
                        return
 
479
                try:
 
480
                        os.utime(filename,(time.time(), filetime))
 
481
                except:
 
482
                        pass
 
483
 
 
484
        def report_destination(self, filename):
 
485
                """Report destination filename."""
 
486
                self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 
487
 
 
488
        def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 
489
                """Report download progress."""
 
490
                if self.params.get('noprogress', False):
 
491
                        return
 
492
                self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 
493
                                (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 
494
                self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 
495
                                (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 
496
 
 
497
        def report_resuming_byte(self, resume_len):
 
498
                """Report attempt to resume at given byte."""
 
499
                self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 
500
 
 
501
        def report_retry(self, count, retries):
 
502
                """Report retry in case of HTTP error 5xx"""
 
503
                self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 
504
 
 
505
        def report_file_already_downloaded(self, file_name):
 
506
                """Report file has already been fully downloaded."""
 
507
                try:
 
508
                        self.to_screen(u'[download] %s has already been downloaded' % file_name)
 
509
                except (UnicodeEncodeError), err:
 
510
                        self.to_screen(u'[download] The file has already been downloaded')
 
511
 
 
512
        def report_unable_to_resume(self):
 
513
                """Report it was impossible to resume download."""
 
514
                self.to_screen(u'[download] Unable to resume')
 
515
 
 
516
        def report_finish(self):
 
517
                """Report download finished."""
 
518
                if self.params.get('noprogress', False):
 
519
                        self.to_screen(u'[download] Download completed')
 
520
                else:
 
521
                        self.to_screen(u'')
 
522
 
 
523
        def increment_downloads(self):
 
524
                """Increment the ordinal that assigns a number to each file."""
 
525
                self._num_downloads += 1
 
526
 
 
527
        def prepare_filename(self, info_dict):
 
528
                """Generate the output filename."""
 
529
                try:
 
530
                        template_dict = dict(info_dict)
 
531
                        template_dict['epoch'] = unicode(long(time.time()))
 
532
                        template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 
533
                        filename = self.params['outtmpl'] % template_dict
 
534
                        return filename
 
535
                except (ValueError, KeyError), err:
 
536
                        self.trouble(u'ERROR: invalid system charset or erroneous output template')
 
537
                        return None
 
538
 
 
539
        def process_info(self, info_dict):
 
540
                """Process a single dictionary returned by an InfoExtractor."""
 
541
                filename = self.prepare_filename(info_dict)
 
542
                # Do nothing else if in simulate mode
 
543
                if self.params.get('simulate', False):
 
544
                        # Forced printings
 
545
                        if self.params.get('forcetitle', False):
 
546
                                print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 
547
                        if self.params.get('forceurl', False):
 
548
                                print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 
549
                        if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 
550
                                print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 
551
                        if self.params.get('forcedescription', False) and 'description' in info_dict:
 
552
                                print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 
553
                        if self.params.get('forcefilename', False) and filename is not None:
 
554
                                print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 
555
 
 
556
                        return
 
557
 
 
558
                if filename is None:
 
559
                        return
 
560
                if self.params.get('nooverwrites', False) and os.path.exists(filename):
 
561
                        self.to_stderr(u'WARNING: file exists and will be skipped')
 
562
                        return
 
563
 
 
564
                try:
 
565
                        self.pmkdir(filename)
 
566
                except (OSError, IOError), err:
 
567
                        self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 
568
                        return
 
569
 
 
570
                try:
 
571
                        success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 
572
                except (OSError, IOError), err:
 
573
                        raise UnavailableVideoError
 
574
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
575
                        self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 
576
                        return
 
577
                except (ContentTooShortError, ), err:
 
578
                        self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 
579
                        return
 
580
 
 
581
                if success:
 
582
                        try:
 
583
                                self.post_process(filename, info_dict)
 
584
                        except (PostProcessingError), err:
 
585
                                self.trouble(u'ERROR: postprocessing: %s' % str(err))
 
586
                                return
 
587
 
 
588
        def download(self, url_list):
 
589
                """Download a given list of URLs."""
 
590
                if len(url_list) > 1 and self.fixed_template():
 
591
                        raise SameFileError(self.params['outtmpl'])
 
592
 
 
593
                for url in url_list:
 
594
                        suitable_found = False
 
595
                        for ie in self._ies:
 
596
                                # Go to next InfoExtractor if not suitable
 
597
                                if not ie.suitable(url):
 
598
                                        continue
 
599
 
 
600
                                # Suitable InfoExtractor found
 
601
                                suitable_found = True
 
602
 
 
603
                                # Extract information from URL and process it
 
604
                                ie.extract(url)
 
605
 
 
606
                                # Suitable InfoExtractor had been found; go to next URL
 
607
                                break
 
608
 
 
609
                        if not suitable_found:
 
610
                                self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 
611
 
 
612
                return self._download_retcode
 
613
 
 
614
        def post_process(self, filename, ie_info):
 
615
                """Run the postprocessing chain on the given file."""
 
616
                info = dict(ie_info)
 
617
                info['filepath'] = filename
 
618
                for pp in self._pps:
 
619
                        info = pp.run(info)
 
620
                        if info is None:
 
621
                                break
 
622
 
 
623
        def _download_with_rtmpdump(self, filename, url, player_url):
 
624
                self.report_destination(filename)
 
625
                tmpfilename = self.temp_name(filename)
 
626
 
 
627
                # Check for rtmpdump first
 
628
                try:
 
629
                        subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 
630
                except (OSError, IOError):
 
631
                        self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 
632
                        return False
 
633
 
 
634
                # Download using rtmpdump. rtmpdump returns exit code 2 when
 
635
                # the connection was interrumpted and resuming appears to be
 
636
                # possible. This is part of rtmpdump's normal usage, AFAIK.
 
637
                basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 
638
                retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 
639
                while retval == 2 or retval == 1:
 
640
                        prevsize = os.path.getsize(tmpfilename)
 
641
                        self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 
642
                        time.sleep(5.0) # This seems to be needed
 
643
                        retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 
644
                        cursize = os.path.getsize(tmpfilename)
 
645
                        if prevsize == cursize and retval == 1:
 
646
                                break
 
647
                if retval == 0:
 
648
                        self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 
649
                        self.try_rename(tmpfilename, filename)
 
650
                        return True
 
651
                else:
 
652
                        self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 
653
                        return False
 
654
 
 
655
        def _do_download(self, filename, url, player_url):
 
656
                # Check file already present
 
657
                if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 
658
                        self.report_file_already_downloaded(filename)
 
659
                        return True
 
660
 
 
661
                # Attempt to download using rtmpdump
 
662
                if url.startswith('rtmp'):
 
663
                        return self._download_with_rtmpdump(filename, url, player_url)
 
664
 
 
665
                tmpfilename = self.temp_name(filename)
 
666
                stream = None
 
667
                open_mode = 'wb'
 
668
 
 
669
                # Do not include the Accept-Encoding header
 
670
                headers = {'Youtubedl-no-compression': 'True'}
 
671
                basic_request = urllib2.Request(url, None, headers)
 
672
                request = urllib2.Request(url, None, headers)
 
673
 
 
674
                # Establish possible resume length
 
675
                if os.path.isfile(tmpfilename):
 
676
                        resume_len = os.path.getsize(tmpfilename)
 
677
                else:
 
678
                        resume_len = 0
 
679
 
 
680
                # Request parameters in case of being able to resume
 
681
                if self.params.get('continuedl', False) and resume_len != 0:
 
682
                        self.report_resuming_byte(resume_len)
 
683
                        request.add_header('Range','bytes=%d-' % resume_len)
 
684
                        open_mode = 'ab'
 
685
 
 
686
                count = 0
 
687
                retries = self.params.get('retries', 0)
 
688
                while count <= retries:
 
689
                        # Establish connection
 
690
                        try:
 
691
                                data = urllib2.urlopen(request)
 
692
                                break
 
693
                        except (urllib2.HTTPError, ), err:
 
694
                                if (err.code < 500 or err.code >= 600) and err.code != 416:
 
695
                                        # Unexpected HTTP error
 
696
                                        raise
 
697
                                elif err.code == 416:
 
698
                                        # Unable to resume (requested range not satisfiable)
 
699
                                        try:
 
700
                                                # Open the connection again without the range header
 
701
                                                data = urllib2.urlopen(basic_request)
 
702
                                                content_length = data.info()['Content-Length']
 
703
                                        except (urllib2.HTTPError, ), err:
 
704
                                                if err.code < 500 or err.code >= 600:
 
705
                                                        raise
 
706
                                        else:
 
707
                                                # Examine the reported length
 
708
                                                if (content_length is not None and
 
709
                                                    (resume_len - 100 < long(content_length) < resume_len + 100)):
 
710
                                                        # The file had already been fully downloaded.
 
711
                                                        # Explanation to the above condition: in issue #175 it was revealed that
 
712
                                                        # YouTube sometimes adds or removes a few bytes from the end of the file,
 
713
                                                        # changing the file size slightly and causing problems for some users. So
 
714
                                                        # I decided to implement a suggested change and consider the file
 
715
                                                        # completely downloaded if the file size differs less than 100 bytes from
 
716
                                                        # the one in the hard drive.
 
717
                                                        self.report_file_already_downloaded(filename)
 
718
                                                        self.try_rename(tmpfilename, filename)
 
719
                                                        return True
 
720
                                                else:
 
721
                                                        # The length does not match, we start the download over
 
722
                                                        self.report_unable_to_resume()
 
723
                                                        open_mode = 'wb'
 
724
                                                        break
 
725
                        # Retry
 
726
                        count += 1
 
727
                        if count <= retries:
 
728
                                self.report_retry(count, retries)
 
729
 
 
730
                if count > retries:
 
731
                        self.trouble(u'ERROR: giving up after %s retries' % retries)
 
732
                        return False
 
733
 
 
734
                data_len = data.info().get('Content-length', None)
 
735
                if data_len is not None:
 
736
                        data_len = long(data_len) + resume_len
 
737
                data_len_str = self.format_bytes(data_len)
 
738
                byte_counter = 0 + resume_len
 
739
                block_size = 1024
 
740
                start = time.time()
 
741
                while True:
 
742
                        # Download and write
 
743
                        before = time.time()
 
744
                        data_block = data.read(block_size)
 
745
                        after = time.time()
 
746
                        if len(data_block) == 0:
 
747
                                break
 
748
                        byte_counter += len(data_block)
 
749
 
 
750
                        # Open file just in time
 
751
                        if stream is None:
 
752
                                try:
 
753
                                        (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 
754
                                        filename = self.undo_temp_name(tmpfilename)
 
755
                                        self.report_destination(filename)
 
756
                                except (OSError, IOError), err:
 
757
                                        self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 
758
                                        return False
 
759
                        try:
 
760
                                stream.write(data_block)
 
761
                        except (IOError, OSError), err:
 
762
                                self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 
763
                                return False
 
764
                        block_size = self.best_block_size(after - before, len(data_block))
 
765
 
 
766
                        # Progress message
 
767
                        percent_str = self.calc_percent(byte_counter, data_len)
 
768
                        eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 
769
                        speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 
770
                        self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 
771
 
 
772
                        # Apply rate limit
 
773
                        self.slow_down(start, byte_counter - resume_len)
 
774
 
 
775
                stream.close()
 
776
                self.report_finish()
 
777
                if data_len is not None and byte_counter != data_len:
 
778
                        raise ContentTooShortError(byte_counter, long(data_len))
 
779
                self.try_rename(tmpfilename, filename)
 
780
 
 
781
                # Update file modification time
 
782
                if self.params.get('updatetime', True):
 
783
                        self.try_utime(filename, data.info().get('last-modified', None))
 
784
 
 
785
                return True
 
786
 
 
787
class InfoExtractor(object):
 
788
        """Information Extractor class.
 
789
 
 
790
        Information extractors are the classes that, given a URL, extract
 
791
        information from the video (or videos) the URL refers to. This
 
792
        information includes the real video URL, the video title and simplified
 
793
        title, author and others. The information is stored in a dictionary
 
794
        which is then passed to the FileDownloader. The FileDownloader
 
795
        processes this information possibly downloading the video to the file
 
796
        system, among other possible outcomes. The dictionaries must include
 
797
        the following fields:
 
798
 
 
799
        id:             Video identifier.
 
800
        url:            Final video URL.
 
801
        uploader:       Nickname of the video uploader.
 
802
        title:          Literal title.
 
803
        stitle:         Simplified title.
 
804
        ext:            Video filename extension.
 
805
        format:         Video format.
 
806
        player_url:     SWF Player URL (may be None).
 
807
 
 
808
        The following fields are optional. Their primary purpose is to allow
 
809
        youtube-dl to serve as the backend for a video search function, such
 
810
        as the one in youtube2mp3.  They are only used when their respective
 
811
        forced printing functions are called:
 
812
 
 
813
        thumbnail:      Full URL to a video thumbnail image.
 
814
        description:    One-line video description.
 
815
 
 
816
        Subclasses of this one should re-define the _real_initialize() and
 
817
        _real_extract() methods, as well as the suitable() static method.
 
818
        Probably, they should also be instantiated and added to the main
 
819
        downloader.
 
820
        """
 
821
 
 
822
        _ready = False
 
823
        _downloader = None
 
824
 
 
825
        def __init__(self, downloader=None):
 
826
                """Constructor. Receives an optional downloader."""
 
827
                self._ready = False
 
828
                self.set_downloader(downloader)
 
829
 
 
830
        @staticmethod
 
831
        def suitable(url):
 
832
                """Receives a URL and returns True if suitable for this IE."""
 
833
                return False
 
834
 
 
835
        def initialize(self):
 
836
                """Initializes an instance (authentication, etc)."""
 
837
                if not self._ready:
 
838
                        self._real_initialize()
 
839
                        self._ready = True
 
840
 
 
841
        def extract(self, url):
 
842
                """Extracts URL information and returns it in list of dicts."""
 
843
                self.initialize()
 
844
                return self._real_extract(url)
 
845
 
 
846
        def set_downloader(self, downloader):
 
847
                """Sets the downloader for this IE."""
 
848
                self._downloader = downloader
 
849
 
 
850
        def _real_initialize(self):
 
851
                """Real initialization process. Redefine in subclasses."""
 
852
                pass
 
853
 
 
854
        def _real_extract(self, url):
 
855
                """Real extraction process. Redefine in subclasses."""
 
856
                pass
 
857
 
 
858
class YoutubeIE(InfoExtractor):
 
859
        """Information extractor for youtube.com."""
 
860
 
 
861
        _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 
862
        _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 
863
        _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 
864
        _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 
865
        _NETRC_MACHINE = 'youtube'
 
866
        # Listed in order of quality
 
867
        _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 
868
        _video_extensions = {
 
869
                '13': '3gp',
 
870
                '17': 'mp4',
 
871
                '18': 'mp4',
 
872
                '22': 'mp4',
 
873
                '37': 'mp4',
 
874
                '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 
875
                '43': 'webm',
 
876
                '45': 'webm',
 
877
        }
 
878
 
 
879
        @staticmethod
 
880
        def suitable(url):
 
881
                return (re.match(YoutubeIE._VALID_URL, url) is not None)
 
882
 
 
883
        def report_lang(self):
 
884
                """Report attempt to set language."""
 
885
                self._downloader.to_screen(u'[youtube] Setting language')
 
886
 
 
887
        def report_login(self):
 
888
                """Report attempt to log in."""
 
889
                self._downloader.to_screen(u'[youtube] Logging in')
 
890
 
 
891
        def report_age_confirmation(self):
 
892
                """Report attempt to confirm age."""
 
893
                self._downloader.to_screen(u'[youtube] Confirming age')
 
894
 
 
895
        def report_video_webpage_download(self, video_id):
 
896
                """Report attempt to download video webpage."""
 
897
                self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 
898
 
 
899
        def report_video_info_webpage_download(self, video_id):
 
900
                """Report attempt to download video info webpage."""
 
901
                self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 
902
 
 
903
        def report_information_extraction(self, video_id):
 
904
                """Report attempt to extract video information."""
 
905
                self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 
906
 
 
907
        def report_unavailable_format(self, video_id, format):
 
908
                """Report extracted video URL."""
 
909
                self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 
910
 
 
911
        def report_rtmp_download(self):
 
912
                """Indicate the download will use the RTMP protocol."""
 
913
                self._downloader.to_screen(u'[youtube] RTMP download detected')
 
914
 
 
915
        def _real_initialize(self):
 
916
                if self._downloader is None:
 
917
                        return
 
918
 
 
919
                username = None
 
920
                password = None
 
921
                downloader_params = self._downloader.params
 
922
 
 
923
                # Attempt to use provided username and password or .netrc data
 
924
                if downloader_params.get('username', None) is not None:
 
925
                        username = downloader_params['username']
 
926
                        password = downloader_params['password']
 
927
                elif downloader_params.get('usenetrc', False):
 
928
                        try:
 
929
                                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 
930
                                if info is not None:
 
931
                                        username = info[0]
 
932
                                        password = info[2]
 
933
                                else:
 
934
                                        raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 
935
                        except (IOError, netrc.NetrcParseError), err:
 
936
                                self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 
937
                                return
 
938
 
 
939
                # Set language
 
940
                request = urllib2.Request(self._LANG_URL)
 
941
                try:
 
942
                        self.report_lang()
 
943
                        urllib2.urlopen(request).read()
 
944
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
945
                        self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 
946
                        return
 
947
 
 
948
                # No authentication to be performed
 
949
                if username is None:
 
950
                        return
 
951
 
 
952
                # Log in
 
953
                login_form = {
 
954
                                'current_form': 'loginForm',
 
955
                                'next':         '/',
 
956
                                'action_login': 'Log In',
 
957
                                'username':     username,
 
958
                                'password':     password,
 
959
                                }
 
960
                request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 
961
                try:
 
962
                        self.report_login()
 
963
                        login_results = urllib2.urlopen(request).read()
 
964
                        if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 
965
                                self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 
966
                                return
 
967
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
968
                        self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 
969
                        return
 
970
 
 
971
                # Confirm age
 
972
                age_form = {
 
973
                                'next_url':             '/',
 
974
                                'action_confirm':       'Confirm',
 
975
                                }
 
976
                request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 
977
                try:
 
978
                        self.report_age_confirmation()
 
979
                        age_results = urllib2.urlopen(request).read()
 
980
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
981
                        self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 
982
                        return
 
983
 
 
984
        def _real_extract(self, url):
 
985
                # Extract video id from URL
 
986
                mobj = re.match(self._VALID_URL, url)
 
987
                if mobj is None:
 
988
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 
989
                        return
 
990
                video_id = mobj.group(2)
 
991
 
 
992
                # Get video webpage
 
993
                self.report_video_webpage_download(video_id)
 
994
                request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 
995
                try:
 
996
                        video_webpage = urllib2.urlopen(request).read()
 
997
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
998
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 
999
                        return
 
1000
 
 
1001
                # Attempt to extract SWF player URL
 
1002
                mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 
1003
                if mobj is not None:
 
1004
                        player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 
1005
                else:
 
1006
                        player_url = None
 
1007
 
 
1008
                # Get video info
 
1009
                self.report_video_info_webpage_download(video_id)
 
1010
                for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 
1011
                        video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 
1012
                                           % (video_id, el_type))
 
1013
                        request = urllib2.Request(video_info_url)
 
1014
                        try:
 
1015
                                video_info_webpage = urllib2.urlopen(request).read()
 
1016
                                video_info = parse_qs(video_info_webpage)
 
1017
                                if 'token' in video_info:
 
1018
                                        break
 
1019
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1020
                                self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 
1021
                                return
 
1022
                if 'token' not in video_info:
 
1023
                        if 'reason' in video_info:
 
1024
                                self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 
1025
                        else:
 
1026
                                self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 
1027
                        return
 
1028
 
 
1029
                # Start extracting information
 
1030
                self.report_information_extraction(video_id)
 
1031
 
 
1032
                # uploader
 
1033
                if 'author' not in video_info:
 
1034
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 
1035
                        return
 
1036
                video_uploader = urllib.unquote_plus(video_info['author'][0])
 
1037
 
 
1038
                # title
 
1039
                if 'title' not in video_info:
 
1040
                        self._downloader.trouble(u'ERROR: unable to extract video title')
 
1041
                        return
 
1042
                video_title = urllib.unquote_plus(video_info['title'][0])
 
1043
                video_title = video_title.decode('utf-8')
 
1044
                video_title = sanitize_title(video_title)
 
1045
 
 
1046
                # simplified title
 
1047
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
1048
                simple_title = simple_title.strip(ur'_')
 
1049
 
 
1050
                # thumbnail image
 
1051
                if 'thumbnail_url' not in video_info:
 
1052
                        self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 
1053
                        video_thumbnail = ''
 
1054
                else:   # don't panic if we can't find it
 
1055
                        video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 
1056
 
 
1057
                # upload date
 
1058
                upload_date = u'NA'
 
1059
                mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 
1060
                if mobj is not None:
 
1061
                        upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 
1062
                        format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 
1063
                        for expression in format_expressions:
 
1064
                                try:
 
1065
                                        upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 
1066
                                except:
 
1067
                                        pass
 
1068
 
 
1069
                # description
 
1070
                video_description = 'No description available.'
 
1071
                if self._downloader.params.get('forcedescription', False):
 
1072
                        mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 
1073
                        if mobj is not None:
 
1074
                                video_description = mobj.group(1)
 
1075
 
 
1076
                # token
 
1077
                video_token = urllib.unquote_plus(video_info['token'][0])
 
1078
 
 
1079
                # Decide which formats to download
 
1080
                req_format = self._downloader.params.get('format', None)
 
1081
 
 
1082
                if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 
1083
                        url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 
1084
                        url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
 
1085
                        url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
 
1086
                        format_limit = self._downloader.params.get('format_limit', None)
 
1087
                        if format_limit is not None and format_limit in self._available_formats:
 
1088
                                format_list = self._available_formats[self._available_formats.index(format_limit):]
 
1089
                        else:
 
1090
                                format_list = self._available_formats
 
1091
                        existing_formats = [x for x in format_list if x in url_map]
 
1092
                        if len(existing_formats) == 0:
 
1093
                                self._downloader.trouble(u'ERROR: no known formats available for video')
 
1094
                                return
 
1095
                        if req_format is None:
 
1096
                                video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 
1097
                        elif req_format == '-1':
 
1098
                                video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 
1099
                        else:
 
1100
                                # Specific format
 
1101
                                if req_format not in url_map:
 
1102
                                        self._downloader.trouble(u'ERROR: requested format not available')
 
1103
                                        return
 
1104
                                video_url_list = [(req_format, url_map[req_format])] # Specific format
 
1105
 
 
1106
                elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 
1107
                        self.report_rtmp_download()
 
1108
                        video_url_list = [(None, video_info['conn'][0])]
 
1109
 
 
1110
                else:
 
1111
                        self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
 
1112
                        return
 
1113
 
 
1114
                for format_param, video_real_url in video_url_list:
 
1115
                        # At this point we have a new video
 
1116
                        self._downloader.increment_downloads()
 
1117
 
 
1118
                        # Extension
 
1119
                        video_extension = self._video_extensions.get(format_param, 'flv')
 
1120
 
 
1121
                        # Find the video URL in fmt_url_map or conn paramters
 
1122
                        try:
 
1123
                                # Process video information
 
1124
                                self._downloader.process_info({
 
1125
                                        'id':           video_id.decode('utf-8'),
 
1126
                                        'url':          video_real_url.decode('utf-8'),
 
1127
                                        'uploader':     video_uploader.decode('utf-8'),
 
1128
                                        'upload_date':  upload_date,
 
1129
                                        'title':        video_title,
 
1130
                                        'stitle':       simple_title,
 
1131
                                        'ext':          video_extension.decode('utf-8'),
 
1132
                                        'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 
1133
                                        'thumbnail':    video_thumbnail.decode('utf-8'),
 
1134
                                        'description':  video_description.decode('utf-8'),
 
1135
                                        'player_url':   player_url,
 
1136
                                })
 
1137
                        except UnavailableVideoError, err:
 
1138
                                self._downloader.trouble(u'\nERROR: unable to download video')
 
1139
 
 
1140
 
 
1141
class MetacafeIE(InfoExtractor):
 
1142
        """Information Extractor for metacafe.com."""
 
1143
 
 
1144
        _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 
1145
        _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 
1146
        _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 
1147
        _youtube_ie = None
 
1148
 
 
1149
        def __init__(self, youtube_ie, downloader=None):
 
1150
                InfoExtractor.__init__(self, downloader)
 
1151
                self._youtube_ie = youtube_ie
 
1152
 
 
1153
        @staticmethod
 
1154
        def suitable(url):
 
1155
                return (re.match(MetacafeIE._VALID_URL, url) is not None)
 
1156
 
 
1157
        def report_disclaimer(self):
 
1158
                """Report disclaimer retrieval."""
 
1159
                self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 
1160
 
 
1161
        def report_age_confirmation(self):
 
1162
                """Report attempt to confirm age."""
 
1163
                self._downloader.to_screen(u'[metacafe] Confirming age')
 
1164
 
 
1165
        def report_download_webpage(self, video_id):
 
1166
                """Report webpage download."""
 
1167
                self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 
1168
 
 
1169
        def report_extraction(self, video_id):
 
1170
                """Report information extraction."""
 
1171
                self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 
1172
 
 
1173
        def _real_initialize(self):
 
1174
                # Retrieve disclaimer
 
1175
                request = urllib2.Request(self._DISCLAIMER)
 
1176
                try:
 
1177
                        self.report_disclaimer()
 
1178
                        disclaimer = urllib2.urlopen(request).read()
 
1179
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1180
                        self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 
1181
                        return
 
1182
 
 
1183
                # Confirm age
 
1184
                disclaimer_form = {
 
1185
                        'filters': '0',
 
1186
                        'submit': "Continue - I'm over 18",
 
1187
                        }
 
1188
                request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 
1189
                try:
 
1190
                        self.report_age_confirmation()
 
1191
                        disclaimer = urllib2.urlopen(request).read()
 
1192
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1193
                        self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 
1194
                        return
 
1195
 
 
1196
        def _real_extract(self, url):
 
1197
                # Extract id and simplified title from URL
 
1198
                mobj = re.match(self._VALID_URL, url)
 
1199
                if mobj is None:
 
1200
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 
1201
                        return
 
1202
 
 
1203
                video_id = mobj.group(1)
 
1204
 
 
1205
                # Check if video comes from YouTube
 
1206
                mobj2 = re.match(r'^yt-(.*)$', video_id)
 
1207
                if mobj2 is not None:
 
1208
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 
1209
                        return
 
1210
 
 
1211
                # At this point we have a new video
 
1212
                self._downloader.increment_downloads()
 
1213
 
 
1214
                simple_title = mobj.group(2).decode('utf-8')
 
1215
 
 
1216
                # Retrieve video webpage to extract further information
 
1217
                request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 
1218
                try:
 
1219
                        self.report_download_webpage(video_id)
 
1220
                        webpage = urllib2.urlopen(request).read()
 
1221
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1222
                        self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 
1223
                        return
 
1224
 
 
1225
                # Extract URL, uploader and title from webpage
 
1226
                self.report_extraction(video_id)
 
1227
                mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 
1228
                if mobj is not None:
 
1229
                        mediaURL = urllib.unquote(mobj.group(1))
 
1230
                        video_extension = mediaURL[-3:]
 
1231
 
 
1232
                        # Extract gdaKey if available
 
1233
                        mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 
1234
                        if mobj is None:
 
1235
                                video_url = mediaURL
 
1236
                        else:
 
1237
                                gdaKey = mobj.group(1)
 
1238
                                video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 
1239
                else:
 
1240
                        mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 
1241
                        if mobj is None:
 
1242
                                self._downloader.trouble(u'ERROR: unable to extract media URL')
 
1243
                                return
 
1244
                        vardict = parse_qs(mobj.group(1))
 
1245
                        if 'mediaData' not in vardict:
 
1246
                                self._downloader.trouble(u'ERROR: unable to extract media URL')
 
1247
                                return
 
1248
                        mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 
1249
                        if mobj is None:
 
1250
                                self._downloader.trouble(u'ERROR: unable to extract media URL')
 
1251
                                return
 
1252
                        mediaURL = mobj.group(1).replace('\\/', '/')
 
1253
                        video_extension = mediaURL[-3:]
 
1254
                        video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 
1255
 
 
1256
                mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 
1257
                if mobj is None:
 
1258
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
1259
                        return
 
1260
                video_title = mobj.group(1).decode('utf-8')
 
1261
                video_title = sanitize_title(video_title)
 
1262
 
 
1263
                mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 
1264
                if mobj is None:
 
1265
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 
1266
                        return
 
1267
                video_uploader = mobj.group(1)
 
1268
 
 
1269
                try:
 
1270
                        # Process video information
 
1271
                        self._downloader.process_info({
 
1272
                                'id':           video_id.decode('utf-8'),
 
1273
                                'url':          video_url.decode('utf-8'),
 
1274
                                'uploader':     video_uploader.decode('utf-8'),
 
1275
                                'upload_date':  u'NA',
 
1276
                                'title':        video_title,
 
1277
                                'stitle':       simple_title,
 
1278
                                'ext':          video_extension.decode('utf-8'),
 
1279
                                'format':       u'NA',
 
1280
                                'player_url':   None,
 
1281
                        })
 
1282
                except UnavailableVideoError:
 
1283
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
1284
 
 
1285
 
 
1286
class DailymotionIE(InfoExtractor):
 
1287
        """Information Extractor for Dailymotion"""
 
1288
 
 
1289
        _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 
1290
 
 
1291
        def __init__(self, downloader=None):
 
1292
                InfoExtractor.__init__(self, downloader)
 
1293
 
 
1294
        @staticmethod
 
1295
        def suitable(url):
 
1296
                return (re.match(DailymotionIE._VALID_URL, url) is not None)
 
1297
 
 
1298
        def report_download_webpage(self, video_id):
 
1299
                """Report webpage download."""
 
1300
                self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 
1301
 
 
1302
        def report_extraction(self, video_id):
 
1303
                """Report information extraction."""
 
1304
                self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 
1305
 
 
1306
        def _real_initialize(self):
 
1307
                return
 
1308
 
 
1309
        def _real_extract(self, url):
 
1310
                # Extract id and simplified title from URL
 
1311
                mobj = re.match(self._VALID_URL, url)
 
1312
                if mobj is None:
 
1313
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 
1314
                        return
 
1315
 
 
1316
                # At this point we have a new video
 
1317
                self._downloader.increment_downloads()
 
1318
                video_id = mobj.group(1)
 
1319
 
 
1320
                simple_title = mobj.group(2).decode('utf-8')
 
1321
                video_extension = 'flv'
 
1322
 
 
1323
                # Retrieve video webpage to extract further information
 
1324
                request = urllib2.Request(url)
 
1325
                try:
 
1326
                        self.report_download_webpage(video_id)
 
1327
                        webpage = urllib2.urlopen(request).read()
 
1328
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1329
                        self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 
1330
                        return
 
1331
 
 
1332
                # Extract URL, uploader and title from webpage
 
1333
                self.report_extraction(video_id)
 
1334
                mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
 
1335
                if mobj is None:
 
1336
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
 
1337
                        return
 
1338
                mediaURL = urllib.unquote(mobj.group(1))
 
1339
 
 
1340
                # if needed add http://www.dailymotion.com/ if relative URL
 
1341
 
 
1342
                video_url = mediaURL
 
1343
 
 
1344
                # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
 
1345
                mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
 
1346
                if mobj is None:
 
1347
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
1348
                        return
 
1349
                video_title = mobj.group(1).decode('utf-8')
 
1350
                video_title = sanitize_title(video_title)
 
1351
 
 
1352
                mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
 
1353
                if mobj is None:
 
1354
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 
1355
                        return
 
1356
                video_uploader = mobj.group(1)
 
1357
 
 
1358
                try:
 
1359
                        # Process video information
 
1360
                        self._downloader.process_info({
 
1361
                                'id':           video_id.decode('utf-8'),
 
1362
                                'url':          video_url.decode('utf-8'),
 
1363
                                'uploader':     video_uploader.decode('utf-8'),
 
1364
                                'upload_date':  u'NA',
 
1365
                                'title':        video_title,
 
1366
                                'stitle':       simple_title,
 
1367
                                'ext':          video_extension.decode('utf-8'),
 
1368
                                'format':       u'NA',
 
1369
                                'player_url':   None,
 
1370
                        })
 
1371
                except UnavailableVideoError:
 
1372
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
1373
 
 
1374
class GoogleIE(InfoExtractor):
 
1375
        """Information extractor for video.google.com."""
 
1376
 
 
1377
        _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 
1378
 
 
1379
        def __init__(self, downloader=None):
 
1380
                InfoExtractor.__init__(self, downloader)
 
1381
 
 
1382
        @staticmethod
 
1383
        def suitable(url):
 
1384
                return (re.match(GoogleIE._VALID_URL, url) is not None)
 
1385
 
 
1386
        def report_download_webpage(self, video_id):
 
1387
                """Report webpage download."""
 
1388
                self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 
1389
 
 
1390
        def report_extraction(self, video_id):
 
1391
                """Report information extraction."""
 
1392
                self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 
1393
 
 
1394
        def _real_initialize(self):
 
1395
                return
 
1396
 
 
1397
        def _real_extract(self, url):
 
1398
                # Extract id from URL
 
1399
                mobj = re.match(self._VALID_URL, url)
 
1400
                if mobj is None:
 
1401
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 
1402
                        return
 
1403
 
 
1404
                # At this point we have a new video
 
1405
                self._downloader.increment_downloads()
 
1406
                video_id = mobj.group(1)
 
1407
 
 
1408
                video_extension = 'mp4'
 
1409
 
 
1410
                # Retrieve video webpage to extract further information
 
1411
                request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 
1412
                try:
 
1413
                        self.report_download_webpage(video_id)
 
1414
                        webpage = urllib2.urlopen(request).read()
 
1415
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1416
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1417
                        return
 
1418
 
 
1419
                # Extract URL, uploader, and title from webpage
 
1420
                self.report_extraction(video_id)
 
1421
                mobj = re.search(r"download_url:'([^']+)'", webpage)
 
1422
                if mobj is None:
 
1423
                        video_extension = 'flv'
 
1424
                        mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 
1425
                if mobj is None:
 
1426
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
 
1427
                        return
 
1428
                mediaURL = urllib.unquote(mobj.group(1))
 
1429
                mediaURL = mediaURL.replace('\\x3d', '\x3d')
 
1430
                mediaURL = mediaURL.replace('\\x26', '\x26')
 
1431
 
 
1432
                video_url = mediaURL
 
1433
 
 
1434
                mobj = re.search(r'<title>(.*)</title>', webpage)
 
1435
                if mobj is None:
 
1436
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
1437
                        return
 
1438
                video_title = mobj.group(1).decode('utf-8')
 
1439
                video_title = sanitize_title(video_title)
 
1440
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
1441
 
 
1442
                # Extract video description
 
1443
                mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 
1444
                if mobj is None:
 
1445
                        self._downloader.trouble(u'ERROR: unable to extract video description')
 
1446
                        return
 
1447
                video_description = mobj.group(1).decode('utf-8')
 
1448
                if not video_description:
 
1449
                        video_description = 'No description available.'
 
1450
 
 
1451
                # Extract video thumbnail
 
1452
                if self._downloader.params.get('forcethumbnail', False):
 
1453
                        request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 
1454
                        try:
 
1455
                                webpage = urllib2.urlopen(request).read()
 
1456
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1457
                                self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1458
                                return
 
1459
                        mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 
1460
                        if mobj is None:
 
1461
                                self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 
1462
                                return
 
1463
                        video_thumbnail = mobj.group(1)
 
1464
                else:   # we need something to pass to process_info
 
1465
                        video_thumbnail = ''
 
1466
 
 
1467
 
 
1468
                try:
 
1469
                        # Process video information
 
1470
                        self._downloader.process_info({
 
1471
                                'id':           video_id.decode('utf-8'),
 
1472
                                'url':          video_url.decode('utf-8'),
 
1473
                                'uploader':     u'NA',
 
1474
                                'upload_date':  u'NA',
 
1475
                                'title':        video_title,
 
1476
                                'stitle':       simple_title,
 
1477
                                'ext':          video_extension.decode('utf-8'),
 
1478
                                'format':       u'NA',
 
1479
                                'player_url':   None,
 
1480
                        })
 
1481
                except UnavailableVideoError:
 
1482
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
1483
 
 
1484
 
 
1485
class PhotobucketIE(InfoExtractor):
 
1486
        """Information extractor for photobucket.com."""
 
1487
 
 
1488
        _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 
1489
 
 
1490
        def __init__(self, downloader=None):
 
1491
                InfoExtractor.__init__(self, downloader)
 
1492
 
 
1493
        @staticmethod
 
1494
        def suitable(url):
 
1495
                return (re.match(PhotobucketIE._VALID_URL, url) is not None)
 
1496
 
 
1497
        def report_download_webpage(self, video_id):
 
1498
                """Report webpage download."""
 
1499
                self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 
1500
 
 
1501
        def report_extraction(self, video_id):
 
1502
                """Report information extraction."""
 
1503
                self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 
1504
 
 
1505
        def _real_initialize(self):
 
1506
                return
 
1507
 
 
1508
        def _real_extract(self, url):
 
1509
                # Extract id from URL
 
1510
                mobj = re.match(self._VALID_URL, url)
 
1511
                if mobj is None:
 
1512
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 
1513
                        return
 
1514
 
 
1515
                # At this point we have a new video
 
1516
                self._downloader.increment_downloads()
 
1517
                video_id = mobj.group(1)
 
1518
 
 
1519
                video_extension = 'flv'
 
1520
 
 
1521
                # Retrieve video webpage to extract further information
 
1522
                request = urllib2.Request(url)
 
1523
                try:
 
1524
                        self.report_download_webpage(video_id)
 
1525
                        webpage = urllib2.urlopen(request).read()
 
1526
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1527
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1528
                        return
 
1529
 
 
1530
                # Extract URL, uploader, and title from webpage
 
1531
                self.report_extraction(video_id)
 
1532
                mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 
1533
                if mobj is None:
 
1534
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
 
1535
                        return
 
1536
                mediaURL = urllib.unquote(mobj.group(1))
 
1537
 
 
1538
                video_url = mediaURL
 
1539
 
 
1540
                mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 
1541
                if mobj is None:
 
1542
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
1543
                        return
 
1544
                video_title = mobj.group(1).decode('utf-8')
 
1545
                video_title = sanitize_title(video_title)
 
1546
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
1547
 
 
1548
                video_uploader = mobj.group(2).decode('utf-8')
 
1549
 
 
1550
                try:
 
1551
                        # Process video information
 
1552
                        self._downloader.process_info({
 
1553
                                'id':           video_id.decode('utf-8'),
 
1554
                                'url':          video_url.decode('utf-8'),
 
1555
                                'uploader':     video_uploader,
 
1556
                                'upload_date':  u'NA',
 
1557
                                'title':        video_title,
 
1558
                                'stitle':       simple_title,
 
1559
                                'ext':          video_extension.decode('utf-8'),
 
1560
                                'format':       u'NA',
 
1561
                                'player_url':   None,
 
1562
                        })
 
1563
                except UnavailableVideoError:
 
1564
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
1565
 
 
1566
 
 
1567
class YahooIE(InfoExtractor):
 
1568
        """Information extractor for video.yahoo.com."""
 
1569
 
 
1570
        # _VALID_URL matches all Yahoo! Video URLs
 
1571
        # _VPAGE_URL matches only the extractable '/watch/' URLs
 
1572
        _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 
1573
        _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 
1574
 
 
1575
        def __init__(self, downloader=None):
 
1576
                InfoExtractor.__init__(self, downloader)
 
1577
 
 
1578
        @staticmethod
 
1579
        def suitable(url):
 
1580
                return (re.match(YahooIE._VALID_URL, url) is not None)
 
1581
 
 
1582
        def report_download_webpage(self, video_id):
 
1583
                """Report webpage download."""
 
1584
                self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 
1585
 
 
1586
        def report_extraction(self, video_id):
 
1587
                """Report information extraction."""
 
1588
                self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 
1589
 
 
1590
        def _real_initialize(self):
 
1591
                return
 
1592
 
 
1593
        def _real_extract(self, url, new_video=True):
 
1594
                # Extract ID from URL
 
1595
                mobj = re.match(self._VALID_URL, url)
 
1596
                if mobj is None:
 
1597
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 
1598
                        return
 
1599
 
 
1600
                # At this point we have a new video
 
1601
                self._downloader.increment_downloads()
 
1602
                video_id = mobj.group(2)
 
1603
                video_extension = 'flv'
 
1604
 
 
1605
                # Rewrite valid but non-extractable URLs as
 
1606
                # extractable English language /watch/ URLs
 
1607
                if re.match(self._VPAGE_URL, url) is None:
 
1608
                        request = urllib2.Request(url)
 
1609
                        try:
 
1610
                                webpage = urllib2.urlopen(request).read()
 
1611
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1612
                                self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1613
                                return
 
1614
 
 
1615
                        mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 
1616
                        if mobj is None:
 
1617
                                self._downloader.trouble(u'ERROR: Unable to extract id field')
 
1618
                                return
 
1619
                        yahoo_id = mobj.group(1)
 
1620
 
 
1621
                        mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 
1622
                        if mobj is None:
 
1623
                                self._downloader.trouble(u'ERROR: Unable to extract vid field')
 
1624
                                return
 
1625
                        yahoo_vid = mobj.group(1)
 
1626
 
 
1627
                        url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 
1628
                        return self._real_extract(url, new_video=False)
 
1629
 
 
1630
                # Retrieve video webpage to extract further information
 
1631
                request = urllib2.Request(url)
 
1632
                try:
 
1633
                        self.report_download_webpage(video_id)
 
1634
                        webpage = urllib2.urlopen(request).read()
 
1635
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1636
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1637
                        return
 
1638
 
 
1639
                # Extract uploader and title from webpage
 
1640
                self.report_extraction(video_id)
 
1641
                mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 
1642
                if mobj is None:
 
1643
                        self._downloader.trouble(u'ERROR: unable to extract video title')
 
1644
                        return
 
1645
                video_title = mobj.group(1).decode('utf-8')
 
1646
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
1647
 
 
1648
                mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 
1649
                if mobj is None:
 
1650
                        self._downloader.trouble(u'ERROR: unable to extract video uploader')
 
1651
                        return
 
1652
                video_uploader = mobj.group(1).decode('utf-8')
 
1653
 
 
1654
                # Extract video thumbnail
 
1655
                mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 
1656
                if mobj is None:
 
1657
                        self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 
1658
                        return
 
1659
                video_thumbnail = mobj.group(1).decode('utf-8')
 
1660
 
 
1661
                # Extract video description
 
1662
                mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 
1663
                if mobj is None:
 
1664
                        self._downloader.trouble(u'ERROR: unable to extract video description')
 
1665
                        return
 
1666
                video_description = mobj.group(1).decode('utf-8')
 
1667
                if not video_description: video_description = 'No description available.'
 
1668
 
 
1669
                # Extract video height and width
 
1670
                mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 
1671
                if mobj is None:
 
1672
                        self._downloader.trouble(u'ERROR: unable to extract video height')
 
1673
                        return
 
1674
                yv_video_height = mobj.group(1)
 
1675
 
 
1676
                mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 
1677
                if mobj is None:
 
1678
                        self._downloader.trouble(u'ERROR: unable to extract video width')
 
1679
                        return
 
1680
                yv_video_width = mobj.group(1)
 
1681
 
 
1682
                # Retrieve video playlist to extract media URL
 
1683
                # I'm not completely sure what all these options are, but we
 
1684
                # seem to need most of them, otherwise the server sends a 401.
 
1685
                yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 
1686
                yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 
1687
                request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 
1688
                                          '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 
1689
                                          '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 
1690
                try:
 
1691
                        self.report_download_webpage(video_id)
 
1692
                        webpage = urllib2.urlopen(request).read()
 
1693
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1694
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1695
                        return
 
1696
 
 
1697
                # Extract media URL from playlist XML
 
1698
                mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 
1699
                if mobj is None:
 
1700
                        self._downloader.trouble(u'ERROR: Unable to extract media URL')
 
1701
                        return
 
1702
                video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 
1703
                video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
 
1704
 
 
1705
                try:
 
1706
                        # Process video information
 
1707
                        self._downloader.process_info({
 
1708
                                'id':           video_id.decode('utf-8'),
 
1709
                                'url':          video_url,
 
1710
                                'uploader':     video_uploader,
 
1711
                                'upload_date':  u'NA',
 
1712
                                'title':        video_title,
 
1713
                                'stitle':       simple_title,
 
1714
                                'ext':          video_extension.decode('utf-8'),
 
1715
                                'thumbnail':    video_thumbnail.decode('utf-8'),
 
1716
                                'description':  video_description,
 
1717
                                'thumbnail':    video_thumbnail,
 
1718
                                'description':  video_description,
 
1719
                                'player_url':   None,
 
1720
                        })
 
1721
                except UnavailableVideoError:
 
1722
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
1723
 
 
1724
 
 
1725
class GenericIE(InfoExtractor):
 
1726
        """Generic last-resort information extractor."""
 
1727
 
 
1728
        def __init__(self, downloader=None):
 
1729
                InfoExtractor.__init__(self, downloader)
 
1730
 
 
1731
        @staticmethod
 
1732
        def suitable(url):
 
1733
                return True
 
1734
 
 
1735
        def report_download_webpage(self, video_id):
 
1736
                """Report webpage download."""
 
1737
                self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
 
1738
                self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
 
1739
 
 
1740
        def report_extraction(self, video_id):
 
1741
                """Report information extraction."""
 
1742
                self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
 
1743
 
 
1744
        def _real_initialize(self):
 
1745
                return
 
1746
 
 
1747
        def _real_extract(self, url):
 
1748
                # At this point we have a new video
 
1749
                self._downloader.increment_downloads()
 
1750
 
 
1751
                video_id = url.split('/')[-1]
 
1752
                request = urllib2.Request(url)
 
1753
                try:
 
1754
                        self.report_download_webpage(video_id)
 
1755
                        webpage = urllib2.urlopen(request).read()
 
1756
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1757
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 
1758
                        return
 
1759
                except ValueError, err:
 
1760
                        # since this is the last-resort InfoExtractor, if
 
1761
                        # this error is thrown, it'll be thrown here
 
1762
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 
1763
                        return
 
1764
 
 
1765
                self.report_extraction(video_id)
 
1766
                # Start with something easy: JW Player in SWFObject
 
1767
                mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 
1768
                if mobj is None:
 
1769
                        # Broaden the search a little bit
 
1770
                        mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 
1771
                if mobj is None:
 
1772
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 
1773
                        return
 
1774
 
 
1775
                # It's possible that one of the regexes
 
1776
                # matched, but returned an empty group:
 
1777
                if mobj.group(1) is None:
 
1778
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 
1779
                        return
 
1780
 
 
1781
                video_url = urllib.unquote(mobj.group(1))
 
1782
                video_id  = os.path.basename(video_url)
 
1783
 
 
1784
                # here's a fun little line of code for you:
 
1785
                video_extension = os.path.splitext(video_id)[1][1:]
 
1786
                video_id        = os.path.splitext(video_id)[0]
 
1787
 
 
1788
                # it's tempting to parse this further, but you would
 
1789
                # have to take into account all the variations like
 
1790
                #   Video Title - Site Name
 
1791
                #   Site Name | Video Title
 
1792
                #   Video Title - Tagline | Site Name
 
1793
                # and so on and so forth; it's just not practical
 
1794
                mobj = re.search(r'<title>(.*)</title>', webpage)
 
1795
                if mobj is None:
 
1796
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
1797
                        return
 
1798
                video_title = mobj.group(1).decode('utf-8')
 
1799
                video_title = sanitize_title(video_title)
 
1800
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
1801
 
 
1802
                # video uploader is domain name
 
1803
                mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
 
1804
                if mobj is None:
 
1805
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
1806
                        return
 
1807
                video_uploader = mobj.group(1).decode('utf-8')
 
1808
 
 
1809
                try:
 
1810
                        # Process video information
 
1811
                        self._downloader.process_info({
 
1812
                                'id':           video_id.decode('utf-8'),
 
1813
                                'url':          video_url.decode('utf-8'),
 
1814
                                'uploader':     video_uploader,
 
1815
                                'upload_date':  u'NA',
 
1816
                                'title':        video_title,
 
1817
                                'stitle':       simple_title,
 
1818
                                'ext':          video_extension.decode('utf-8'),
 
1819
                                'format':       u'NA',
 
1820
                                'player_url':   None,
 
1821
                        })
 
1822
                except UnavailableVideoError, err:
 
1823
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
1824
 
 
1825
 
 
1826
class YoutubeSearchIE(InfoExtractor):
 
1827
        """Information Extractor for YouTube search queries."""
 
1828
        _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 
1829
        _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 
1830
        _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 
1831
        _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
 
1832
        _youtube_ie = None
 
1833
        _max_youtube_results = 1000
 
1834
 
 
1835
        def __init__(self, youtube_ie, downloader=None):
 
1836
                InfoExtractor.__init__(self, downloader)
 
1837
                self._youtube_ie = youtube_ie
 
1838
 
 
1839
        @staticmethod
 
1840
        def suitable(url):
 
1841
                return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 
1842
 
 
1843
        def report_download_page(self, query, pagenum):
 
1844
                """Report attempt to download playlist page with given number."""
 
1845
                query = query.decode(preferredencoding())
 
1846
                self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 
1847
 
 
1848
        def _real_initialize(self):
 
1849
                self._youtube_ie.initialize()
 
1850
 
 
1851
        def _real_extract(self, query):
 
1852
                mobj = re.match(self._VALID_QUERY, query)
 
1853
                if mobj is None:
 
1854
                        self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 
1855
                        return
 
1856
 
 
1857
                prefix, query = query.split(':')
 
1858
                prefix = prefix[8:]
 
1859
                query  = query.encode('utf-8')
 
1860
                if prefix == '':
 
1861
                        self._download_n_results(query, 1)
 
1862
                        return
 
1863
                elif prefix == 'all':
 
1864
                        self._download_n_results(query, self._max_youtube_results)
 
1865
                        return
 
1866
                else:
 
1867
                        try:
 
1868
                                n = long(prefix)
 
1869
                                if n <= 0:
 
1870
                                        self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 
1871
                                        return
 
1872
                                elif n > self._max_youtube_results:
 
1873
                                        self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 
1874
                                        n = self._max_youtube_results
 
1875
                                self._download_n_results(query, n)
 
1876
                                return
 
1877
                        except ValueError: # parsing prefix as integer fails
 
1878
                                self._download_n_results(query, 1)
 
1879
                                return
 
1880
 
 
1881
        def _download_n_results(self, query, n):
 
1882
                """Downloads a specified number of results for a query"""
 
1883
 
 
1884
                video_ids = []
 
1885
                already_seen = set()
 
1886
                pagenum = 1
 
1887
 
 
1888
                while True:
 
1889
                        self.report_download_page(query, pagenum)
 
1890
                        result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 
1891
                        request = urllib2.Request(result_url)
 
1892
                        try:
 
1893
                                page = urllib2.urlopen(request).read()
 
1894
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1895
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 
1896
                                return
 
1897
 
 
1898
                        # Extract video identifiers
 
1899
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 
1900
                                video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 
1901
                                if video_id not in already_seen:
 
1902
                                        video_ids.append(video_id)
 
1903
                                        already_seen.add(video_id)
 
1904
                                        if len(video_ids) == n:
 
1905
                                                # Specified n videos reached
 
1906
                                                for id in video_ids:
 
1907
                                                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 
1908
                                                return
 
1909
 
 
1910
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
 
1911
                                for id in video_ids:
 
1912
                                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 
1913
                                return
 
1914
 
 
1915
                        pagenum = pagenum + 1
 
1916
 
 
1917
class GoogleSearchIE(InfoExtractor):
 
1918
        """Information Extractor for Google Video search queries."""
 
1919
        _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
 
1920
        _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
 
1921
        _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
 
1922
        _MORE_PAGES_INDICATOR = r'<span>Next</span>'
 
1923
        _google_ie = None
 
1924
        _max_google_results = 1000
 
1925
 
 
1926
        def __init__(self, google_ie, downloader=None):
 
1927
                InfoExtractor.__init__(self, downloader)
 
1928
                self._google_ie = google_ie
 
1929
 
 
1930
        @staticmethod
 
1931
        def suitable(url):
 
1932
                return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
 
1933
 
 
1934
        def report_download_page(self, query, pagenum):
 
1935
                """Report attempt to download playlist page with given number."""
 
1936
                query = query.decode(preferredencoding())
 
1937
                self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
 
1938
 
 
1939
        def _real_initialize(self):
 
1940
                self._google_ie.initialize()
 
1941
 
 
1942
        def _real_extract(self, query):
 
1943
                mobj = re.match(self._VALID_QUERY, query)
 
1944
                if mobj is None:
 
1945
                        self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 
1946
                        return
 
1947
 
 
1948
                prefix, query = query.split(':')
 
1949
                prefix = prefix[8:]
 
1950
                query  = query.encode('utf-8')
 
1951
                if prefix == '':
 
1952
                        self._download_n_results(query, 1)
 
1953
                        return
 
1954
                elif prefix == 'all':
 
1955
                        self._download_n_results(query, self._max_google_results)
 
1956
                        return
 
1957
                else:
 
1958
                        try:
 
1959
                                n = long(prefix)
 
1960
                                if n <= 0:
 
1961
                                        self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 
1962
                                        return
 
1963
                                elif n > self._max_google_results:
 
1964
                                        self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
 
1965
                                        n = self._max_google_results
 
1966
                                self._download_n_results(query, n)
 
1967
                                return
 
1968
                        except ValueError: # parsing prefix as integer fails
 
1969
                                self._download_n_results(query, 1)
 
1970
                                return
 
1971
 
 
1972
        def _download_n_results(self, query, n):
 
1973
                """Downloads a specified number of results for a query"""
 
1974
 
 
1975
                video_ids = []
 
1976
                already_seen = set()
 
1977
                pagenum = 1
 
1978
 
 
1979
                while True:
 
1980
                        self.report_download_page(query, pagenum)
 
1981
                        result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 
1982
                        request = urllib2.Request(result_url)
 
1983
                        try:
 
1984
                                page = urllib2.urlopen(request).read()
 
1985
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
1986
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 
1987
                                return
 
1988
 
 
1989
                        # Extract video identifiers
 
1990
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 
1991
                                video_id = mobj.group(1)
 
1992
                                if video_id not in already_seen:
 
1993
                                        video_ids.append(video_id)
 
1994
                                        already_seen.add(video_id)
 
1995
                                        if len(video_ids) == n:
 
1996
                                                # Specified n videos reached
 
1997
                                                for id in video_ids:
 
1998
                                                        self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
 
1999
                                                return
 
2000
 
 
2001
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
 
2002
                                for id in video_ids:
 
2003
                                        self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
 
2004
                                return
 
2005
 
 
2006
                        pagenum = pagenum + 1
 
2007
 
 
2008
class YahooSearchIE(InfoExtractor):
 
2009
        """Information Extractor for Yahoo! Video search queries."""
 
2010
        _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
 
2011
        _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
 
2012
        _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
 
2013
        _MORE_PAGES_INDICATOR = r'\s*Next'
 
2014
        _yahoo_ie = None
 
2015
        _max_yahoo_results = 1000
 
2016
 
 
2017
        def __init__(self, yahoo_ie, downloader=None):
 
2018
                InfoExtractor.__init__(self, downloader)
 
2019
                self._yahoo_ie = yahoo_ie
 
2020
 
 
2021
        @staticmethod
 
2022
        def suitable(url):
 
2023
                return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
 
2024
 
 
2025
        def report_download_page(self, query, pagenum):
 
2026
                """Report attempt to download playlist page with given number."""
 
2027
                query = query.decode(preferredencoding())
 
2028
                self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
 
2029
 
 
2030
        def _real_initialize(self):
 
2031
                self._yahoo_ie.initialize()
 
2032
 
 
2033
        def _real_extract(self, query):
 
2034
                mobj = re.match(self._VALID_QUERY, query)
 
2035
                if mobj is None:
 
2036
                        self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 
2037
                        return
 
2038
 
 
2039
                prefix, query = query.split(':')
 
2040
                prefix = prefix[8:]
 
2041
                query  = query.encode('utf-8')
 
2042
                if prefix == '':
 
2043
                        self._download_n_results(query, 1)
 
2044
                        return
 
2045
                elif prefix == 'all':
 
2046
                        self._download_n_results(query, self._max_yahoo_results)
 
2047
                        return
 
2048
                else:
 
2049
                        try:
 
2050
                                n = long(prefix)
 
2051
                                if n <= 0:
 
2052
                                        self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 
2053
                                        return
 
2054
                                elif n > self._max_yahoo_results:
 
2055
                                        self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
 
2056
                                        n = self._max_yahoo_results
 
2057
                                self._download_n_results(query, n)
 
2058
                                return
 
2059
                        except ValueError: # parsing prefix as integer fails
 
2060
                                self._download_n_results(query, 1)
 
2061
                                return
 
2062
 
 
2063
        def _download_n_results(self, query, n):
 
2064
                """Downloads a specified number of results for a query"""
 
2065
 
 
2066
                video_ids = []
 
2067
                already_seen = set()
 
2068
                pagenum = 1
 
2069
 
 
2070
                while True:
 
2071
                        self.report_download_page(query, pagenum)
 
2072
                        result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 
2073
                        request = urllib2.Request(result_url)
 
2074
                        try:
 
2075
                                page = urllib2.urlopen(request).read()
 
2076
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
2077
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 
2078
                                return
 
2079
 
 
2080
                        # Extract video identifiers
 
2081
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 
2082
                                video_id = mobj.group(1)
 
2083
                                if video_id not in already_seen:
 
2084
                                        video_ids.append(video_id)
 
2085
                                        already_seen.add(video_id)
 
2086
                                        if len(video_ids) == n:
 
2087
                                                # Specified n videos reached
 
2088
                                                for id in video_ids:
 
2089
                                                        self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
 
2090
                                                return
 
2091
 
 
2092
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
 
2093
                                for id in video_ids:
 
2094
                                        self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
 
2095
                                return
 
2096
 
 
2097
                        pagenum = pagenum + 1
 
2098
 
 
2099
class YoutubePlaylistIE(InfoExtractor):
 
2100
        """Information Extractor for YouTube playlists."""
 
2101
 
 
2102
        _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
 
2103
        _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
 
2104
        _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 
2105
        _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
 
2106
        _youtube_ie = None
 
2107
 
 
2108
        def __init__(self, youtube_ie, downloader=None):
 
2109
                InfoExtractor.__init__(self, downloader)
 
2110
                self._youtube_ie = youtube_ie
 
2111
 
 
2112
        @staticmethod
 
2113
        def suitable(url):
 
2114
                return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 
2115
 
 
2116
        def report_download_page(self, playlist_id, pagenum):
 
2117
                """Report attempt to download playlist page with given number."""
 
2118
                self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 
2119
 
 
2120
        def _real_initialize(self):
 
2121
                self._youtube_ie.initialize()
 
2122
 
 
2123
        def _real_extract(self, url):
 
2124
                # Extract playlist id
 
2125
                mobj = re.match(self._VALID_URL, url)
 
2126
                if mobj is None:
 
2127
                        self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 
2128
                        return
 
2129
 
 
2130
                # Single video case
 
2131
                if mobj.group(3) is not None:
 
2132
                        self._youtube_ie.extract(mobj.group(3))
 
2133
                        return
 
2134
 
 
2135
                # Download playlist pages
 
2136
                # prefix is 'p' as default for playlists but there are other types that need extra care
 
2137
                playlist_prefix = mobj.group(1)
 
2138
                if playlist_prefix == 'a':
 
2139
                        playlist_access = 'artist'
 
2140
                else:
 
2141
                        playlist_prefix = 'p'
 
2142
                        playlist_access = 'view_play_list'
 
2143
                playlist_id = mobj.group(2)
 
2144
                video_ids = []
 
2145
                pagenum = 1
 
2146
 
 
2147
                while True:
 
2148
                        self.report_download_page(playlist_id, pagenum)
 
2149
                        request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
 
2150
                        try:
 
2151
                                page = urllib2.urlopen(request).read()
 
2152
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
2153
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 
2154
                                return
 
2155
 
 
2156
                        # Extract video identifiers
 
2157
                        ids_in_page = []
 
2158
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 
2159
                                if mobj.group(1) not in ids_in_page:
 
2160
                                        ids_in_page.append(mobj.group(1))
 
2161
                        video_ids.extend(ids_in_page)
 
2162
 
 
2163
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
 
2164
                                break
 
2165
                        pagenum = pagenum + 1
 
2166
 
 
2167
                playliststart = self._downloader.params.get('playliststart', 1) - 1
 
2168
                playlistend = self._downloader.params.get('playlistend', -1)
 
2169
                video_ids = video_ids[playliststart:playlistend]
 
2170
 
 
2171
                for id in video_ids:
 
2172
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 
2173
                return
 
2174
 
 
2175
class YoutubeUserIE(InfoExtractor):
 
2176
        """Information Extractor for YouTube users."""
 
2177
 
 
2178
        _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
 
2179
        _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
 
2180
        _GDATA_PAGE_SIZE = 50
 
2181
        _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
 
2182
        _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 
2183
        _youtube_ie = None
 
2184
 
 
2185
        def __init__(self, youtube_ie, downloader=None):
 
2186
                InfoExtractor.__init__(self, downloader)
 
2187
                self._youtube_ie = youtube_ie
 
2188
 
 
2189
        @staticmethod
 
2190
        def suitable(url):
 
2191
                return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
 
2192
 
 
2193
        def report_download_page(self, username, start_index):
 
2194
                """Report attempt to download user page."""
 
2195
                self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
 
2196
                                           (username, start_index, start_index + self._GDATA_PAGE_SIZE))
 
2197
 
 
2198
        def _real_initialize(self):
 
2199
                self._youtube_ie.initialize()
 
2200
 
 
2201
        def _real_extract(self, url):
 
2202
                # Extract username
 
2203
                mobj = re.match(self._VALID_URL, url)
 
2204
                if mobj is None:
 
2205
                        self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 
2206
                        return
 
2207
 
 
2208
                username = mobj.group(1)
 
2209
 
 
2210
                # Download video ids using YouTube Data API. Result size per
 
2211
                # query is limited (currently to 50 videos) so we need to query
 
2212
                # page by page until there are no video ids - it means we got
 
2213
                # all of them.
 
2214
 
 
2215
                video_ids = []
 
2216
                pagenum = 0
 
2217
 
 
2218
                while True:
 
2219
                        start_index = pagenum * self._GDATA_PAGE_SIZE + 1
 
2220
                        self.report_download_page(username, start_index)
 
2221
 
 
2222
                        request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
 
2223
 
 
2224
                        try:
 
2225
                                page = urllib2.urlopen(request).read()
 
2226
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
2227
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 
2228
                                return
 
2229
 
 
2230
                        # Extract video identifiers
 
2231
                        ids_in_page = []
 
2232
 
 
2233
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 
2234
                                if mobj.group(1) not in ids_in_page:
 
2235
                                        ids_in_page.append(mobj.group(1))
 
2236
 
 
2237
                        video_ids.extend(ids_in_page)
 
2238
 
 
2239
                        # A little optimization - if current page is not
 
2240
                        # "full", ie. does not contain PAGE_SIZE video ids then
 
2241
                        # we can assume that this page is the last one - there
 
2242
                        # are no more ids on further pages - no need to query
 
2243
                        # again.
 
2244
 
 
2245
                        if len(ids_in_page) < self._GDATA_PAGE_SIZE:
 
2246
                                break
 
2247
 
 
2248
                        pagenum += 1
 
2249
 
 
2250
                all_ids_count = len(video_ids)
 
2251
                playliststart = self._downloader.params.get('playliststart', 1) - 1
 
2252
                playlistend = self._downloader.params.get('playlistend', -1)
 
2253
 
 
2254
                if playlistend == -1:
 
2255
                        video_ids = video_ids[playliststart:]
 
2256
                else:
 
2257
                        video_ids = video_ids[playliststart:playlistend]
 
2258
                        
 
2259
                self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
 
2260
                                           (username, all_ids_count, len(video_ids)))
 
2261
 
 
2262
                for video_id in video_ids:
 
2263
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
 
2264
 
 
2265
 
 
2266
class DepositFilesIE(InfoExtractor):
 
2267
        """Information extractor for depositfiles.com"""
 
2268
 
 
2269
        _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
 
2270
 
 
2271
        def __init__(self, downloader=None):
 
2272
                InfoExtractor.__init__(self, downloader)
 
2273
 
 
2274
        @staticmethod
 
2275
        def suitable(url):
 
2276
                return (re.match(DepositFilesIE._VALID_URL, url) is not None)
 
2277
 
 
2278
        def report_download_webpage(self, file_id):
 
2279
                """Report webpage download."""
 
2280
                self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
 
2281
 
 
2282
        def report_extraction(self, file_id):
 
2283
                """Report information extraction."""
 
2284
                self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
 
2285
 
 
2286
        def _real_initialize(self):
 
2287
                return
 
2288
 
 
2289
        def _real_extract(self, url):
 
2290
                # At this point we have a new file
 
2291
                self._downloader.increment_downloads()
 
2292
 
 
2293
                file_id = url.split('/')[-1]
 
2294
                # Rebuild url in english locale
 
2295
                url = 'http://depositfiles.com/en/files/' + file_id
 
2296
 
 
2297
                # Retrieve file webpage with 'Free download' button pressed
 
2298
                free_download_indication = { 'gateway_result' : '1' }
 
2299
                request = urllib2.Request(url, urllib.urlencode(free_download_indication))
 
2300
                try:
 
2301
                        self.report_download_webpage(file_id)
 
2302
                        webpage = urllib2.urlopen(request).read()
 
2303
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
2304
                        self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
 
2305
                        return
 
2306
 
 
2307
                # Search for the real file URL
 
2308
                mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 
2309
                if (mobj is None) or (mobj.group(1) is None):
 
2310
                        # Try to figure out reason of the error.
 
2311
                        mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 
2312
                        if (mobj is not None) and (mobj.group(1) is not None):
 
2313
                                restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 
2314
                                self._downloader.trouble(u'ERROR: %s' % restriction_message)
 
2315
                        else:
 
2316
                                self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
 
2317
                        return
 
2318
 
 
2319
                file_url = mobj.group(1)
 
2320
                file_extension = os.path.splitext(file_url)[1][1:]
 
2321
 
 
2322
                # Search for file title
 
2323
                mobj = re.search(r'<b title="(.*?)">', webpage)
 
2324
                if mobj is None:
 
2325
                        self._downloader.trouble(u'ERROR: unable to extract title')
 
2326
                        return
 
2327
                file_title = mobj.group(1).decode('utf-8')
 
2328
 
 
2329
                try:
 
2330
                        # Process file information
 
2331
                        self._downloader.process_info({
 
2332
                                'id':           file_id.decode('utf-8'),
 
2333
                                'url':          file_url.decode('utf-8'),
 
2334
                                'uploader':     u'NA',
 
2335
                                'upload_date':  u'NA',
 
2336
                                'title':        file_title,
 
2337
                                'stitle':       file_title,
 
2338
                                'ext':          file_extension.decode('utf-8'),
 
2339
                                'format':       u'NA',
 
2340
                                'player_url':   None,
 
2341
                        })
 
2342
                except UnavailableVideoError, err:
 
2343
                        self._downloader.trouble(u'ERROR: unable to download file')
 
2344
 
 
2345
class FacebookIE(InfoExtractor):
 
2346
        """Information Extractor for Facebook"""
 
2347
 
 
2348
        _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 
2349
        _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 
2350
        _NETRC_MACHINE = 'facebook'
 
2351
        _available_formats = ['highqual', 'lowqual']
 
2352
        _video_extensions = {
 
2353
                'highqual': 'mp4',
 
2354
                'lowqual': 'mp4',
 
2355
        }
 
2356
 
 
2357
        def __init__(self, downloader=None):
 
2358
                InfoExtractor.__init__(self, downloader)
 
2359
 
 
2360
        @staticmethod
 
2361
        def suitable(url):
 
2362
                return (re.match(FacebookIE._VALID_URL, url) is not None)
 
2363
 
 
2364
        def _reporter(self, message):
 
2365
                """Add header and report message."""
 
2366
                self._downloader.to_screen(u'[facebook] %s' % message)
 
2367
 
 
2368
        def report_login(self):
 
2369
                """Report attempt to log in."""
 
2370
                self._reporter(u'Logging in')
 
2371
 
 
2372
        def report_video_webpage_download(self, video_id):
 
2373
                """Report attempt to download video webpage."""
 
2374
                self._reporter(u'%s: Downloading video webpage' % video_id)
 
2375
 
 
2376
        def report_information_extraction(self, video_id):
 
2377
                """Report attempt to extract video information."""
 
2378
                self._reporter(u'%s: Extracting video information' % video_id)
 
2379
 
 
2380
        def _parse_page(self, video_webpage):
 
2381
                """Extract video information from page"""
 
2382
                # General data
 
2383
                data = {'title': r'class="video_title datawrap">(.*?)</',
 
2384
                        'description': r'<div class="datawrap">(.*?)</div>',
 
2385
                        'owner': r'\("video_owner_name", "(.*?)"\)',
 
2386
                        'upload_date': r'data-date="(.*?)"',
 
2387
                        'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
 
2388
                        }
 
2389
                video_info = {}
 
2390
                for piece in data.keys():
 
2391
                        mobj = re.search(data[piece], video_webpage)
 
2392
                        if mobj is not None:
 
2393
                                video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
 
2394
 
 
2395
                # Video urls
 
2396
                video_urls = {}
 
2397
                for fmt in self._available_formats:
 
2398
                        mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
 
2399
                        if mobj is not None:
 
2400
                                # URL is in a Javascript segment inside an escaped Unicode format within
 
2401
                                # the generally utf-8 page
 
2402
                                video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
 
2403
                video_info['video_urls'] = video_urls
 
2404
 
 
2405
                return video_info
 
2406
 
 
2407
        def _real_initialize(self):
 
2408
                if self._downloader is None:
 
2409
                        return
 
2410
 
 
2411
                useremail = None
 
2412
                password = None
 
2413
                downloader_params = self._downloader.params
 
2414
 
 
2415
                # Attempt to use provided username and password or .netrc data
 
2416
                if downloader_params.get('username', None) is not None:
 
2417
                        useremail = downloader_params['username']
 
2418
                        password = downloader_params['password']
 
2419
                elif downloader_params.get('usenetrc', False):
 
2420
                        try:
 
2421
                                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 
2422
                                if info is not None:
 
2423
                                        useremail = info[0]
 
2424
                                        password = info[2]
 
2425
                                else:
 
2426
                                        raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 
2427
                        except (IOError, netrc.NetrcParseError), err:
 
2428
                                self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 
2429
                                return
 
2430
 
 
2431
                if useremail is None:
 
2432
                        return
 
2433
 
 
2434
                # Log in
 
2435
                login_form = {
 
2436
                        'email': useremail,
 
2437
                        'pass': password,
 
2438
                        'login': 'Log+In'
 
2439
                        }
 
2440
                request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 
2441
                try:
 
2442
                        self.report_login()
 
2443
                        login_results = urllib2.urlopen(request).read()
 
2444
                        if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 
2445
                                self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 
2446
                                return
 
2447
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
2448
                        self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 
2449
                        return
 
2450
 
 
2451
        def _real_extract(self, url):
 
2452
                mobj = re.match(self._VALID_URL, url)
 
2453
                if mobj is None:
 
2454
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 
2455
                        return
 
2456
                video_id = mobj.group('ID')
 
2457
 
 
2458
                # Get video webpage
 
2459
                self.report_video_webpage_download(video_id)
 
2460
                request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
 
2461
                try:
 
2462
                        page = urllib2.urlopen(request)
 
2463
                        video_webpage = page.read()
 
2464
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 
2465
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 
2466
                        return
 
2467
 
 
2468
                # Start extracting information
 
2469
                self.report_information_extraction(video_id)
 
2470
 
 
2471
                # Extract information
 
2472
                video_info = self._parse_page(video_webpage)
 
2473
 
 
2474
                # uploader
 
2475
                if 'owner' not in video_info:
 
2476
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 
2477
                        return
 
2478
                video_uploader = video_info['owner']
 
2479
 
 
2480
                # title
 
2481
                if 'title' not in video_info:
 
2482
                        self._downloader.trouble(u'ERROR: unable to extract video title')
 
2483
                        return
 
2484
                video_title = video_info['title']
 
2485
                video_title = video_title.decode('utf-8')
 
2486
                video_title = sanitize_title(video_title)
 
2487
 
 
2488
                # simplified title
 
2489
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 
2490
                simple_title = simple_title.strip(ur'_')
 
2491
 
 
2492
                # thumbnail image
 
2493
                if 'thumbnail' not in video_info:
 
2494
                        self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 
2495
                        video_thumbnail = ''
 
2496
                else:
 
2497
                        video_thumbnail = video_info['thumbnail']
 
2498
 
 
2499
                # upload date
 
2500
                upload_date = u'NA'
 
2501
                if 'upload_date' in video_info:
 
2502
                        upload_time = video_info['upload_date']
 
2503
                        timetuple = email.utils.parsedate_tz(upload_time)
 
2504
                        if timetuple is not None:
 
2505
                                try:
 
2506
                                        upload_date = time.strftime('%Y%m%d', timetuple[0:9])
 
2507
                                except:
 
2508
                                        pass
 
2509
 
 
2510
                # description
 
2511
                video_description = 'No description available.'
 
2512
                if (self._downloader.params.get('forcedescription', False) and
 
2513
                    'description' in video_info):
 
2514
                        video_description = video_info['description']
 
2515
 
 
2516
                url_map = video_info['video_urls']
 
2517
                if len(url_map.keys()) > 0:
 
2518
                        # Decide which formats to download
 
2519
                        req_format = self._downloader.params.get('format', None)
 
2520
                        format_limit = self._downloader.params.get('format_limit', None)
 
2521
 
 
2522
                        if format_limit is not None and format_limit in self._available_formats:
 
2523
                                format_list = self._available_formats[self._available_formats.index(format_limit):]
 
2524
                        else:
 
2525
                                format_list = self._available_formats
 
2526
                        existing_formats = [x for x in format_list if x in url_map]
 
2527
                        if len(existing_formats) == 0:
 
2528
                                self._downloader.trouble(u'ERROR: no known formats available for video')
 
2529
                                return
 
2530
                        if req_format is None:
 
2531
                                video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 
2532
                        elif req_format == '-1':
 
2533
                                video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 
2534
                        else:
 
2535
                                # Specific format
 
2536
                                if req_format not in url_map:
 
2537
                                        self._downloader.trouble(u'ERROR: requested format not available')
 
2538
                                        return
 
2539
                                video_url_list = [(req_format, url_map[req_format])] # Specific format
 
2540
 
 
2541
                for format_param, video_real_url in video_url_list:
 
2542
 
 
2543
                        # At this point we have a new video
 
2544
                        self._downloader.increment_downloads()
 
2545
 
 
2546
                        # Extension
 
2547
                        video_extension = self._video_extensions.get(format_param, 'mp4')
 
2548
 
 
2549
                        # Find the video URL in fmt_url_map or conn paramters
 
2550
                        try:
 
2551
                                # Process video information
 
2552
                                self._downloader.process_info({
 
2553
                                        'id':           video_id.decode('utf-8'),
 
2554
                                        'url':          video_real_url.decode('utf-8'),
 
2555
                                        'uploader':     video_uploader.decode('utf-8'),
 
2556
                                        'upload_date':  upload_date,
 
2557
                                        'title':        video_title,
 
2558
                                        'stitle':       simple_title,
 
2559
                                        'ext':          video_extension.decode('utf-8'),
 
2560
                                        'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 
2561
                                        'thumbnail':    video_thumbnail.decode('utf-8'),
 
2562
                                        'description':  video_description.decode('utf-8'),
 
2563
                                        'player_url':   None,
 
2564
                                })
 
2565
                        except UnavailableVideoError, err:
 
2566
                                self._downloader.trouble(u'\nERROR: unable to download video')
 
2567
 
 
2568
class PostProcessor(object):
 
2569
        """Post Processor class.
 
2570
 
 
2571
        PostProcessor objects can be added to downloaders with their
 
2572
        add_post_processor() method. When the downloader has finished a
 
2573
        successful download, it will take its internal chain of PostProcessors
 
2574
        and start calling the run() method on each one of them, first with
 
2575
        an initial argument and then with the returned value of the previous
 
2576
        PostProcessor.
 
2577
 
 
2578
        The chain will be stopped if one of them ever returns None or the end
 
2579
        of the chain is reached.
 
2580
 
 
2581
        PostProcessor objects follow a "mutual registration" process similar
 
2582
        to InfoExtractor objects.
 
2583
        """
 
2584
 
 
2585
        _downloader = None
 
2586
 
 
2587
        def __init__(self, downloader=None):
 
2588
                self._downloader = downloader
 
2589
 
 
2590
        def set_downloader(self, downloader):
 
2591
                """Sets the downloader for this PP."""
 
2592
                self._downloader = downloader
 
2593
 
 
2594
        def run(self, information):
 
2595
                """Run the PostProcessor.
 
2596
 
 
2597
                The "information" argument is a dictionary like the ones
 
2598
                composed by InfoExtractors. The only difference is that this
 
2599
                one has an extra field called "filepath" that points to the
 
2600
                downloaded file.
 
2601
 
 
2602
                When this method returns None, the postprocessing chain is
 
2603
                stopped. However, this method may return an information
 
2604
                dictionary that will be passed to the next postprocessing
 
2605
                object in the chain. It can be the one it received after
 
2606
                changing some fields.
 
2607
 
 
2608
                In addition, this method may raise a PostProcessingError
 
2609
                exception that will be taken into account by the downloader
 
2610
                it was called from.
 
2611
                """
 
2612
                return information # by default, do nothing
 
2613
 
 
2614
class FFmpegExtractAudioPP(PostProcessor):
 
2615
 
 
2616
        def __init__(self, downloader=None, preferredcodec=None):
 
2617
                PostProcessor.__init__(self, downloader)
 
2618
                if preferredcodec is None:
 
2619
                        preferredcodec = 'best'
 
2620
                self._preferredcodec = preferredcodec
 
2621
 
 
2622
        @staticmethod
 
2623
        def get_audio_codec(path):
 
2624
                try:
 
2625
                        cmd = ['ffprobe', '-show_streams', '--', path]
 
2626
                        handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
 
2627
                        output = handle.communicate()[0]
 
2628
                        if handle.wait() != 0:
 
2629
                                return None
 
2630
                except (IOError, OSError):
 
2631
                        return None
 
2632
                audio_codec = None
 
2633
                for line in output.split('\n'):
 
2634
                        if line.startswith('codec_name='):
 
2635
                                audio_codec = line.split('=')[1].strip()
 
2636
                        elif line.strip() == 'codec_type=audio' and audio_codec is not None:
 
2637
                                return audio_codec
 
2638
                return None
 
2639
 
 
2640
        @staticmethod
 
2641
        def run_ffmpeg(path, out_path, codec, more_opts):
 
2642
                try:
 
2643
                        cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
 
2644
                        ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
 
2645
                        return (ret == 0)
 
2646
                except (IOError, OSError):
 
2647
                        return False
 
2648
 
 
2649
        def run(self, information):
 
2650
                path = information['filepath']
 
2651
 
 
2652
                filecodec = self.get_audio_codec(path)
 
2653
                if filecodec is None:
 
2654
                        self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
 
2655
                        return None
 
2656
 
 
2657
                more_opts = []
 
2658
                if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
 
2659
                        if filecodec == 'aac' or filecodec == 'mp3':
 
2660
                                # Lossless if possible
 
2661
                                acodec = 'copy'
 
2662
                                extension = filecodec
 
2663
                                if filecodec == 'aac':
 
2664
                                        more_opts = ['-f', 'adts']
 
2665
                        else:
 
2666
                                # MP3 otherwise.
 
2667
                                acodec = 'libmp3lame'
 
2668
                                extension = 'mp3'
 
2669
                                more_opts = ['-ab', '128k']
 
2670
                else:
 
2671
                        # We convert the audio (lossy)
 
2672
                        acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
 
2673
                        extension = self._preferredcodec
 
2674
                        more_opts = ['-ab', '128k']
 
2675
                        if self._preferredcodec == 'aac':
 
2676
                                more_opts += ['-f', 'adts']
 
2677
 
 
2678
                (prefix, ext) = os.path.splitext(path)
 
2679
                new_path = prefix + '.' + extension
 
2680
                self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
 
2681
                status = self.run_ffmpeg(path, new_path, acodec, more_opts)
 
2682
 
 
2683
                if not status:
 
2684
                        self._downloader.to_stderr(u'WARNING: error running ffmpeg')
 
2685
                        return None
 
2686
 
 
2687
                try:
 
2688
                        os.remove(path)
 
2689
                except (IOError, OSError):
 
2690
                        self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
 
2691
                        return None
 
2692
 
 
2693
                information['filepath'] = new_path
 
2694
                return information
 
2695
 
 
2696
### MAIN PROGRAM ###
 
2697
if __name__ == '__main__':
 
2698
        try:
 
2699
                # Modules needed only when running the main program
 
2700
                import getpass
 
2701
                import optparse
 
2702
 
 
2703
                # Function to update the program file with the latest version from the repository.
 
2704
                def update_self(downloader, filename):
 
2705
                        # Note: downloader only used for options
 
2706
                        if not os.access(filename, os.W_OK):
 
2707
                                sys.exit('ERROR: no write permissions on %s' % filename)
 
2708
 
 
2709
                        downloader.to_screen('Updating to latest stable version...')
 
2710
                        try:
 
2711
                                latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
 
2712
                                latest_version = urllib.urlopen(latest_url).read().strip()
 
2713
                                prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
 
2714
                                newcontent = urllib.urlopen(prog_url).read()
 
2715
                        except (IOError, OSError), err:
 
2716
                                sys.exit('ERROR: unable to download latest version')
 
2717
                        try:
 
2718
                                stream = open(filename, 'w')
 
2719
                                stream.write(newcontent)
 
2720
                                stream.close()
 
2721
                        except (IOError, OSError), err:
 
2722
                                sys.exit('ERROR: unable to overwrite current version')
 
2723
                        downloader.to_screen('Updated to version %s' % latest_version)
 
2724
 
 
2725
                # Parse command line
 
2726
                parser = optparse.OptionParser(
 
2727
                        usage='Usage: %prog [options] url...',
 
2728
                        version='2011.08.04',
 
2729
                        conflict_handler='resolve',
 
2730
                )
 
2731
 
 
2732
                parser.add_option('-h', '--help',
 
2733
                                action='help', help='print this help text and exit')
 
2734
                parser.add_option('-v', '--version',
 
2735
                                action='version', help='print program version and exit')
 
2736
                parser.add_option('-U', '--update',
 
2737
                                action='store_true', dest='update_self', help='update this program to latest stable version')
 
2738
                parser.add_option('-i', '--ignore-errors',
 
2739
                                action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 
2740
                parser.add_option('-r', '--rate-limit',
 
2741
                                dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
 
2742
                parser.add_option('-R', '--retries',
 
2743
                                dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
 
2744
                parser.add_option('--playlist-start',
 
2745
                                dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
 
2746
                parser.add_option('--playlist-end',
 
2747
                                dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
 
2748
                parser.add_option('--dump-user-agent',
 
2749
                                action='store_true', dest='dump_user_agent',
 
2750
                                help='display the current browser identification', default=False)
 
2751
 
 
2752
                authentication = optparse.OptionGroup(parser, 'Authentication Options')
 
2753
                authentication.add_option('-u', '--username',
 
2754
                                dest='username', metavar='USERNAME', help='account username')
 
2755
                authentication.add_option('-p', '--password',
 
2756
                                dest='password', metavar='PASSWORD', help='account password')
 
2757
                authentication.add_option('-n', '--netrc',
 
2758
                                action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 
2759
                parser.add_option_group(authentication)
 
2760
 
 
2761
                video_format = optparse.OptionGroup(parser, 'Video Format Options')
 
2762
                video_format.add_option('-f', '--format',
 
2763
                                action='store', dest='format', metavar='FORMAT', help='video format code')
 
2764
                video_format.add_option('--all-formats',
 
2765
                                action='store_const', dest='format', help='download all available video formats', const='-1')
 
2766
                video_format.add_option('--max-quality',
 
2767
                                action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
 
2768
                parser.add_option_group(video_format)
 
2769
 
 
2770
                verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
 
2771
                verbosity.add_option('-q', '--quiet',
 
2772
                                action='store_true', dest='quiet', help='activates quiet mode', default=False)
 
2773
                verbosity.add_option('-s', '--simulate',
 
2774
                                action='store_true', dest='simulate', help='do not download video', default=False)
 
2775
                verbosity.add_option('-g', '--get-url',
 
2776
                                action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 
2777
                verbosity.add_option('-e', '--get-title',
 
2778
                                action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 
2779
                verbosity.add_option('--get-thumbnail',
 
2780
                                action='store_true', dest='getthumbnail',
 
2781
                                help='simulate, quiet but print thumbnail URL', default=False)
 
2782
                verbosity.add_option('--get-description',
 
2783
                                action='store_true', dest='getdescription',
 
2784
                                help='simulate, quiet but print video description', default=False)
 
2785
                verbosity.add_option('--get-filename',
 
2786
                                action='store_true', dest='getfilename',
 
2787
                                help='simulate, quiet but print output filename', default=False)
 
2788
                verbosity.add_option('--no-progress',
 
2789
                                action='store_true', dest='noprogress', help='do not print progress bar', default=False)
 
2790
                verbosity.add_option('--console-title',
 
2791
                                action='store_true', dest='consoletitle',
 
2792
                                help='display progress in console titlebar', default=False)
 
2793
                parser.add_option_group(verbosity)
 
2794
 
 
2795
                filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
 
2796
                filesystem.add_option('-t', '--title',
 
2797
                                action='store_true', dest='usetitle', help='use title in file name', default=False)
 
2798
                filesystem.add_option('-l', '--literal',
 
2799
                                action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 
2800
                filesystem.add_option('-A', '--auto-number',
 
2801
                                action='store_true', dest='autonumber',
 
2802
                                help='number downloaded files starting from 00000', default=False)
 
2803
                filesystem.add_option('-o', '--output',
 
2804
                                dest='outtmpl', metavar='TEMPLATE', help='output filename template')
 
2805
                filesystem.add_option('-a', '--batch-file',
 
2806
                                dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
 
2807
                filesystem.add_option('-w', '--no-overwrites',
 
2808
                                action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
 
2809
                filesystem.add_option('-c', '--continue',
 
2810
                                action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
 
2811
                filesystem.add_option('--cookies',
 
2812
                                dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
 
2813
                filesystem.add_option('--no-part',
 
2814
                                action='store_true', dest='nopart', help='do not use .part files', default=False)
 
2815
                filesystem.add_option('--no-mtime',
 
2816
                                action='store_false', dest='updatetime',
 
2817
                                help='do not use the Last-modified header to set the file modification time', default=True)
 
2818
                parser.add_option_group(filesystem)
 
2819
 
 
2820
                postproc = optparse.OptionGroup(parser, 'Post-processing Options')
 
2821
                postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
 
2822
                                help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
 
2823
                postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
 
2824
                                help='"best", "aac" or "mp3"; best by default')
 
2825
                parser.add_option_group(postproc)
 
2826
 
 
2827
                (opts, args) = parser.parse_args()
 
2828
 
 
2829
                # Open appropriate CookieJar
 
2830
                if opts.cookiefile is None:
 
2831
                        jar = cookielib.CookieJar()
 
2832
                else:
 
2833
                        try:
 
2834
                                jar = cookielib.MozillaCookieJar(opts.cookiefile)
 
2835
                                if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
 
2836
                                        jar.load()
 
2837
                        except (IOError, OSError), err:
 
2838
                                sys.exit(u'ERROR: unable to open cookie file')
 
2839
 
 
2840
                # Dump user agent
 
2841
                if opts.dump_user_agent:
 
2842
                        print std_headers['User-Agent']
 
2843
                        sys.exit(0)
 
2844
 
 
2845
                # General configuration
 
2846
                cookie_processor = urllib2.HTTPCookieProcessor(jar)
 
2847
                urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
 
2848
                socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 
2849
 
 
2850
                # Batch file verification
 
2851
                batchurls = []
 
2852
                if opts.batchfile is not None:
 
2853
                        try:
 
2854
                                if opts.batchfile == '-':
 
2855
                                        batchfd = sys.stdin
 
2856
                                else:
 
2857
                                        batchfd = open(opts.batchfile, 'r')
 
2858
                                batchurls = batchfd.readlines()
 
2859
                                batchurls = [x.strip() for x in batchurls]
 
2860
                                batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
 
2861
                        except IOError:
 
2862
                                sys.exit(u'ERROR: batch file could not be read')
 
2863
                all_urls = batchurls + args
 
2864
 
 
2865
                # Conflicting, missing and erroneous options
 
2866
                if opts.usenetrc and (opts.username is not None or opts.password is not None):
 
2867
                        parser.error(u'using .netrc conflicts with giving username/password')
 
2868
                if opts.password is not None and opts.username is None:
 
2869
                        parser.error(u'account username missing')
 
2870
                if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
 
2871
                        parser.error(u'using output template conflicts with using title, literal title or auto number')
 
2872
                if opts.usetitle and opts.useliteral:
 
2873
                        parser.error(u'using title conflicts with using literal title')
 
2874
                if opts.username is not None and opts.password is None:
 
2875
                        opts.password = getpass.getpass(u'Type account password and press return:')
 
2876
                if opts.ratelimit is not None:
 
2877
                        numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
 
2878
                        if numeric_limit is None:
 
2879
                                parser.error(u'invalid rate limit specified')
 
2880
                        opts.ratelimit = numeric_limit
 
2881
                if opts.retries is not None:
 
2882
                        try:
 
2883
                                opts.retries = long(opts.retries)
 
2884
                        except (TypeError, ValueError), err:
 
2885
                                parser.error(u'invalid retry count specified')
 
2886
                try:
 
2887
                        opts.playliststart = long(opts.playliststart)
 
2888
                        if opts.playliststart <= 0:
 
2889
                                raise ValueError
 
2890
                except (TypeError, ValueError), err:
 
2891
                        parser.error(u'invalid playlist start number specified')
 
2892
                try:
 
2893
                        opts.playlistend = long(opts.playlistend)
 
2894
                        if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
 
2895
                                raise ValueError
 
2896
                except (TypeError, ValueError), err:
 
2897
                        parser.error(u'invalid playlist end number specified')
 
2898
                if opts.extractaudio:
 
2899
                        if opts.audioformat not in ['best', 'aac', 'mp3']:
 
2900
                                parser.error(u'invalid audio format specified')
 
2901
 
 
2902
                # Information extractors
 
2903
                youtube_ie = YoutubeIE()
 
2904
                metacafe_ie = MetacafeIE(youtube_ie)
 
2905
                dailymotion_ie = DailymotionIE()
 
2906
                youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
 
2907
                youtube_user_ie = YoutubeUserIE(youtube_ie)
 
2908
                youtube_search_ie = YoutubeSearchIE(youtube_ie)
 
2909
                google_ie = GoogleIE()
 
2910
                google_search_ie = GoogleSearchIE(google_ie)
 
2911
                photobucket_ie = PhotobucketIE()
 
2912
                yahoo_ie = YahooIE()
 
2913
                yahoo_search_ie = YahooSearchIE(yahoo_ie)
 
2914
                deposit_files_ie = DepositFilesIE()
 
2915
                facebook_ie = FacebookIE()
 
2916
                generic_ie = GenericIE()
 
2917
 
 
2918
                # File downloader
 
2919
                fd = FileDownloader({
 
2920
                        'usenetrc': opts.usenetrc,
 
2921
                        'username': opts.username,
 
2922
                        'password': opts.password,
 
2923
                        'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
 
2924
                        'forceurl': opts.geturl,
 
2925
                        'forcetitle': opts.gettitle,
 
2926
                        'forcethumbnail': opts.getthumbnail,
 
2927
                        'forcedescription': opts.getdescription,
 
2928
                        'forcefilename': opts.getfilename,
 
2929
                        'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
 
2930
                        'format': opts.format,
 
2931
                        'format_limit': opts.format_limit,
 
2932
                        'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
 
2933
                                or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
 
2934
                                or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
 
2935
                                or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
 
2936
                                or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
 
2937
                                or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
 
2938
                                or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
 
2939
                                or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
 
2940
                                or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
 
2941
                                or u'%(id)s.%(ext)s'),
 
2942
                        'ignoreerrors': opts.ignoreerrors,
 
2943
                        'ratelimit': opts.ratelimit,
 
2944
                        'nooverwrites': opts.nooverwrites,
 
2945
                        'retries': opts.retries,
 
2946
                        'continuedl': opts.continue_dl,
 
2947
                        'noprogress': opts.noprogress,
 
2948
                        'playliststart': opts.playliststart,
 
2949
                        'playlistend': opts.playlistend,
 
2950
                        'logtostderr': opts.outtmpl == '-',
 
2951
                        'consoletitle': opts.consoletitle,
 
2952
                        'nopart': opts.nopart,
 
2953
                        'updatetime': opts.updatetime,
 
2954
                        })
 
2955
                fd.add_info_extractor(youtube_search_ie)
 
2956
                fd.add_info_extractor(youtube_pl_ie)
 
2957
                fd.add_info_extractor(youtube_user_ie)
 
2958
                fd.add_info_extractor(metacafe_ie)
 
2959
                fd.add_info_extractor(dailymotion_ie)
 
2960
                fd.add_info_extractor(youtube_ie)
 
2961
                fd.add_info_extractor(google_ie)
 
2962
                fd.add_info_extractor(google_search_ie)
 
2963
                fd.add_info_extractor(photobucket_ie)
 
2964
                fd.add_info_extractor(yahoo_ie)
 
2965
                fd.add_info_extractor(yahoo_search_ie)
 
2966
                fd.add_info_extractor(deposit_files_ie)
 
2967
                fd.add_info_extractor(facebook_ie)
 
2968
 
 
2969
                # This must come last since it's the
 
2970
                # fallback if none of the others work
 
2971
                fd.add_info_extractor(generic_ie)
 
2972
 
 
2973
                # PostProcessors
 
2974
                if opts.extractaudio:
 
2975
                        fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
 
2976
 
 
2977
                # Update version
 
2978
                if opts.update_self:
 
2979
                        update_self(fd, sys.argv[0])
 
2980
 
 
2981
                # Maybe do nothing
 
2982
                if len(all_urls) < 1:
 
2983
                        if not opts.update_self:
 
2984
                                parser.error(u'you must provide at least one URL')
 
2985
                        else:
 
2986
                                sys.exit()
 
2987
                retcode = fd.download(all_urls)
 
2988
 
 
2989
                # Dump cookie jar if requested
 
2990
                if opts.cookiefile is not None:
 
2991
                        try:
 
2992
                                jar.save()
 
2993
                        except (IOError, OSError), err:
 
2994
                                sys.exit(u'ERROR: unable to save cookie jar')
 
2995
 
 
2996
                sys.exit(retcode)
 
2997
 
 
2998
        except DownloadError:
 
2999
                sys.exit(1)
 
3000
        except SameFileError:
 
3001
                sys.exit(u'ERROR: fixed output name but more than one file to download')
 
3002
        except KeyboardInterrupt:
 
3003
                sys.exit(u'\nERROR: Interrupted by user')