~cairo-dock-team/cairo-dock-plug-ins-extras/git

« back to all changes in this revision

Viewing changes to YoutubeDl/youtubedl.py

  • Committer: fabounet03
  • Date: 2020-02-28 14:14:49 UTC
  • Revision ID: git-v1:75ca8cfb7bb3798de84d213f862589ff7765bd0b
Replaced YoutubeDl applet with a more generic and enhanced applet: VideoDownloader

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
#!/usr/bin/env python
2
 
# -*- coding: utf-8 -*-
3
 
 
4
 
__authors__  = (
5
 
        'Ricardo Garcia Gonzalez',
6
 
        'Danny Colligan',
7
 
        'Benjamin Johnson',
8
 
        'Vasyl\' Vavrychuk',
9
 
        'Witold Baryluk',
10
 
        'Paweł Paprota',
11
 
        'Gergely Imreh',
12
 
        'Rogério Brito',
13
 
        'Philipp Hagemeister',
14
 
        'Sören Schulze',
15
 
        'Kevin Ngo',
16
 
        'Ori Avtalion',
17
 
        'shizeeg',
18
 
        )
19
 
 
20
 
__license__ = 'Public Domain'
21
 
__version__ = '2012.02.27'
22
 
 
23
 
UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
 
 
25
 
 
26
 
import cookielib
27
 
import datetime
28
 
import getpass
29
 
import gzip
30
 
import htmlentitydefs
31
 
import HTMLParser
32
 
import httplib
33
 
import locale
34
 
import math
35
 
import netrc
36
 
import optparse
37
 
import os
38
 
import os.path
39
 
import re
40
 
import shlex
41
 
import socket
42
 
import string
43
 
import subprocess
44
 
import sys
45
 
import time
46
 
import urllib
47
 
import urllib2
48
 
import warnings
49
 
import zlib
50
 
 
51
 
if os.name == 'nt':
52
 
        import ctypes
53
 
 
54
 
try:
55
 
        import email.utils
56
 
except ImportError: # Python 2.4
57
 
        import email.Utils
58
 
try:
59
 
        import cStringIO as StringIO
60
 
except ImportError:
61
 
        import StringIO
62
 
 
63
 
# parse_qs was moved from the cgi module to the urlparse module recently.
64
 
try:
65
 
        from urlparse import parse_qs
66
 
except ImportError:
67
 
        from cgi import parse_qs
68
 
 
69
 
try:
70
 
        import lxml.etree
71
 
except ImportError:
72
 
        pass # Handled below
73
 
 
74
 
try:
75
 
        import xml.etree.ElementTree
76
 
except ImportError: # Python<2.5: Not officially supported, but let it slip
77
 
        warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
 
 
79
 
std_headers = {
80
 
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81
 
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82
 
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83
 
        'Accept-Encoding': 'gzip, deflate',
84
 
        'Accept-Language': 'en-us,en;q=0.5',
85
 
}
86
 
 
87
 
try:
88
 
        import json
89
 
except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90
 
        import re
91
 
        class json(object):
92
 
                @staticmethod
93
 
                def loads(s):
94
 
                        s = s.decode('UTF-8')
95
 
                        def raiseError(msg, i):
96
 
                                raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97
 
                        def skipSpace(i, expectMore=True):
98
 
                                while i < len(s) and s[i] in ' \t\r\n':
99
 
                                        i += 1
100
 
                                if expectMore:
101
 
                                        if i >= len(s):
102
 
                                                raiseError('Premature end', i)
103
 
                                return i
104
 
                        def decodeEscape(match):
105
 
                                esc = match.group(1)
106
 
                                _STATIC = {
107
 
                                        '"': '"',
108
 
                                        '\\': '\\',
109
 
                                        '/': '/',
110
 
                                        'b': unichr(0x8),
111
 
                                        'f': unichr(0xc),
112
 
                                        'n': '\n',
113
 
                                        'r': '\r',
114
 
                                        't': '\t',
115
 
                                }
116
 
                                if esc in _STATIC:
117
 
                                        return _STATIC[esc]
118
 
                                if esc[0] == 'u':
119
 
                                        if len(esc) == 1+4:
120
 
                                                return unichr(int(esc[1:5], 16))
121
 
                                        if len(esc) == 5+6 and esc[5:7] == '\\u':
122
 
                                                hi = int(esc[1:5], 16)
123
 
                                                low = int(esc[7:11], 16)
124
 
                                                return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125
 
                                raise ValueError('Unknown escape ' + str(esc))
126
 
                        def parseString(i):
127
 
                                i += 1
128
 
                                e = i
129
 
                                while True:
130
 
                                        e = s.index('"', e)
131
 
                                        bslashes = 0
132
 
                                        while s[e-bslashes-1] == '\\':
133
 
                                                bslashes += 1
134
 
                                        if bslashes % 2 == 1:
135
 
                                                e += 1
136
 
                                                continue
137
 
                                        break
138
 
                                rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139
 
                                stri = rexp.sub(decodeEscape, s[i:e])
140
 
                                return (e+1,stri)
141
 
                        def parseObj(i):
142
 
                                i += 1
143
 
                                res = {}
144
 
                                i = skipSpace(i)
145
 
                                if s[i] == '}': # Empty dictionary
146
 
                                        return (i+1,res)
147
 
                                while True:
148
 
                                        if s[i] != '"':
149
 
                                                raiseError('Expected a string object key', i)
150
 
                                        i,key = parseString(i)
151
 
                                        i = skipSpace(i)
152
 
                                        if i >= len(s) or s[i] != ':':
153
 
                                                raiseError('Expected a colon', i)
154
 
                                        i,val = parse(i+1)
155
 
                                        res[key] = val
156
 
                                        i = skipSpace(i)
157
 
                                        if s[i] == '}':
158
 
                                                return (i+1, res)
159
 
                                        if s[i] != ',':
160
 
                                                raiseError('Expected comma or closing curly brace', i)
161
 
                                        i = skipSpace(i+1)
162
 
                        def parseArray(i):
163
 
                                res = []
164
 
                                i = skipSpace(i+1)
165
 
                                if s[i] == ']': # Empty array
166
 
                                        return (i+1,res)
167
 
                                while True:
168
 
                                        i,val = parse(i)
169
 
                                        res.append(val)
170
 
                                        i = skipSpace(i) # Raise exception if premature end
171
 
                                        if s[i] == ']':
172
 
                                                return (i+1, res)
173
 
                                        if s[i] != ',':
174
 
                                                raiseError('Expected a comma or closing bracket', i)
175
 
                                        i = skipSpace(i+1)
176
 
                        def parseDiscrete(i):
177
 
                                for k,v in {'true': True, 'false': False, 'null': None}.items():
178
 
                                        if s.startswith(k, i):
179
 
                                                return (i+len(k), v)
180
 
                                raiseError('Not a boolean (or null)', i)
181
 
                        def parseNumber(i):
182
 
                                mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183
 
                                if mobj is None:
184
 
                                        raiseError('Not a number', i)
185
 
                                nums = mobj.group(1)
186
 
                                if '.' in nums or 'e' in nums or 'E' in nums:
187
 
                                        return (i+len(nums), float(nums))
188
 
                                return (i+len(nums), int(nums))
189
 
                        CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190
 
                        def parse(i):
191
 
                                i = skipSpace(i)
192
 
                                i,res = CHARMAP.get(s[i], parseNumber)(i)
193
 
                                i = skipSpace(i, False)
194
 
                                return (i,res)
195
 
                        i,res = parse(0)
196
 
                        if i < len(s):
197
 
                                raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198
 
                        return res
199
 
 
200
 
def preferredencoding():
201
 
        """Get preferred encoding.
202
 
 
203
 
        Returns the best encoding scheme for the system, based on
204
 
        locale.getpreferredencoding() and some further tweaks.
205
 
        """
206
 
        def yield_preferredencoding():
207
 
                try:
208
 
                        pref = locale.getpreferredencoding()
209
 
                        u'TEST'.encode(pref)
210
 
                except:
211
 
                        pref = 'UTF-8'
212
 
                while True:
213
 
                        yield pref
214
 
        return yield_preferredencoding().next()
215
 
 
216
 
 
217
 
def htmlentity_transform(matchobj):
218
 
        """Transforms an HTML entity to a Unicode character.
219
 
 
220
 
        This function receives a match object and is intended to be used with
221
 
        the re.sub() function.
222
 
        """
223
 
        entity = matchobj.group(1)
224
 
 
225
 
        # Known non-numeric HTML entity
226
 
        if entity in htmlentitydefs.name2codepoint:
227
 
                return unichr(htmlentitydefs.name2codepoint[entity])
228
 
 
229
 
        # Unicode character
230
 
        mobj = re.match(ur'(?u)#(x?\d+)', entity)
231
 
        if mobj is not None:
232
 
                numstr = mobj.group(1)
233
 
                if numstr.startswith(u'x'):
234
 
                        base = 16
235
 
                        numstr = u'0%s' % numstr
236
 
                else:
237
 
                        base = 10
238
 
                return unichr(long(numstr, base))
239
 
 
240
 
        # Unknown entity in name, return its literal representation
241
 
        return (u'&%s;' % entity)
242
 
 
243
 
 
244
 
def sanitize_title(utitle):
245
 
        """Sanitizes a video title so it could be used as part of a filename."""
246
 
        utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247
 
        return utitle.replace(unicode(os.sep), u'%')
248
 
 
249
 
 
250
 
def sanitize_open(filename, open_mode):
251
 
        """Try to open the given filename, and slightly tweak it if this fails.
252
 
 
253
 
        Attempts to open the given filename. If this fails, it tries to change
254
 
        the filename slightly, step by step, until it's either able to open it
255
 
        or it fails and raises a final exception, like the standard open()
256
 
        function.
257
 
 
258
 
        It returns the tuple (stream, definitive_file_name).
259
 
        """
260
 
        try:
261
 
                if filename == u'-':
262
 
                        if sys.platform == 'win32':
263
 
                                import msvcrt
264
 
                                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265
 
                        return (sys.stdout, filename)
266
 
                stream = open(_encodeFilename(filename), open_mode)
267
 
                return (stream, filename)
268
 
        except (IOError, OSError), err:
269
 
                # In case of error, try to remove win32 forbidden chars
270
 
                filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
 
 
272
 
                # An exception here should be caught in the caller
273
 
                stream = open(_encodeFilename(filename), open_mode)
274
 
                return (stream, filename)
275
 
 
276
 
 
277
 
def timeconvert(timestr):
278
 
        """Convert RFC 2822 defined time string into system timestamp"""
279
 
        timestamp = None
280
 
        timetuple = email.utils.parsedate_tz(timestr)
281
 
        if timetuple is not None:
282
 
                timestamp = email.utils.mktime_tz(timetuple)
283
 
        return timestamp
284
 
 
285
 
def _simplify_title(title):
286
 
        expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287
 
        return expr.sub(u'_', title).strip(u'_')
288
 
 
289
 
def _orderedSet(iterable):
290
 
        """ Remove all duplicates from the input iterable """
291
 
        res = []
292
 
        for el in iterable:
293
 
                if el not in res:
294
 
                        res.append(el)
295
 
        return res
296
 
 
297
 
def _unescapeHTML(s):
298
 
        """
299
 
        @param s a string (of type unicode)
300
 
        """
301
 
        assert type(s) == type(u'')
302
 
 
303
 
        htmlParser = HTMLParser.HTMLParser()
304
 
        return htmlParser.unescape(s)
305
 
 
306
 
def _encodeFilename(s):
307
 
        """
308
 
        @param s The name of the file (of type unicode)
309
 
        """
310
 
 
311
 
        assert type(s) == type(u'')
312
 
 
313
 
        if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314
 
                # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315
 
                # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316
 
                # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317
 
                return s
318
 
        else:
319
 
                return s.encode(sys.getfilesystemencoding(), 'ignore')
320
 
 
321
 
class DownloadError(Exception):
322
 
        """Download Error exception.
323
 
 
324
 
        This exception may be thrown by FileDownloader objects if they are not
325
 
        configured to continue on errors. They will contain the appropriate
326
 
        error message.
327
 
        """
328
 
        pass
329
 
 
330
 
 
331
 
class SameFileError(Exception):
332
 
        """Same File exception.
333
 
 
334
 
        This exception will be thrown by FileDownloader objects if they detect
335
 
        multiple files would have to be downloaded to the same file on disk.
336
 
        """
337
 
        pass
338
 
 
339
 
 
340
 
class PostProcessingError(Exception):
341
 
        """Post Processing exception.
342
 
 
343
 
        This exception may be raised by PostProcessor's .run() method to
344
 
        indicate an error in the postprocessing task.
345
 
        """
346
 
        pass
347
 
 
348
 
class MaxDownloadsReached(Exception):
349
 
        """ --max-downloads limit has been reached. """
350
 
        pass
351
 
 
352
 
 
353
 
class UnavailableVideoError(Exception):
354
 
        """Unavailable Format exception.
355
 
 
356
 
        This exception will be thrown when a video is requested
357
 
        in a format that is not available for that video.
358
 
        """
359
 
        pass
360
 
 
361
 
 
362
 
class ContentTooShortError(Exception):
363
 
        """Content Too Short exception.
364
 
 
365
 
        This exception may be raised by FileDownloader objects when a file they
366
 
        download is too small for what the server announced first, indicating
367
 
        the connection was probably interrupted.
368
 
        """
369
 
        # Both in bytes
370
 
        downloaded = None
371
 
        expected = None
372
 
 
373
 
        def __init__(self, downloaded, expected):
374
 
                self.downloaded = downloaded
375
 
                self.expected = expected
376
 
 
377
 
 
378
 
class YoutubeDLHandler(urllib2.HTTPHandler):
379
 
        """Handler for HTTP requests and responses.
380
 
 
381
 
        This class, when installed with an OpenerDirector, automatically adds
382
 
        the standard headers to every HTTP request and handles gzipped and
383
 
        deflated responses from web servers. If compression is to be avoided in
384
 
        a particular request, the original request in the program code only has
385
 
        to include the HTTP header "Youtubedl-No-Compression", which will be
386
 
        removed before making the real request.
387
 
 
388
 
        Part of this code was copied from:
389
 
 
390
 
        http://techknack.net/python-urllib2-handlers/
391
 
 
392
 
        Andrew Rowls, the author of that code, agreed to release it to the
393
 
        public domain.
394
 
        """
395
 
 
396
 
        @staticmethod
397
 
        def deflate(data):
398
 
                try:
399
 
                        return zlib.decompress(data, -zlib.MAX_WBITS)
400
 
                except zlib.error:
401
 
                        return zlib.decompress(data)
402
 
 
403
 
        @staticmethod
404
 
        def addinfourl_wrapper(stream, headers, url, code):
405
 
                if hasattr(urllib2.addinfourl, 'getcode'):
406
 
                        return urllib2.addinfourl(stream, headers, url, code)
407
 
                ret = urllib2.addinfourl(stream, headers, url)
408
 
                ret.code = code
409
 
                return ret
410
 
 
411
 
        def http_request(self, req):
412
 
                for h in std_headers:
413
 
                        if h in req.headers:
414
 
                                del req.headers[h]
415
 
                        req.add_header(h, std_headers[h])
416
 
                if 'Youtubedl-no-compression' in req.headers:
417
 
                        if 'Accept-encoding' in req.headers:
418
 
                                del req.headers['Accept-encoding']
419
 
                        del req.headers['Youtubedl-no-compression']
420
 
                return req
421
 
 
422
 
        def http_response(self, req, resp):
423
 
                old_resp = resp
424
 
                # gzip
425
 
                if resp.headers.get('Content-encoding', '') == 'gzip':
426
 
                        gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427
 
                        resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428
 
                        resp.msg = old_resp.msg
429
 
                # deflate
430
 
                if resp.headers.get('Content-encoding', '') == 'deflate':
431
 
                        gz = StringIO.StringIO(self.deflate(resp.read()))
432
 
                        resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433
 
                        resp.msg = old_resp.msg
434
 
                return resp
435
 
 
436
 
 
437
 
class FileDownloader(object):
438
 
        """File Downloader class.
439
 
 
440
 
        File downloader objects are the ones responsible of downloading the
441
 
        actual video file and writing it to disk if the user has requested
442
 
        it, among some other tasks. In most cases there should be one per
443
 
        program. As, given a video URL, the downloader doesn't know how to
444
 
        extract all the needed information, task that InfoExtractors do, it
445
 
        has to pass the URL to one of them.
446
 
 
447
 
        For this, file downloader objects have a method that allows
448
 
        InfoExtractors to be registered in a given order. When it is passed
449
 
        a URL, the file downloader handles it to the first InfoExtractor it
450
 
        finds that reports being able to handle it. The InfoExtractor extracts
451
 
        all the information about the video or videos the URL refers to, and
452
 
        asks the FileDownloader to process the video information, possibly
453
 
        downloading the video.
454
 
 
455
 
        File downloaders accept a lot of parameters. In order not to saturate
456
 
        the object constructor with arguments, it receives a dictionary of
457
 
        options instead. These options are available through the params
458
 
        attribute for the InfoExtractors to use. The FileDownloader also
459
 
        registers itself as the downloader in charge for the InfoExtractors
460
 
        that are added to it, so this is a "mutual registration".
461
 
 
462
 
        Available options:
463
 
 
464
 
        username:         Username for authentication purposes.
465
 
        password:         Password for authentication purposes.
466
 
        usenetrc:         Use netrc for authentication instead.
467
 
        quiet:            Do not print messages to stdout.
468
 
        forceurl:         Force printing final URL.
469
 
        forcetitle:       Force printing title.
470
 
        forcethumbnail:   Force printing thumbnail URL.
471
 
        forcedescription: Force printing description.
472
 
        forcefilename:    Force printing final filename.
473
 
        simulate:         Do not download the video files.
474
 
        format:           Video format code.
475
 
        format_limit:     Highest quality format to try.
476
 
        outtmpl:          Template for output names.
477
 
        ignoreerrors:     Do not stop on download errors.
478
 
        ratelimit:        Download speed limit, in bytes/sec.
479
 
        nooverwrites:     Prevent overwriting files.
480
 
        retries:          Number of times to retry for HTTP error 5xx
481
 
        continuedl:       Try to continue downloads if possible.
482
 
        noprogress:       Do not print the progress bar.
483
 
        playliststart:    Playlist item to start at.
484
 
        playlistend:      Playlist item to end at.
485
 
        matchtitle:       Download only matching titles.
486
 
        rejecttitle:      Reject downloads for matching titles.
487
 
        logtostderr:      Log messages to stderr instead of stdout.
488
 
        consoletitle:     Display progress in console window's titlebar.
489
 
        nopart:           Do not use temporary .part files.
490
 
        updatetime:       Use the Last-modified header to set output file timestamps.
491
 
        writedescription: Write the video description to a .description file
492
 
        writeinfojson:    Write the video description to a .info.json file
493
 
        """
494
 
 
495
 
        params = None
496
 
        _ies = []
497
 
        _pps = []
498
 
        _download_retcode = None
499
 
        _num_downloads = None
500
 
        _screen_file = None
501
 
 
502
 
        def __init__(self, params):
503
 
                """Create a FileDownloader object with the given options."""
504
 
                self._ies = []
505
 
                self._pps = []
506
 
                self._download_retcode = 0
507
 
                self._num_downloads = 0
508
 
                self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
509
 
                self.params = params
510
 
 
511
 
        @staticmethod
512
 
        def format_bytes(bytes):
513
 
                if bytes is None:
514
 
                        return 'N/A'
515
 
                if type(bytes) is str:
516
 
                        bytes = float(bytes)
517
 
                if bytes == 0.0:
518
 
                        exponent = 0
519
 
                else:
520
 
                        exponent = long(math.log(bytes, 1024.0))
521
 
                suffix = 'bkMGTPEZY'[exponent]
522
 
                converted = float(bytes) / float(1024 ** exponent)
523
 
                return '%.2f%s' % (converted, suffix)
524
 
 
525
 
        @staticmethod
526
 
        def calc_percent(byte_counter, data_len):
527
 
                if data_len is None:
528
 
                        return '---.-%'
529
 
                return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
530
 
 
531
 
        @staticmethod
532
 
        def calc_eta(start, now, total, current):
533
 
                if total is None:
534
 
                        return '--:--'
535
 
                dif = now - start
536
 
                if current == 0 or dif < 0.001: # One millisecond
537
 
                        return '--:--'
538
 
                rate = float(current) / dif
539
 
                eta = long((float(total) - float(current)) / rate)
540
 
                (eta_mins, eta_secs) = divmod(eta, 60)
541
 
                if eta_mins > 99:
542
 
                        return '--:--'
543
 
                return '%02d:%02d' % (eta_mins, eta_secs)
544
 
 
545
 
        @staticmethod
546
 
        def calc_speed(start, now, bytes):
547
 
                dif = now - start
548
 
                if bytes == 0 or dif < 0.001: # One millisecond
549
 
                        return '%10s' % '---b/s'
550
 
                return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
551
 
 
552
 
        @staticmethod
553
 
        def best_block_size(elapsed_time, bytes):
554
 
                new_min = max(bytes / 2.0, 1.0)
555
 
                new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556
 
                if elapsed_time < 0.001:
557
 
                        return long(new_max)
558
 
                rate = bytes / elapsed_time
559
 
                if rate > new_max:
560
 
                        return long(new_max)
561
 
                if rate < new_min:
562
 
                        return long(new_min)
563
 
                return long(rate)
564
 
 
565
 
        @staticmethod
566
 
        def parse_bytes(bytestr):
567
 
                """Parse a string indicating a byte quantity into a long integer."""
568
 
                matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
569
 
                if matchobj is None:
570
 
                        return None
571
 
                number = float(matchobj.group(1))
572
 
                multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573
 
                return long(round(number * multiplier))
574
 
 
575
 
        def add_info_extractor(self, ie):
576
 
                """Add an InfoExtractor object to the end of the list."""
577
 
                self._ies.append(ie)
578
 
                ie.set_downloader(self)
579
 
 
580
 
        def add_post_processor(self, pp):
581
 
                """Add a PostProcessor object to the end of the chain."""
582
 
                self._pps.append(pp)
583
 
                pp.set_downloader(self)
584
 
 
585
 
        def to_screen(self, message, skip_eol=False):
586
 
                """Print message to stdout if not in quiet mode."""
587
 
                assert type(message) == type(u'')
588
 
                if not self.params.get('quiet', False):
589
 
                        terminator = [u'\n', u''][skip_eol]
590
 
                        output = message + terminator
591
 
 
592
 
                        if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593
 
                                output = output.encode(preferredencoding(), 'ignore')
594
 
                        self._screen_file.write(output)
595
 
                        self._screen_file.flush()
596
 
 
597
 
        def to_stderr(self, message):
598
 
                """Print message to stderr."""
599
 
                print >>sys.stderr, message.encode(preferredencoding())
600
 
 
601
 
        def to_cons_title(self, message):
602
 
                """Set console/terminal window title to message."""
603
 
                if not self.params.get('consoletitle', False):
604
 
                        return
605
 
                if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606
 
                        # c_wchar_p() might not be necessary if `message` is
607
 
                        # already of type unicode()
608
 
                        ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609
 
                elif 'TERM' in os.environ:
610
 
                        sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
611
 
 
612
 
        def fixed_template(self):
613
 
                """Checks if the output template is fixed."""
614
 
                return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
615
 
 
616
 
        def trouble(self, message=None):
617
 
                """Determine action to take when a download problem appears.
618
 
 
619
 
                Depending on if the downloader has been configured to ignore
620
 
                download errors or not, this method may throw an exception or
621
 
                not when errors are found, after printing the message.
622
 
                """
623
 
                if message is not None:
624
 
                        self.to_stderr(message)
625
 
                if not self.params.get('ignoreerrors', False):
626
 
                        raise DownloadError(message)
627
 
                self._download_retcode = 1
628
 
 
629
 
        def slow_down(self, start_time, byte_counter):
630
 
                """Sleep if the download speed is over the rate limit."""
631
 
                rate_limit = self.params.get('ratelimit', None)
632
 
                if rate_limit is None or byte_counter == 0:
633
 
                        return
634
 
                now = time.time()
635
 
                elapsed = now - start_time
636
 
                if elapsed <= 0.0:
637
 
                        return
638
 
                speed = float(byte_counter) / elapsed
639
 
                if speed > rate_limit:
640
 
                        time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
641
 
 
642
 
        def temp_name(self, filename):
643
 
                """Returns a temporary filename for the given filename."""
644
 
                if self.params.get('nopart', False) or filename == u'-' or \
645
 
                                (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
646
 
                        return filename
647
 
                return filename + u'.part'
648
 
 
649
 
        def undo_temp_name(self, filename):
650
 
                if filename.endswith(u'.part'):
651
 
                        return filename[:-len(u'.part')]
652
 
                return filename
653
 
 
654
 
        def try_rename(self, old_filename, new_filename):
655
 
                try:
656
 
                        if old_filename == new_filename:
657
 
                                return
658
 
                        os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659
 
                except (IOError, OSError), err:
660
 
                        self.trouble(u'ERROR: unable to rename file')
661
 
 
662
 
        def try_utime(self, filename, last_modified_hdr):
663
 
                """Try to set the last-modified time of the given file."""
664
 
                if last_modified_hdr is None:
665
 
                        return
666
 
                if not os.path.isfile(_encodeFilename(filename)):
667
 
                        return
668
 
                timestr = last_modified_hdr
669
 
                if timestr is None:
670
 
                        return
671
 
                filetime = timeconvert(timestr)
672
 
                if filetime is None:
673
 
                        return filetime
674
 
                try:
675
 
                        os.utime(filename, (time.time(), filetime))
676
 
                except:
677
 
                        pass
678
 
                return filetime
679
 
 
680
 
        def report_writedescription(self, descfn):
681
 
                """ Report that the description file is being written """
682
 
                self.to_screen(u'[info] Writing video description to: ' + descfn)
683
 
 
684
 
        def report_writeinfojson(self, infofn):
685
 
                """ Report that the metadata file has been written """
686
 
                self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
687
 
 
688
 
        def report_destination(self, filename):
689
 
                """Report destination filename."""
690
 
                self.to_screen(u'[download] Destination: ' + filename)
691
 
 
692
 
        def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693
 
                """Report download progress."""
694
 
                if self.params.get('noprogress', False):
695
 
                        return
696
 
                self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697
 
                                (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698
 
                self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699
 
                                (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
700
 
 
701
 
        def report_resuming_byte(self, resume_len):
702
 
                """Report attempt to resume at given byte."""
703
 
                self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
704
 
 
705
 
        def report_retry(self, count, retries):
706
 
                """Report retry in case of HTTP error 5xx"""
707
 
                self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
708
 
 
709
 
        def report_file_already_downloaded(self, file_name):
710
 
                """Report file has already been fully downloaded."""
711
 
                try:
712
 
                        self.to_screen(u'[download] %s has already been downloaded' % file_name)
713
 
                except (UnicodeEncodeError), err:
714
 
                        self.to_screen(u'[download] The file has already been downloaded')
715
 
 
716
 
        def report_unable_to_resume(self):
717
 
                """Report it was impossible to resume download."""
718
 
                self.to_screen(u'[download] Unable to resume')
719
 
 
720
 
        def report_finish(self):
721
 
                """Report download finished."""
722
 
                if self.params.get('noprogress', False):
723
 
                        self.to_screen(u'[download] Download completed')
724
 
                else:
725
 
                        self.to_screen(u'')
726
 
 
727
 
        def increment_downloads(self):
728
 
                """Increment the ordinal that assigns a number to each file."""
729
 
                self._num_downloads += 1
730
 
 
731
 
        def prepare_filename(self, info_dict):
732
 
                """Generate the output filename."""
733
 
                try:
734
 
                        template_dict = dict(info_dict)
735
 
                        template_dict['epoch'] = unicode(long(time.time()))
736
 
                        template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737
 
                        filename = self.params['outtmpl'] % template_dict
738
 
                        return filename
739
 
                except (ValueError, KeyError), err:
740
 
                        self.trouble(u'ERROR: invalid system charset or erroneous output template')
741
 
                        return None
742
 
 
743
 
        def _match_entry(self, info_dict):
744
 
                """ Returns None iff the file should be downloaded """
745
 
 
746
 
                title = info_dict['title']
747
 
                matchtitle = self.params.get('matchtitle', False)
748
 
                if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749
 
                        return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750
 
                rejecttitle = self.params.get('rejecttitle', False)
751
 
                if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752
 
                        return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
753
 
                return None
754
 
 
755
 
        def process_info(self, info_dict):
756
 
                """Process a single dictionary returned by an InfoExtractor."""
757
 
 
758
 
                reason = self._match_entry(info_dict)
759
 
                if reason is not None:
760
 
                        self.to_screen(u'[download] ' + reason)
761
 
                        return
762
 
 
763
 
                max_downloads = self.params.get('max_downloads')
764
 
                if max_downloads is not None:
765
 
                        if self._num_downloads > int(max_downloads):
766
 
                                raise MaxDownloadsReached()
767
 
 
768
 
                filename = self.prepare_filename(info_dict)
769
 
                
770
 
                # Forced printings
771
 
                if self.params.get('forcetitle', False):
772
 
                        print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773
 
                if self.params.get('forceurl', False):
774
 
                        print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775
 
                if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776
 
                        print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777
 
                if self.params.get('forcedescription', False) and 'description' in info_dict:
778
 
                        print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779
 
                if self.params.get('forcefilename', False) and filename is not None:
780
 
                        print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781
 
                if self.params.get('forceformat', False):
782
 
                        print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
783
 
 
784
 
                # Do nothing else if in simulate mode
785
 
                if self.params.get('simulate', False):
786
 
                        return
787
 
 
788
 
                if filename is None:
789
 
                        return
790
 
 
791
 
                try:
792
 
                        dn = os.path.dirname(_encodeFilename(filename))
793
 
                        if dn != '' and not os.path.exists(dn): # dn is already encoded
794
 
                                os.makedirs(dn)
795
 
                except (OSError, IOError), err:
796
 
                        self.trouble(u'ERROR: unable to create directory ' + unicode(err))
797
 
                        return
798
 
 
799
 
                if self.params.get('writedescription', False):
800
 
                        try:
801
 
                                descfn = filename + u'.description'
802
 
                                self.report_writedescription(descfn)
803
 
                                descfile = open(_encodeFilename(descfn), 'wb')
804
 
                                try:
805
 
                                        descfile.write(info_dict['description'].encode('utf-8'))
806
 
                                finally:
807
 
                                        descfile.close()
808
 
                        except (OSError, IOError):
809
 
                                self.trouble(u'ERROR: Cannot write description file ' + descfn)
810
 
                                return
811
 
 
812
 
                if self.params.get('writeinfojson', False):
813
 
                        infofn = filename + u'.info.json'
814
 
                        self.report_writeinfojson(infofn)
815
 
                        try:
816
 
                                json.dump
817
 
                        except (NameError,AttributeError):
818
 
                                self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
819
 
                                return
820
 
                        try:
821
 
                                infof = open(_encodeFilename(infofn), 'wb')
822
 
                                try:
823
 
                                        json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824
 
                                        json.dump(json_info_dict, infof)
825
 
                                finally:
826
 
                                        infof.close()
827
 
                        except (OSError, IOError):
828
 
                                self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
829
 
                                return
830
 
 
831
 
                if not self.params.get('skip_download', False):
832
 
                        if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
833
 
                                success = True
834
 
                        else:
835
 
                                try:
836
 
                                        success = self._do_download(filename, info_dict)
837
 
                                except (OSError, IOError), err:
838
 
                                        raise UnavailableVideoError
839
 
                                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840
 
                                        self.trouble(u'ERROR: unable to download video data: %s' % str(err))
841
 
                                        return
842
 
                                except (ContentTooShortError, ), err:
843
 
                                        self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
844
 
                                        return
845
 
        
846
 
                        if success:
847
 
                                try:
848
 
                                        self.post_process(filename, info_dict)
849
 
                                except (PostProcessingError), err:
850
 
                                        self.trouble(u'ERROR: postprocessing: %s' % str(err))
851
 
                                        return
852
 
 
853
 
        def download(self, url_list):
854
 
                """Download a given list of URLs."""
855
 
                if len(url_list) > 1 and self.fixed_template():
856
 
                        raise SameFileError(self.params['outtmpl'])
857
 
 
858
 
                for url in url_list:
859
 
                        suitable_found = False
860
 
                        for ie in self._ies:
861
 
                                # Go to next InfoExtractor if not suitable
862
 
                                if not ie.suitable(url):
863
 
                                        continue
864
 
 
865
 
                                # Suitable InfoExtractor found
866
 
                                suitable_found = True
867
 
 
868
 
                                # Extract information from URL and process it
869
 
                                ie.extract(url)
870
 
 
871
 
                                # Suitable InfoExtractor had been found; go to next URL
872
 
                                break
873
 
 
874
 
                        if not suitable_found:
875
 
                                self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
876
 
 
877
 
                return self._download_retcode
878
 
 
879
 
        def post_process(self, filename, ie_info):
880
 
                """Run the postprocessing chain on the given file."""
881
 
                info = dict(ie_info)
882
 
                info['filepath'] = filename
883
 
                for pp in self._pps:
884
 
                        info = pp.run(info)
885
 
                        if info is None:
886
 
                                break
887
 
 
888
 
        def _download_with_rtmpdump(self, filename, url, player_url):
889
 
                self.report_destination(filename)
890
 
                tmpfilename = self.temp_name(filename)
891
 
 
892
 
                # Check for rtmpdump first
893
 
                try:
894
 
                        subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895
 
                except (OSError, IOError):
896
 
                        self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
897
 
                        return False
898
 
 
899
 
                # Download using rtmpdump. rtmpdump returns exit code 2 when
900
 
                # the connection was interrumpted and resuming appears to be
901
 
                # possible. This is part of rtmpdump's normal usage, AFAIK.
902
 
                basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903
 
                args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904
 
                if self.params.get('verbose', False):
905
 
                        try:
906
 
                                import pipes
907
 
                                shell_quote = lambda args: ' '.join(map(pipes.quote, args))
908
 
                        except ImportError:
909
 
                                shell_quote = repr
910
 
                        self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911
 
                retval = subprocess.call(args)
912
 
                while retval == 2 or retval == 1:
913
 
                        prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914
 
                        self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915
 
                        time.sleep(5.0) # This seems to be needed
916
 
                        retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917
 
                        cursize = os.path.getsize(_encodeFilename(tmpfilename))
918
 
                        if prevsize == cursize and retval == 1:
919
 
                                break
920
 
                         # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921
 
                        if prevsize == cursize and retval == 2 and cursize > 1024:
922
 
                                self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
923
 
                                retval = 0
924
 
                                break
925
 
                if retval == 0:
926
 
                        self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927
 
                        self.try_rename(tmpfilename, filename)
928
 
                        return True
929
 
                else:
930
 
                        self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
931
 
                        return False
932
 
 
933
 
        def _do_download(self, filename, info_dict):
934
 
                url = info_dict['url']
935
 
                player_url = info_dict.get('player_url', None)
936
 
 
937
 
                # Check file already present
938
 
                if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939
 
                        self.report_file_already_downloaded(filename)
940
 
                        return True
941
 
 
942
 
                # Attempt to download using rtmpdump
943
 
                if url.startswith('rtmp'):
944
 
                        return self._download_with_rtmpdump(filename, url, player_url)
945
 
 
946
 
                tmpfilename = self.temp_name(filename)
947
 
                stream = None
948
 
 
949
 
                # Do not include the Accept-Encoding header
950
 
                headers = {'Youtubedl-no-compression': 'True'}
951
 
                basic_request = urllib2.Request(url, None, headers)
952
 
                request = urllib2.Request(url, None, headers)
953
 
 
954
 
                # Establish possible resume length
955
 
                if os.path.isfile(_encodeFilename(tmpfilename)):
956
 
                        resume_len = os.path.getsize(_encodeFilename(tmpfilename))
957
 
                else:
958
 
                        resume_len = 0
959
 
 
960
 
                open_mode = 'wb'
961
 
                if resume_len != 0:
962
 
                        if self.params.get('continuedl', False):
963
 
                                self.report_resuming_byte(resume_len)
964
 
                                request.add_header('Range','bytes=%d-' % resume_len)
965
 
                                open_mode = 'ab'
966
 
                        else:
967
 
                                resume_len = 0
968
 
 
969
 
                count = 0
970
 
                retries = self.params.get('retries', 0)
971
 
                while count <= retries:
972
 
                        # Establish connection
973
 
                        try:
974
 
                                if count == 0 and 'urlhandle' in info_dict:
975
 
                                        data = info_dict['urlhandle']
976
 
                                data = urllib2.urlopen(request)
977
 
                                break
978
 
                        except (urllib2.HTTPError, ), err:
979
 
                                if (err.code < 500 or err.code >= 600) and err.code != 416:
980
 
                                        # Unexpected HTTP error
981
 
                                        raise
982
 
                                elif err.code == 416:
983
 
                                        # Unable to resume (requested range not satisfiable)
984
 
                                        try:
985
 
                                                # Open the connection again without the range header
986
 
                                                data = urllib2.urlopen(basic_request)
987
 
                                                content_length = data.info()['Content-Length']
988
 
                                        except (urllib2.HTTPError, ), err:
989
 
                                                if err.code < 500 or err.code >= 600:
990
 
                                                        raise
991
 
                                        else:
992
 
                                                # Examine the reported length
993
 
                                                if (content_length is not None and
994
 
                                                                (resume_len - 100 < long(content_length) < resume_len + 100)):
995
 
                                                        # The file had already been fully downloaded.
996
 
                                                        # Explanation to the above condition: in issue #175 it was revealed that
997
 
                                                        # YouTube sometimes adds or removes a few bytes from the end of the file,
998
 
                                                        # changing the file size slightly and causing problems for some users. So
999
 
                                                        # I decided to implement a suggested change and consider the file
1000
 
                                                        # completely downloaded if the file size differs less than 100 bytes from
1001
 
                                                        # the one in the hard drive.
1002
 
                                                        self.report_file_already_downloaded(filename)
1003
 
                                                        self.try_rename(tmpfilename, filename)
1004
 
                                                        return True
1005
 
                                                else:
1006
 
                                                        # The length does not match, we start the download over
1007
 
                                                        self.report_unable_to_resume()
1008
 
                                                        open_mode = 'wb'
1009
 
                                                        break
1010
 
                        # Retry
1011
 
                        count += 1
1012
 
                        if count <= retries:
1013
 
                                self.report_retry(count, retries)
1014
 
 
1015
 
                if count > retries:
1016
 
                        self.trouble(u'ERROR: giving up after %s retries' % retries)
1017
 
                        return False
1018
 
 
1019
 
                data_len = data.info().get('Content-length', None)
1020
 
                if data_len is not None:
1021
 
                        data_len = long(data_len) + resume_len
1022
 
                data_len_str = self.format_bytes(data_len)
1023
 
                byte_counter = 0 + resume_len
1024
 
                block_size = 1024
1025
 
                start = time.time()
1026
 
                while True:
1027
 
                        # Download and write
1028
 
                        before = time.time()
1029
 
                        data_block = data.read(block_size)
1030
 
                        after = time.time()
1031
 
                        if len(data_block) == 0:
1032
 
                                break
1033
 
                        byte_counter += len(data_block)
1034
 
 
1035
 
                        # Open file just in time
1036
 
                        if stream is None:
1037
 
                                try:
1038
 
                                        (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039
 
                                        assert stream is not None
1040
 
                                        filename = self.undo_temp_name(tmpfilename)
1041
 
                                        self.report_destination(filename)
1042
 
                                except (OSError, IOError), err:
1043
 
                                        self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1044
 
                                        return False
1045
 
                        try:
1046
 
                                stream.write(data_block)
1047
 
                        except (IOError, OSError), err:
1048
 
                                self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1049
 
                                return False
1050
 
                        block_size = self.best_block_size(after - before, len(data_block))
1051
 
 
1052
 
                        # Progress message
1053
 
                        speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054
 
                        if data_len is None:
1055
 
                                self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1056
 
                        else:
1057
 
                                percent_str = self.calc_percent(byte_counter, data_len)
1058
 
                                eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059
 
                                self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1060
 
 
1061
 
                        # Apply rate limit
1062
 
                        self.slow_down(start, byte_counter - resume_len)
1063
 
 
1064
 
                if stream is None:
1065
 
                        self.trouble(u'\nERROR: Did not get any data blocks')
1066
 
                        return False
1067
 
                stream.close()
1068
 
                self.report_finish()
1069
 
                if data_len is not None and byte_counter != data_len:
1070
 
                        raise ContentTooShortError(byte_counter, long(data_len))
1071
 
                self.try_rename(tmpfilename, filename)
1072
 
 
1073
 
                # Update file modification time
1074
 
                if self.params.get('updatetime', True):
1075
 
                        info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1076
 
 
1077
 
                return True
1078
 
 
1079
 
 
1080
 
class InfoExtractor(object):
1081
 
        """Information Extractor class.
1082
 
 
1083
 
        Information extractors are the classes that, given a URL, extract
1084
 
        information from the video (or videos) the URL refers to. This
1085
 
        information includes the real video URL, the video title and simplified
1086
 
        title, author and others. The information is stored in a dictionary
1087
 
        which is then passed to the FileDownloader. The FileDownloader
1088
 
        processes this information possibly downloading the video to the file
1089
 
        system, among other possible outcomes. The dictionaries must include
1090
 
        the following fields:
1091
 
 
1092
 
        id:             Video identifier.
1093
 
        url:            Final video URL.
1094
 
        uploader:       Nickname of the video uploader.
1095
 
        title:          Literal title.
1096
 
        stitle:         Simplified title.
1097
 
        ext:            Video filename extension.
1098
 
        format:         Video format.
1099
 
        player_url:     SWF Player URL (may be None).
1100
 
 
1101
 
        The following fields are optional. Their primary purpose is to allow
1102
 
        youtube-dl to serve as the backend for a video search function, such
1103
 
        as the one in youtube2mp3.  They are only used when their respective
1104
 
        forced printing functions are called:
1105
 
 
1106
 
        thumbnail:      Full URL to a video thumbnail image.
1107
 
        description:    One-line video description.
1108
 
 
1109
 
        Subclasses of this one should re-define the _real_initialize() and
1110
 
        _real_extract() methods and define a _VALID_URL regexp.
1111
 
        Probably, they should also be added to the list of extractors.
1112
 
        """
1113
 
 
1114
 
        _ready = False
1115
 
        _downloader = None
1116
 
 
1117
 
        def __init__(self, downloader=None):
1118
 
                """Constructor. Receives an optional downloader."""
1119
 
                self._ready = False
1120
 
                self.set_downloader(downloader)
1121
 
 
1122
 
        def suitable(self, url):
1123
 
                """Receives a URL and returns True if suitable for this IE."""
1124
 
                return re.match(self._VALID_URL, url) is not None
1125
 
 
1126
 
        def initialize(self):
1127
 
                """Initializes an instance (authentication, etc)."""
1128
 
                if not self._ready:
1129
 
                        self._real_initialize()
1130
 
                        self._ready = True
1131
 
 
1132
 
        def extract(self, url):
1133
 
                """Extracts URL information and returns it in list of dicts."""
1134
 
                self.initialize()
1135
 
                return self._real_extract(url)
1136
 
 
1137
 
        def set_downloader(self, downloader):
1138
 
                """Sets the downloader for this IE."""
1139
 
                self._downloader = downloader
1140
 
 
1141
 
        def _real_initialize(self):
1142
 
                """Real initialization process. Redefine in subclasses."""
1143
 
                pass
1144
 
 
1145
 
        def _real_extract(self, url):
1146
 
                """Real extraction process. Redefine in subclasses."""
1147
 
                pass
1148
 
 
1149
 
 
1150
 
class YoutubeIE(InfoExtractor):
1151
 
        """Information extractor for youtube.com."""
1152
 
 
1153
 
        _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154
 
        _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155
 
        _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156
 
        _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157
 
        _NETRC_MACHINE = 'youtube'
1158
 
        # Listed in order of quality
1159
 
        _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160
 
        _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161
 
        _video_extensions = {
1162
 
                '13': '3gp',
1163
 
                '17': 'mp4',
1164
 
                '18': 'mp4',
1165
 
                '22': 'mp4',
1166
 
                '37': 'mp4',
1167
 
                '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1168
 
                '43': 'webm',
1169
 
                '44': 'webm',
1170
 
                '45': 'webm',
1171
 
        }
1172
 
        _video_dimensions = {
1173
 
                '5': '240x400',
1174
 
                '6': '???',
1175
 
                '13': '???',
1176
 
                '17': '144x176',
1177
 
                '18': '360x640',
1178
 
                '22': '720x1280',
1179
 
                '34': '360x640',
1180
 
                '35': '480x854',
1181
 
                '37': '1080x1920',
1182
 
                '38': '3072x4096',
1183
 
                '43': '360x640',
1184
 
                '44': '480x854',
1185
 
                '45': '720x1280',
1186
 
        }       
1187
 
        IE_NAME = u'youtube'
1188
 
 
1189
 
        def report_lang(self):
1190
 
                """Report attempt to set language."""
1191
 
                self._downloader.to_screen(u'[youtube] Setting language')
1192
 
 
1193
 
        def report_login(self):
1194
 
                """Report attempt to log in."""
1195
 
                self._downloader.to_screen(u'[youtube] Logging in')
1196
 
 
1197
 
        def report_age_confirmation(self):
1198
 
                """Report attempt to confirm age."""
1199
 
                self._downloader.to_screen(u'[youtube] Confirming age')
1200
 
 
1201
 
        def report_video_webpage_download(self, video_id):
1202
 
                """Report attempt to download video webpage."""
1203
 
                self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1204
 
 
1205
 
        def report_video_info_webpage_download(self, video_id):
1206
 
                """Report attempt to download video info webpage."""
1207
 
                self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1208
 
 
1209
 
        def report_information_extraction(self, video_id):
1210
 
                """Report attempt to extract video information."""
1211
 
                self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1212
 
 
1213
 
        def report_unavailable_format(self, video_id, format):
1214
 
                """Report extracted video URL."""
1215
 
                self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1216
 
 
1217
 
        def report_rtmp_download(self):
1218
 
                """Indicate the download will use the RTMP protocol."""
1219
 
                self._downloader.to_screen(u'[youtube] RTMP download detected')
1220
 
 
1221
 
        def _print_formats(self, formats):
1222
 
                print 'Available formats:'
1223
 
                for x in formats:
1224
 
                        print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1225
 
 
1226
 
        def _real_initialize(self):
1227
 
                if self._downloader is None:
1228
 
                        return
1229
 
 
1230
 
                username = None
1231
 
                password = None
1232
 
                downloader_params = self._downloader.params
1233
 
 
1234
 
                # Attempt to use provided username and password or .netrc data
1235
 
                if downloader_params.get('username', None) is not None:
1236
 
                        username = downloader_params['username']
1237
 
                        password = downloader_params['password']
1238
 
                elif downloader_params.get('usenetrc', False):
1239
 
                        try:
1240
 
                                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241
 
                                if info is not None:
1242
 
                                        username = info[0]
1243
 
                                        password = info[2]
1244
 
                                else:
1245
 
                                        raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246
 
                        except (IOError, netrc.NetrcParseError), err:
1247
 
                                self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1248
 
                                return
1249
 
 
1250
 
                # Set language
1251
 
                request = urllib2.Request(self._LANG_URL)
1252
 
                try:
1253
 
                        self.report_lang()
1254
 
                        urllib2.urlopen(request).read()
1255
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256
 
                        self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1257
 
                        return
1258
 
 
1259
 
                # No authentication to be performed
1260
 
                if username is None:
1261
 
                        return
1262
 
 
1263
 
                # Log in
1264
 
                login_form = {
1265
 
                                'current_form': 'loginForm',
1266
 
                                'next':         '/',
1267
 
                                'action_login': 'Log In',
1268
 
                                'username':     username,
1269
 
                                'password':     password,
1270
 
                                }
1271
 
                request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1272
 
                try:
1273
 
                        self.report_login()
1274
 
                        login_results = urllib2.urlopen(request).read()
1275
 
                        if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276
 
                                self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1277
 
                                return
1278
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279
 
                        self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1280
 
                        return
1281
 
 
1282
 
                # Confirm age
1283
 
                age_form = {
1284
 
                                'next_url':             '/',
1285
 
                                'action_confirm':       'Confirm',
1286
 
                                }
1287
 
                request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1288
 
                try:
1289
 
                        self.report_age_confirmation()
1290
 
                        age_results = urllib2.urlopen(request).read()
1291
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292
 
                        self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1293
 
                        return
1294
 
 
1295
 
        def _real_extract(self, url):
1296
 
                # Extract video id from URL
1297
 
                mobj = re.match(self._VALID_URL, url)
1298
 
                if mobj is None:
1299
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1300
 
                        return
1301
 
                video_id = mobj.group(2)
1302
 
 
1303
 
                # Get video webpage
1304
 
                self.report_video_webpage_download(video_id)
1305
 
                request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1306
 
                try:
1307
 
                        video_webpage = urllib2.urlopen(request).read()
1308
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1310
 
                        return
1311
 
 
1312
 
                # Attempt to extract SWF player URL
1313
 
                mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314
 
                if mobj is not None:
1315
 
                        player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1316
 
                else:
1317
 
                        player_url = None
1318
 
 
1319
 
                # Get video info
1320
 
                self.report_video_info_webpage_download(video_id)
1321
 
                for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322
 
                        video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323
 
                                        % (video_id, el_type))
1324
 
                        request = urllib2.Request(video_info_url)
1325
 
                        try:
1326
 
                                video_info_webpage = urllib2.urlopen(request).read()
1327
 
                                video_info = parse_qs(video_info_webpage)
1328
 
                                if 'token' in video_info:
1329
 
                                        break
1330
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331
 
                                self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1332
 
                                return
1333
 
                if 'token' not in video_info:
1334
 
                        if 'reason' in video_info:
1335
 
                                self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1336
 
                        else:
1337
 
                                self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1338
 
                        return
1339
 
 
1340
 
                # Start extracting information
1341
 
                self.report_information_extraction(video_id)
1342
 
 
1343
 
                # uploader
1344
 
                if 'author' not in video_info:
1345
 
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1346
 
                        return
1347
 
                video_uploader = urllib.unquote_plus(video_info['author'][0])
1348
 
 
1349
 
                # title
1350
 
                if 'title' not in video_info:
1351
 
                        self._downloader.trouble(u'ERROR: unable to extract video title')
1352
 
                        return
1353
 
                video_title = urllib.unquote_plus(video_info['title'][0])
1354
 
                video_title = video_title.decode('utf-8')
1355
 
                video_title = sanitize_title(video_title)
1356
 
 
1357
 
                # simplified title
1358
 
                simple_title = _simplify_title(video_title)
1359
 
 
1360
 
                # thumbnail image
1361
 
                if 'thumbnail_url' not in video_info:
1362
 
                        self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363
 
                        video_thumbnail = ''
1364
 
                else:   # don't panic if we can't find it
1365
 
                        video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1366
 
 
1367
 
                # upload date
1368
 
                upload_date = u'NA'
1369
 
                mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370
 
                if mobj is not None:
1371
 
                        upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372
 
                        format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373
 
                        for expression in format_expressions:
1374
 
                                try:
1375
 
                                        upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1376
 
                                except:
1377
 
                                        pass
1378
 
 
1379
 
                # description
1380
 
                try:
1381
 
                        lxml.etree
1382
 
                except NameError:
1383
 
                        video_description = u'No description available.'
1384
 
                        mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1385
 
                        if mobj is not None:
1386
 
                                video_description = mobj.group(1).decode('utf-8')
1387
 
                else:
1388
 
                        html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1389
 
                        vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1390
 
                        video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1391
 
                        # TODO use another parser
1392
 
 
1393
 
                # token
1394
 
                video_token = urllib.unquote_plus(video_info['token'][0])
1395
 
 
1396
 
                # Decide which formats to download
1397
 
                req_format = self._downloader.params.get('format', None)
1398
 
 
1399
 
                if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1400
 
                        self.report_rtmp_download()
1401
 
                        video_url_list = [(None, video_info['conn'][0])]
1402
 
                elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1403
 
                        url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1404
 
                        url_data = [parse_qs(uds) for uds in url_data_strs]
1405
 
                        url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1406
 
                        url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1407
 
 
1408
 
                        format_limit = self._downloader.params.get('format_limit', None)
1409
 
                        available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1410
 
                        if format_limit is not None and format_limit in available_formats:
1411
 
                                format_list = available_formats[available_formats.index(format_limit):]
1412
 
                        else:
1413
 
                                format_list = available_formats
1414
 
                        existing_formats = [x for x in format_list if x in url_map]
1415
 
                        if len(existing_formats) == 0:
1416
 
                                self._downloader.trouble(u'ERROR: no known formats available for video')
1417
 
                                return
1418
 
                        if self._downloader.params.get('listformats', None):
1419
 
                                self._print_formats(existing_formats)
1420
 
                                return
1421
 
                        if req_format is None or req_format == 'best':
1422
 
                                video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1423
 
                        elif req_format == 'worst':
1424
 
                                video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1425
 
                        elif req_format in ('-1', 'all'):
1426
 
                                video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1427
 
                        else:
1428
 
                                # Specific formats. We pick the first in a slash-delimeted sequence.
1429
 
                                # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1430
 
                                req_formats = req_format.split('/')
1431
 
                                video_url_list = None
1432
 
                                for rf in req_formats:
1433
 
                                        if rf in url_map:
1434
 
                                                video_url_list = [(rf, url_map[rf])]
1435
 
                                                break
1436
 
                                if video_url_list is None:
1437
 
                                        self._downloader.trouble(u'ERROR: requested format not available')
1438
 
                                        return
1439
 
                else:
1440
 
                        self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1441
 
                        return
1442
 
 
1443
 
                for format_param, video_real_url in video_url_list:
1444
 
                        # At this point we have a new video
1445
 
                        self._downloader.increment_downloads()
1446
 
 
1447
 
                        # Extension
1448
 
                        video_extension = self._video_extensions.get(format_param, 'flv')
1449
 
 
1450
 
                        try:
1451
 
                                # Process video information
1452
 
                                self._downloader.process_info({
1453
 
                                        'id':           video_id.decode('utf-8'),
1454
 
                                        'url':          video_real_url.decode('utf-8'),
1455
 
                                        'uploader':     video_uploader.decode('utf-8'),
1456
 
                                        'upload_date':  upload_date,
1457
 
                                        'title':        video_title,
1458
 
                                        'stitle':       simple_title,
1459
 
                                        'ext':          video_extension.decode('utf-8'),
1460
 
                                        'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1461
 
                                        'thumbnail':    video_thumbnail.decode('utf-8'),
1462
 
                                        'description':  video_description,
1463
 
                                        'player_url':   player_url,
1464
 
                                })
1465
 
                        except UnavailableVideoError, err:
1466
 
                                self._downloader.trouble(u'\nERROR: unable to download video')
1467
 
 
1468
 
 
1469
 
class MetacafeIE(InfoExtractor):
1470
 
        """Information Extractor for metacafe.com."""
1471
 
 
1472
 
        _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1473
 
        _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1474
 
        _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1475
 
        _youtube_ie = None
1476
 
        IE_NAME = u'metacafe'
1477
 
 
1478
 
        def __init__(self, youtube_ie, downloader=None):
1479
 
                InfoExtractor.__init__(self, downloader)
1480
 
                self._youtube_ie = youtube_ie
1481
 
 
1482
 
        def report_disclaimer(self):
1483
 
                """Report disclaimer retrieval."""
1484
 
                self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1485
 
 
1486
 
        def report_age_confirmation(self):
1487
 
                """Report attempt to confirm age."""
1488
 
                self._downloader.to_screen(u'[metacafe] Confirming age')
1489
 
 
1490
 
        def report_download_webpage(self, video_id):
1491
 
                """Report webpage download."""
1492
 
                self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1493
 
 
1494
 
        def report_extraction(self, video_id):
1495
 
                """Report information extraction."""
1496
 
                self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1497
 
 
1498
 
        def _real_initialize(self):
1499
 
                # Retrieve disclaimer
1500
 
                request = urllib2.Request(self._DISCLAIMER)
1501
 
                try:
1502
 
                        self.report_disclaimer()
1503
 
                        disclaimer = urllib2.urlopen(request).read()
1504
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505
 
                        self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1506
 
                        return
1507
 
 
1508
 
                # Confirm age
1509
 
                disclaimer_form = {
1510
 
                        'filters': '0',
1511
 
                        'submit': "Continue - I'm over 18",
1512
 
                        }
1513
 
                request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1514
 
                try:
1515
 
                        self.report_age_confirmation()
1516
 
                        disclaimer = urllib2.urlopen(request).read()
1517
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518
 
                        self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1519
 
                        return
1520
 
 
1521
 
        def _real_extract(self, url):
1522
 
                # Extract id and simplified title from URL
1523
 
                mobj = re.match(self._VALID_URL, url)
1524
 
                if mobj is None:
1525
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1526
 
                        return
1527
 
 
1528
 
                video_id = mobj.group(1)
1529
 
 
1530
 
                # Check if video comes from YouTube
1531
 
                mobj2 = re.match(r'^yt-(.*)$', video_id)
1532
 
                if mobj2 is not None:
1533
 
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1534
 
                        return
1535
 
 
1536
 
                # At this point we have a new video
1537
 
                self._downloader.increment_downloads()
1538
 
 
1539
 
                simple_title = mobj.group(2).decode('utf-8')
1540
 
 
1541
 
                # Retrieve video webpage to extract further information
1542
 
                request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1543
 
                try:
1544
 
                        self.report_download_webpage(video_id)
1545
 
                        webpage = urllib2.urlopen(request).read()
1546
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547
 
                        self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1548
 
                        return
1549
 
 
1550
 
                # Extract URL, uploader and title from webpage
1551
 
                self.report_extraction(video_id)
1552
 
                mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1553
 
                if mobj is not None:
1554
 
                        mediaURL = urllib.unquote(mobj.group(1))
1555
 
                        video_extension = mediaURL[-3:]
1556
 
 
1557
 
                        # Extract gdaKey if available
1558
 
                        mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1559
 
                        if mobj is None:
1560
 
                                video_url = mediaURL
1561
 
                        else:
1562
 
                                gdaKey = mobj.group(1)
1563
 
                                video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1564
 
                else:
1565
 
                        mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1566
 
                        if mobj is None:
1567
 
                                self._downloader.trouble(u'ERROR: unable to extract media URL')
1568
 
                                return
1569
 
                        vardict = parse_qs(mobj.group(1))
1570
 
                        if 'mediaData' not in vardict:
1571
 
                                self._downloader.trouble(u'ERROR: unable to extract media URL')
1572
 
                                return
1573
 
                        mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1574
 
                        if mobj is None:
1575
 
                                self._downloader.trouble(u'ERROR: unable to extract media URL')
1576
 
                                return
1577
 
                        mediaURL = mobj.group(1).replace('\\/', '/')
1578
 
                        video_extension = mediaURL[-3:]
1579
 
                        video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1580
 
 
1581
 
                mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1582
 
                if mobj is None:
1583
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
1584
 
                        return
1585
 
                video_title = mobj.group(1).decode('utf-8')
1586
 
                video_title = sanitize_title(video_title)
1587
 
 
1588
 
                mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1589
 
                if mobj is None:
1590
 
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1591
 
                        return
1592
 
                video_uploader = mobj.group(1)
1593
 
 
1594
 
                try:
1595
 
                        # Process video information
1596
 
                        self._downloader.process_info({
1597
 
                                'id':           video_id.decode('utf-8'),
1598
 
                                'url':          video_url.decode('utf-8'),
1599
 
                                'uploader':     video_uploader.decode('utf-8'),
1600
 
                                'upload_date':  u'NA',
1601
 
                                'title':        video_title,
1602
 
                                'stitle':       simple_title,
1603
 
                                'ext':          video_extension.decode('utf-8'),
1604
 
                                'format':       u'NA',
1605
 
                                'player_url':   None,
1606
 
                        })
1607
 
                except UnavailableVideoError:
1608
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
1609
 
 
1610
 
 
1611
 
class DailymotionIE(InfoExtractor):
1612
 
        """Information Extractor for Dailymotion"""
1613
 
 
1614
 
        _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1615
 
        IE_NAME = u'dailymotion'
1616
 
 
1617
 
        def __init__(self, downloader=None):
1618
 
                InfoExtractor.__init__(self, downloader)
1619
 
 
1620
 
        def report_download_webpage(self, video_id):
1621
 
                """Report webpage download."""
1622
 
                self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1623
 
 
1624
 
        def report_extraction(self, video_id):
1625
 
                """Report information extraction."""
1626
 
                self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1627
 
 
1628
 
        def _real_extract(self, url):
1629
 
                # Extract id and simplified title from URL
1630
 
                mobj = re.match(self._VALID_URL, url)
1631
 
                if mobj is None:
1632
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1633
 
                        return
1634
 
 
1635
 
                # At this point we have a new video
1636
 
                self._downloader.increment_downloads()
1637
 
                video_id = mobj.group(1)
1638
 
 
1639
 
                video_extension = 'flv'
1640
 
 
1641
 
                # Retrieve video webpage to extract further information
1642
 
                request = urllib2.Request(url)
1643
 
                request.add_header('Cookie', 'family_filter=off')
1644
 
                try:
1645
 
                        self.report_download_webpage(video_id)
1646
 
                        webpage = urllib2.urlopen(request).read()
1647
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648
 
                        self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1649
 
                        return
1650
 
 
1651
 
                # Extract URL, uploader and title from webpage
1652
 
                self.report_extraction(video_id)
1653
 
                mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1654
 
                if mobj is None:
1655
 
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
1656
 
                        return
1657
 
                sequence = urllib.unquote(mobj.group(1))
1658
 
                mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1659
 
                if mobj is None:
1660
 
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
1661
 
                        return
1662
 
                mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1663
 
 
1664
 
                # if needed add http://www.dailymotion.com/ if relative URL
1665
 
 
1666
 
                video_url = mediaURL
1667
 
 
1668
 
                mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1669
 
                if mobj is None:
1670
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
1671
 
                        return
1672
 
                video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1673
 
                video_title = sanitize_title(video_title)
1674
 
                simple_title = _simplify_title(video_title)
1675
 
 
1676
 
                mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1677
 
                if mobj is None:
1678
 
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1679
 
                        return
1680
 
                video_uploader = mobj.group(1)
1681
 
 
1682
 
                try:
1683
 
                        # Process video information
1684
 
                        self._downloader.process_info({
1685
 
                                'id':           video_id.decode('utf-8'),
1686
 
                                'url':          video_url.decode('utf-8'),
1687
 
                                'uploader':     video_uploader.decode('utf-8'),
1688
 
                                'upload_date':  u'NA',
1689
 
                                'title':        video_title,
1690
 
                                'stitle':       simple_title,
1691
 
                                'ext':          video_extension.decode('utf-8'),
1692
 
                                'format':       u'NA',
1693
 
                                'player_url':   None,
1694
 
                        })
1695
 
                except UnavailableVideoError:
1696
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
1697
 
 
1698
 
 
1699
 
class GoogleIE(InfoExtractor):
1700
 
        """Information extractor for video.google.com."""
1701
 
 
1702
 
        _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1703
 
        IE_NAME = u'video.google'
1704
 
 
1705
 
        def __init__(self, downloader=None):
1706
 
                InfoExtractor.__init__(self, downloader)
1707
 
 
1708
 
        def report_download_webpage(self, video_id):
1709
 
                """Report webpage download."""
1710
 
                self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1711
 
 
1712
 
        def report_extraction(self, video_id):
1713
 
                """Report information extraction."""
1714
 
                self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1715
 
 
1716
 
        def _real_extract(self, url):
1717
 
                # Extract id from URL
1718
 
                mobj = re.match(self._VALID_URL, url)
1719
 
                if mobj is None:
1720
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1721
 
                        return
1722
 
 
1723
 
                # At this point we have a new video
1724
 
                self._downloader.increment_downloads()
1725
 
                video_id = mobj.group(1)
1726
 
 
1727
 
                video_extension = 'mp4'
1728
 
 
1729
 
                # Retrieve video webpage to extract further information
1730
 
                request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1731
 
                try:
1732
 
                        self.report_download_webpage(video_id)
1733
 
                        webpage = urllib2.urlopen(request).read()
1734
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1736
 
                        return
1737
 
 
1738
 
                # Extract URL, uploader, and title from webpage
1739
 
                self.report_extraction(video_id)
1740
 
                mobj = re.search(r"download_url:'([^']+)'", webpage)
1741
 
                if mobj is None:
1742
 
                        video_extension = 'flv'
1743
 
                        mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1744
 
                if mobj is None:
1745
 
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
1746
 
                        return
1747
 
                mediaURL = urllib.unquote(mobj.group(1))
1748
 
                mediaURL = mediaURL.replace('\\x3d', '\x3d')
1749
 
                mediaURL = mediaURL.replace('\\x26', '\x26')
1750
 
 
1751
 
                video_url = mediaURL
1752
 
 
1753
 
                mobj = re.search(r'<title>(.*)</title>', webpage)
1754
 
                if mobj is None:
1755
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
1756
 
                        return
1757
 
                video_title = mobj.group(1).decode('utf-8')
1758
 
                video_title = sanitize_title(video_title)
1759
 
                simple_title = _simplify_title(video_title)
1760
 
 
1761
 
                # Extract video description
1762
 
                mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1763
 
                if mobj is None:
1764
 
                        self._downloader.trouble(u'ERROR: unable to extract video description')
1765
 
                        return
1766
 
                video_description = mobj.group(1).decode('utf-8')
1767
 
                if not video_description:
1768
 
                        video_description = 'No description available.'
1769
 
 
1770
 
                # Extract video thumbnail
1771
 
                if self._downloader.params.get('forcethumbnail', False):
1772
 
                        request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1773
 
                        try:
1774
 
                                webpage = urllib2.urlopen(request).read()
1775
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776
 
                                self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1777
 
                                return
1778
 
                        mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1779
 
                        if mobj is None:
1780
 
                                self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1781
 
                                return
1782
 
                        video_thumbnail = mobj.group(1)
1783
 
                else:   # we need something to pass to process_info
1784
 
                        video_thumbnail = ''
1785
 
 
1786
 
                try:
1787
 
                        # Process video information
1788
 
                        self._downloader.process_info({
1789
 
                                'id':           video_id.decode('utf-8'),
1790
 
                                'url':          video_url.decode('utf-8'),
1791
 
                                'uploader':     u'NA',
1792
 
                                'upload_date':  u'NA',
1793
 
                                'title':        video_title,
1794
 
                                'stitle':       simple_title,
1795
 
                                'ext':          video_extension.decode('utf-8'),
1796
 
                                'format':       u'NA',
1797
 
                                'player_url':   None,
1798
 
                        })
1799
 
                except UnavailableVideoError:
1800
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
1801
 
 
1802
 
 
1803
 
class PhotobucketIE(InfoExtractor):
1804
 
        """Information extractor for photobucket.com."""
1805
 
 
1806
 
        _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1807
 
        IE_NAME = u'photobucket'
1808
 
 
1809
 
        def __init__(self, downloader=None):
1810
 
                InfoExtractor.__init__(self, downloader)
1811
 
 
1812
 
        def report_download_webpage(self, video_id):
1813
 
                """Report webpage download."""
1814
 
                self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1815
 
 
1816
 
        def report_extraction(self, video_id):
1817
 
                """Report information extraction."""
1818
 
                self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1819
 
 
1820
 
        def _real_extract(self, url):
1821
 
                # Extract id from URL
1822
 
                mobj = re.match(self._VALID_URL, url)
1823
 
                if mobj is None:
1824
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1825
 
                        return
1826
 
 
1827
 
                # At this point we have a new video
1828
 
                self._downloader.increment_downloads()
1829
 
                video_id = mobj.group(1)
1830
 
 
1831
 
                video_extension = 'flv'
1832
 
 
1833
 
                # Retrieve video webpage to extract further information
1834
 
                request = urllib2.Request(url)
1835
 
                try:
1836
 
                        self.report_download_webpage(video_id)
1837
 
                        webpage = urllib2.urlopen(request).read()
1838
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1840
 
                        return
1841
 
 
1842
 
                # Extract URL, uploader, and title from webpage
1843
 
                self.report_extraction(video_id)
1844
 
                mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1845
 
                if mobj is None:
1846
 
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
1847
 
                        return
1848
 
                mediaURL = urllib.unquote(mobj.group(1))
1849
 
 
1850
 
                video_url = mediaURL
1851
 
 
1852
 
                mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1853
 
                if mobj is None:
1854
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
1855
 
                        return
1856
 
                video_title = mobj.group(1).decode('utf-8')
1857
 
                video_title = sanitize_title(video_title)
1858
 
                simple_title = _simplify_title(vide_title)
1859
 
 
1860
 
                video_uploader = mobj.group(2).decode('utf-8')
1861
 
 
1862
 
                try:
1863
 
                        # Process video information
1864
 
                        self._downloader.process_info({
1865
 
                                'id':           video_id.decode('utf-8'),
1866
 
                                'url':          video_url.decode('utf-8'),
1867
 
                                'uploader':     video_uploader,
1868
 
                                'upload_date':  u'NA',
1869
 
                                'title':        video_title,
1870
 
                                'stitle':       simple_title,
1871
 
                                'ext':          video_extension.decode('utf-8'),
1872
 
                                'format':       u'NA',
1873
 
                                'player_url':   None,
1874
 
                        })
1875
 
                except UnavailableVideoError:
1876
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
1877
 
 
1878
 
 
1879
 
class YahooIE(InfoExtractor):
1880
 
        """Information extractor for video.yahoo.com."""
1881
 
 
1882
 
        # _VALID_URL matches all Yahoo! Video URLs
1883
 
        # _VPAGE_URL matches only the extractable '/watch/' URLs
1884
 
        _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1885
 
        _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1886
 
        IE_NAME = u'video.yahoo'
1887
 
 
1888
 
        def __init__(self, downloader=None):
1889
 
                InfoExtractor.__init__(self, downloader)
1890
 
 
1891
 
        def report_download_webpage(self, video_id):
1892
 
                """Report webpage download."""
1893
 
                self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1894
 
 
1895
 
        def report_extraction(self, video_id):
1896
 
                """Report information extraction."""
1897
 
                self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1898
 
 
1899
 
        def _real_extract(self, url, new_video=True):
1900
 
                # Extract ID from URL
1901
 
                mobj = re.match(self._VALID_URL, url)
1902
 
                if mobj is None:
1903
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1904
 
                        return
1905
 
 
1906
 
                # At this point we have a new video
1907
 
                self._downloader.increment_downloads()
1908
 
                video_id = mobj.group(2)
1909
 
                video_extension = 'flv'
1910
 
 
1911
 
                # Rewrite valid but non-extractable URLs as
1912
 
                # extractable English language /watch/ URLs
1913
 
                if re.match(self._VPAGE_URL, url) is None:
1914
 
                        request = urllib2.Request(url)
1915
 
                        try:
1916
 
                                webpage = urllib2.urlopen(request).read()
1917
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918
 
                                self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1919
 
                                return
1920
 
 
1921
 
                        mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1922
 
                        if mobj is None:
1923
 
                                self._downloader.trouble(u'ERROR: Unable to extract id field')
1924
 
                                return
1925
 
                        yahoo_id = mobj.group(1)
1926
 
 
1927
 
                        mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1928
 
                        if mobj is None:
1929
 
                                self._downloader.trouble(u'ERROR: Unable to extract vid field')
1930
 
                                return
1931
 
                        yahoo_vid = mobj.group(1)
1932
 
 
1933
 
                        url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1934
 
                        return self._real_extract(url, new_video=False)
1935
 
 
1936
 
                # Retrieve video webpage to extract further information
1937
 
                request = urllib2.Request(url)
1938
 
                try:
1939
 
                        self.report_download_webpage(video_id)
1940
 
                        webpage = urllib2.urlopen(request).read()
1941
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1943
 
                        return
1944
 
 
1945
 
                # Extract uploader and title from webpage
1946
 
                self.report_extraction(video_id)
1947
 
                mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1948
 
                if mobj is None:
1949
 
                        self._downloader.trouble(u'ERROR: unable to extract video title')
1950
 
                        return
1951
 
                video_title = mobj.group(1).decode('utf-8')
1952
 
                simple_title = _simplify_title(video_title)
1953
 
 
1954
 
                mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1955
 
                if mobj is None:
1956
 
                        self._downloader.trouble(u'ERROR: unable to extract video uploader')
1957
 
                        return
1958
 
                video_uploader = mobj.group(1).decode('utf-8')
1959
 
 
1960
 
                # Extract video thumbnail
1961
 
                mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1962
 
                if mobj is None:
1963
 
                        self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1964
 
                        return
1965
 
                video_thumbnail = mobj.group(1).decode('utf-8')
1966
 
 
1967
 
                # Extract video description
1968
 
                mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1969
 
                if mobj is None:
1970
 
                        self._downloader.trouble(u'ERROR: unable to extract video description')
1971
 
                        return
1972
 
                video_description = mobj.group(1).decode('utf-8')
1973
 
                if not video_description:
1974
 
                        video_description = 'No description available.'
1975
 
 
1976
 
                # Extract video height and width
1977
 
                mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1978
 
                if mobj is None:
1979
 
                        self._downloader.trouble(u'ERROR: unable to extract video height')
1980
 
                        return
1981
 
                yv_video_height = mobj.group(1)
1982
 
 
1983
 
                mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1984
 
                if mobj is None:
1985
 
                        self._downloader.trouble(u'ERROR: unable to extract video width')
1986
 
                        return
1987
 
                yv_video_width = mobj.group(1)
1988
 
 
1989
 
                # Retrieve video playlist to extract media URL
1990
 
                # I'm not completely sure what all these options are, but we
1991
 
                # seem to need most of them, otherwise the server sends a 401.
1992
 
                yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1993
 
                yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1994
 
                request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1995
 
                                '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1996
 
                                '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1997
 
                try:
1998
 
                        self.report_download_webpage(video_id)
1999
 
                        webpage = urllib2.urlopen(request).read()
2000
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2002
 
                        return
2003
 
 
2004
 
                # Extract media URL from playlist XML
2005
 
                mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2006
 
                if mobj is None:
2007
 
                        self._downloader.trouble(u'ERROR: Unable to extract media URL')
2008
 
                        return
2009
 
                video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2010
 
                video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2011
 
 
2012
 
                try:
2013
 
                        # Process video information
2014
 
                        self._downloader.process_info({
2015
 
                                'id':           video_id.decode('utf-8'),
2016
 
                                'url':          video_url,
2017
 
                                'uploader':     video_uploader,
2018
 
                                'upload_date':  u'NA',
2019
 
                                'title':        video_title,
2020
 
                                'stitle':       simple_title,
2021
 
                                'ext':          video_extension.decode('utf-8'),
2022
 
                                'thumbnail':    video_thumbnail.decode('utf-8'),
2023
 
                                'description':  video_description,
2024
 
                                'thumbnail':    video_thumbnail,
2025
 
                                'player_url':   None,
2026
 
                        })
2027
 
                except UnavailableVideoError:
2028
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
2029
 
 
2030
 
 
2031
 
class VimeoIE(InfoExtractor):
2032
 
        """Information extractor for vimeo.com."""
2033
 
 
2034
 
        # _VALID_URL matches Vimeo URLs
2035
 
        _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2036
 
        IE_NAME = u'vimeo'
2037
 
 
2038
 
        def __init__(self, downloader=None):
2039
 
                InfoExtractor.__init__(self, downloader)
2040
 
 
2041
 
        def report_download_webpage(self, video_id):
2042
 
                """Report webpage download."""
2043
 
                self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2044
 
 
2045
 
        def report_extraction(self, video_id):
2046
 
                """Report information extraction."""
2047
 
                self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2048
 
 
2049
 
        def _real_extract(self, url, new_video=True):
2050
 
                # Extract ID from URL
2051
 
                mobj = re.match(self._VALID_URL, url)
2052
 
                if mobj is None:
2053
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2054
 
                        return
2055
 
 
2056
 
                # At this point we have a new video
2057
 
                self._downloader.increment_downloads()
2058
 
                video_id = mobj.group(1)
2059
 
 
2060
 
                # Retrieve video webpage to extract further information
2061
 
                request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2062
 
                try:
2063
 
                        self.report_download_webpage(video_id)
2064
 
                        webpage = urllib2.urlopen(request).read()
2065
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2067
 
                        return
2068
 
 
2069
 
                # Now we begin extracting as much information as we can from what we
2070
 
                # retrieved. First we extract the information common to all extractors,
2071
 
                # and latter we extract those that are Vimeo specific.
2072
 
                self.report_extraction(video_id)
2073
 
 
2074
 
                # Extract title
2075
 
                mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2076
 
                if mobj is None:
2077
 
                        self._downloader.trouble(u'ERROR: unable to extract video title')
2078
 
                        return
2079
 
                video_title = mobj.group(1).decode('utf-8')
2080
 
                simple_title = _simplify_title(video_title)
2081
 
 
2082
 
                # Extract uploader
2083
 
                mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2084
 
                if mobj is None:
2085
 
                        self._downloader.trouble(u'ERROR: unable to extract video uploader')
2086
 
                        return
2087
 
                video_uploader = mobj.group(1).decode('utf-8')
2088
 
 
2089
 
                # Extract video thumbnail
2090
 
                mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2091
 
                if mobj is None:
2092
 
                        self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2093
 
                        return
2094
 
                video_thumbnail = mobj.group(1).decode('utf-8')
2095
 
 
2096
 
                # # Extract video description
2097
 
                # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2098
 
                # if mobj is None:
2099
 
                #       self._downloader.trouble(u'ERROR: unable to extract video description')
2100
 
                #       return
2101
 
                # video_description = mobj.group(1).decode('utf-8')
2102
 
                # if not video_description: video_description = 'No description available.'
2103
 
                video_description = 'Foo.'
2104
 
 
2105
 
                # Vimeo specific: extract request signature
2106
 
                mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2107
 
                if mobj is None:
2108
 
                        self._downloader.trouble(u'ERROR: unable to extract request signature')
2109
 
                        return
2110
 
                sig = mobj.group(1).decode('utf-8')
2111
 
 
2112
 
                # Vimeo specific: extract video quality information
2113
 
                mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2114
 
                if mobj is None:
2115
 
                        self._downloader.trouble(u'ERROR: unable to extract video quality information')
2116
 
                        return
2117
 
                quality = mobj.group(1).decode('utf-8')
2118
 
 
2119
 
                if int(quality) == 1:
2120
 
                        quality = 'hd'
2121
 
                else:
2122
 
                        quality = 'sd'
2123
 
 
2124
 
                # Vimeo specific: Extract request signature expiration
2125
 
                mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2126
 
                if mobj is None:
2127
 
                        self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2128
 
                        return
2129
 
                sig_exp = mobj.group(1).decode('utf-8')
2130
 
 
2131
 
                video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2132
 
 
2133
 
                try:
2134
 
                        # Process video information
2135
 
                        self._downloader.process_info({
2136
 
                                'id':           video_id.decode('utf-8'),
2137
 
                                'url':          video_url,
2138
 
                                'uploader':     video_uploader,
2139
 
                                'upload_date':  u'NA',
2140
 
                                'title':        video_title,
2141
 
                                'stitle':       simple_title,
2142
 
                                'ext':          u'mp4',
2143
 
                                'thumbnail':    video_thumbnail.decode('utf-8'),
2144
 
                                'description':  video_description,
2145
 
                                'thumbnail':    video_thumbnail,
2146
 
                                'description':  video_description,
2147
 
                                'player_url':   None,
2148
 
                        })
2149
 
                except UnavailableVideoError:
2150
 
                        self._downloader.trouble(u'ERROR: unable to download video')
2151
 
 
2152
 
 
2153
 
class GenericIE(InfoExtractor):
2154
 
        """Generic last-resort information extractor."""
2155
 
 
2156
 
        _VALID_URL = r'.*'
2157
 
        IE_NAME = u'generic'
2158
 
 
2159
 
        def __init__(self, downloader=None):
2160
 
                InfoExtractor.__init__(self, downloader)
2161
 
 
2162
 
        def report_download_webpage(self, video_id):
2163
 
                """Report webpage download."""
2164
 
                self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2165
 
                self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2166
 
 
2167
 
        def report_extraction(self, video_id):
2168
 
                """Report information extraction."""
2169
 
                self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2170
 
 
2171
 
        def _real_extract(self, url):
2172
 
                # At this point we have a new video
2173
 
                self._downloader.increment_downloads()
2174
 
 
2175
 
                video_id = url.split('/')[-1]
2176
 
                request = urllib2.Request(url)
2177
 
                try:
2178
 
                        self.report_download_webpage(video_id)
2179
 
                        webpage = urllib2.urlopen(request).read()
2180
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2182
 
                        return
2183
 
                except ValueError, err:
2184
 
                        # since this is the last-resort InfoExtractor, if
2185
 
                        # this error is thrown, it'll be thrown here
2186
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2187
 
                        return
2188
 
 
2189
 
                self.report_extraction(video_id)
2190
 
                # Start with something easy: JW Player in SWFObject
2191
 
                mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2192
 
                if mobj is None:
2193
 
                        # Broaden the search a little bit
2194
 
                        mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2195
 
                if mobj is None:
2196
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2197
 
                        return
2198
 
 
2199
 
                # It's possible that one of the regexes
2200
 
                # matched, but returned an empty group:
2201
 
                if mobj.group(1) is None:
2202
 
                        self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2203
 
                        return
2204
 
 
2205
 
                video_url = urllib.unquote(mobj.group(1))
2206
 
                video_id = os.path.basename(video_url)
2207
 
 
2208
 
                # here's a fun little line of code for you:
2209
 
                video_extension = os.path.splitext(video_id)[1][1:]
2210
 
                video_id = os.path.splitext(video_id)[0]
2211
 
 
2212
 
                # it's tempting to parse this further, but you would
2213
 
                # have to take into account all the variations like
2214
 
                #   Video Title - Site Name
2215
 
                #   Site Name | Video Title
2216
 
                #   Video Title - Tagline | Site Name
2217
 
                # and so on and so forth; it's just not practical
2218
 
                mobj = re.search(r'<title>(.*)</title>', webpage)
2219
 
                if mobj is None:
2220
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
2221
 
                        return
2222
 
                video_title = mobj.group(1).decode('utf-8')
2223
 
                video_title = sanitize_title(video_title)
2224
 
                simple_title = _simplify_title(video_title)
2225
 
 
2226
 
                # video uploader is domain name
2227
 
                mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2228
 
                if mobj is None:
2229
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
2230
 
                        return
2231
 
                video_uploader = mobj.group(1).decode('utf-8')
2232
 
 
2233
 
                try:
2234
 
                        # Process video information
2235
 
                        self._downloader.process_info({
2236
 
                                'id':           video_id.decode('utf-8'),
2237
 
                                'url':          video_url.decode('utf-8'),
2238
 
                                'uploader':     video_uploader,
2239
 
                                'upload_date':  u'NA',
2240
 
                                'title':        video_title,
2241
 
                                'stitle':       simple_title,
2242
 
                                'ext':          video_extension.decode('utf-8'),
2243
 
                                'format':       u'NA',
2244
 
                                'player_url':   None,
2245
 
                        })
2246
 
                except UnavailableVideoError, err:
2247
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
2248
 
 
2249
 
 
2250
 
class YoutubeSearchIE(InfoExtractor):
2251
 
        """Information Extractor for YouTube search queries."""
2252
 
        _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2253
 
        _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2254
 
        _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2255
 
        _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2256
 
        _youtube_ie = None
2257
 
        _max_youtube_results = 1000
2258
 
        IE_NAME = u'youtube:search'
2259
 
 
2260
 
        def __init__(self, youtube_ie, downloader=None):
2261
 
                InfoExtractor.__init__(self, downloader)
2262
 
                self._youtube_ie = youtube_ie
2263
 
 
2264
 
        def report_download_page(self, query, pagenum):
2265
 
                """Report attempt to download playlist page with given number."""
2266
 
                query = query.decode(preferredencoding())
2267
 
                self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2268
 
 
2269
 
        def _real_initialize(self):
2270
 
                self._youtube_ie.initialize()
2271
 
 
2272
 
        def _real_extract(self, query):
2273
 
                mobj = re.match(self._VALID_URL, query)
2274
 
                if mobj is None:
2275
 
                        self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2276
 
                        return
2277
 
 
2278
 
                prefix, query = query.split(':')
2279
 
                prefix = prefix[8:]
2280
 
                query = query.encode('utf-8')
2281
 
                if prefix == '':
2282
 
                        self._download_n_results(query, 1)
2283
 
                        return
2284
 
                elif prefix == 'all':
2285
 
                        self._download_n_results(query, self._max_youtube_results)
2286
 
                        return
2287
 
                else:
2288
 
                        try:
2289
 
                                n = long(prefix)
2290
 
                                if n <= 0:
2291
 
                                        self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2292
 
                                        return
2293
 
                                elif n > self._max_youtube_results:
2294
 
                                        self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2295
 
                                        n = self._max_youtube_results
2296
 
                                self._download_n_results(query, n)
2297
 
                                return
2298
 
                        except ValueError: # parsing prefix as integer fails
2299
 
                                self._download_n_results(query, 1)
2300
 
                                return
2301
 
 
2302
 
        def _download_n_results(self, query, n):
2303
 
                """Downloads a specified number of results for a query"""
2304
 
 
2305
 
                video_ids = []
2306
 
                already_seen = set()
2307
 
                pagenum = 1
2308
 
 
2309
 
                while True:
2310
 
                        self.report_download_page(query, pagenum)
2311
 
                        result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2312
 
                        request = urllib2.Request(result_url)
2313
 
                        try:
2314
 
                                page = urllib2.urlopen(request).read()
2315
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316
 
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2317
 
                                return
2318
 
 
2319
 
                        # Extract video identifiers
2320
 
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2321
 
                                video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2322
 
                                if video_id not in already_seen:
2323
 
                                        video_ids.append(video_id)
2324
 
                                        already_seen.add(video_id)
2325
 
                                        if len(video_ids) == n:
2326
 
                                                # Specified n videos reached
2327
 
                                                for id in video_ids:
2328
 
                                                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2329
 
                                                return
2330
 
 
2331
 
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332
 
                                for id in video_ids:
2333
 
                                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2334
 
                                return
2335
 
 
2336
 
                        pagenum = pagenum + 1
2337
 
 
2338
 
 
2339
 
class GoogleSearchIE(InfoExtractor):
2340
 
        """Information Extractor for Google Video search queries."""
2341
 
        _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2342
 
        _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2343
 
        _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2344
 
        _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2345
 
        _google_ie = None
2346
 
        _max_google_results = 1000
2347
 
        IE_NAME = u'video.google:search'
2348
 
 
2349
 
        def __init__(self, google_ie, downloader=None):
2350
 
                InfoExtractor.__init__(self, downloader)
2351
 
                self._google_ie = google_ie
2352
 
 
2353
 
        def report_download_page(self, query, pagenum):
2354
 
                """Report attempt to download playlist page with given number."""
2355
 
                query = query.decode(preferredencoding())
2356
 
                self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2357
 
 
2358
 
        def _real_initialize(self):
2359
 
                self._google_ie.initialize()
2360
 
 
2361
 
        def _real_extract(self, query):
2362
 
                mobj = re.match(self._VALID_URL, query)
2363
 
                if mobj is None:
2364
 
                        self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2365
 
                        return
2366
 
 
2367
 
                prefix, query = query.split(':')
2368
 
                prefix = prefix[8:]
2369
 
                query = query.encode('utf-8')
2370
 
                if prefix == '':
2371
 
                        self._download_n_results(query, 1)
2372
 
                        return
2373
 
                elif prefix == 'all':
2374
 
                        self._download_n_results(query, self._max_google_results)
2375
 
                        return
2376
 
                else:
2377
 
                        try:
2378
 
                                n = long(prefix)
2379
 
                                if n <= 0:
2380
 
                                        self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2381
 
                                        return
2382
 
                                elif n > self._max_google_results:
2383
 
                                        self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2384
 
                                        n = self._max_google_results
2385
 
                                self._download_n_results(query, n)
2386
 
                                return
2387
 
                        except ValueError: # parsing prefix as integer fails
2388
 
                                self._download_n_results(query, 1)
2389
 
                                return
2390
 
 
2391
 
        def _download_n_results(self, query, n):
2392
 
                """Downloads a specified number of results for a query"""
2393
 
 
2394
 
                video_ids = []
2395
 
                already_seen = set()
2396
 
                pagenum = 1
2397
 
 
2398
 
                while True:
2399
 
                        self.report_download_page(query, pagenum)
2400
 
                        result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2401
 
                        request = urllib2.Request(result_url)
2402
 
                        try:
2403
 
                                page = urllib2.urlopen(request).read()
2404
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405
 
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2406
 
                                return
2407
 
 
2408
 
                        # Extract video identifiers
2409
 
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2410
 
                                video_id = mobj.group(1)
2411
 
                                if video_id not in already_seen:
2412
 
                                        video_ids.append(video_id)
2413
 
                                        already_seen.add(video_id)
2414
 
                                        if len(video_ids) == n:
2415
 
                                                # Specified n videos reached
2416
 
                                                for id in video_ids:
2417
 
                                                        self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2418
 
                                                return
2419
 
 
2420
 
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2421
 
                                for id in video_ids:
2422
 
                                        self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2423
 
                                return
2424
 
 
2425
 
                        pagenum = pagenum + 1
2426
 
 
2427
 
 
2428
 
class YahooSearchIE(InfoExtractor):
2429
 
        """Information Extractor for Yahoo! Video search queries."""
2430
 
        _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2431
 
        _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2432
 
        _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2433
 
        _MORE_PAGES_INDICATOR = r'\s*Next'
2434
 
        _yahoo_ie = None
2435
 
        _max_yahoo_results = 1000
2436
 
        IE_NAME = u'video.yahoo:search'
2437
 
 
2438
 
        def __init__(self, yahoo_ie, downloader=None):
2439
 
                InfoExtractor.__init__(self, downloader)
2440
 
                self._yahoo_ie = yahoo_ie
2441
 
 
2442
 
        def report_download_page(self, query, pagenum):
2443
 
                """Report attempt to download playlist page with given number."""
2444
 
                query = query.decode(preferredencoding())
2445
 
                self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2446
 
 
2447
 
        def _real_initialize(self):
2448
 
                self._yahoo_ie.initialize()
2449
 
 
2450
 
        def _real_extract(self, query):
2451
 
                mobj = re.match(self._VALID_URL, query)
2452
 
                if mobj is None:
2453
 
                        self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2454
 
                        return
2455
 
 
2456
 
                prefix, query = query.split(':')
2457
 
                prefix = prefix[8:]
2458
 
                query = query.encode('utf-8')
2459
 
                if prefix == '':
2460
 
                        self._download_n_results(query, 1)
2461
 
                        return
2462
 
                elif prefix == 'all':
2463
 
                        self._download_n_results(query, self._max_yahoo_results)
2464
 
                        return
2465
 
                else:
2466
 
                        try:
2467
 
                                n = long(prefix)
2468
 
                                if n <= 0:
2469
 
                                        self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2470
 
                                        return
2471
 
                                elif n > self._max_yahoo_results:
2472
 
                                        self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2473
 
                                        n = self._max_yahoo_results
2474
 
                                self._download_n_results(query, n)
2475
 
                                return
2476
 
                        except ValueError: # parsing prefix as integer fails
2477
 
                                self._download_n_results(query, 1)
2478
 
                                return
2479
 
 
2480
 
        def _download_n_results(self, query, n):
2481
 
                """Downloads a specified number of results for a query"""
2482
 
 
2483
 
                video_ids = []
2484
 
                already_seen = set()
2485
 
                pagenum = 1
2486
 
 
2487
 
                while True:
2488
 
                        self.report_download_page(query, pagenum)
2489
 
                        result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2490
 
                        request = urllib2.Request(result_url)
2491
 
                        try:
2492
 
                                page = urllib2.urlopen(request).read()
2493
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2494
 
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2495
 
                                return
2496
 
 
2497
 
                        # Extract video identifiers
2498
 
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2499
 
                                video_id = mobj.group(1)
2500
 
                                if video_id not in already_seen:
2501
 
                                        video_ids.append(video_id)
2502
 
                                        already_seen.add(video_id)
2503
 
                                        if len(video_ids) == n:
2504
 
                                                # Specified n videos reached
2505
 
                                                for id in video_ids:
2506
 
                                                        self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2507
 
                                                return
2508
 
 
2509
 
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2510
 
                                for id in video_ids:
2511
 
                                        self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2512
 
                                return
2513
 
 
2514
 
                        pagenum = pagenum + 1
2515
 
 
2516
 
 
2517
 
class YoutubePlaylistIE(InfoExtractor):
2518
 
        """Information Extractor for YouTube playlists."""
2519
 
 
2520
 
        _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2521
 
        _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2522
 
        _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2523
 
        _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2524
 
        _youtube_ie = None
2525
 
        IE_NAME = u'youtube:playlist'
2526
 
 
2527
 
        def __init__(self, youtube_ie, downloader=None):
2528
 
                InfoExtractor.__init__(self, downloader)
2529
 
                self._youtube_ie = youtube_ie
2530
 
 
2531
 
        def report_download_page(self, playlist_id, pagenum):
2532
 
                """Report attempt to download playlist page with given number."""
2533
 
                self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2534
 
 
2535
 
        def _real_initialize(self):
2536
 
                self._youtube_ie.initialize()
2537
 
 
2538
 
        def _real_extract(self, url):
2539
 
                # Extract playlist id
2540
 
                mobj = re.match(self._VALID_URL, url)
2541
 
                if mobj is None:
2542
 
                        self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2543
 
                        return
2544
 
 
2545
 
                # Single video case
2546
 
                if mobj.group(3) is not None:
2547
 
                        self._youtube_ie.extract(mobj.group(3))
2548
 
                        return
2549
 
 
2550
 
                # Download playlist pages
2551
 
                # prefix is 'p' as default for playlists but there are other types that need extra care
2552
 
                playlist_prefix = mobj.group(1)
2553
 
                if playlist_prefix == 'a':
2554
 
                        playlist_access = 'artist'
2555
 
                else:
2556
 
                        playlist_prefix = 'p'
2557
 
                        playlist_access = 'view_play_list'
2558
 
                playlist_id = mobj.group(2)
2559
 
                video_ids = []
2560
 
                pagenum = 1
2561
 
 
2562
 
                while True:
2563
 
                        self.report_download_page(playlist_id, pagenum)
2564
 
                        url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2565
 
                        request = urllib2.Request(url)
2566
 
                        try:
2567
 
                                page = urllib2.urlopen(request).read()
2568
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2569
 
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2570
 
                                return
2571
 
 
2572
 
                        # Extract video identifiers
2573
 
                        ids_in_page = []
2574
 
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2575
 
                                if mobj.group(1) not in ids_in_page:
2576
 
                                        ids_in_page.append(mobj.group(1))
2577
 
                        video_ids.extend(ids_in_page)
2578
 
 
2579
 
                        if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2580
 
                                break
2581
 
                        pagenum = pagenum + 1
2582
 
 
2583
 
                playliststart = self._downloader.params.get('playliststart', 1) - 1
2584
 
                playlistend = self._downloader.params.get('playlistend', -1)
2585
 
                video_ids = video_ids[playliststart:playlistend]
2586
 
 
2587
 
                for id in video_ids:
2588
 
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2589
 
                return
2590
 
 
2591
 
 
2592
 
class YoutubeUserIE(InfoExtractor):
2593
 
        """Information Extractor for YouTube users."""
2594
 
 
2595
 
        _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2596
 
        _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2597
 
        _GDATA_PAGE_SIZE = 50
2598
 
        _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2599
 
        _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2600
 
        _youtube_ie = None
2601
 
        IE_NAME = u'youtube:user'
2602
 
 
2603
 
        def __init__(self, youtube_ie, downloader=None):
2604
 
                InfoExtractor.__init__(self, downloader)
2605
 
                self._youtube_ie = youtube_ie
2606
 
 
2607
 
        def report_download_page(self, username, start_index):
2608
 
                """Report attempt to download user page."""
2609
 
                self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2610
 
                                (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2611
 
 
2612
 
        def _real_initialize(self):
2613
 
                self._youtube_ie.initialize()
2614
 
 
2615
 
        def _real_extract(self, url):
2616
 
                # Extract username
2617
 
                mobj = re.match(self._VALID_URL, url)
2618
 
                if mobj is None:
2619
 
                        self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2620
 
                        return
2621
 
 
2622
 
                username = mobj.group(1)
2623
 
 
2624
 
                # Download video ids using YouTube Data API. Result size per
2625
 
                # query is limited (currently to 50 videos) so we need to query
2626
 
                # page by page until there are no video ids - it means we got
2627
 
                # all of them.
2628
 
 
2629
 
                video_ids = []
2630
 
                pagenum = 0
2631
 
 
2632
 
                while True:
2633
 
                        start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2634
 
                        self.report_download_page(username, start_index)
2635
 
 
2636
 
                        request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2637
 
 
2638
 
                        try:
2639
 
                                page = urllib2.urlopen(request).read()
2640
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2641
 
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2642
 
                                return
2643
 
 
2644
 
                        # Extract video identifiers
2645
 
                        ids_in_page = []
2646
 
 
2647
 
                        for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2648
 
                                if mobj.group(1) not in ids_in_page:
2649
 
                                        ids_in_page.append(mobj.group(1))
2650
 
 
2651
 
                        video_ids.extend(ids_in_page)
2652
 
 
2653
 
                        # A little optimization - if current page is not
2654
 
                        # "full", ie. does not contain PAGE_SIZE video ids then
2655
 
                        # we can assume that this page is the last one - there
2656
 
                        # are no more ids on further pages - no need to query
2657
 
                        # again.
2658
 
 
2659
 
                        if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2660
 
                                break
2661
 
 
2662
 
                        pagenum += 1
2663
 
 
2664
 
                all_ids_count = len(video_ids)
2665
 
                playliststart = self._downloader.params.get('playliststart', 1) - 1
2666
 
                playlistend = self._downloader.params.get('playlistend', -1)
2667
 
 
2668
 
                if playlistend == -1:
2669
 
                        video_ids = video_ids[playliststart:]
2670
 
                else:
2671
 
                        video_ids = video_ids[playliststart:playlistend]
2672
 
 
2673
 
                self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2674
 
                                (username, all_ids_count, len(video_ids)))
2675
 
 
2676
 
                for video_id in video_ids:
2677
 
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2678
 
 
2679
 
 
2680
 
class DepositFilesIE(InfoExtractor):
2681
 
        """Information extractor for depositfiles.com"""
2682
 
 
2683
 
        _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2684
 
        IE_NAME = u'DepositFiles'
2685
 
 
2686
 
        def __init__(self, downloader=None):
2687
 
                InfoExtractor.__init__(self, downloader)
2688
 
 
2689
 
        def report_download_webpage(self, file_id):
2690
 
                """Report webpage download."""
2691
 
                self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2692
 
 
2693
 
        def report_extraction(self, file_id):
2694
 
                """Report information extraction."""
2695
 
                self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2696
 
 
2697
 
        def _real_extract(self, url):
2698
 
                # At this point we have a new file
2699
 
                self._downloader.increment_downloads()
2700
 
 
2701
 
                file_id = url.split('/')[-1]
2702
 
                # Rebuild url in english locale
2703
 
                url = 'http://depositfiles.com/en/files/' + file_id
2704
 
 
2705
 
                # Retrieve file webpage with 'Free download' button pressed
2706
 
                free_download_indication = { 'gateway_result' : '1' }
2707
 
                request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2708
 
                try:
2709
 
                        self.report_download_webpage(file_id)
2710
 
                        webpage = urllib2.urlopen(request).read()
2711
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2713
 
                        return
2714
 
 
2715
 
                # Search for the real file URL
2716
 
                mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2717
 
                if (mobj is None) or (mobj.group(1) is None):
2718
 
                        # Try to figure out reason of the error.
2719
 
                        mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2720
 
                        if (mobj is not None) and (mobj.group(1) is not None):
2721
 
                                restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2722
 
                                self._downloader.trouble(u'ERROR: %s' % restriction_message)
2723
 
                        else:
2724
 
                                self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2725
 
                        return
2726
 
 
2727
 
                file_url = mobj.group(1)
2728
 
                file_extension = os.path.splitext(file_url)[1][1:]
2729
 
 
2730
 
                # Search for file title
2731
 
                mobj = re.search(r'<b title="(.*?)">', webpage)
2732
 
                if mobj is None:
2733
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
2734
 
                        return
2735
 
                file_title = mobj.group(1).decode('utf-8')
2736
 
 
2737
 
                try:
2738
 
                        # Process file information
2739
 
                        self._downloader.process_info({
2740
 
                                'id':           file_id.decode('utf-8'),
2741
 
                                'url':          file_url.decode('utf-8'),
2742
 
                                'uploader':     u'NA',
2743
 
                                'upload_date':  u'NA',
2744
 
                                'title':        file_title,
2745
 
                                'stitle':       file_title,
2746
 
                                'ext':          file_extension.decode('utf-8'),
2747
 
                                'format':       u'NA',
2748
 
                                'player_url':   None,
2749
 
                        })
2750
 
                except UnavailableVideoError, err:
2751
 
                        self._downloader.trouble(u'ERROR: unable to download file')
2752
 
 
2753
 
 
2754
 
class FacebookIE(InfoExtractor):
2755
 
        """Information Extractor for Facebook"""
2756
 
 
2757
 
        _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2758
 
        _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2759
 
        _NETRC_MACHINE = 'facebook'
2760
 
        _available_formats = ['video', 'highqual', 'lowqual']
2761
 
        _video_extensions = {
2762
 
                'video': 'mp4',
2763
 
                'highqual': 'mp4',
2764
 
                'lowqual': 'mp4',
2765
 
        }
2766
 
        IE_NAME = u'facebook'
2767
 
 
2768
 
        def __init__(self, downloader=None):
2769
 
                InfoExtractor.__init__(self, downloader)
2770
 
 
2771
 
        def _reporter(self, message):
2772
 
                """Add header and report message."""
2773
 
                self._downloader.to_screen(u'[facebook] %s' % message)
2774
 
 
2775
 
        def report_login(self):
2776
 
                """Report attempt to log in."""
2777
 
                self._reporter(u'Logging in')
2778
 
 
2779
 
        def report_video_webpage_download(self, video_id):
2780
 
                """Report attempt to download video webpage."""
2781
 
                self._reporter(u'%s: Downloading video webpage' % video_id)
2782
 
 
2783
 
        def report_information_extraction(self, video_id):
2784
 
                """Report attempt to extract video information."""
2785
 
                self._reporter(u'%s: Extracting video information' % video_id)
2786
 
 
2787
 
        def _parse_page(self, video_webpage):
2788
 
                """Extract video information from page"""
2789
 
                # General data
2790
 
                data = {'title': r'\("video_title", "(.*?)"\)',
2791
 
                        'description': r'<div class="datawrap">(.*?)</div>',
2792
 
                        'owner': r'\("video_owner_name", "(.*?)"\)',
2793
 
                        'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2794
 
                        }
2795
 
                video_info = {}
2796
 
                for piece in data.keys():
2797
 
                        mobj = re.search(data[piece], video_webpage)
2798
 
                        if mobj is not None:
2799
 
                                video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2800
 
 
2801
 
                # Video urls
2802
 
                video_urls = {}
2803
 
                for fmt in self._available_formats:
2804
 
                        mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2805
 
                        if mobj is not None:
2806
 
                                # URL is in a Javascript segment inside an escaped Unicode format within
2807
 
                                # the generally utf-8 page
2808
 
                                video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2809
 
                video_info['video_urls'] = video_urls
2810
 
 
2811
 
                return video_info
2812
 
 
2813
 
        def _real_initialize(self):
2814
 
                if self._downloader is None:
2815
 
                        return
2816
 
 
2817
 
                useremail = None
2818
 
                password = None
2819
 
                downloader_params = self._downloader.params
2820
 
 
2821
 
                # Attempt to use provided username and password or .netrc data
2822
 
                if downloader_params.get('username', None) is not None:
2823
 
                        useremail = downloader_params['username']
2824
 
                        password = downloader_params['password']
2825
 
                elif downloader_params.get('usenetrc', False):
2826
 
                        try:
2827
 
                                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2828
 
                                if info is not None:
2829
 
                                        useremail = info[0]
2830
 
                                        password = info[2]
2831
 
                                else:
2832
 
                                        raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2833
 
                        except (IOError, netrc.NetrcParseError), err:
2834
 
                                self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2835
 
                                return
2836
 
 
2837
 
                if useremail is None:
2838
 
                        return
2839
 
 
2840
 
                # Log in
2841
 
                login_form = {
2842
 
                        'email': useremail,
2843
 
                        'pass': password,
2844
 
                        'login': 'Log+In'
2845
 
                        }
2846
 
                request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2847
 
                try:
2848
 
                        self.report_login()
2849
 
                        login_results = urllib2.urlopen(request).read()
2850
 
                        if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2851
 
                                self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2852
 
                                return
2853
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2854
 
                        self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2855
 
                        return
2856
 
 
2857
 
        def _real_extract(self, url):
2858
 
                mobj = re.match(self._VALID_URL, url)
2859
 
                if mobj is None:
2860
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2861
 
                        return
2862
 
                video_id = mobj.group('ID')
2863
 
 
2864
 
                # Get video webpage
2865
 
                self.report_video_webpage_download(video_id)
2866
 
                request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2867
 
                try:
2868
 
                        page = urllib2.urlopen(request)
2869
 
                        video_webpage = page.read()
2870
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2871
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2872
 
                        return
2873
 
 
2874
 
                # Start extracting information
2875
 
                self.report_information_extraction(video_id)
2876
 
 
2877
 
                # Extract information
2878
 
                video_info = self._parse_page(video_webpage)
2879
 
 
2880
 
                # uploader
2881
 
                if 'owner' not in video_info:
2882
 
                        self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2883
 
                        return
2884
 
                video_uploader = video_info['owner']
2885
 
 
2886
 
                # title
2887
 
                if 'title' not in video_info:
2888
 
                        self._downloader.trouble(u'ERROR: unable to extract video title')
2889
 
                        return
2890
 
                video_title = video_info['title']
2891
 
                video_title = video_title.decode('utf-8')
2892
 
                video_title = sanitize_title(video_title)
2893
 
 
2894
 
                simple_title = _simplify_title(video_title)
2895
 
 
2896
 
                # thumbnail image
2897
 
                if 'thumbnail' not in video_info:
2898
 
                        self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2899
 
                        video_thumbnail = ''
2900
 
                else:
2901
 
                        video_thumbnail = video_info['thumbnail']
2902
 
 
2903
 
                # upload date
2904
 
                upload_date = u'NA'
2905
 
                if 'upload_date' in video_info:
2906
 
                        upload_time = video_info['upload_date']
2907
 
                        timetuple = email.utils.parsedate_tz(upload_time)
2908
 
                        if timetuple is not None:
2909
 
                                try:
2910
 
                                        upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2911
 
                                except:
2912
 
                                        pass
2913
 
 
2914
 
                # description
2915
 
                video_description = video_info.get('description', 'No description available.')
2916
 
 
2917
 
                url_map = video_info['video_urls']
2918
 
                if len(url_map.keys()) > 0:
2919
 
                        # Decide which formats to download
2920
 
                        req_format = self._downloader.params.get('format', None)
2921
 
                        format_limit = self._downloader.params.get('format_limit', None)
2922
 
 
2923
 
                        if format_limit is not None and format_limit in self._available_formats:
2924
 
                                format_list = self._available_formats[self._available_formats.index(format_limit):]
2925
 
                        else:
2926
 
                                format_list = self._available_formats
2927
 
                        existing_formats = [x for x in format_list if x in url_map]
2928
 
                        if len(existing_formats) == 0:
2929
 
                                self._downloader.trouble(u'ERROR: no known formats available for video')
2930
 
                                return
2931
 
                        if req_format is None:
2932
 
                                video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2933
 
                        elif req_format == 'worst':
2934
 
                                video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2935
 
                        elif req_format == '-1':
2936
 
                                video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2937
 
                        else:
2938
 
                                # Specific format
2939
 
                                if req_format not in url_map:
2940
 
                                        self._downloader.trouble(u'ERROR: requested format not available')
2941
 
                                        return
2942
 
                                video_url_list = [(req_format, url_map[req_format])] # Specific format
2943
 
 
2944
 
                for format_param, video_real_url in video_url_list:
2945
 
 
2946
 
                        # At this point we have a new video
2947
 
                        self._downloader.increment_downloads()
2948
 
 
2949
 
                        # Extension
2950
 
                        video_extension = self._video_extensions.get(format_param, 'mp4')
2951
 
 
2952
 
                        try:
2953
 
                                # Process video information
2954
 
                                self._downloader.process_info({
2955
 
                                        'id':           video_id.decode('utf-8'),
2956
 
                                        'url':          video_real_url.decode('utf-8'),
2957
 
                                        'uploader':     video_uploader.decode('utf-8'),
2958
 
                                        'upload_date':  upload_date,
2959
 
                                        'title':        video_title,
2960
 
                                        'stitle':       simple_title,
2961
 
                                        'ext':          video_extension.decode('utf-8'),
2962
 
                                        'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2963
 
                                        'thumbnail':    video_thumbnail.decode('utf-8'),
2964
 
                                        'description':  video_description.decode('utf-8'),
2965
 
                                        'player_url':   None,
2966
 
                                })
2967
 
                        except UnavailableVideoError, err:
2968
 
                                self._downloader.trouble(u'\nERROR: unable to download video')
2969
 
 
2970
 
class BlipTVIE(InfoExtractor):
2971
 
        """Information extractor for blip.tv"""
2972
 
 
2973
 
        _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2974
 
        _URL_EXT = r'^.*\.([a-z0-9]+)$'
2975
 
        IE_NAME = u'blip.tv'
2976
 
 
2977
 
        def report_extraction(self, file_id):
2978
 
                """Report information extraction."""
2979
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2980
 
 
2981
 
        def report_direct_download(self, title):
2982
 
                """Report information extraction."""
2983
 
                self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2984
 
 
2985
 
        def _real_extract(self, url):
2986
 
                mobj = re.match(self._VALID_URL, url)
2987
 
                if mobj is None:
2988
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2989
 
                        return
2990
 
 
2991
 
                if '?' in url:
2992
 
                        cchar = '&'
2993
 
                else:
2994
 
                        cchar = '?'
2995
 
                json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2996
 
                request = urllib2.Request(json_url)
2997
 
                self.report_extraction(mobj.group(1))
2998
 
                info = None
2999
 
                try:
3000
 
                        urlh = urllib2.urlopen(request)
3001
 
                        if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3002
 
                                basename = url.split('/')[-1]
3003
 
                                title,ext = os.path.splitext(basename)
3004
 
                                title = title.decode('UTF-8')
3005
 
                                ext = ext.replace('.', '')
3006
 
                                self.report_direct_download(title)
3007
 
                                info = {
3008
 
                                        'id': title,
3009
 
                                        'url': url,
3010
 
                                        'title': title,
3011
 
                                        'stitle': _simplify_title(title),
3012
 
                                        'ext': ext,
3013
 
                                        'urlhandle': urlh
3014
 
                                }
3015
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3016
 
                        self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3017
 
                        return
3018
 
                if info is None: # Regular URL
3019
 
                        try:
3020
 
                                json_code = urlh.read()
3021
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3022
 
                                self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3023
 
                                return
3024
 
 
3025
 
                        try:
3026
 
                                json_data = json.loads(json_code)
3027
 
                                if 'Post' in json_data:
3028
 
                                        data = json_data['Post']
3029
 
                                else:
3030
 
                                        data = json_data
3031
 
        
3032
 
                                upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3033
 
                                video_url = data['media']['url']
3034
 
                                umobj = re.match(self._URL_EXT, video_url)
3035
 
                                if umobj is None:
3036
 
                                        raise ValueError('Can not determine filename extension')
3037
 
                                ext = umobj.group(1)
3038
 
        
3039
 
                                info = {
3040
 
                                        'id': data['item_id'],
3041
 
                                        'url': video_url,
3042
 
                                        'uploader': data['display_name'],
3043
 
                                        'upload_date': upload_date,
3044
 
                                        'title': data['title'],
3045
 
                                        'stitle': _simplify_title(data['title']),
3046
 
                                        'ext': ext,
3047
 
                                        'format': data['media']['mimeType'],
3048
 
                                        'thumbnail': data['thumbnailUrl'],
3049
 
                                        'description': data['description'],
3050
 
                                        'player_url': data['embedUrl']
3051
 
                                }
3052
 
                        except (ValueError,KeyError), err:
3053
 
                                self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3054
 
                                return
3055
 
 
3056
 
                self._downloader.increment_downloads()
3057
 
 
3058
 
                try:
3059
 
                        self._downloader.process_info(info)
3060
 
                except UnavailableVideoError, err:
3061
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
3062
 
 
3063
 
 
3064
 
class MyVideoIE(InfoExtractor):
3065
 
        """Information Extractor for myvideo.de."""
3066
 
 
3067
 
        _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3068
 
        IE_NAME = u'myvideo'
3069
 
 
3070
 
        def __init__(self, downloader=None):
3071
 
                InfoExtractor.__init__(self, downloader)
3072
 
        
3073
 
        def report_download_webpage(self, video_id):
3074
 
                """Report webpage download."""
3075
 
                self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3076
 
 
3077
 
        def report_extraction(self, video_id):
3078
 
                """Report information extraction."""
3079
 
                self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3080
 
 
3081
 
        def _real_extract(self,url):
3082
 
                mobj = re.match(self._VALID_URL, url)
3083
 
                if mobj is None:
3084
 
                        self._download.trouble(u'ERROR: invalid URL: %s' % url)
3085
 
                        return
3086
 
 
3087
 
                video_id = mobj.group(1)
3088
 
 
3089
 
                # Get video webpage
3090
 
                request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3091
 
                try:
3092
 
                        self.report_download_webpage(video_id)
3093
 
                        webpage = urllib2.urlopen(request).read()
3094
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3096
 
                        return
3097
 
 
3098
 
                self.report_extraction(video_id)
3099
 
                mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3100
 
                                 webpage)
3101
 
                if mobj is None:
3102
 
                        self._downloader.trouble(u'ERROR: unable to extract media URL')
3103
 
                        return
3104
 
                video_url = mobj.group(1) + ('/%s.flv' % video_id)
3105
 
 
3106
 
                mobj = re.search('<title>([^<]+)</title>', webpage)
3107
 
                if mobj is None:
3108
 
                        self._downloader.trouble(u'ERROR: unable to extract title')
3109
 
                        return
3110
 
 
3111
 
                video_title = mobj.group(1)
3112
 
                video_title = sanitize_title(video_title)
3113
 
 
3114
 
                simple_title = _simplify_title(video_title)
3115
 
 
3116
 
                try:
3117
 
                        self._downloader.process_info({
3118
 
                                'id':           video_id,
3119
 
                                'url':          video_url,
3120
 
                                'uploader':     u'NA',
3121
 
                                'upload_date':  u'NA',
3122
 
                                'title':        video_title,
3123
 
                                'stitle':       simple_title,
3124
 
                                'ext':          u'flv',
3125
 
                                'format':       u'NA',
3126
 
                                'player_url':   None,
3127
 
                        })
3128
 
                except UnavailableVideoError:
3129
 
                        self._downloader.trouble(u'\nERROR: Unable to download video')
3130
 
 
3131
 
class ComedyCentralIE(InfoExtractor):
3132
 
        """Information extractor for The Daily Show and Colbert Report """
3133
 
 
3134
 
        _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3135
 
        IE_NAME = u'comedycentral'
3136
 
 
3137
 
        def report_extraction(self, episode_id):
3138
 
                self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3139
 
        
3140
 
        def report_config_download(self, episode_id):
3141
 
                self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3142
 
 
3143
 
        def report_index_download(self, episode_id):
3144
 
                self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3145
 
 
3146
 
        def report_player_url(self, episode_id):
3147
 
                self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3148
 
 
3149
 
        def _real_extract(self, url):
3150
 
                mobj = re.match(self._VALID_URL, url)
3151
 
                if mobj is None:
3152
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3153
 
                        return
3154
 
 
3155
 
                if mobj.group('shortname'):
3156
 
                        if mobj.group('shortname') in ('tds', 'thedailyshow'):
3157
 
                                url = u'http://www.thedailyshow.com/full-episodes/'
3158
 
                        else:
3159
 
                                url = u'http://www.colbertnation.com/full-episodes/'
3160
 
                        mobj = re.match(self._VALID_URL, url)
3161
 
                        assert mobj is not None
3162
 
 
3163
 
                dlNewest = not mobj.group('episode')
3164
 
                if dlNewest:
3165
 
                        epTitle = mobj.group('showname')
3166
 
                else:
3167
 
                        epTitle = mobj.group('episode')
3168
 
 
3169
 
                req = urllib2.Request(url)
3170
 
                self.report_extraction(epTitle)
3171
 
                try:
3172
 
                        htmlHandle = urllib2.urlopen(req)
3173
 
                        html = htmlHandle.read()
3174
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3175
 
                        self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3176
 
                        return
3177
 
                if dlNewest:
3178
 
                        url = htmlHandle.geturl()
3179
 
                        mobj = re.match(self._VALID_URL, url)
3180
 
                        if mobj is None:
3181
 
                                self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3182
 
                                return
3183
 
                        if mobj.group('episode') == '':
3184
 
                                self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3185
 
                                return
3186
 
                        epTitle = mobj.group('episode')
3187
 
 
3188
 
                mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3189
 
                if len(mMovieParams) == 0:
3190
 
                        self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3191
 
                        return
3192
 
 
3193
 
                playerUrl_raw = mMovieParams[0][0]
3194
 
                self.report_player_url(epTitle)
3195
 
                try:
3196
 
                        urlHandle = urllib2.urlopen(playerUrl_raw)
3197
 
                        playerUrl = urlHandle.geturl()
3198
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3199
 
                        self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3200
 
                        return
3201
 
 
3202
 
                uri = mMovieParams[0][1]
3203
 
                indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3204
 
                self.report_index_download(epTitle)
3205
 
                try:
3206
 
                        indexXml = urllib2.urlopen(indexUrl).read()
3207
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3208
 
                        self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3209
 
                        return
3210
 
 
3211
 
                idoc = xml.etree.ElementTree.fromstring(indexXml)
3212
 
                itemEls = idoc.findall('.//item')
3213
 
                for itemEl in itemEls:
3214
 
                        mediaId = itemEl.findall('./guid')[0].text
3215
 
                        shortMediaId = mediaId.split(':')[-1]
3216
 
                        showId = mediaId.split(':')[-2].replace('.com', '')
3217
 
                        officialTitle = itemEl.findall('./title')[0].text
3218
 
                        officialDate = itemEl.findall('./pubDate')[0].text
3219
 
 
3220
 
                        configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3221
 
                                                urllib.urlencode({'uri': mediaId}))
3222
 
                        configReq = urllib2.Request(configUrl)
3223
 
                        self.report_config_download(epTitle)
3224
 
                        try:
3225
 
                                configXml = urllib2.urlopen(configReq).read()
3226
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227
 
                                self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3228
 
                                return
3229
 
 
3230
 
                        cdoc = xml.etree.ElementTree.fromstring(configXml)
3231
 
                        turls = []
3232
 
                        for rendition in cdoc.findall('.//rendition'):
3233
 
                                finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3234
 
                                turls.append(finfo)
3235
 
 
3236
 
                        if len(turls) == 0:
3237
 
                                self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3238
 
                                continue
3239
 
 
3240
 
                        # For now, just pick the highest bitrate
3241
 
                        format,video_url = turls[-1]
3242
 
 
3243
 
                        self._downloader.increment_downloads()
3244
 
 
3245
 
                        effTitle = showId + u'-' + epTitle
3246
 
                        info = {
3247
 
                                'id': shortMediaId,
3248
 
                                'url': video_url,
3249
 
                                'uploader': showId,
3250
 
                                'upload_date': officialDate,
3251
 
                                'title': effTitle,
3252
 
                                'stitle': _simplify_title(effTitle),
3253
 
                                'ext': 'mp4',
3254
 
                                'format': format,
3255
 
                                'thumbnail': None,
3256
 
                                'description': officialTitle,
3257
 
                                'player_url': playerUrl
3258
 
                        }
3259
 
 
3260
 
                        try:
3261
 
                                self._downloader.process_info(info)
3262
 
                        except UnavailableVideoError, err:
3263
 
                                self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3264
 
                                continue
3265
 
 
3266
 
 
3267
 
class EscapistIE(InfoExtractor):
3268
 
        """Information extractor for The Escapist """
3269
 
 
3270
 
        _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3271
 
        IE_NAME = u'escapist'
3272
 
 
3273
 
        def report_extraction(self, showName):
3274
 
                self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3275
 
 
3276
 
        def report_config_download(self, showName):
3277
 
                self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3278
 
 
3279
 
        def _real_extract(self, url):
3280
 
                htmlParser = HTMLParser.HTMLParser()
3281
 
 
3282
 
                mobj = re.match(self._VALID_URL, url)
3283
 
                if mobj is None:
3284
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3285
 
                        return
3286
 
                showName = mobj.group('showname')
3287
 
                videoId = mobj.group('episode')
3288
 
 
3289
 
                self.report_extraction(showName)
3290
 
                try:
3291
 
                        webPage = urllib2.urlopen(url).read()
3292
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293
 
                        self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3294
 
                        return
3295
 
 
3296
 
                descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3297
 
                description = htmlParser.unescape(descMatch.group(1))
3298
 
                imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3299
 
                imgUrl = htmlParser.unescape(imgMatch.group(1))
3300
 
                playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3301
 
                playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3302
 
                configUrlMatch = re.search('config=(.*)$', playerUrl)
3303
 
                configUrl = urllib2.unquote(configUrlMatch.group(1))
3304
 
 
3305
 
                self.report_config_download(showName)
3306
 
                try:
3307
 
                        configJSON = urllib2.urlopen(configUrl).read()
3308
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3309
 
                        self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3310
 
                        return
3311
 
 
3312
 
                # Technically, it's JavaScript, not JSON
3313
 
                configJSON = configJSON.replace("'", '"')
3314
 
 
3315
 
                try:
3316
 
                        config = json.loads(configJSON)
3317
 
                except (ValueError,), err:
3318
 
                        self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3319
 
                        return
3320
 
 
3321
 
                playlist = config['playlist']
3322
 
                videoUrl = playlist[1]['url']
3323
 
 
3324
 
                self._downloader.increment_downloads()
3325
 
                info = {
3326
 
                        'id': videoId,
3327
 
                        'url': videoUrl,
3328
 
                        'uploader': showName,
3329
 
                        'upload_date': None,
3330
 
                        'title': showName,
3331
 
                        'stitle': _simplify_title(showName),
3332
 
                        'ext': 'flv',
3333
 
                        'format': 'flv',
3334
 
                        'thumbnail': imgUrl,
3335
 
                        'description': description,
3336
 
                        'player_url': playerUrl,
3337
 
                }
3338
 
 
3339
 
                try:
3340
 
                        self._downloader.process_info(info)
3341
 
                except UnavailableVideoError, err:
3342
 
                        self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3343
 
 
3344
 
 
3345
 
class CollegeHumorIE(InfoExtractor):
3346
 
        """Information extractor for collegehumor.com"""
3347
 
 
3348
 
        _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3349
 
        IE_NAME = u'collegehumor'
3350
 
 
3351
 
        def report_webpage(self, video_id):
3352
 
                """Report information extraction."""
3353
 
                self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3354
 
 
3355
 
        def report_extraction(self, video_id):
3356
 
                """Report information extraction."""
3357
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3358
 
 
3359
 
        def _real_extract(self, url):
3360
 
                htmlParser = HTMLParser.HTMLParser()
3361
 
 
3362
 
                mobj = re.match(self._VALID_URL, url)
3363
 
                if mobj is None:
3364
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3365
 
                        return
3366
 
                video_id = mobj.group('videoid')
3367
 
 
3368
 
                self.report_webpage(video_id)
3369
 
                request = urllib2.Request(url)
3370
 
                try:
3371
 
                        webpage = urllib2.urlopen(request).read()
3372
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3373
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3374
 
                        return
3375
 
 
3376
 
                m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3377
 
                if m is None:
3378
 
                        self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3379
 
                        return
3380
 
                internal_video_id = m.group('internalvideoid')
3381
 
 
3382
 
                info = {
3383
 
                        'id': video_id,
3384
 
                        'internal_id': internal_video_id,
3385
 
                }
3386
 
 
3387
 
                self.report_extraction(video_id)
3388
 
                xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3389
 
                try:
3390
 
                        metaXml = urllib2.urlopen(xmlUrl).read()
3391
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3392
 
                        self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3393
 
                        return
3394
 
 
3395
 
                mdoc = xml.etree.ElementTree.fromstring(metaXml)
3396
 
                try:
3397
 
                        videoNode = mdoc.findall('./video')[0]
3398
 
                        info['description'] = videoNode.findall('./description')[0].text
3399
 
                        info['title'] = videoNode.findall('./caption')[0].text
3400
 
                        info['stitle'] = _simplify_title(info['title'])
3401
 
                        info['url'] = videoNode.findall('./file')[0].text
3402
 
                        info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3403
 
                        info['ext'] = info['url'].rpartition('.')[2]
3404
 
                        info['format'] = info['ext']
3405
 
                except IndexError:
3406
 
                        self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3407
 
                        return
3408
 
 
3409
 
                self._downloader.increment_downloads()
3410
 
 
3411
 
                try:
3412
 
                        self._downloader.process_info(info)
3413
 
                except UnavailableVideoError, err:
3414
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
3415
 
 
3416
 
 
3417
 
class XVideosIE(InfoExtractor):
3418
 
        """Information extractor for xvideos.com"""
3419
 
 
3420
 
        _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3421
 
        IE_NAME = u'xvideos'
3422
 
 
3423
 
        def report_webpage(self, video_id):
3424
 
                """Report information extraction."""
3425
 
                self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3426
 
 
3427
 
        def report_extraction(self, video_id):
3428
 
                """Report information extraction."""
3429
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3430
 
 
3431
 
        def _real_extract(self, url):
3432
 
                htmlParser = HTMLParser.HTMLParser()
3433
 
 
3434
 
                mobj = re.match(self._VALID_URL, url)
3435
 
                if mobj is None:
3436
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3437
 
                        return
3438
 
                video_id = mobj.group(1).decode('utf-8')
3439
 
 
3440
 
                self.report_webpage(video_id)
3441
 
 
3442
 
                request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3443
 
                try:
3444
 
                        webpage = urllib2.urlopen(request).read()
3445
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3446
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3447
 
                        return
3448
 
 
3449
 
                self.report_extraction(video_id)
3450
 
 
3451
 
 
3452
 
                # Extract video URL
3453
 
                mobj = re.search(r'flv_url=(.+?)&', webpage)
3454
 
                if mobj is None:
3455
 
                        self._downloader.trouble(u'ERROR: unable to extract video url')
3456
 
                        return
3457
 
                video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3458
 
 
3459
 
 
3460
 
                # Extract title
3461
 
                mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3462
 
                if mobj is None:
3463
 
                        self._downloader.trouble(u'ERROR: unable to extract video title')
3464
 
                        return
3465
 
                video_title = mobj.group(1).decode('utf-8')
3466
 
 
3467
 
 
3468
 
                # Extract video thumbnail
3469
 
                mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3470
 
                if mobj is None:
3471
 
                        self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3472
 
                        return
3473
 
                video_thumbnail = mobj.group(1).decode('utf-8')
3474
 
 
3475
 
 
3476
 
 
3477
 
                self._downloader.increment_downloads()
3478
 
                info = {
3479
 
                        'id': video_id,
3480
 
                        'url': video_url,
3481
 
                        'uploader': None,
3482
 
                        'upload_date': None,
3483
 
                        'title': video_title,
3484
 
                        'stitle': _simplify_title(video_title),
3485
 
                        'ext': 'flv',
3486
 
                        'format': 'flv',
3487
 
                        'thumbnail': video_thumbnail,
3488
 
                        'description': None,
3489
 
                        'player_url': None,
3490
 
                }
3491
 
 
3492
 
                try:
3493
 
                        self._downloader.process_info(info)
3494
 
                except UnavailableVideoError, err:
3495
 
                        self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3496
 
 
3497
 
 
3498
 
class SoundcloudIE(InfoExtractor):
3499
 
        """Information extractor for soundcloud.com
3500
 
           To access the media, the uid of the song and a stream token
3501
 
           must be extracted from the page source and the script must make
3502
 
           a request to media.soundcloud.com/crossdomain.xml. Then
3503
 
           the media can be grabbed by requesting from an url composed
3504
 
           of the stream token and uid
3505
 
         """
3506
 
 
3507
 
        _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3508
 
        IE_NAME = u'soundcloud'
3509
 
 
3510
 
        def __init__(self, downloader=None):
3511
 
                InfoExtractor.__init__(self, downloader)
3512
 
 
3513
 
        def report_webpage(self, video_id):
3514
 
                """Report information extraction."""
3515
 
                self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3516
 
 
3517
 
        def report_extraction(self, video_id):
3518
 
                """Report information extraction."""
3519
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3520
 
 
3521
 
        def _real_extract(self, url):
3522
 
                htmlParser = HTMLParser.HTMLParser()
3523
 
 
3524
 
                mobj = re.match(self._VALID_URL, url)
3525
 
                if mobj is None:
3526
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3527
 
                        return
3528
 
 
3529
 
                # extract uploader (which is in the url)
3530
 
                uploader = mobj.group(1).decode('utf-8')
3531
 
                # extract simple title (uploader + slug of song title)
3532
 
                slug_title =  mobj.group(2).decode('utf-8')
3533
 
                simple_title = uploader + '-' + slug_title
3534
 
 
3535
 
                self.report_webpage('%s/%s' % (uploader, slug_title))
3536
 
 
3537
 
                request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3538
 
                try:
3539
 
                        webpage = urllib2.urlopen(request).read()
3540
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3541
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3542
 
                        return
3543
 
 
3544
 
                self.report_extraction('%s/%s' % (uploader, slug_title))
3545
 
 
3546
 
                # extract uid and stream token that soundcloud hands out for access
3547
 
                mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3548
 
                if mobj:
3549
 
                        video_id = mobj.group(1)
3550
 
                        stream_token = mobj.group(2)
3551
 
 
3552
 
                # extract unsimplified title
3553
 
                mobj = re.search('"title":"(.*?)",', webpage)
3554
 
                if mobj:
3555
 
                        title = mobj.group(1)
3556
 
 
3557
 
                # construct media url (with uid/token)
3558
 
                mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3559
 
                mediaURL = mediaURL % (video_id, stream_token)
3560
 
 
3561
 
                # description
3562
 
                description = u'No description available'
3563
 
                mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3564
 
                if mobj:
3565
 
                        description = mobj.group(1)
3566
 
                
3567
 
                # upload date
3568
 
                upload_date = None
3569
 
                mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3570
 
                if mobj:
3571
 
                        try:
3572
 
                                upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3573
 
                        except Exception, e:
3574
 
                                print str(e)
3575
 
 
3576
 
                # for soundcloud, a request to a cross domain is required for cookies
3577
 
                request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3578
 
 
3579
 
                try:
3580
 
                        self._downloader.process_info({
3581
 
                                'id':           video_id.decode('utf-8'),
3582
 
                                'url':          mediaURL,
3583
 
                                'uploader':     uploader.decode('utf-8'),
3584
 
                                'upload_date':  upload_date,
3585
 
                                'title':        simple_title.decode('utf-8'),
3586
 
                                'stitle':       simple_title.decode('utf-8'),
3587
 
                                'ext':          u'mp3',
3588
 
                                'format':       u'NA',
3589
 
                                'player_url':   None,
3590
 
                                'description': description.decode('utf-8')
3591
 
                        })
3592
 
                except UnavailableVideoError:
3593
 
                        self._downloader.trouble(u'\nERROR: unable to download video')
3594
 
 
3595
 
 
3596
 
class InfoQIE(InfoExtractor):
3597
 
        """Information extractor for infoq.com"""
3598
 
 
3599
 
        _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3600
 
        IE_NAME = u'infoq'
3601
 
 
3602
 
        def report_webpage(self, video_id):
3603
 
                """Report information extraction."""
3604
 
                self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3605
 
 
3606
 
        def report_extraction(self, video_id):
3607
 
                """Report information extraction."""
3608
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3609
 
 
3610
 
        def _real_extract(self, url):
3611
 
                htmlParser = HTMLParser.HTMLParser()
3612
 
 
3613
 
                mobj = re.match(self._VALID_URL, url)
3614
 
                if mobj is None:
3615
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3616
 
                        return
3617
 
 
3618
 
                self.report_webpage(url)
3619
 
 
3620
 
                request = urllib2.Request(url)
3621
 
                try:
3622
 
                        webpage = urllib2.urlopen(request).read()
3623
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3624
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3625
 
                        return
3626
 
 
3627
 
                self.report_extraction(url)
3628
 
 
3629
 
 
3630
 
                # Extract video URL
3631
 
                mobj = re.search(r"jsclassref='([^']*)'", webpage)
3632
 
                if mobj is None:
3633
 
                        self._downloader.trouble(u'ERROR: unable to extract video url')
3634
 
                        return
3635
 
                video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3636
 
 
3637
 
 
3638
 
                # Extract title
3639
 
                mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3640
 
                if mobj is None:
3641
 
                        self._downloader.trouble(u'ERROR: unable to extract video title')
3642
 
                        return
3643
 
                video_title = mobj.group(1).decode('utf-8')
3644
 
 
3645
 
                # Extract description
3646
 
                video_description = u'No description available.'
3647
 
                mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3648
 
                if mobj is not None:
3649
 
                        video_description = mobj.group(1).decode('utf-8')
3650
 
 
3651
 
                video_filename = video_url.split('/')[-1]
3652
 
                video_id, extension = video_filename.split('.')
3653
 
 
3654
 
                self._downloader.increment_downloads()
3655
 
                info = {
3656
 
                        'id': video_id,
3657
 
                        'url': video_url,
3658
 
                        'uploader': None,
3659
 
                        'upload_date': None,
3660
 
                        'title': video_title,
3661
 
                        'stitle': _simplify_title(video_title),
3662
 
                        'ext': extension,
3663
 
                        'format': extension, # Extension is always(?) mp4, but seems to be flv
3664
 
                        'thumbnail': None,
3665
 
                        'description': video_description,
3666
 
                        'player_url': None,
3667
 
                }
3668
 
 
3669
 
                try:
3670
 
                        self._downloader.process_info(info)
3671
 
                except UnavailableVideoError, err:
3672
 
                        self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3673
 
 
3674
 
class MixcloudIE(InfoExtractor):
3675
 
        """Information extractor for www.mixcloud.com"""
3676
 
        _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3677
 
        IE_NAME = u'mixcloud'
3678
 
 
3679
 
        def __init__(self, downloader=None):
3680
 
                InfoExtractor.__init__(self, downloader)
3681
 
 
3682
 
        def report_download_json(self, file_id):
3683
 
                """Report JSON download."""
3684
 
                self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3685
 
 
3686
 
        def report_extraction(self, file_id):
3687
 
                """Report information extraction."""
3688
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3689
 
 
3690
 
        def get_urls(self, jsonData, fmt, bitrate='best'):
3691
 
                """Get urls from 'audio_formats' section in json"""
3692
 
                file_url = None
3693
 
                try:
3694
 
                        bitrate_list = jsonData[fmt]
3695
 
                        if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3696
 
                                bitrate = max(bitrate_list) # select highest
3697
 
 
3698
 
                        url_list = jsonData[fmt][bitrate]
3699
 
                except TypeError: # we have no bitrate info.
3700
 
                        url_list = jsonData[fmt]
3701
 
                                
3702
 
                return url_list
3703
 
 
3704
 
        def check_urls(self, url_list):
3705
 
                """Returns 1st active url from list"""
3706
 
                for url in url_list:
3707
 
                        try:
3708
 
                                urllib2.urlopen(url)
3709
 
                                return url
3710
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3711
 
                                url = None
3712
 
 
3713
 
                return None
3714
 
 
3715
 
        def _print_formats(self, formats):
3716
 
                print 'Available formats:'
3717
 
                for fmt in formats.keys():
3718
 
                        for b in formats[fmt]:
3719
 
                                try:
3720
 
                                        ext = formats[fmt][b][0]
3721
 
                                        print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3722
 
                                except TypeError: # we have no bitrate info
3723
 
                                        ext = formats[fmt][0]
3724
 
                                        print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3725
 
                                        break
3726
 
 
3727
 
        def _real_extract(self, url):
3728
 
                mobj = re.match(self._VALID_URL, url)
3729
 
                if mobj is None:
3730
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3731
 
                        return
3732
 
                # extract uploader & filename from url
3733
 
                uploader = mobj.group(1).decode('utf-8')
3734
 
                file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3735
 
 
3736
 
                # construct API request
3737
 
                file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3738
 
                # retrieve .json file with links to files
3739
 
                request = urllib2.Request(file_url)
3740
 
                try:
3741
 
                        self.report_download_json(file_url)
3742
 
                        jsonData = urllib2.urlopen(request).read()
3743
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3744
 
                        self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3745
 
                        return
3746
 
 
3747
 
                # parse JSON
3748
 
                json_data = json.loads(jsonData)
3749
 
                player_url = json_data['player_swf_url']
3750
 
                formats = dict(json_data['audio_formats'])
3751
 
 
3752
 
                req_format = self._downloader.params.get('format', None)
3753
 
                bitrate = None
3754
 
 
3755
 
                if self._downloader.params.get('listformats', None):
3756
 
                        self._print_formats(formats)
3757
 
                        return
3758
 
 
3759
 
                if req_format is None or req_format == 'best':
3760
 
                        for format_param in formats.keys():
3761
 
                                url_list = self.get_urls(formats, format_param)
3762
 
                                # check urls
3763
 
                                file_url = self.check_urls(url_list)
3764
 
                                if file_url is not None:
3765
 
                                        break # got it!
3766
 
                else:
3767
 
                        if req_format not in formats.keys():
3768
 
                                self._downloader.trouble(u'ERROR: format is not available')
3769
 
                                return
3770
 
 
3771
 
                        url_list = self.get_urls(formats, req_format)
3772
 
                        file_url = self.check_urls(url_list)
3773
 
                        format_param = req_format
3774
 
 
3775
 
                # We have audio
3776
 
                self._downloader.increment_downloads()
3777
 
                try:
3778
 
                        # Process file information
3779
 
                        self._downloader.process_info({
3780
 
                                'id': file_id.decode('utf-8'),
3781
 
                                'url': file_url.decode('utf-8'),
3782
 
                                'uploader':     uploader.decode('utf-8'),
3783
 
                                'upload_date': u'NA',
3784
 
                                'title': json_data['name'],
3785
 
                                'stitle': _simplify_title(json_data['name']),
3786
 
                                'ext': file_url.split('.')[-1].decode('utf-8'),
3787
 
                                'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3788
 
                                'thumbnail': json_data['thumbnail_url'],
3789
 
                                'description': json_data['description'],
3790
 
                                'player_url': player_url.decode('utf-8'),
3791
 
                        })
3792
 
                except UnavailableVideoError, err:
3793
 
                        self._downloader.trouble(u'ERROR: unable to download file')
3794
 
 
3795
 
class StanfordOpenClassroomIE(InfoExtractor):
3796
 
        """Information extractor for Stanford's Open ClassRoom"""
3797
 
 
3798
 
        _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3799
 
        IE_NAME = u'stanfordoc'
3800
 
 
3801
 
        def report_download_webpage(self, objid):
3802
 
                """Report information extraction."""
3803
 
                self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3804
 
 
3805
 
        def report_extraction(self, video_id):
3806
 
                """Report information extraction."""
3807
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3808
 
 
3809
 
        def _real_extract(self, url):
3810
 
                mobj = re.match(self._VALID_URL, url)
3811
 
                if mobj is None:
3812
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3813
 
                        return
3814
 
 
3815
 
                if mobj.group('course') and mobj.group('video'): # A specific video
3816
 
                        course = mobj.group('course')
3817
 
                        video = mobj.group('video')
3818
 
                        info = {
3819
 
                                'id': _simplify_title(course + '_' + video),
3820
 
                        }
3821
 
        
3822
 
                        self.report_extraction(info['id'])
3823
 
                        baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3824
 
                        xmlUrl = baseUrl + video + '.xml'
3825
 
                        try:
3826
 
                                metaXml = urllib2.urlopen(xmlUrl).read()
3827
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3828
 
                                self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3829
 
                                return
3830
 
                        mdoc = xml.etree.ElementTree.fromstring(metaXml)
3831
 
                        try:
3832
 
                                info['title'] = mdoc.findall('./title')[0].text
3833
 
                                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3834
 
                        except IndexError:
3835
 
                                self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3836
 
                                return
3837
 
                        info['stitle'] = _simplify_title(info['title'])
3838
 
                        info['ext'] = info['url'].rpartition('.')[2]
3839
 
                        info['format'] = info['ext']
3840
 
                        self._downloader.increment_downloads()
3841
 
                        try:
3842
 
                                self._downloader.process_info(info)
3843
 
                        except UnavailableVideoError, err:
3844
 
                                self._downloader.trouble(u'\nERROR: unable to download video')
3845
 
                elif mobj.group('course'): # A course page
3846
 
                        unescapeHTML = HTMLParser.HTMLParser().unescape
3847
 
 
3848
 
                        course = mobj.group('course')
3849
 
                        info = {
3850
 
                                'id': _simplify_title(course),
3851
 
                                'type': 'playlist',
3852
 
                        }
3853
 
 
3854
 
                        self.report_download_webpage(info['id'])
3855
 
                        try:
3856
 
                                coursepage = urllib2.urlopen(url).read()
3857
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3858
 
                                self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3859
 
                                return
3860
 
 
3861
 
                        m = re.search('<h1>([^<]+)</h1>', coursepage)
3862
 
                        if m:
3863
 
                                info['title'] = unescapeHTML(m.group(1))
3864
 
                        else:
3865
 
                                info['title'] = info['id']
3866
 
                        info['stitle'] = _simplify_title(info['title'])
3867
 
 
3868
 
                        m = re.search('<description>([^<]+)</description>', coursepage)
3869
 
                        if m:
3870
 
                                info['description'] = unescapeHTML(m.group(1))
3871
 
 
3872
 
                        links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3873
 
                        info['list'] = [
3874
 
                                {
3875
 
                                        'type': 'reference',
3876
 
                                        'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3877
 
                                }
3878
 
                                        for vpage in links]
3879
 
 
3880
 
                        for entry in info['list']:
3881
 
                                assert entry['type'] == 'reference'
3882
 
                                self.extract(entry['url'])
3883
 
                else: # Root page
3884
 
                        unescapeHTML = HTMLParser.HTMLParser().unescape
3885
 
 
3886
 
                        info = {
3887
 
                                'id': 'Stanford OpenClassroom',
3888
 
                                'type': 'playlist',
3889
 
                        }
3890
 
 
3891
 
                        self.report_download_webpage(info['id'])
3892
 
                        rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3893
 
                        try:
3894
 
                                rootpage = urllib2.urlopen(rootURL).read()
3895
 
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3896
 
                                self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3897
 
                                return
3898
 
 
3899
 
                        info['title'] = info['id']
3900
 
                        info['stitle'] = _simplify_title(info['title'])
3901
 
 
3902
 
                        links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3903
 
                        info['list'] = [
3904
 
                                {
3905
 
                                        'type': 'reference',
3906
 
                                        'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3907
 
                                }
3908
 
                                        for cpage in links]
3909
 
 
3910
 
                        for entry in info['list']:
3911
 
                                assert entry['type'] == 'reference'
3912
 
                                self.extract(entry['url'])
3913
 
 
3914
 
class MTVIE(InfoExtractor):
3915
 
        """Information extractor for MTV.com"""
3916
 
 
3917
 
        _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3918
 
        IE_NAME = u'mtv'
3919
 
 
3920
 
        def report_webpage(self, video_id):
3921
 
                """Report information extraction."""
3922
 
                self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3923
 
 
3924
 
        def report_extraction(self, video_id):
3925
 
                """Report information extraction."""
3926
 
                self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3927
 
 
3928
 
        def _real_extract(self, url):
3929
 
                mobj = re.match(self._VALID_URL, url)
3930
 
                if mobj is None:
3931
 
                        self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3932
 
                        return
3933
 
                if not mobj.group('proto'):
3934
 
                        url = 'http://' + url
3935
 
                video_id = mobj.group('videoid')
3936
 
                self.report_webpage(video_id)
3937
 
 
3938
 
                request = urllib2.Request(url)
3939
 
                try:
3940
 
                        webpage = urllib2.urlopen(request).read()
3941
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3942
 
                        self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3943
 
                        return
3944
 
 
3945
 
                mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3946
 
                if mobj is None:
3947
 
                        self._downloader.trouble(u'ERROR: unable to extract song name')
3948
 
                        return
3949
 
                song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3950
 
                mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3951
 
                if mobj is None:
3952
 
                        self._downloader.trouble(u'ERROR: unable to extract performer')
3953
 
                        return
3954
 
                performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3955
 
                video_title = performer + ' - ' + song_name 
3956
 
 
3957
 
                mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3958
 
                if mobj is None:
3959
 
                        self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3960
 
                        return
3961
 
                mtvn_uri = mobj.group(1)
3962
 
 
3963
 
                mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3964
 
                if mobj is None:
3965
 
                        self._downloader.trouble(u'ERROR: unable to extract content id')
3966
 
                        return
3967
 
                content_id = mobj.group(1)
3968
 
 
3969
 
                videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3970
 
                self.report_extraction(video_id)
3971
 
                request = urllib2.Request(videogen_url)
3972
 
                try:
3973
 
                        metadataXml = urllib2.urlopen(request).read()
3974
 
                except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3975
 
                        self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3976
 
                        return
3977
 
 
3978
 
                mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3979
 
                renditions = mdoc.findall('.//rendition')
3980
 
 
3981
 
                # For now, always pick the highest quality.
3982
 
                rendition = renditions[-1]
3983
 
 
3984
 
                try:
3985
 
                        _,_,ext = rendition.attrib['type'].partition('/')
3986
 
                        format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3987
 
                        video_url = rendition.find('./src').text
3988
 
                except KeyError:
3989
 
                        self._downloader.trouble('Invalid rendition field.')
3990
 
                        return
3991
 
 
3992
 
                self._downloader.increment_downloads()
3993
 
                info = {
3994
 
                        'id': video_id,
3995
 
                        'url': video_url,
3996
 
                        'uploader': performer,
3997
 
                        'title': video_title,
3998
 
                        'stitle': _simplify_title(video_title),
3999
 
                        'ext': ext,
4000
 
                        'format': format,
4001
 
                }
4002
 
 
4003
 
                try:
4004
 
                        self._downloader.process_info(info)
4005
 
                except UnavailableVideoError, err:
4006
 
                        self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4007
 
 
4008
 
 
4009
 
class PostProcessor(object):
4010
 
        """Post Processor class.
4011
 
 
4012
 
        PostProcessor objects can be added to downloaders with their
4013
 
        add_post_processor() method. When the downloader has finished a
4014
 
        successful download, it will take its internal chain of PostProcessors
4015
 
        and start calling the run() method on each one of them, first with
4016
 
        an initial argument and then with the returned value of the previous
4017
 
        PostProcessor.
4018
 
 
4019
 
        The chain will be stopped if one of them ever returns None or the end
4020
 
        of the chain is reached.
4021
 
 
4022
 
        PostProcessor objects follow a "mutual registration" process similar
4023
 
        to InfoExtractor objects.
4024
 
        """
4025
 
 
4026
 
        _downloader = None
4027
 
 
4028
 
        def __init__(self, downloader=None):
4029
 
                self._downloader = downloader
4030
 
 
4031
 
        def set_downloader(self, downloader):
4032
 
                """Sets the downloader for this PP."""
4033
 
                self._downloader = downloader
4034
 
 
4035
 
        def run(self, information):
4036
 
                """Run the PostProcessor.
4037
 
 
4038
 
                The "information" argument is a dictionary like the ones
4039
 
                composed by InfoExtractors. The only difference is that this
4040
 
                one has an extra field called "filepath" that points to the
4041
 
                downloaded file.
4042
 
 
4043
 
                When this method returns None, the postprocessing chain is
4044
 
                stopped. However, this method may return an information
4045
 
                dictionary that will be passed to the next postprocessing
4046
 
                object in the chain. It can be the one it received after
4047
 
                changing some fields.
4048
 
 
4049
 
                In addition, this method may raise a PostProcessingError
4050
 
                exception that will be taken into account by the downloader
4051
 
                it was called from.
4052
 
                """
4053
 
                return information # by default, do nothing
4054
 
 
4055
 
class AudioConversionError(BaseException):
4056
 
        def __init__(self, message):
4057
 
                self.message = message
4058
 
 
4059
 
class FFmpegExtractAudioPP(PostProcessor):
4060
 
 
4061
 
        def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4062
 
                PostProcessor.__init__(self, downloader)
4063
 
                if preferredcodec is None:
4064
 
                        preferredcodec = 'best'
4065
 
                self._preferredcodec = preferredcodec
4066
 
                self._preferredquality = preferredquality
4067
 
                self._keepvideo = keepvideo
4068
 
 
4069
 
        @staticmethod
4070
 
        def get_audio_codec(path):
4071
 
                try:
4072
 
                        cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4073
 
                        handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4074
 
                        output = handle.communicate()[0]
4075
 
                        if handle.wait() != 0:
4076
 
                                return None
4077
 
                except (IOError, OSError):
4078
 
                        return None
4079
 
                audio_codec = None
4080
 
                for line in output.split('\n'):
4081
 
                        if line.startswith('codec_name='):
4082
 
                                audio_codec = line.split('=')[1].strip()
4083
 
                        elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4084
 
                                return audio_codec
4085
 
                return None
4086
 
 
4087
 
        @staticmethod
4088
 
        def run_ffmpeg(path, out_path, codec, more_opts):
4089
 
                if codec is None:
4090
 
                        acodec_opts = []
4091
 
                else:
4092
 
                        acodec_opts = ['-acodec', codec]
4093
 
                cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4094
 
                try:
4095
 
                        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4096
 
                        stdout,stderr = p.communicate()
4097
 
                except (IOError, OSError):
4098
 
                        e = sys.exc_info()[1]
4099
 
                        if isinstance(e, OSError) and e.errno == 2:
4100
 
                                raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4101
 
                        else:
4102
 
                                raise e
4103
 
                if p.returncode != 0:
4104
 
                        msg = stderr.strip().split('\n')[-1]
4105
 
                        raise AudioConversionError(msg)
4106
 
 
4107
 
        def run(self, information):
4108
 
                path = information['filepath']
4109
 
 
4110
 
                filecodec = self.get_audio_codec(path)
4111
 
                if filecodec is None:
4112
 
                        self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4113
 
                        return None
4114
 
 
4115
 
                more_opts = []
4116
 
                if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4117
 
                        if self._preferredcodec == 'm4a' and filecodec == 'aac':
4118
 
                                # Lossless, but in another container
4119
 
                                acodec = 'copy'
4120
 
                                extension = self._preferredcodec
4121
 
                                more_opts = ['-absf', 'aac_adtstoasc']
4122
 
                        elif filecodec in ['aac', 'mp3', 'vorbis']:
4123
 
                                # Lossless if possible
4124
 
                                acodec = 'copy'
4125
 
                                extension = filecodec
4126
 
                                if filecodec == 'aac':
4127
 
                                        more_opts = ['-f', 'adts']
4128
 
                                if filecodec == 'vorbis':
4129
 
                                        extension = 'ogg'
4130
 
                        else:
4131
 
                                # MP3 otherwise.
4132
 
                                acodec = 'libmp3lame'
4133
 
                                extension = 'mp3'
4134
 
                                more_opts = []
4135
 
                                if self._preferredquality is not None:
4136
 
                                        more_opts += ['-ab', self._preferredquality]
4137
 
                else:
4138
 
                        # We convert the audio (lossy)
4139
 
                        acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4140
 
                        extension = self._preferredcodec
4141
 
                        more_opts = []
4142
 
                        if self._preferredquality is not None:
4143
 
                                more_opts += ['-ab', self._preferredquality]
4144
 
                        if self._preferredcodec == 'aac':
4145
 
                                more_opts += ['-f', 'adts']
4146
 
                        if self._preferredcodec == 'm4a':
4147
 
                                more_opts += ['-absf', 'aac_adtstoasc']
4148
 
                        if self._preferredcodec == 'vorbis':
4149
 
                                extension = 'ogg'
4150
 
                        if self._preferredcodec == 'wav':
4151
 
                                extension = 'wav'
4152
 
                                more_opts += ['-f', 'wav']
4153
 
 
4154
 
                prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4155
 
                new_path = prefix + sep + extension
4156
 
                self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4157
 
                try:
4158
 
                        self.run_ffmpeg(path, new_path, acodec, more_opts)
4159
 
                except:
4160
 
                        etype,e,tb = sys.exc_info()
4161
 
                        if isinstance(e, AudioConversionError):
4162
 
                                self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4163
 
                        else:
4164
 
                                self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4165
 
                        return None
4166
 
 
4167
 
                # Try to update the date time for extracted audio file.
4168
 
                if information.get('filetime') is not None:
4169
 
                        try:
4170
 
                                os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4171
 
                        except:
4172
 
                                self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4173
 
 
4174
 
                if not self._keepvideo:
4175
 
                        try:
4176
 
                                os.remove(_encodeFilename(path))
4177
 
                        except (IOError, OSError):
4178
 
                                self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4179
 
                                return None
4180
 
 
4181
 
                information['filepath'] = new_path
4182
 
                return information
4183
 
 
4184
 
 
4185
 
def updateSelf(downloader, filename):
4186
 
        ''' Update the program file with the latest version from the repository '''
4187
 
        # Note: downloader only used for options
4188
 
        if not os.access(filename, os.W_OK):
4189
 
                sys.exit('ERROR: no write permissions on %s' % filename)
4190
 
 
4191
 
        downloader.to_screen(u'Updating to latest version...')
4192
 
 
4193
 
        try:
4194
 
                try:
4195
 
                        urlh = urllib.urlopen(UPDATE_URL)
4196
 
                        newcontent = urlh.read()
4197
 
                        
4198
 
                        vmatch = re.search("__version__ = '([^']+)'", newcontent)
4199
 
                        if vmatch is not None and vmatch.group(1) == __version__:
4200
 
                                downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4201
 
                                return
4202
 
                finally:
4203
 
                        urlh.close()
4204
 
        except (IOError, OSError), err:
4205
 
                sys.exit('ERROR: unable to download latest version')
4206
 
 
4207
 
        try:
4208
 
                outf = open(filename, 'wb')
4209
 
                try:
4210
 
                        outf.write(newcontent)
4211
 
                finally:
4212
 
                        outf.close()
4213
 
        except (IOError, OSError), err:
4214
 
                sys.exit('ERROR: unable to overwrite current version')
4215
 
 
4216
 
        downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4217
 
 
4218
 
def parseOpts():
4219
 
        def _readOptions(filename_bytes):
4220
 
                try:
4221
 
                        optionf = open(filename_bytes)
4222
 
                except IOError:
4223
 
                        return [] # silently skip if file is not present
4224
 
                try:
4225
 
                        res = []
4226
 
                        for l in optionf:
4227
 
                                res += shlex.split(l, comments=True)
4228
 
                finally:
4229
 
                        optionf.close()
4230
 
                return res
4231
 
 
4232
 
        def _format_option_string(option):
4233
 
                ''' ('-o', '--option') -> -o, --format METAVAR'''
4234
 
 
4235
 
                opts = []
4236
 
 
4237
 
                if option._short_opts: opts.append(option._short_opts[0])
4238
 
                if option._long_opts: opts.append(option._long_opts[0])
4239
 
                if len(opts) > 1: opts.insert(1, ', ')
4240
 
 
4241
 
                if option.takes_value(): opts.append(' %s' % option.metavar)
4242
 
 
4243
 
                return "".join(opts)
4244
 
 
4245
 
        def _find_term_columns():
4246
 
                columns = os.environ.get('COLUMNS', None)
4247
 
                if columns:
4248
 
                        return int(columns)
4249
 
 
4250
 
                try:
4251
 
                        sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4252
 
                        out,err = sp.communicate()
4253
 
                        return int(out.split()[1])
4254
 
                except:
4255
 
                        pass
4256
 
                return None
4257
 
 
4258
 
        max_width = 80
4259
 
        max_help_position = 80
4260
 
 
4261
 
        # No need to wrap help messages if we're on a wide console
4262
 
        columns = _find_term_columns()
4263
 
        if columns: max_width = columns
4264
 
 
4265
 
        fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4266
 
        fmt.format_option_strings = _format_option_string
4267
 
 
4268
 
        kw = {
4269
 
                'version'   : __version__,
4270
 
                'formatter' : fmt,
4271
 
                'usage' : '%prog [options] url [url...]',
4272
 
                'conflict_handler' : 'resolve',
4273
 
        }
4274
 
 
4275
 
        parser = optparse.OptionParser(**kw)
4276
 
 
4277
 
        # option groups
4278
 
        general        = optparse.OptionGroup(parser, 'General Options')
4279
 
        selection      = optparse.OptionGroup(parser, 'Video Selection')
4280
 
        authentication = optparse.OptionGroup(parser, 'Authentication Options')
4281
 
        video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4282
 
        postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4283
 
        filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4284
 
        verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4285
 
 
4286
 
        general.add_option('-h', '--help',
4287
 
                        action='help', help='print this help text and exit')
4288
 
        general.add_option('-v', '--version',
4289
 
                        action='version', help='print program version and exit')
4290
 
        general.add_option('-U', '--update',
4291
 
                        action='store_true', dest='update_self', help='update this program to latest version')
4292
 
        general.add_option('-i', '--ignore-errors',
4293
 
                        action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4294
 
        general.add_option('-r', '--rate-limit',
4295
 
                        dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4296
 
        general.add_option('-R', '--retries',
4297
 
                        dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4298
 
        general.add_option('--dump-user-agent',
4299
 
                        action='store_true', dest='dump_user_agent',
4300
 
                        help='display the current browser identification', default=False)
4301
 
        general.add_option('--list-extractors',
4302
 
                        action='store_true', dest='list_extractors',
4303
 
                        help='List all supported extractors and the URLs they would handle', default=False)
4304
 
 
4305
 
        selection.add_option('--playlist-start',
4306
 
                        dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4307
 
        selection.add_option('--playlist-end',
4308
 
                        dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4309
 
        selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4310
 
        selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4311
 
        selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4312
 
 
4313
 
        authentication.add_option('-u', '--username',
4314
 
                        dest='username', metavar='USERNAME', help='account username')
4315
 
        authentication.add_option('-p', '--password',
4316
 
                        dest='password', metavar='PASSWORD', help='account password')
4317
 
        authentication.add_option('-n', '--netrc',
4318
 
                        action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4319
 
 
4320
 
 
4321
 
        video_format.add_option('-f', '--format',
4322
 
                        action='store', dest='format', metavar='FORMAT', help='video format code')
4323
 
        video_format.add_option('--all-formats',
4324
 
                        action='store_const', dest='format', help='download all available video formats', const='all')
4325
 
        video_format.add_option('--prefer-free-formats',
4326
 
                        action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4327
 
        video_format.add_option('--max-quality',
4328
 
                        action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4329
 
        video_format.add_option('-F', '--list-formats',
4330
 
                        action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4331
 
 
4332
 
 
4333
 
        verbosity.add_option('-q', '--quiet',
4334
 
                        action='store_true', dest='quiet', help='activates quiet mode', default=False)
4335
 
        verbosity.add_option('-s', '--simulate',
4336
 
                        action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4337
 
        verbosity.add_option('--skip-download',
4338
 
                        action='store_true', dest='skip_download', help='do not download the video', default=False)
4339
 
        verbosity.add_option('-g', '--get-url',
4340
 
                        action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4341
 
        verbosity.add_option('-e', '--get-title',
4342
 
                        action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4343
 
        verbosity.add_option('--get-thumbnail',
4344
 
                        action='store_true', dest='getthumbnail',
4345
 
                        help='simulate, quiet but print thumbnail URL', default=False)
4346
 
        verbosity.add_option('--get-description',
4347
 
                        action='store_true', dest='getdescription',
4348
 
                        help='simulate, quiet but print video description', default=False)
4349
 
        verbosity.add_option('--get-filename',
4350
 
                        action='store_true', dest='getfilename',
4351
 
                        help='simulate, quiet but print output filename', default=False)
4352
 
        verbosity.add_option('--get-format',
4353
 
                        action='store_true', dest='getformat',
4354
 
                        help='simulate, quiet but print output format', default=False)
4355
 
        verbosity.add_option('--no-progress',
4356
 
                        action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4357
 
        verbosity.add_option('--console-title',
4358
 
                        action='store_true', dest='consoletitle',
4359
 
                        help='display progress in console titlebar', default=False)
4360
 
        verbosity.add_option('-v', '--verbose',
4361
 
                        action='store_true', dest='verbose', help='print various debugging information', default=False)
4362
 
 
4363
 
 
4364
 
        filesystem.add_option('-t', '--title',
4365
 
                        action='store_true', dest='usetitle', help='use title in file name', default=False)
4366
 
        filesystem.add_option('-l', '--literal',
4367
 
                        action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4368
 
        filesystem.add_option('-A', '--auto-number',
4369
 
                        action='store_true', dest='autonumber',
4370
 
                        help='number downloaded files starting from 00000', default=False)
4371
 
        filesystem.add_option('-o', '--output',
4372
 
                        dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4373
 
        filesystem.add_option('-a', '--batch-file',
4374
 
                        dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4375
 
        filesystem.add_option('-w', '--no-overwrites',
4376
 
                        action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4377
 
        filesystem.add_option('-c', '--continue',
4378
 
                        action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4379
 
        filesystem.add_option('--no-continue',
4380
 
                        action='store_false', dest='continue_dl',
4381
 
                        help='do not resume partially downloaded files (restart from beginning)')
4382
 
        filesystem.add_option('--cookies',
4383
 
                        dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4384
 
        filesystem.add_option('--no-part',
4385
 
                        action='store_true', dest='nopart', help='do not use .part files', default=False)
4386
 
        filesystem.add_option('--no-mtime',
4387
 
                        action='store_false', dest='updatetime',
4388
 
                        help='do not use the Last-modified header to set the file modification time', default=True)
4389
 
        filesystem.add_option('--write-description',
4390
 
                        action='store_true', dest='writedescription',
4391
 
                        help='write video description to a .description file', default=False)
4392
 
        filesystem.add_option('--write-info-json',
4393
 
                        action='store_true', dest='writeinfojson',
4394
 
                        help='write video metadata to a .info.json file', default=False)
4395
 
 
4396
 
 
4397
 
        postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4398
 
                        help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4399
 
        postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4400
 
                        help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4401
 
        postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4402
 
                        help='ffmpeg audio bitrate specification, 128k by default')
4403
 
        postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4404
 
                        help='keeps the video file on disk after the post-processing; the video is erased by default')
4405
 
 
4406
 
 
4407
 
        parser.add_option_group(general)
4408
 
        parser.add_option_group(selection)
4409
 
        parser.add_option_group(filesystem)
4410
 
        parser.add_option_group(verbosity)
4411
 
        parser.add_option_group(video_format)
4412
 
        parser.add_option_group(authentication)
4413
 
        parser.add_option_group(postproc)
4414
 
 
4415
 
        xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4416
 
        if xdg_config_home:
4417
 
                userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4418
 
        else:
4419
 
                userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4420
 
        argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4421
 
        opts, args = parser.parse_args(argv)
4422
 
 
4423
 
        return parser, opts, args
4424
 
 
4425
 
def gen_extractors():
4426
 
        """ Return a list of an instance of every supported extractor.
4427
 
        The order does matter; the first extractor matched is the one handling the URL.
4428
 
        """
4429
 
        youtube_ie = YoutubeIE()
4430
 
        google_ie = GoogleIE()
4431
 
        yahoo_ie = YahooIE()
4432
 
        return [
4433
 
                YoutubePlaylistIE(youtube_ie),
4434
 
                YoutubeUserIE(youtube_ie),
4435
 
                YoutubeSearchIE(youtube_ie),
4436
 
                youtube_ie,
4437
 
                MetacafeIE(youtube_ie),
4438
 
                DailymotionIE(),
4439
 
                google_ie,
4440
 
                GoogleSearchIE(google_ie),
4441
 
                PhotobucketIE(),
4442
 
                yahoo_ie,
4443
 
                YahooSearchIE(yahoo_ie),
4444
 
                DepositFilesIE(),
4445
 
                FacebookIE(),
4446
 
                BlipTVIE(),
4447
 
                VimeoIE(),
4448
 
                MyVideoIE(),
4449
 
                ComedyCentralIE(),
4450
 
                EscapistIE(),
4451
 
                CollegeHumorIE(),
4452
 
                XVideosIE(),
4453
 
                SoundcloudIE(),
4454
 
                InfoQIE(),
4455
 
                MixcloudIE(),
4456
 
                StanfordOpenClassroomIE(),
4457
 
                MTVIE(),
4458
 
 
4459
 
                GenericIE()
4460
 
        ]
4461
 
 
4462
 
def _real_main():
4463
 
        parser, opts, args = parseOpts()
4464
 
 
4465
 
        # Open appropriate CookieJar
4466
 
        if opts.cookiefile is None:
4467
 
                jar = cookielib.CookieJar()
4468
 
        else:
4469
 
                try:
4470
 
                        jar = cookielib.MozillaCookieJar(opts.cookiefile)
4471
 
                        if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4472
 
                                jar.load()
4473
 
                except (IOError, OSError), err:
4474
 
                        sys.exit(u'ERROR: unable to open cookie file')
4475
 
 
4476
 
        # Dump user agent
4477
 
        if opts.dump_user_agent:
4478
 
                print std_headers['User-Agent']
4479
 
                sys.exit(0)
4480
 
 
4481
 
        # Batch file verification
4482
 
        batchurls = []
4483
 
        if opts.batchfile is not None:
4484
 
                try:
4485
 
                        if opts.batchfile == '-':
4486
 
                                batchfd = sys.stdin
4487
 
                        else:
4488
 
                                batchfd = open(opts.batchfile, 'r')
4489
 
                        batchurls = batchfd.readlines()
4490
 
                        batchurls = [x.strip() for x in batchurls]
4491
 
                        batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4492
 
                except IOError:
4493
 
                        sys.exit(u'ERROR: batch file could not be read')
4494
 
        all_urls = batchurls + args
4495
 
 
4496
 
        # General configuration
4497
 
        cookie_processor = urllib2.HTTPCookieProcessor(jar)
4498
 
        proxy_handler = urllib2.ProxyHandler()
4499
 
        opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4500
 
        urllib2.install_opener(opener)
4501
 
        socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4502
 
 
4503
 
        if opts.verbose:
4504
 
                print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4505
 
 
4506
 
        extractors = gen_extractors()
4507
 
 
4508
 
        if opts.list_extractors:
4509
 
                for ie in extractors:
4510
 
                        print(ie.IE_NAME)
4511
 
                        matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4512
 
                        all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4513
 
                        for mu in matchedUrls:
4514
 
                                print(u'  ' + mu)
4515
 
                sys.exit(0)
4516
 
 
4517
 
        # Conflicting, missing and erroneous options
4518
 
        if opts.usenetrc and (opts.username is not None or opts.password is not None):
4519
 
                parser.error(u'using .netrc conflicts with giving username/password')
4520
 
        if opts.password is not None and opts.username is None:
4521
 
                parser.error(u'account username missing')
4522
 
        if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4523
 
                parser.error(u'using output template conflicts with using title, literal title or auto number')
4524
 
        if opts.usetitle and opts.useliteral:
4525
 
                parser.error(u'using title conflicts with using literal title')
4526
 
        if opts.username is not None and opts.password is None:
4527
 
                opts.password = getpass.getpass(u'Type account password and press return:')
4528
 
        if opts.ratelimit is not None:
4529
 
                numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4530
 
                if numeric_limit is None:
4531
 
                        parser.error(u'invalid rate limit specified')
4532
 
                opts.ratelimit = numeric_limit
4533
 
        if opts.retries is not None:
4534
 
                try:
4535
 
                        opts.retries = long(opts.retries)
4536
 
                except (TypeError, ValueError), err:
4537
 
                        parser.error(u'invalid retry count specified')
4538
 
        try:
4539
 
                opts.playliststart = int(opts.playliststart)
4540
 
                if opts.playliststart <= 0:
4541
 
                        raise ValueError(u'Playlist start must be positive')
4542
 
        except (TypeError, ValueError), err:
4543
 
                parser.error(u'invalid playlist start number specified')
4544
 
        try:
4545
 
                opts.playlistend = int(opts.playlistend)
4546
 
                if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4547
 
                        raise ValueError(u'Playlist end must be greater than playlist start')
4548
 
        except (TypeError, ValueError), err:
4549
 
                parser.error(u'invalid playlist end number specified')
4550
 
        if opts.extractaudio:
4551
 
                if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4552
 
                        parser.error(u'invalid audio format specified')
4553
 
 
4554
 
        # File downloader
4555
 
        fd = FileDownloader({
4556
 
                'usenetrc': opts.usenetrc,
4557
 
                'username': opts.username,
4558
 
                'password': opts.password,
4559
 
                'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4560
 
                'forceurl': opts.geturl,
4561
 
                'forcetitle': opts.gettitle,
4562
 
                'forcethumbnail': opts.getthumbnail,
4563
 
                'forcedescription': opts.getdescription,
4564
 
                'forcefilename': opts.getfilename,
4565
 
                'forceformat': opts.getformat,
4566
 
                'simulate': opts.simulate,
4567
 
                'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4568
 
                'format': opts.format,
4569
 
                'format_limit': opts.format_limit,
4570
 
                'listformats': opts.listformats,
4571
 
                'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4572
 
                        or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4573
 
                        or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4574
 
                        or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4575
 
                        or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4576
 
                        or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4577
 
                        or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4578
 
                        or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4579
 
                        or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4580
 
                        or u'%(id)s.%(ext)s'),
4581
 
                'ignoreerrors': opts.ignoreerrors,
4582
 
                'ratelimit': opts.ratelimit,
4583
 
                'nooverwrites': opts.nooverwrites,
4584
 
                'retries': opts.retries,
4585
 
                'continuedl': opts.continue_dl,
4586
 
                'noprogress': opts.noprogress,
4587
 
                'playliststart': opts.playliststart,
4588
 
                'playlistend': opts.playlistend,
4589
 
                'logtostderr': opts.outtmpl == '-',
4590
 
                'consoletitle': opts.consoletitle,
4591
 
                'nopart': opts.nopart,
4592
 
                'updatetime': opts.updatetime,
4593
 
                'writedescription': opts.writedescription,
4594
 
                'writeinfojson': opts.writeinfojson,
4595
 
                'matchtitle': opts.matchtitle,
4596
 
                'rejecttitle': opts.rejecttitle,
4597
 
                'max_downloads': opts.max_downloads,
4598
 
                'prefer_free_formats': opts.prefer_free_formats,
4599
 
                'verbose': opts.verbose,
4600
 
                })
4601
 
        for extractor in extractors:
4602
 
                fd.add_info_extractor(extractor)
4603
 
 
4604
 
        # PostProcessors
4605
 
        if opts.extractaudio:
4606
 
                fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4607
 
 
4608
 
        # Update version
4609
 
        if opts.update_self:
4610
 
                updateSelf(fd, sys.argv[0])
4611
 
 
4612
 
        # Maybe do nothing
4613
 
        if len(all_urls) < 1:
4614
 
                if not opts.update_self:
4615
 
                        parser.error(u'you must provide at least one URL')
4616
 
                else:
4617
 
                        sys.exit()
4618
 
        
4619
 
        try:
4620
 
                retcode = fd.download(all_urls)
4621
 
        except MaxDownloadsReached:
4622
 
                fd.to_screen(u'--max-download limit reached, aborting.')
4623
 
                retcode = 101
4624
 
 
4625
 
        # Dump cookie jar if requested
4626
 
        if opts.cookiefile is not None:
4627
 
                try:
4628
 
                        jar.save()
4629
 
                except (IOError, OSError), err:
4630
 
                        sys.exit(u'ERROR: unable to save cookie jar')
4631
 
 
4632
 
        sys.exit(retcode)
4633
 
 
4634
 
def main():
4635
 
        try:
4636
 
                _real_main()
4637
 
        except DownloadError:
4638
 
                sys.exit(1)
4639
 
        except SameFileError:
4640
 
                sys.exit(u'ERROR: fixed output name but more than one file to download')
4641
 
        except KeyboardInterrupt:
4642
 
                sys.exit(u'\nERROR: Interrupted by user')
4643
 
 
4644
 
if __name__ == '__main__':
4645
 
        main()
4646
 
 
4647
 
# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: