2
# -*- coding: utf-8 -*-
5
'Ricardo Garcia Gonzalez',
13
'Philipp Hagemeister',
20
__license__ = 'Public Domain'
21
__version__ = '2012.02.27'
23
UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56
except ImportError: # Python 2.4
59
import cStringIO as StringIO
63
# parse_qs was moved from the cgi module to the urlparse module recently.
65
from urlparse import parse_qs
67
from cgi import parse_qs
75
import xml.etree.ElementTree
76
except ImportError: # Python<2.5: Not officially supported, but let it slip
77
warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83
'Accept-Encoding': 'gzip, deflate',
84
'Accept-Language': 'en-us,en;q=0.5',
89
except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95
def raiseError(msg, i):
96
raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97
def skipSpace(i, expectMore=True):
98
while i < len(s) and s[i] in ' \t\r\n':
102
raiseError('Premature end', i)
104
def decodeEscape(match):
120
return unichr(int(esc[1:5], 16))
121
if len(esc) == 5+6 and esc[5:7] == '\\u':
122
hi = int(esc[1:5], 16)
123
low = int(esc[7:11], 16)
124
return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125
raise ValueError('Unknown escape ' + str(esc))
132
while s[e-bslashes-1] == '\\':
134
if bslashes % 2 == 1:
138
rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139
stri = rexp.sub(decodeEscape, s[i:e])
145
if s[i] == '}': # Empty dictionary
149
raiseError('Expected a string object key', i)
150
i,key = parseString(i)
152
if i >= len(s) or s[i] != ':':
153
raiseError('Expected a colon', i)
160
raiseError('Expected comma or closing curly brace', i)
165
if s[i] == ']': # Empty array
170
i = skipSpace(i) # Raise exception if premature end
174
raiseError('Expected a comma or closing bracket', i)
176
def parseDiscrete(i):
177
for k,v in {'true': True, 'false': False, 'null': None}.items():
178
if s.startswith(k, i):
180
raiseError('Not a boolean (or null)', i)
182
mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184
raiseError('Not a number', i)
186
if '.' in nums or 'e' in nums or 'E' in nums:
187
return (i+len(nums), float(nums))
188
return (i+len(nums), int(nums))
189
CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192
i,res = CHARMAP.get(s[i], parseNumber)(i)
193
i = skipSpace(i, False)
197
raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200
def preferredencoding():
201
"""Get preferred encoding.
203
Returns the best encoding scheme for the system, based on
204
locale.getpreferredencoding() and some further tweaks.
206
def yield_preferredencoding():
208
pref = locale.getpreferredencoding()
214
return yield_preferredencoding().next()
217
def htmlentity_transform(matchobj):
218
"""Transforms an HTML entity to a Unicode character.
220
This function receives a match object and is intended to be used with
221
the re.sub() function.
223
entity = matchobj.group(1)
225
# Known non-numeric HTML entity
226
if entity in htmlentitydefs.name2codepoint:
227
return unichr(htmlentitydefs.name2codepoint[entity])
230
mobj = re.match(ur'(?u)#(x?\d+)', entity)
232
numstr = mobj.group(1)
233
if numstr.startswith(u'x'):
235
numstr = u'0%s' % numstr
238
return unichr(long(numstr, base))
240
# Unknown entity in name, return its literal representation
241
return (u'&%s;' % entity)
244
def sanitize_title(utitle):
245
"""Sanitizes a video title so it could be used as part of a filename."""
246
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247
return utitle.replace(unicode(os.sep), u'%')
250
def sanitize_open(filename, open_mode):
251
"""Try to open the given filename, and slightly tweak it if this fails.
253
Attempts to open the given filename. If this fails, it tries to change
254
the filename slightly, step by step, until it's either able to open it
255
or it fails and raises a final exception, like the standard open()
258
It returns the tuple (stream, definitive_file_name).
262
if sys.platform == 'win32':
264
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265
return (sys.stdout, filename)
266
stream = open(_encodeFilename(filename), open_mode)
267
return (stream, filename)
268
except (IOError, OSError), err:
269
# In case of error, try to remove win32 forbidden chars
270
filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272
# An exception here should be caught in the caller
273
stream = open(_encodeFilename(filename), open_mode)
274
return (stream, filename)
277
def timeconvert(timestr):
278
"""Convert RFC 2822 defined time string into system timestamp"""
280
timetuple = email.utils.parsedate_tz(timestr)
281
if timetuple is not None:
282
timestamp = email.utils.mktime_tz(timetuple)
285
def _simplify_title(title):
286
expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287
return expr.sub(u'_', title).strip(u'_')
289
def _orderedSet(iterable):
290
""" Remove all duplicates from the input iterable """
297
def _unescapeHTML(s):
299
@param s a string (of type unicode)
301
assert type(s) == type(u'')
303
htmlParser = HTMLParser.HTMLParser()
304
return htmlParser.unescape(s)
306
def _encodeFilename(s):
308
@param s The name of the file (of type unicode)
311
assert type(s) == type(u'')
313
if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314
# Pass u'' directly to use Unicode APIs on Windows 2000 and up
315
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
316
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
319
return s.encode(sys.getfilesystemencoding(), 'ignore')
321
class DownloadError(Exception):
322
"""Download Error exception.
324
This exception may be thrown by FileDownloader objects if they are not
325
configured to continue on errors. They will contain the appropriate
331
class SameFileError(Exception):
332
"""Same File exception.
334
This exception will be thrown by FileDownloader objects if they detect
335
multiple files would have to be downloaded to the same file on disk.
340
class PostProcessingError(Exception):
341
"""Post Processing exception.
343
This exception may be raised by PostProcessor's .run() method to
344
indicate an error in the postprocessing task.
348
class MaxDownloadsReached(Exception):
349
""" --max-downloads limit has been reached. """
353
class UnavailableVideoError(Exception):
354
"""Unavailable Format exception.
356
This exception will be thrown when a video is requested
357
in a format that is not available for that video.
362
class ContentTooShortError(Exception):
363
"""Content Too Short exception.
365
This exception may be raised by FileDownloader objects when a file they
366
download is too small for what the server announced first, indicating
367
the connection was probably interrupted.
373
def __init__(self, downloaded, expected):
374
self.downloaded = downloaded
375
self.expected = expected
378
class YoutubeDLHandler(urllib2.HTTPHandler):
379
"""Handler for HTTP requests and responses.
381
This class, when installed with an OpenerDirector, automatically adds
382
the standard headers to every HTTP request and handles gzipped and
383
deflated responses from web servers. If compression is to be avoided in
384
a particular request, the original request in the program code only has
385
to include the HTTP header "Youtubedl-No-Compression", which will be
386
removed before making the real request.
388
Part of this code was copied from:
390
http://techknack.net/python-urllib2-handlers/
392
Andrew Rowls, the author of that code, agreed to release it to the
399
return zlib.decompress(data, -zlib.MAX_WBITS)
401
return zlib.decompress(data)
404
def addinfourl_wrapper(stream, headers, url, code):
405
if hasattr(urllib2.addinfourl, 'getcode'):
406
return urllib2.addinfourl(stream, headers, url, code)
407
ret = urllib2.addinfourl(stream, headers, url)
411
def http_request(self, req):
412
for h in std_headers:
415
req.add_header(h, std_headers[h])
416
if 'Youtubedl-no-compression' in req.headers:
417
if 'Accept-encoding' in req.headers:
418
del req.headers['Accept-encoding']
419
del req.headers['Youtubedl-no-compression']
422
def http_response(self, req, resp):
425
if resp.headers.get('Content-encoding', '') == 'gzip':
426
gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428
resp.msg = old_resp.msg
430
if resp.headers.get('Content-encoding', '') == 'deflate':
431
gz = StringIO.StringIO(self.deflate(resp.read()))
432
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433
resp.msg = old_resp.msg
437
class FileDownloader(object):
438
"""File Downloader class.
440
File downloader objects are the ones responsible of downloading the
441
actual video file and writing it to disk if the user has requested
442
it, among some other tasks. In most cases there should be one per
443
program. As, given a video URL, the downloader doesn't know how to
444
extract all the needed information, task that InfoExtractors do, it
445
has to pass the URL to one of them.
447
For this, file downloader objects have a method that allows
448
InfoExtractors to be registered in a given order. When it is passed
449
a URL, the file downloader handles it to the first InfoExtractor it
450
finds that reports being able to handle it. The InfoExtractor extracts
451
all the information about the video or videos the URL refers to, and
452
asks the FileDownloader to process the video information, possibly
453
downloading the video.
455
File downloaders accept a lot of parameters. In order not to saturate
456
the object constructor with arguments, it receives a dictionary of
457
options instead. These options are available through the params
458
attribute for the InfoExtractors to use. The FileDownloader also
459
registers itself as the downloader in charge for the InfoExtractors
460
that are added to it, so this is a "mutual registration".
464
username: Username for authentication purposes.
465
password: Password for authentication purposes.
466
usenetrc: Use netrc for authentication instead.
467
quiet: Do not print messages to stdout.
468
forceurl: Force printing final URL.
469
forcetitle: Force printing title.
470
forcethumbnail: Force printing thumbnail URL.
471
forcedescription: Force printing description.
472
forcefilename: Force printing final filename.
473
simulate: Do not download the video files.
474
format: Video format code.
475
format_limit: Highest quality format to try.
476
outtmpl: Template for output names.
477
ignoreerrors: Do not stop on download errors.
478
ratelimit: Download speed limit, in bytes/sec.
479
nooverwrites: Prevent overwriting files.
480
retries: Number of times to retry for HTTP error 5xx
481
continuedl: Try to continue downloads if possible.
482
noprogress: Do not print the progress bar.
483
playliststart: Playlist item to start at.
484
playlistend: Playlist item to end at.
485
matchtitle: Download only matching titles.
486
rejecttitle: Reject downloads for matching titles.
487
logtostderr: Log messages to stderr instead of stdout.
488
consoletitle: Display progress in console window's titlebar.
489
nopart: Do not use temporary .part files.
490
updatetime: Use the Last-modified header to set output file timestamps.
491
writedescription: Write the video description to a .description file
492
writeinfojson: Write the video description to a .info.json file
498
_download_retcode = None
499
_num_downloads = None
502
def __init__(self, params):
503
"""Create a FileDownloader object with the given options."""
506
self._download_retcode = 0
507
self._num_downloads = 0
508
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
512
def format_bytes(bytes):
515
if type(bytes) is str:
520
exponent = long(math.log(bytes, 1024.0))
521
suffix = 'bkMGTPEZY'[exponent]
522
converted = float(bytes) / float(1024 ** exponent)
523
return '%.2f%s' % (converted, suffix)
526
def calc_percent(byte_counter, data_len):
529
return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532
def calc_eta(start, now, total, current):
536
if current == 0 or dif < 0.001: # One millisecond
538
rate = float(current) / dif
539
eta = long((float(total) - float(current)) / rate)
540
(eta_mins, eta_secs) = divmod(eta, 60)
543
return '%02d:%02d' % (eta_mins, eta_secs)
546
def calc_speed(start, now, bytes):
548
if bytes == 0 or dif < 0.001: # One millisecond
549
return '%10s' % '---b/s'
550
return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553
def best_block_size(elapsed_time, bytes):
554
new_min = max(bytes / 2.0, 1.0)
555
new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556
if elapsed_time < 0.001:
558
rate = bytes / elapsed_time
566
def parse_bytes(bytestr):
567
"""Parse a string indicating a byte quantity into a long integer."""
568
matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571
number = float(matchobj.group(1))
572
multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573
return long(round(number * multiplier))
575
def add_info_extractor(self, ie):
576
"""Add an InfoExtractor object to the end of the list."""
578
ie.set_downloader(self)
580
def add_post_processor(self, pp):
581
"""Add a PostProcessor object to the end of the chain."""
583
pp.set_downloader(self)
585
def to_screen(self, message, skip_eol=False):
586
"""Print message to stdout if not in quiet mode."""
587
assert type(message) == type(u'')
588
if not self.params.get('quiet', False):
589
terminator = [u'\n', u''][skip_eol]
590
output = message + terminator
592
if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593
output = output.encode(preferredencoding(), 'ignore')
594
self._screen_file.write(output)
595
self._screen_file.flush()
597
def to_stderr(self, message):
598
"""Print message to stderr."""
599
print >>sys.stderr, message.encode(preferredencoding())
601
def to_cons_title(self, message):
602
"""Set console/terminal window title to message."""
603
if not self.params.get('consoletitle', False):
605
if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606
# c_wchar_p() might not be necessary if `message` is
607
# already of type unicode()
608
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609
elif 'TERM' in os.environ:
610
sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612
def fixed_template(self):
613
"""Checks if the output template is fixed."""
614
return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616
def trouble(self, message=None):
617
"""Determine action to take when a download problem appears.
619
Depending on if the downloader has been configured to ignore
620
download errors or not, this method may throw an exception or
621
not when errors are found, after printing the message.
623
if message is not None:
624
self.to_stderr(message)
625
if not self.params.get('ignoreerrors', False):
626
raise DownloadError(message)
627
self._download_retcode = 1
629
def slow_down(self, start_time, byte_counter):
630
"""Sleep if the download speed is over the rate limit."""
631
rate_limit = self.params.get('ratelimit', None)
632
if rate_limit is None or byte_counter == 0:
635
elapsed = now - start_time
638
speed = float(byte_counter) / elapsed
639
if speed > rate_limit:
640
time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642
def temp_name(self, filename):
643
"""Returns a temporary filename for the given filename."""
644
if self.params.get('nopart', False) or filename == u'-' or \
645
(os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647
return filename + u'.part'
649
def undo_temp_name(self, filename):
650
if filename.endswith(u'.part'):
651
return filename[:-len(u'.part')]
654
def try_rename(self, old_filename, new_filename):
656
if old_filename == new_filename:
658
os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659
except (IOError, OSError), err:
660
self.trouble(u'ERROR: unable to rename file')
662
def try_utime(self, filename, last_modified_hdr):
663
"""Try to set the last-modified time of the given file."""
664
if last_modified_hdr is None:
666
if not os.path.isfile(_encodeFilename(filename)):
668
timestr = last_modified_hdr
671
filetime = timeconvert(timestr)
675
os.utime(filename, (time.time(), filetime))
680
def report_writedescription(self, descfn):
681
""" Report that the description file is being written """
682
self.to_screen(u'[info] Writing video description to: ' + descfn)
684
def report_writeinfojson(self, infofn):
685
""" Report that the metadata file has been written """
686
self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
688
def report_destination(self, filename):
689
"""Report destination filename."""
690
self.to_screen(u'[download] Destination: ' + filename)
692
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693
"""Report download progress."""
694
if self.params.get('noprogress', False):
696
self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698
self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699
(percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
701
def report_resuming_byte(self, resume_len):
702
"""Report attempt to resume at given byte."""
703
self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
705
def report_retry(self, count, retries):
706
"""Report retry in case of HTTP error 5xx"""
707
self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
709
def report_file_already_downloaded(self, file_name):
710
"""Report file has already been fully downloaded."""
712
self.to_screen(u'[download] %s has already been downloaded' % file_name)
713
except (UnicodeEncodeError), err:
714
self.to_screen(u'[download] The file has already been downloaded')
716
def report_unable_to_resume(self):
717
"""Report it was impossible to resume download."""
718
self.to_screen(u'[download] Unable to resume')
720
def report_finish(self):
721
"""Report download finished."""
722
if self.params.get('noprogress', False):
723
self.to_screen(u'[download] Download completed')
727
def increment_downloads(self):
728
"""Increment the ordinal that assigns a number to each file."""
729
self._num_downloads += 1
731
def prepare_filename(self, info_dict):
732
"""Generate the output filename."""
734
template_dict = dict(info_dict)
735
template_dict['epoch'] = unicode(long(time.time()))
736
template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737
filename = self.params['outtmpl'] % template_dict
739
except (ValueError, KeyError), err:
740
self.trouble(u'ERROR: invalid system charset or erroneous output template')
743
def _match_entry(self, info_dict):
744
""" Returns None iff the file should be downloaded """
746
title = info_dict['title']
747
matchtitle = self.params.get('matchtitle', False)
748
if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749
return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750
rejecttitle = self.params.get('rejecttitle', False)
751
if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752
return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
755
def process_info(self, info_dict):
756
"""Process a single dictionary returned by an InfoExtractor."""
758
reason = self._match_entry(info_dict)
759
if reason is not None:
760
self.to_screen(u'[download] ' + reason)
763
max_downloads = self.params.get('max_downloads')
764
if max_downloads is not None:
765
if self._num_downloads > int(max_downloads):
766
raise MaxDownloadsReached()
768
filename = self.prepare_filename(info_dict)
771
if self.params.get('forcetitle', False):
772
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773
if self.params.get('forceurl', False):
774
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775
if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776
print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777
if self.params.get('forcedescription', False) and 'description' in info_dict:
778
print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779
if self.params.get('forcefilename', False) and filename is not None:
780
print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781
if self.params.get('forceformat', False):
782
print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
784
# Do nothing else if in simulate mode
785
if self.params.get('simulate', False):
792
dn = os.path.dirname(_encodeFilename(filename))
793
if dn != '' and not os.path.exists(dn): # dn is already encoded
795
except (OSError, IOError), err:
796
self.trouble(u'ERROR: unable to create directory ' + unicode(err))
799
if self.params.get('writedescription', False):
801
descfn = filename + u'.description'
802
self.report_writedescription(descfn)
803
descfile = open(_encodeFilename(descfn), 'wb')
805
descfile.write(info_dict['description'].encode('utf-8'))
808
except (OSError, IOError):
809
self.trouble(u'ERROR: Cannot write description file ' + descfn)
812
if self.params.get('writeinfojson', False):
813
infofn = filename + u'.info.json'
814
self.report_writeinfojson(infofn)
817
except (NameError,AttributeError):
818
self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
821
infof = open(_encodeFilename(infofn), 'wb')
823
json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824
json.dump(json_info_dict, infof)
827
except (OSError, IOError):
828
self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
831
if not self.params.get('skip_download', False):
832
if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
836
success = self._do_download(filename, info_dict)
837
except (OSError, IOError), err:
838
raise UnavailableVideoError
839
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840
self.trouble(u'ERROR: unable to download video data: %s' % str(err))
842
except (ContentTooShortError, ), err:
843
self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
848
self.post_process(filename, info_dict)
849
except (PostProcessingError), err:
850
self.trouble(u'ERROR: postprocessing: %s' % str(err))
853
def download(self, url_list):
854
"""Download a given list of URLs."""
855
if len(url_list) > 1 and self.fixed_template():
856
raise SameFileError(self.params['outtmpl'])
859
suitable_found = False
861
# Go to next InfoExtractor if not suitable
862
if not ie.suitable(url):
865
# Suitable InfoExtractor found
866
suitable_found = True
868
# Extract information from URL and process it
871
# Suitable InfoExtractor had been found; go to next URL
874
if not suitable_found:
875
self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
877
return self._download_retcode
879
def post_process(self, filename, ie_info):
880
"""Run the postprocessing chain on the given file."""
882
info['filepath'] = filename
888
def _download_with_rtmpdump(self, filename, url, player_url):
889
self.report_destination(filename)
890
tmpfilename = self.temp_name(filename)
892
# Check for rtmpdump first
894
subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895
except (OSError, IOError):
896
self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
899
# Download using rtmpdump. rtmpdump returns exit code 2 when
900
# the connection was interrumpted and resuming appears to be
901
# possible. This is part of rtmpdump's normal usage, AFAIK.
902
basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903
args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904
if self.params.get('verbose', False):
907
shell_quote = lambda args: ' '.join(map(pipes.quote, args))
910
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911
retval = subprocess.call(args)
912
while retval == 2 or retval == 1:
913
prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914
self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915
time.sleep(5.0) # This seems to be needed
916
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917
cursize = os.path.getsize(_encodeFilename(tmpfilename))
918
if prevsize == cursize and retval == 1:
920
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921
if prevsize == cursize and retval == 2 and cursize > 1024:
922
self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
926
self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927
self.try_rename(tmpfilename, filename)
930
self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
933
def _do_download(self, filename, info_dict):
934
url = info_dict['url']
935
player_url = info_dict.get('player_url', None)
937
# Check file already present
938
if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939
self.report_file_already_downloaded(filename)
942
# Attempt to download using rtmpdump
943
if url.startswith('rtmp'):
944
return self._download_with_rtmpdump(filename, url, player_url)
946
tmpfilename = self.temp_name(filename)
949
# Do not include the Accept-Encoding header
950
headers = {'Youtubedl-no-compression': 'True'}
951
basic_request = urllib2.Request(url, None, headers)
952
request = urllib2.Request(url, None, headers)
954
# Establish possible resume length
955
if os.path.isfile(_encodeFilename(tmpfilename)):
956
resume_len = os.path.getsize(_encodeFilename(tmpfilename))
962
if self.params.get('continuedl', False):
963
self.report_resuming_byte(resume_len)
964
request.add_header('Range','bytes=%d-' % resume_len)
970
retries = self.params.get('retries', 0)
971
while count <= retries:
972
# Establish connection
974
if count == 0 and 'urlhandle' in info_dict:
975
data = info_dict['urlhandle']
976
data = urllib2.urlopen(request)
978
except (urllib2.HTTPError, ), err:
979
if (err.code < 500 or err.code >= 600) and err.code != 416:
980
# Unexpected HTTP error
982
elif err.code == 416:
983
# Unable to resume (requested range not satisfiable)
985
# Open the connection again without the range header
986
data = urllib2.urlopen(basic_request)
987
content_length = data.info()['Content-Length']
988
except (urllib2.HTTPError, ), err:
989
if err.code < 500 or err.code >= 600:
992
# Examine the reported length
993
if (content_length is not None and
994
(resume_len - 100 < long(content_length) < resume_len + 100)):
995
# The file had already been fully downloaded.
996
# Explanation to the above condition: in issue #175 it was revealed that
997
# YouTube sometimes adds or removes a few bytes from the end of the file,
998
# changing the file size slightly and causing problems for some users. So
999
# I decided to implement a suggested change and consider the file
1000
# completely downloaded if the file size differs less than 100 bytes from
1001
# the one in the hard drive.
1002
self.report_file_already_downloaded(filename)
1003
self.try_rename(tmpfilename, filename)
1006
# The length does not match, we start the download over
1007
self.report_unable_to_resume()
1012
if count <= retries:
1013
self.report_retry(count, retries)
1016
self.trouble(u'ERROR: giving up after %s retries' % retries)
1019
data_len = data.info().get('Content-length', None)
1020
if data_len is not None:
1021
data_len = long(data_len) + resume_len
1022
data_len_str = self.format_bytes(data_len)
1023
byte_counter = 0 + resume_len
1027
# Download and write
1028
before = time.time()
1029
data_block = data.read(block_size)
1031
if len(data_block) == 0:
1033
byte_counter += len(data_block)
1035
# Open file just in time
1038
(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039
assert stream is not None
1040
filename = self.undo_temp_name(tmpfilename)
1041
self.report_destination(filename)
1042
except (OSError, IOError), err:
1043
self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1046
stream.write(data_block)
1047
except (IOError, OSError), err:
1048
self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1050
block_size = self.best_block_size(after - before, len(data_block))
1053
speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054
if data_len is None:
1055
self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1057
percent_str = self.calc_percent(byte_counter, data_len)
1058
eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059
self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1062
self.slow_down(start, byte_counter - resume_len)
1065
self.trouble(u'\nERROR: Did not get any data blocks')
1068
self.report_finish()
1069
if data_len is not None and byte_counter != data_len:
1070
raise ContentTooShortError(byte_counter, long(data_len))
1071
self.try_rename(tmpfilename, filename)
1073
# Update file modification time
1074
if self.params.get('updatetime', True):
1075
info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1080
class InfoExtractor(object):
1081
"""Information Extractor class.
1083
Information extractors are the classes that, given a URL, extract
1084
information from the video (or videos) the URL refers to. This
1085
information includes the real video URL, the video title and simplified
1086
title, author and others. The information is stored in a dictionary
1087
which is then passed to the FileDownloader. The FileDownloader
1088
processes this information possibly downloading the video to the file
1089
system, among other possible outcomes. The dictionaries must include
1090
the following fields:
1092
id: Video identifier.
1093
url: Final video URL.
1094
uploader: Nickname of the video uploader.
1095
title: Literal title.
1096
stitle: Simplified title.
1097
ext: Video filename extension.
1098
format: Video format.
1099
player_url: SWF Player URL (may be None).
1101
The following fields are optional. Their primary purpose is to allow
1102
youtube-dl to serve as the backend for a video search function, such
1103
as the one in youtube2mp3. They are only used when their respective
1104
forced printing functions are called:
1106
thumbnail: Full URL to a video thumbnail image.
1107
description: One-line video description.
1109
Subclasses of this one should re-define the _real_initialize() and
1110
_real_extract() methods and define a _VALID_URL regexp.
1111
Probably, they should also be added to the list of extractors.
1117
def __init__(self, downloader=None):
1118
"""Constructor. Receives an optional downloader."""
1120
self.set_downloader(downloader)
1122
def suitable(self, url):
1123
"""Receives a URL and returns True if suitable for this IE."""
1124
return re.match(self._VALID_URL, url) is not None
1126
def initialize(self):
1127
"""Initializes an instance (authentication, etc)."""
1129
self._real_initialize()
1132
def extract(self, url):
1133
"""Extracts URL information and returns it in list of dicts."""
1135
return self._real_extract(url)
1137
def set_downloader(self, downloader):
1138
"""Sets the downloader for this IE."""
1139
self._downloader = downloader
1141
def _real_initialize(self):
1142
"""Real initialization process. Redefine in subclasses."""
1145
def _real_extract(self, url):
1146
"""Real extraction process. Redefine in subclasses."""
1150
class YoutubeIE(InfoExtractor):
1151
"""Information extractor for youtube.com."""
1153
_VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154
_LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155
_LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157
_NETRC_MACHINE = 'youtube'
1158
# Listed in order of quality
1159
_available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160
_available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161
_video_extensions = {
1167
'38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1172
_video_dimensions = {
1187
IE_NAME = u'youtube'
1189
def report_lang(self):
1190
"""Report attempt to set language."""
1191
self._downloader.to_screen(u'[youtube] Setting language')
1193
def report_login(self):
1194
"""Report attempt to log in."""
1195
self._downloader.to_screen(u'[youtube] Logging in')
1197
def report_age_confirmation(self):
1198
"""Report attempt to confirm age."""
1199
self._downloader.to_screen(u'[youtube] Confirming age')
1201
def report_video_webpage_download(self, video_id):
1202
"""Report attempt to download video webpage."""
1203
self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1205
def report_video_info_webpage_download(self, video_id):
1206
"""Report attempt to download video info webpage."""
1207
self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1209
def report_information_extraction(self, video_id):
1210
"""Report attempt to extract video information."""
1211
self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1213
def report_unavailable_format(self, video_id, format):
1214
"""Report extracted video URL."""
1215
self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1217
def report_rtmp_download(self):
1218
"""Indicate the download will use the RTMP protocol."""
1219
self._downloader.to_screen(u'[youtube] RTMP download detected')
1221
def _print_formats(self, formats):
1222
print 'Available formats:'
1224
print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1226
def _real_initialize(self):
1227
if self._downloader is None:
1232
downloader_params = self._downloader.params
1234
# Attempt to use provided username and password or .netrc data
1235
if downloader_params.get('username', None) is not None:
1236
username = downloader_params['username']
1237
password = downloader_params['password']
1238
elif downloader_params.get('usenetrc', False):
1240
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241
if info is not None:
1245
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246
except (IOError, netrc.NetrcParseError), err:
1247
self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1251
request = urllib2.Request(self._LANG_URL)
1254
urllib2.urlopen(request).read()
1255
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256
self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1259
# No authentication to be performed
1260
if username is None:
1265
'current_form': 'loginForm',
1267
'action_login': 'Log In',
1268
'username': username,
1269
'password': password,
1271
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1274
login_results = urllib2.urlopen(request).read()
1275
if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276
self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1278
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279
self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1285
'action_confirm': 'Confirm',
1287
request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1289
self.report_age_confirmation()
1290
age_results = urllib2.urlopen(request).read()
1291
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1295
def _real_extract(self, url):
1296
# Extract video id from URL
1297
mobj = re.match(self._VALID_URL, url)
1299
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1301
video_id = mobj.group(2)
1304
self.report_video_webpage_download(video_id)
1305
request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1307
video_webpage = urllib2.urlopen(request).read()
1308
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1312
# Attempt to extract SWF player URL
1313
mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314
if mobj is not None:
1315
player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1320
self.report_video_info_webpage_download(video_id)
1321
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322
video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323
% (video_id, el_type))
1324
request = urllib2.Request(video_info_url)
1326
video_info_webpage = urllib2.urlopen(request).read()
1327
video_info = parse_qs(video_info_webpage)
1328
if 'token' in video_info:
1330
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1333
if 'token' not in video_info:
1334
if 'reason' in video_info:
1335
self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1337
self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1340
# Start extracting information
1341
self.report_information_extraction(video_id)
1344
if 'author' not in video_info:
1345
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1347
video_uploader = urllib.unquote_plus(video_info['author'][0])
1350
if 'title' not in video_info:
1351
self._downloader.trouble(u'ERROR: unable to extract video title')
1353
video_title = urllib.unquote_plus(video_info['title'][0])
1354
video_title = video_title.decode('utf-8')
1355
video_title = sanitize_title(video_title)
1358
simple_title = _simplify_title(video_title)
1361
if 'thumbnail_url' not in video_info:
1362
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363
video_thumbnail = ''
1364
else: # don't panic if we can't find it
1365
video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1369
mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370
if mobj is not None:
1371
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372
format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373
for expression in format_expressions:
1375
upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1383
video_description = u'No description available.'
1384
mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1385
if mobj is not None:
1386
video_description = mobj.group(1).decode('utf-8')
1388
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1389
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1390
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1391
# TODO use another parser
1394
video_token = urllib.unquote_plus(video_info['token'][0])
1396
# Decide which formats to download
1397
req_format = self._downloader.params.get('format', None)
1399
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1400
self.report_rtmp_download()
1401
video_url_list = [(None, video_info['conn'][0])]
1402
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1403
url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1404
url_data = [parse_qs(uds) for uds in url_data_strs]
1405
url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1406
url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1408
format_limit = self._downloader.params.get('format_limit', None)
1409
available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1410
if format_limit is not None and format_limit in available_formats:
1411
format_list = available_formats[available_formats.index(format_limit):]
1413
format_list = available_formats
1414
existing_formats = [x for x in format_list if x in url_map]
1415
if len(existing_formats) == 0:
1416
self._downloader.trouble(u'ERROR: no known formats available for video')
1418
if self._downloader.params.get('listformats', None):
1419
self._print_formats(existing_formats)
1421
if req_format is None or req_format == 'best':
1422
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1423
elif req_format == 'worst':
1424
video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1425
elif req_format in ('-1', 'all'):
1426
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1428
# Specific formats. We pick the first in a slash-delimeted sequence.
1429
# For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1430
req_formats = req_format.split('/')
1431
video_url_list = None
1432
for rf in req_formats:
1434
video_url_list = [(rf, url_map[rf])]
1436
if video_url_list is None:
1437
self._downloader.trouble(u'ERROR: requested format not available')
1440
self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1443
for format_param, video_real_url in video_url_list:
1444
# At this point we have a new video
1445
self._downloader.increment_downloads()
1448
video_extension = self._video_extensions.get(format_param, 'flv')
1451
# Process video information
1452
self._downloader.process_info({
1453
'id': video_id.decode('utf-8'),
1454
'url': video_real_url.decode('utf-8'),
1455
'uploader': video_uploader.decode('utf-8'),
1456
'upload_date': upload_date,
1457
'title': video_title,
1458
'stitle': simple_title,
1459
'ext': video_extension.decode('utf-8'),
1460
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1461
'thumbnail': video_thumbnail.decode('utf-8'),
1462
'description': video_description,
1463
'player_url': player_url,
1465
except UnavailableVideoError, err:
1466
self._downloader.trouble(u'\nERROR: unable to download video')
1469
class MetacafeIE(InfoExtractor):
1470
"""Information Extractor for metacafe.com."""
1472
_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1473
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1474
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1476
IE_NAME = u'metacafe'
1478
def __init__(self, youtube_ie, downloader=None):
1479
InfoExtractor.__init__(self, downloader)
1480
self._youtube_ie = youtube_ie
1482
def report_disclaimer(self):
1483
"""Report disclaimer retrieval."""
1484
self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1486
def report_age_confirmation(self):
1487
"""Report attempt to confirm age."""
1488
self._downloader.to_screen(u'[metacafe] Confirming age')
1490
def report_download_webpage(self, video_id):
1491
"""Report webpage download."""
1492
self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1494
def report_extraction(self, video_id):
1495
"""Report information extraction."""
1496
self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1498
def _real_initialize(self):
1499
# Retrieve disclaimer
1500
request = urllib2.Request(self._DISCLAIMER)
1502
self.report_disclaimer()
1503
disclaimer = urllib2.urlopen(request).read()
1504
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505
self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1511
'submit': "Continue - I'm over 18",
1513
request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1515
self.report_age_confirmation()
1516
disclaimer = urllib2.urlopen(request).read()
1517
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1521
def _real_extract(self, url):
1522
# Extract id and simplified title from URL
1523
mobj = re.match(self._VALID_URL, url)
1525
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1528
video_id = mobj.group(1)
1530
# Check if video comes from YouTube
1531
mobj2 = re.match(r'^yt-(.*)$', video_id)
1532
if mobj2 is not None:
1533
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1536
# At this point we have a new video
1537
self._downloader.increment_downloads()
1539
simple_title = mobj.group(2).decode('utf-8')
1541
# Retrieve video webpage to extract further information
1542
request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1544
self.report_download_webpage(video_id)
1545
webpage = urllib2.urlopen(request).read()
1546
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547
self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1550
# Extract URL, uploader and title from webpage
1551
self.report_extraction(video_id)
1552
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1553
if mobj is not None:
1554
mediaURL = urllib.unquote(mobj.group(1))
1555
video_extension = mediaURL[-3:]
1557
# Extract gdaKey if available
1558
mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1560
video_url = mediaURL
1562
gdaKey = mobj.group(1)
1563
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1565
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1567
self._downloader.trouble(u'ERROR: unable to extract media URL')
1569
vardict = parse_qs(mobj.group(1))
1570
if 'mediaData' not in vardict:
1571
self._downloader.trouble(u'ERROR: unable to extract media URL')
1573
mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1575
self._downloader.trouble(u'ERROR: unable to extract media URL')
1577
mediaURL = mobj.group(1).replace('\\/', '/')
1578
video_extension = mediaURL[-3:]
1579
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1581
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1583
self._downloader.trouble(u'ERROR: unable to extract title')
1585
video_title = mobj.group(1).decode('utf-8')
1586
video_title = sanitize_title(video_title)
1588
mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1590
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1592
video_uploader = mobj.group(1)
1595
# Process video information
1596
self._downloader.process_info({
1597
'id': video_id.decode('utf-8'),
1598
'url': video_url.decode('utf-8'),
1599
'uploader': video_uploader.decode('utf-8'),
1600
'upload_date': u'NA',
1601
'title': video_title,
1602
'stitle': simple_title,
1603
'ext': video_extension.decode('utf-8'),
1607
except UnavailableVideoError:
1608
self._downloader.trouble(u'\nERROR: unable to download video')
1611
class DailymotionIE(InfoExtractor):
1612
"""Information Extractor for Dailymotion"""
1614
_VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1615
IE_NAME = u'dailymotion'
1617
def __init__(self, downloader=None):
1618
InfoExtractor.__init__(self, downloader)
1620
def report_download_webpage(self, video_id):
1621
"""Report webpage download."""
1622
self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1624
def report_extraction(self, video_id):
1625
"""Report information extraction."""
1626
self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1628
def _real_extract(self, url):
1629
# Extract id and simplified title from URL
1630
mobj = re.match(self._VALID_URL, url)
1632
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1635
# At this point we have a new video
1636
self._downloader.increment_downloads()
1637
video_id = mobj.group(1)
1639
video_extension = 'flv'
1641
# Retrieve video webpage to extract further information
1642
request = urllib2.Request(url)
1643
request.add_header('Cookie', 'family_filter=off')
1645
self.report_download_webpage(video_id)
1646
webpage = urllib2.urlopen(request).read()
1647
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648
self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1651
# Extract URL, uploader and title from webpage
1652
self.report_extraction(video_id)
1653
mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1655
self._downloader.trouble(u'ERROR: unable to extract media URL')
1657
sequence = urllib.unquote(mobj.group(1))
1658
mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1660
self._downloader.trouble(u'ERROR: unable to extract media URL')
1662
mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1664
# if needed add http://www.dailymotion.com/ if relative URL
1666
video_url = mediaURL
1668
mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1670
self._downloader.trouble(u'ERROR: unable to extract title')
1672
video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1673
video_title = sanitize_title(video_title)
1674
simple_title = _simplify_title(video_title)
1676
mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1678
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1680
video_uploader = mobj.group(1)
1683
# Process video information
1684
self._downloader.process_info({
1685
'id': video_id.decode('utf-8'),
1686
'url': video_url.decode('utf-8'),
1687
'uploader': video_uploader.decode('utf-8'),
1688
'upload_date': u'NA',
1689
'title': video_title,
1690
'stitle': simple_title,
1691
'ext': video_extension.decode('utf-8'),
1695
except UnavailableVideoError:
1696
self._downloader.trouble(u'\nERROR: unable to download video')
1699
class GoogleIE(InfoExtractor):
1700
"""Information extractor for video.google.com."""
1702
_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1703
IE_NAME = u'video.google'
1705
def __init__(self, downloader=None):
1706
InfoExtractor.__init__(self, downloader)
1708
def report_download_webpage(self, video_id):
1709
"""Report webpage download."""
1710
self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1712
def report_extraction(self, video_id):
1713
"""Report information extraction."""
1714
self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1716
def _real_extract(self, url):
1717
# Extract id from URL
1718
mobj = re.match(self._VALID_URL, url)
1720
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1723
# At this point we have a new video
1724
self._downloader.increment_downloads()
1725
video_id = mobj.group(1)
1727
video_extension = 'mp4'
1729
# Retrieve video webpage to extract further information
1730
request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1732
self.report_download_webpage(video_id)
1733
webpage = urllib2.urlopen(request).read()
1734
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1738
# Extract URL, uploader, and title from webpage
1739
self.report_extraction(video_id)
1740
mobj = re.search(r"download_url:'([^']+)'", webpage)
1742
video_extension = 'flv'
1743
mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1745
self._downloader.trouble(u'ERROR: unable to extract media URL')
1747
mediaURL = urllib.unquote(mobj.group(1))
1748
mediaURL = mediaURL.replace('\\x3d', '\x3d')
1749
mediaURL = mediaURL.replace('\\x26', '\x26')
1751
video_url = mediaURL
1753
mobj = re.search(r'<title>(.*)</title>', webpage)
1755
self._downloader.trouble(u'ERROR: unable to extract title')
1757
video_title = mobj.group(1).decode('utf-8')
1758
video_title = sanitize_title(video_title)
1759
simple_title = _simplify_title(video_title)
1761
# Extract video description
1762
mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1764
self._downloader.trouble(u'ERROR: unable to extract video description')
1766
video_description = mobj.group(1).decode('utf-8')
1767
if not video_description:
1768
video_description = 'No description available.'
1770
# Extract video thumbnail
1771
if self._downloader.params.get('forcethumbnail', False):
1772
request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1774
webpage = urllib2.urlopen(request).read()
1775
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1778
mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1780
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1782
video_thumbnail = mobj.group(1)
1783
else: # we need something to pass to process_info
1784
video_thumbnail = ''
1787
# Process video information
1788
self._downloader.process_info({
1789
'id': video_id.decode('utf-8'),
1790
'url': video_url.decode('utf-8'),
1792
'upload_date': u'NA',
1793
'title': video_title,
1794
'stitle': simple_title,
1795
'ext': video_extension.decode('utf-8'),
1799
except UnavailableVideoError:
1800
self._downloader.trouble(u'\nERROR: unable to download video')
1803
class PhotobucketIE(InfoExtractor):
1804
"""Information extractor for photobucket.com."""
1806
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1807
IE_NAME = u'photobucket'
1809
def __init__(self, downloader=None):
1810
InfoExtractor.__init__(self, downloader)
1812
def report_download_webpage(self, video_id):
1813
"""Report webpage download."""
1814
self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1816
def report_extraction(self, video_id):
1817
"""Report information extraction."""
1818
self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1820
def _real_extract(self, url):
1821
# Extract id from URL
1822
mobj = re.match(self._VALID_URL, url)
1824
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1827
# At this point we have a new video
1828
self._downloader.increment_downloads()
1829
video_id = mobj.group(1)
1831
video_extension = 'flv'
1833
# Retrieve video webpage to extract further information
1834
request = urllib2.Request(url)
1836
self.report_download_webpage(video_id)
1837
webpage = urllib2.urlopen(request).read()
1838
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1842
# Extract URL, uploader, and title from webpage
1843
self.report_extraction(video_id)
1844
mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1846
self._downloader.trouble(u'ERROR: unable to extract media URL')
1848
mediaURL = urllib.unquote(mobj.group(1))
1850
video_url = mediaURL
1852
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1854
self._downloader.trouble(u'ERROR: unable to extract title')
1856
video_title = mobj.group(1).decode('utf-8')
1857
video_title = sanitize_title(video_title)
1858
simple_title = _simplify_title(vide_title)
1860
video_uploader = mobj.group(2).decode('utf-8')
1863
# Process video information
1864
self._downloader.process_info({
1865
'id': video_id.decode('utf-8'),
1866
'url': video_url.decode('utf-8'),
1867
'uploader': video_uploader,
1868
'upload_date': u'NA',
1869
'title': video_title,
1870
'stitle': simple_title,
1871
'ext': video_extension.decode('utf-8'),
1875
except UnavailableVideoError:
1876
self._downloader.trouble(u'\nERROR: unable to download video')
1879
class YahooIE(InfoExtractor):
1880
"""Information extractor for video.yahoo.com."""
1882
# _VALID_URL matches all Yahoo! Video URLs
1883
# _VPAGE_URL matches only the extractable '/watch/' URLs
1884
_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1885
_VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1886
IE_NAME = u'video.yahoo'
1888
def __init__(self, downloader=None):
1889
InfoExtractor.__init__(self, downloader)
1891
def report_download_webpage(self, video_id):
1892
"""Report webpage download."""
1893
self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1895
def report_extraction(self, video_id):
1896
"""Report information extraction."""
1897
self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1899
def _real_extract(self, url, new_video=True):
1900
# Extract ID from URL
1901
mobj = re.match(self._VALID_URL, url)
1903
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1906
# At this point we have a new video
1907
self._downloader.increment_downloads()
1908
video_id = mobj.group(2)
1909
video_extension = 'flv'
1911
# Rewrite valid but non-extractable URLs as
1912
# extractable English language /watch/ URLs
1913
if re.match(self._VPAGE_URL, url) is None:
1914
request = urllib2.Request(url)
1916
webpage = urllib2.urlopen(request).read()
1917
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921
mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1923
self._downloader.trouble(u'ERROR: Unable to extract id field')
1925
yahoo_id = mobj.group(1)
1927
mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1929
self._downloader.trouble(u'ERROR: Unable to extract vid field')
1931
yahoo_vid = mobj.group(1)
1933
url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1934
return self._real_extract(url, new_video=False)
1936
# Retrieve video webpage to extract further information
1937
request = urllib2.Request(url)
1939
self.report_download_webpage(video_id)
1940
webpage = urllib2.urlopen(request).read()
1941
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1945
# Extract uploader and title from webpage
1946
self.report_extraction(video_id)
1947
mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1949
self._downloader.trouble(u'ERROR: unable to extract video title')
1951
video_title = mobj.group(1).decode('utf-8')
1952
simple_title = _simplify_title(video_title)
1954
mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1956
self._downloader.trouble(u'ERROR: unable to extract video uploader')
1958
video_uploader = mobj.group(1).decode('utf-8')
1960
# Extract video thumbnail
1961
mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1963
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1965
video_thumbnail = mobj.group(1).decode('utf-8')
1967
# Extract video description
1968
mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1970
self._downloader.trouble(u'ERROR: unable to extract video description')
1972
video_description = mobj.group(1).decode('utf-8')
1973
if not video_description:
1974
video_description = 'No description available.'
1976
# Extract video height and width
1977
mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1979
self._downloader.trouble(u'ERROR: unable to extract video height')
1981
yv_video_height = mobj.group(1)
1983
mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1985
self._downloader.trouble(u'ERROR: unable to extract video width')
1987
yv_video_width = mobj.group(1)
1989
# Retrieve video playlist to extract media URL
1990
# I'm not completely sure what all these options are, but we
1991
# seem to need most of them, otherwise the server sends a 401.
1992
yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1993
yv_bitrate = '700' # according to Wikipedia this is hard-coded
1994
request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1995
'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1996
'&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1998
self.report_download_webpage(video_id)
1999
webpage = urllib2.urlopen(request).read()
2000
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2004
# Extract media URL from playlist XML
2005
mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2007
self._downloader.trouble(u'ERROR: Unable to extract media URL')
2009
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2010
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2013
# Process video information
2014
self._downloader.process_info({
2015
'id': video_id.decode('utf-8'),
2017
'uploader': video_uploader,
2018
'upload_date': u'NA',
2019
'title': video_title,
2020
'stitle': simple_title,
2021
'ext': video_extension.decode('utf-8'),
2022
'thumbnail': video_thumbnail.decode('utf-8'),
2023
'description': video_description,
2024
'thumbnail': video_thumbnail,
2027
except UnavailableVideoError:
2028
self._downloader.trouble(u'\nERROR: unable to download video')
2031
class VimeoIE(InfoExtractor):
2032
"""Information extractor for vimeo.com."""
2034
# _VALID_URL matches Vimeo URLs
2035
_VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2038
def __init__(self, downloader=None):
2039
InfoExtractor.__init__(self, downloader)
2041
def report_download_webpage(self, video_id):
2042
"""Report webpage download."""
2043
self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2045
def report_extraction(self, video_id):
2046
"""Report information extraction."""
2047
self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2049
def _real_extract(self, url, new_video=True):
2050
# Extract ID from URL
2051
mobj = re.match(self._VALID_URL, url)
2053
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2056
# At this point we have a new video
2057
self._downloader.increment_downloads()
2058
video_id = mobj.group(1)
2060
# Retrieve video webpage to extract further information
2061
request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2063
self.report_download_webpage(video_id)
2064
webpage = urllib2.urlopen(request).read()
2065
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2069
# Now we begin extracting as much information as we can from what we
2070
# retrieved. First we extract the information common to all extractors,
2071
# and latter we extract those that are Vimeo specific.
2072
self.report_extraction(video_id)
2075
mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2077
self._downloader.trouble(u'ERROR: unable to extract video title')
2079
video_title = mobj.group(1).decode('utf-8')
2080
simple_title = _simplify_title(video_title)
2083
mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2085
self._downloader.trouble(u'ERROR: unable to extract video uploader')
2087
video_uploader = mobj.group(1).decode('utf-8')
2089
# Extract video thumbnail
2090
mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2092
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2094
video_thumbnail = mobj.group(1).decode('utf-8')
2096
# # Extract video description
2097
# mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2099
# self._downloader.trouble(u'ERROR: unable to extract video description')
2101
# video_description = mobj.group(1).decode('utf-8')
2102
# if not video_description: video_description = 'No description available.'
2103
video_description = 'Foo.'
2105
# Vimeo specific: extract request signature
2106
mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2108
self._downloader.trouble(u'ERROR: unable to extract request signature')
2110
sig = mobj.group(1).decode('utf-8')
2112
# Vimeo specific: extract video quality information
2113
mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2115
self._downloader.trouble(u'ERROR: unable to extract video quality information')
2117
quality = mobj.group(1).decode('utf-8')
2119
if int(quality) == 1:
2124
# Vimeo specific: Extract request signature expiration
2125
mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2127
self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2129
sig_exp = mobj.group(1).decode('utf-8')
2131
video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2134
# Process video information
2135
self._downloader.process_info({
2136
'id': video_id.decode('utf-8'),
2138
'uploader': video_uploader,
2139
'upload_date': u'NA',
2140
'title': video_title,
2141
'stitle': simple_title,
2143
'thumbnail': video_thumbnail.decode('utf-8'),
2144
'description': video_description,
2145
'thumbnail': video_thumbnail,
2146
'description': video_description,
2149
except UnavailableVideoError:
2150
self._downloader.trouble(u'ERROR: unable to download video')
2153
class GenericIE(InfoExtractor):
2154
"""Generic last-resort information extractor."""
2157
IE_NAME = u'generic'
2159
def __init__(self, downloader=None):
2160
InfoExtractor.__init__(self, downloader)
2162
def report_download_webpage(self, video_id):
2163
"""Report webpage download."""
2164
self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2165
self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2167
def report_extraction(self, video_id):
2168
"""Report information extraction."""
2169
self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2171
def _real_extract(self, url):
2172
# At this point we have a new video
2173
self._downloader.increment_downloads()
2175
video_id = url.split('/')[-1]
2176
request = urllib2.Request(url)
2178
self.report_download_webpage(video_id)
2179
webpage = urllib2.urlopen(request).read()
2180
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2183
except ValueError, err:
2184
# since this is the last-resort InfoExtractor, if
2185
# this error is thrown, it'll be thrown here
2186
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2189
self.report_extraction(video_id)
2190
# Start with something easy: JW Player in SWFObject
2191
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2193
# Broaden the search a little bit
2194
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2196
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2199
# It's possible that one of the regexes
2200
# matched, but returned an empty group:
2201
if mobj.group(1) is None:
2202
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2205
video_url = urllib.unquote(mobj.group(1))
2206
video_id = os.path.basename(video_url)
2208
# here's a fun little line of code for you:
2209
video_extension = os.path.splitext(video_id)[1][1:]
2210
video_id = os.path.splitext(video_id)[0]
2212
# it's tempting to parse this further, but you would
2213
# have to take into account all the variations like
2214
# Video Title - Site Name
2215
# Site Name | Video Title
2216
# Video Title - Tagline | Site Name
2217
# and so on and so forth; it's just not practical
2218
mobj = re.search(r'<title>(.*)</title>', webpage)
2220
self._downloader.trouble(u'ERROR: unable to extract title')
2222
video_title = mobj.group(1).decode('utf-8')
2223
video_title = sanitize_title(video_title)
2224
simple_title = _simplify_title(video_title)
2226
# video uploader is domain name
2227
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2229
self._downloader.trouble(u'ERROR: unable to extract title')
2231
video_uploader = mobj.group(1).decode('utf-8')
2234
# Process video information
2235
self._downloader.process_info({
2236
'id': video_id.decode('utf-8'),
2237
'url': video_url.decode('utf-8'),
2238
'uploader': video_uploader,
2239
'upload_date': u'NA',
2240
'title': video_title,
2241
'stitle': simple_title,
2242
'ext': video_extension.decode('utf-8'),
2246
except UnavailableVideoError, err:
2247
self._downloader.trouble(u'\nERROR: unable to download video')
2250
class YoutubeSearchIE(InfoExtractor):
2251
"""Information Extractor for YouTube search queries."""
2252
_VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2253
_TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2254
_VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2255
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2257
_max_youtube_results = 1000
2258
IE_NAME = u'youtube:search'
2260
def __init__(self, youtube_ie, downloader=None):
2261
InfoExtractor.__init__(self, downloader)
2262
self._youtube_ie = youtube_ie
2264
def report_download_page(self, query, pagenum):
2265
"""Report attempt to download playlist page with given number."""
2266
query = query.decode(preferredencoding())
2267
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2269
def _real_initialize(self):
2270
self._youtube_ie.initialize()
2272
def _real_extract(self, query):
2273
mobj = re.match(self._VALID_URL, query)
2275
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2278
prefix, query = query.split(':')
2280
query = query.encode('utf-8')
2282
self._download_n_results(query, 1)
2284
elif prefix == 'all':
2285
self._download_n_results(query, self._max_youtube_results)
2291
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2293
elif n > self._max_youtube_results:
2294
self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2295
n = self._max_youtube_results
2296
self._download_n_results(query, n)
2298
except ValueError: # parsing prefix as integer fails
2299
self._download_n_results(query, 1)
2302
def _download_n_results(self, query, n):
2303
"""Downloads a specified number of results for a query"""
2306
already_seen = set()
2310
self.report_download_page(query, pagenum)
2311
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2312
request = urllib2.Request(result_url)
2314
page = urllib2.urlopen(request).read()
2315
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2319
# Extract video identifiers
2320
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2321
video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2322
if video_id not in already_seen:
2323
video_ids.append(video_id)
2324
already_seen.add(video_id)
2325
if len(video_ids) == n:
2326
# Specified n videos reached
2327
for id in video_ids:
2328
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2331
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332
for id in video_ids:
2333
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2336
pagenum = pagenum + 1
2339
class GoogleSearchIE(InfoExtractor):
2340
"""Information Extractor for Google Video search queries."""
2341
_VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2342
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2343
_VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2344
_MORE_PAGES_INDICATOR = r'<span>Next</span>'
2346
_max_google_results = 1000
2347
IE_NAME = u'video.google:search'
2349
def __init__(self, google_ie, downloader=None):
2350
InfoExtractor.__init__(self, downloader)
2351
self._google_ie = google_ie
2353
def report_download_page(self, query, pagenum):
2354
"""Report attempt to download playlist page with given number."""
2355
query = query.decode(preferredencoding())
2356
self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2358
def _real_initialize(self):
2359
self._google_ie.initialize()
2361
def _real_extract(self, query):
2362
mobj = re.match(self._VALID_URL, query)
2364
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2367
prefix, query = query.split(':')
2369
query = query.encode('utf-8')
2371
self._download_n_results(query, 1)
2373
elif prefix == 'all':
2374
self._download_n_results(query, self._max_google_results)
2380
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2382
elif n > self._max_google_results:
2383
self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2384
n = self._max_google_results
2385
self._download_n_results(query, n)
2387
except ValueError: # parsing prefix as integer fails
2388
self._download_n_results(query, 1)
2391
def _download_n_results(self, query, n):
2392
"""Downloads a specified number of results for a query"""
2395
already_seen = set()
2399
self.report_download_page(query, pagenum)
2400
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2401
request = urllib2.Request(result_url)
2403
page = urllib2.urlopen(request).read()
2404
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2408
# Extract video identifiers
2409
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2410
video_id = mobj.group(1)
2411
if video_id not in already_seen:
2412
video_ids.append(video_id)
2413
already_seen.add(video_id)
2414
if len(video_ids) == n:
2415
# Specified n videos reached
2416
for id in video_ids:
2417
self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2420
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2421
for id in video_ids:
2422
self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2425
pagenum = pagenum + 1
2428
class YahooSearchIE(InfoExtractor):
2429
"""Information Extractor for Yahoo! Video search queries."""
2430
_VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2431
_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2432
_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2433
_MORE_PAGES_INDICATOR = r'\s*Next'
2435
_max_yahoo_results = 1000
2436
IE_NAME = u'video.yahoo:search'
2438
def __init__(self, yahoo_ie, downloader=None):
2439
InfoExtractor.__init__(self, downloader)
2440
self._yahoo_ie = yahoo_ie
2442
def report_download_page(self, query, pagenum):
2443
"""Report attempt to download playlist page with given number."""
2444
query = query.decode(preferredencoding())
2445
self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2447
def _real_initialize(self):
2448
self._yahoo_ie.initialize()
2450
def _real_extract(self, query):
2451
mobj = re.match(self._VALID_URL, query)
2453
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2456
prefix, query = query.split(':')
2458
query = query.encode('utf-8')
2460
self._download_n_results(query, 1)
2462
elif prefix == 'all':
2463
self._download_n_results(query, self._max_yahoo_results)
2469
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2471
elif n > self._max_yahoo_results:
2472
self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2473
n = self._max_yahoo_results
2474
self._download_n_results(query, n)
2476
except ValueError: # parsing prefix as integer fails
2477
self._download_n_results(query, 1)
2480
def _download_n_results(self, query, n):
2481
"""Downloads a specified number of results for a query"""
2484
already_seen = set()
2488
self.report_download_page(query, pagenum)
2489
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2490
request = urllib2.Request(result_url)
2492
page = urllib2.urlopen(request).read()
2493
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2494
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2497
# Extract video identifiers
2498
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2499
video_id = mobj.group(1)
2500
if video_id not in already_seen:
2501
video_ids.append(video_id)
2502
already_seen.add(video_id)
2503
if len(video_ids) == n:
2504
# Specified n videos reached
2505
for id in video_ids:
2506
self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2509
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2510
for id in video_ids:
2511
self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2514
pagenum = pagenum + 1
2517
class YoutubePlaylistIE(InfoExtractor):
2518
"""Information Extractor for YouTube playlists."""
2520
_VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2521
_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2522
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2523
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2525
IE_NAME = u'youtube:playlist'
2527
def __init__(self, youtube_ie, downloader=None):
2528
InfoExtractor.__init__(self, downloader)
2529
self._youtube_ie = youtube_ie
2531
def report_download_page(self, playlist_id, pagenum):
2532
"""Report attempt to download playlist page with given number."""
2533
self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2535
def _real_initialize(self):
2536
self._youtube_ie.initialize()
2538
def _real_extract(self, url):
2539
# Extract playlist id
2540
mobj = re.match(self._VALID_URL, url)
2542
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2546
if mobj.group(3) is not None:
2547
self._youtube_ie.extract(mobj.group(3))
2550
# Download playlist pages
2551
# prefix is 'p' as default for playlists but there are other types that need extra care
2552
playlist_prefix = mobj.group(1)
2553
if playlist_prefix == 'a':
2554
playlist_access = 'artist'
2556
playlist_prefix = 'p'
2557
playlist_access = 'view_play_list'
2558
playlist_id = mobj.group(2)
2563
self.report_download_page(playlist_id, pagenum)
2564
url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2565
request = urllib2.Request(url)
2567
page = urllib2.urlopen(request).read()
2568
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2569
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2572
# Extract video identifiers
2574
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2575
if mobj.group(1) not in ids_in_page:
2576
ids_in_page.append(mobj.group(1))
2577
video_ids.extend(ids_in_page)
2579
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2581
pagenum = pagenum + 1
2583
playliststart = self._downloader.params.get('playliststart', 1) - 1
2584
playlistend = self._downloader.params.get('playlistend', -1)
2585
video_ids = video_ids[playliststart:playlistend]
2587
for id in video_ids:
2588
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2592
class YoutubeUserIE(InfoExtractor):
2593
"""Information Extractor for YouTube users."""
2595
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2596
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2597
_GDATA_PAGE_SIZE = 50
2598
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2599
_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2601
IE_NAME = u'youtube:user'
2603
def __init__(self, youtube_ie, downloader=None):
2604
InfoExtractor.__init__(self, downloader)
2605
self._youtube_ie = youtube_ie
2607
def report_download_page(self, username, start_index):
2608
"""Report attempt to download user page."""
2609
self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2610
(username, start_index, start_index + self._GDATA_PAGE_SIZE))
2612
def _real_initialize(self):
2613
self._youtube_ie.initialize()
2615
def _real_extract(self, url):
2617
mobj = re.match(self._VALID_URL, url)
2619
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2622
username = mobj.group(1)
2624
# Download video ids using YouTube Data API. Result size per
2625
# query is limited (currently to 50 videos) so we need to query
2626
# page by page until there are no video ids - it means we got
2633
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2634
self.report_download_page(username, start_index)
2636
request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2639
page = urllib2.urlopen(request).read()
2640
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2641
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2644
# Extract video identifiers
2647
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2648
if mobj.group(1) not in ids_in_page:
2649
ids_in_page.append(mobj.group(1))
2651
video_ids.extend(ids_in_page)
2653
# A little optimization - if current page is not
2654
# "full", ie. does not contain PAGE_SIZE video ids then
2655
# we can assume that this page is the last one - there
2656
# are no more ids on further pages - no need to query
2659
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2664
all_ids_count = len(video_ids)
2665
playliststart = self._downloader.params.get('playliststart', 1) - 1
2666
playlistend = self._downloader.params.get('playlistend', -1)
2668
if playlistend == -1:
2669
video_ids = video_ids[playliststart:]
2671
video_ids = video_ids[playliststart:playlistend]
2673
self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2674
(username, all_ids_count, len(video_ids)))
2676
for video_id in video_ids:
2677
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2680
class DepositFilesIE(InfoExtractor):
2681
"""Information extractor for depositfiles.com"""
2683
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2684
IE_NAME = u'DepositFiles'
2686
def __init__(self, downloader=None):
2687
InfoExtractor.__init__(self, downloader)
2689
def report_download_webpage(self, file_id):
2690
"""Report webpage download."""
2691
self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2693
def report_extraction(self, file_id):
2694
"""Report information extraction."""
2695
self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2697
def _real_extract(self, url):
2698
# At this point we have a new file
2699
self._downloader.increment_downloads()
2701
file_id = url.split('/')[-1]
2702
# Rebuild url in english locale
2703
url = 'http://depositfiles.com/en/files/' + file_id
2705
# Retrieve file webpage with 'Free download' button pressed
2706
free_download_indication = { 'gateway_result' : '1' }
2707
request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2709
self.report_download_webpage(file_id)
2710
webpage = urllib2.urlopen(request).read()
2711
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712
self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2715
# Search for the real file URL
2716
mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2717
if (mobj is None) or (mobj.group(1) is None):
2718
# Try to figure out reason of the error.
2719
mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2720
if (mobj is not None) and (mobj.group(1) is not None):
2721
restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2722
self._downloader.trouble(u'ERROR: %s' % restriction_message)
2724
self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2727
file_url = mobj.group(1)
2728
file_extension = os.path.splitext(file_url)[1][1:]
2730
# Search for file title
2731
mobj = re.search(r'<b title="(.*?)">', webpage)
2733
self._downloader.trouble(u'ERROR: unable to extract title')
2735
file_title = mobj.group(1).decode('utf-8')
2738
# Process file information
2739
self._downloader.process_info({
2740
'id': file_id.decode('utf-8'),
2741
'url': file_url.decode('utf-8'),
2743
'upload_date': u'NA',
2744
'title': file_title,
2745
'stitle': file_title,
2746
'ext': file_extension.decode('utf-8'),
2750
except UnavailableVideoError, err:
2751
self._downloader.trouble(u'ERROR: unable to download file')
2754
class FacebookIE(InfoExtractor):
2755
"""Information Extractor for Facebook"""
2757
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2758
_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2759
_NETRC_MACHINE = 'facebook'
2760
_available_formats = ['video', 'highqual', 'lowqual']
2761
_video_extensions = {
2766
IE_NAME = u'facebook'
2768
def __init__(self, downloader=None):
2769
InfoExtractor.__init__(self, downloader)
2771
def _reporter(self, message):
2772
"""Add header and report message."""
2773
self._downloader.to_screen(u'[facebook] %s' % message)
2775
def report_login(self):
2776
"""Report attempt to log in."""
2777
self._reporter(u'Logging in')
2779
def report_video_webpage_download(self, video_id):
2780
"""Report attempt to download video webpage."""
2781
self._reporter(u'%s: Downloading video webpage' % video_id)
2783
def report_information_extraction(self, video_id):
2784
"""Report attempt to extract video information."""
2785
self._reporter(u'%s: Extracting video information' % video_id)
2787
def _parse_page(self, video_webpage):
2788
"""Extract video information from page"""
2790
data = {'title': r'\("video_title", "(.*?)"\)',
2791
'description': r'<div class="datawrap">(.*?)</div>',
2792
'owner': r'\("video_owner_name", "(.*?)"\)',
2793
'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2796
for piece in data.keys():
2797
mobj = re.search(data[piece], video_webpage)
2798
if mobj is not None:
2799
video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2803
for fmt in self._available_formats:
2804
mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2805
if mobj is not None:
2806
# URL is in a Javascript segment inside an escaped Unicode format within
2807
# the generally utf-8 page
2808
video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2809
video_info['video_urls'] = video_urls
2813
def _real_initialize(self):
2814
if self._downloader is None:
2819
downloader_params = self._downloader.params
2821
# Attempt to use provided username and password or .netrc data
2822
if downloader_params.get('username', None) is not None:
2823
useremail = downloader_params['username']
2824
password = downloader_params['password']
2825
elif downloader_params.get('usenetrc', False):
2827
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2828
if info is not None:
2832
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2833
except (IOError, netrc.NetrcParseError), err:
2834
self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2837
if useremail is None:
2846
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2849
login_results = urllib2.urlopen(request).read()
2850
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2851
self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2853
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2854
self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2857
def _real_extract(self, url):
2858
mobj = re.match(self._VALID_URL, url)
2860
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2862
video_id = mobj.group('ID')
2865
self.report_video_webpage_download(video_id)
2866
request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2868
page = urllib2.urlopen(request)
2869
video_webpage = page.read()
2870
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2871
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2874
# Start extracting information
2875
self.report_information_extraction(video_id)
2877
# Extract information
2878
video_info = self._parse_page(video_webpage)
2881
if 'owner' not in video_info:
2882
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2884
video_uploader = video_info['owner']
2887
if 'title' not in video_info:
2888
self._downloader.trouble(u'ERROR: unable to extract video title')
2890
video_title = video_info['title']
2891
video_title = video_title.decode('utf-8')
2892
video_title = sanitize_title(video_title)
2894
simple_title = _simplify_title(video_title)
2897
if 'thumbnail' not in video_info:
2898
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2899
video_thumbnail = ''
2901
video_thumbnail = video_info['thumbnail']
2905
if 'upload_date' in video_info:
2906
upload_time = video_info['upload_date']
2907
timetuple = email.utils.parsedate_tz(upload_time)
2908
if timetuple is not None:
2910
upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2915
video_description = video_info.get('description', 'No description available.')
2917
url_map = video_info['video_urls']
2918
if len(url_map.keys()) > 0:
2919
# Decide which formats to download
2920
req_format = self._downloader.params.get('format', None)
2921
format_limit = self._downloader.params.get('format_limit', None)
2923
if format_limit is not None and format_limit in self._available_formats:
2924
format_list = self._available_formats[self._available_formats.index(format_limit):]
2926
format_list = self._available_formats
2927
existing_formats = [x for x in format_list if x in url_map]
2928
if len(existing_formats) == 0:
2929
self._downloader.trouble(u'ERROR: no known formats available for video')
2931
if req_format is None:
2932
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2933
elif req_format == 'worst':
2934
video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2935
elif req_format == '-1':
2936
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2939
if req_format not in url_map:
2940
self._downloader.trouble(u'ERROR: requested format not available')
2942
video_url_list = [(req_format, url_map[req_format])] # Specific format
2944
for format_param, video_real_url in video_url_list:
2946
# At this point we have a new video
2947
self._downloader.increment_downloads()
2950
video_extension = self._video_extensions.get(format_param, 'mp4')
2953
# Process video information
2954
self._downloader.process_info({
2955
'id': video_id.decode('utf-8'),
2956
'url': video_real_url.decode('utf-8'),
2957
'uploader': video_uploader.decode('utf-8'),
2958
'upload_date': upload_date,
2959
'title': video_title,
2960
'stitle': simple_title,
2961
'ext': video_extension.decode('utf-8'),
2962
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2963
'thumbnail': video_thumbnail.decode('utf-8'),
2964
'description': video_description.decode('utf-8'),
2967
except UnavailableVideoError, err:
2968
self._downloader.trouble(u'\nERROR: unable to download video')
2970
class BlipTVIE(InfoExtractor):
2971
"""Information extractor for blip.tv"""
2973
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2974
_URL_EXT = r'^.*\.([a-z0-9]+)$'
2975
IE_NAME = u'blip.tv'
2977
def report_extraction(self, file_id):
2978
"""Report information extraction."""
2979
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2981
def report_direct_download(self, title):
2982
"""Report information extraction."""
2983
self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2985
def _real_extract(self, url):
2986
mobj = re.match(self._VALID_URL, url)
2988
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2995
json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2996
request = urllib2.Request(json_url)
2997
self.report_extraction(mobj.group(1))
3000
urlh = urllib2.urlopen(request)
3001
if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3002
basename = url.split('/')[-1]
3003
title,ext = os.path.splitext(basename)
3004
title = title.decode('UTF-8')
3005
ext = ext.replace('.', '')
3006
self.report_direct_download(title)
3011
'stitle': _simplify_title(title),
3015
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3016
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3018
if info is None: # Regular URL
3020
json_code = urlh.read()
3021
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3022
self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3026
json_data = json.loads(json_code)
3027
if 'Post' in json_data:
3028
data = json_data['Post']
3032
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3033
video_url = data['media']['url']
3034
umobj = re.match(self._URL_EXT, video_url)
3036
raise ValueError('Can not determine filename extension')
3037
ext = umobj.group(1)
3040
'id': data['item_id'],
3042
'uploader': data['display_name'],
3043
'upload_date': upload_date,
3044
'title': data['title'],
3045
'stitle': _simplify_title(data['title']),
3047
'format': data['media']['mimeType'],
3048
'thumbnail': data['thumbnailUrl'],
3049
'description': data['description'],
3050
'player_url': data['embedUrl']
3052
except (ValueError,KeyError), err:
3053
self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3056
self._downloader.increment_downloads()
3059
self._downloader.process_info(info)
3060
except UnavailableVideoError, err:
3061
self._downloader.trouble(u'\nERROR: unable to download video')
3064
class MyVideoIE(InfoExtractor):
3065
"""Information Extractor for myvideo.de."""
3067
_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3068
IE_NAME = u'myvideo'
3070
def __init__(self, downloader=None):
3071
InfoExtractor.__init__(self, downloader)
3073
def report_download_webpage(self, video_id):
3074
"""Report webpage download."""
3075
self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3077
def report_extraction(self, video_id):
3078
"""Report information extraction."""
3079
self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3081
def _real_extract(self,url):
3082
mobj = re.match(self._VALID_URL, url)
3084
self._download.trouble(u'ERROR: invalid URL: %s' % url)
3087
video_id = mobj.group(1)
3090
request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3092
self.report_download_webpage(video_id)
3093
webpage = urllib2.urlopen(request).read()
3094
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3098
self.report_extraction(video_id)
3099
mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3102
self._downloader.trouble(u'ERROR: unable to extract media URL')
3104
video_url = mobj.group(1) + ('/%s.flv' % video_id)
3106
mobj = re.search('<title>([^<]+)</title>', webpage)
3108
self._downloader.trouble(u'ERROR: unable to extract title')
3111
video_title = mobj.group(1)
3112
video_title = sanitize_title(video_title)
3114
simple_title = _simplify_title(video_title)
3117
self._downloader.process_info({
3121
'upload_date': u'NA',
3122
'title': video_title,
3123
'stitle': simple_title,
3128
except UnavailableVideoError:
3129
self._downloader.trouble(u'\nERROR: Unable to download video')
3131
class ComedyCentralIE(InfoExtractor):
3132
"""Information extractor for The Daily Show and Colbert Report """
3134
_VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3135
IE_NAME = u'comedycentral'
3137
def report_extraction(self, episode_id):
3138
self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3140
def report_config_download(self, episode_id):
3141
self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3143
def report_index_download(self, episode_id):
3144
self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3146
def report_player_url(self, episode_id):
3147
self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3149
def _real_extract(self, url):
3150
mobj = re.match(self._VALID_URL, url)
3152
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3155
if mobj.group('shortname'):
3156
if mobj.group('shortname') in ('tds', 'thedailyshow'):
3157
url = u'http://www.thedailyshow.com/full-episodes/'
3159
url = u'http://www.colbertnation.com/full-episodes/'
3160
mobj = re.match(self._VALID_URL, url)
3161
assert mobj is not None
3163
dlNewest = not mobj.group('episode')
3165
epTitle = mobj.group('showname')
3167
epTitle = mobj.group('episode')
3169
req = urllib2.Request(url)
3170
self.report_extraction(epTitle)
3172
htmlHandle = urllib2.urlopen(req)
3173
html = htmlHandle.read()
3174
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3175
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3178
url = htmlHandle.geturl()
3179
mobj = re.match(self._VALID_URL, url)
3181
self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3183
if mobj.group('episode') == '':
3184
self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3186
epTitle = mobj.group('episode')
3188
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3189
if len(mMovieParams) == 0:
3190
self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3193
playerUrl_raw = mMovieParams[0][0]
3194
self.report_player_url(epTitle)
3196
urlHandle = urllib2.urlopen(playerUrl_raw)
3197
playerUrl = urlHandle.geturl()
3198
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3199
self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3202
uri = mMovieParams[0][1]
3203
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3204
self.report_index_download(epTitle)
3206
indexXml = urllib2.urlopen(indexUrl).read()
3207
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3208
self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3211
idoc = xml.etree.ElementTree.fromstring(indexXml)
3212
itemEls = idoc.findall('.//item')
3213
for itemEl in itemEls:
3214
mediaId = itemEl.findall('./guid')[0].text
3215
shortMediaId = mediaId.split(':')[-1]
3216
showId = mediaId.split(':')[-2].replace('.com', '')
3217
officialTitle = itemEl.findall('./title')[0].text
3218
officialDate = itemEl.findall('./pubDate')[0].text
3220
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3221
urllib.urlencode({'uri': mediaId}))
3222
configReq = urllib2.Request(configUrl)
3223
self.report_config_download(epTitle)
3225
configXml = urllib2.urlopen(configReq).read()
3226
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3230
cdoc = xml.etree.ElementTree.fromstring(configXml)
3232
for rendition in cdoc.findall('.//rendition'):
3233
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3237
self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3240
# For now, just pick the highest bitrate
3241
format,video_url = turls[-1]
3243
self._downloader.increment_downloads()
3245
effTitle = showId + u'-' + epTitle
3250
'upload_date': officialDate,
3252
'stitle': _simplify_title(effTitle),
3256
'description': officialTitle,
3257
'player_url': playerUrl
3261
self._downloader.process_info(info)
3262
except UnavailableVideoError, err:
3263
self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3267
class EscapistIE(InfoExtractor):
3268
"""Information extractor for The Escapist """
3270
_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3271
IE_NAME = u'escapist'
3273
def report_extraction(self, showName):
3274
self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3276
def report_config_download(self, showName):
3277
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3279
def _real_extract(self, url):
3280
htmlParser = HTMLParser.HTMLParser()
3282
mobj = re.match(self._VALID_URL, url)
3284
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3286
showName = mobj.group('showname')
3287
videoId = mobj.group('episode')
3289
self.report_extraction(showName)
3291
webPage = urllib2.urlopen(url).read()
3292
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293
self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3296
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3297
description = htmlParser.unescape(descMatch.group(1))
3298
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3299
imgUrl = htmlParser.unescape(imgMatch.group(1))
3300
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3301
playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3302
configUrlMatch = re.search('config=(.*)$', playerUrl)
3303
configUrl = urllib2.unquote(configUrlMatch.group(1))
3305
self.report_config_download(showName)
3307
configJSON = urllib2.urlopen(configUrl).read()
3308
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3309
self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3312
# Technically, it's JavaScript, not JSON
3313
configJSON = configJSON.replace("'", '"')
3316
config = json.loads(configJSON)
3317
except (ValueError,), err:
3318
self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3321
playlist = config['playlist']
3322
videoUrl = playlist[1]['url']
3324
self._downloader.increment_downloads()
3328
'uploader': showName,
3329
'upload_date': None,
3331
'stitle': _simplify_title(showName),
3334
'thumbnail': imgUrl,
3335
'description': description,
3336
'player_url': playerUrl,
3340
self._downloader.process_info(info)
3341
except UnavailableVideoError, err:
3342
self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3345
class CollegeHumorIE(InfoExtractor):
3346
"""Information extractor for collegehumor.com"""
3348
_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3349
IE_NAME = u'collegehumor'
3351
def report_webpage(self, video_id):
3352
"""Report information extraction."""
3353
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3355
def report_extraction(self, video_id):
3356
"""Report information extraction."""
3357
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3359
def _real_extract(self, url):
3360
htmlParser = HTMLParser.HTMLParser()
3362
mobj = re.match(self._VALID_URL, url)
3364
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3366
video_id = mobj.group('videoid')
3368
self.report_webpage(video_id)
3369
request = urllib2.Request(url)
3371
webpage = urllib2.urlopen(request).read()
3372
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3373
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3376
m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3378
self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3380
internal_video_id = m.group('internalvideoid')
3384
'internal_id': internal_video_id,
3387
self.report_extraction(video_id)
3388
xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3390
metaXml = urllib2.urlopen(xmlUrl).read()
3391
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3392
self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3395
mdoc = xml.etree.ElementTree.fromstring(metaXml)
3397
videoNode = mdoc.findall('./video')[0]
3398
info['description'] = videoNode.findall('./description')[0].text
3399
info['title'] = videoNode.findall('./caption')[0].text
3400
info['stitle'] = _simplify_title(info['title'])
3401
info['url'] = videoNode.findall('./file')[0].text
3402
info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3403
info['ext'] = info['url'].rpartition('.')[2]
3404
info['format'] = info['ext']
3406
self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3409
self._downloader.increment_downloads()
3412
self._downloader.process_info(info)
3413
except UnavailableVideoError, err:
3414
self._downloader.trouble(u'\nERROR: unable to download video')
3417
class XVideosIE(InfoExtractor):
3418
"""Information extractor for xvideos.com"""
3420
_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3421
IE_NAME = u'xvideos'
3423
def report_webpage(self, video_id):
3424
"""Report information extraction."""
3425
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3427
def report_extraction(self, video_id):
3428
"""Report information extraction."""
3429
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3431
def _real_extract(self, url):
3432
htmlParser = HTMLParser.HTMLParser()
3434
mobj = re.match(self._VALID_URL, url)
3436
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3438
video_id = mobj.group(1).decode('utf-8')
3440
self.report_webpage(video_id)
3442
request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3444
webpage = urllib2.urlopen(request).read()
3445
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3446
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3449
self.report_extraction(video_id)
3453
mobj = re.search(r'flv_url=(.+?)&', webpage)
3455
self._downloader.trouble(u'ERROR: unable to extract video url')
3457
video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3461
mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3463
self._downloader.trouble(u'ERROR: unable to extract video title')
3465
video_title = mobj.group(1).decode('utf-8')
3468
# Extract video thumbnail
3469
mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3471
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3473
video_thumbnail = mobj.group(1).decode('utf-8')
3477
self._downloader.increment_downloads()
3482
'upload_date': None,
3483
'title': video_title,
3484
'stitle': _simplify_title(video_title),
3487
'thumbnail': video_thumbnail,
3488
'description': None,
3493
self._downloader.process_info(info)
3494
except UnavailableVideoError, err:
3495
self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3498
class SoundcloudIE(InfoExtractor):
3499
"""Information extractor for soundcloud.com
3500
To access the media, the uid of the song and a stream token
3501
must be extracted from the page source and the script must make
3502
a request to media.soundcloud.com/crossdomain.xml. Then
3503
the media can be grabbed by requesting from an url composed
3504
of the stream token and uid
3507
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3508
IE_NAME = u'soundcloud'
3510
def __init__(self, downloader=None):
3511
InfoExtractor.__init__(self, downloader)
3513
def report_webpage(self, video_id):
3514
"""Report information extraction."""
3515
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3517
def report_extraction(self, video_id):
3518
"""Report information extraction."""
3519
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3521
def _real_extract(self, url):
3522
htmlParser = HTMLParser.HTMLParser()
3524
mobj = re.match(self._VALID_URL, url)
3526
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3529
# extract uploader (which is in the url)
3530
uploader = mobj.group(1).decode('utf-8')
3531
# extract simple title (uploader + slug of song title)
3532
slug_title = mobj.group(2).decode('utf-8')
3533
simple_title = uploader + '-' + slug_title
3535
self.report_webpage('%s/%s' % (uploader, slug_title))
3537
request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3539
webpage = urllib2.urlopen(request).read()
3540
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3541
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3544
self.report_extraction('%s/%s' % (uploader, slug_title))
3546
# extract uid and stream token that soundcloud hands out for access
3547
mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3549
video_id = mobj.group(1)
3550
stream_token = mobj.group(2)
3552
# extract unsimplified title
3553
mobj = re.search('"title":"(.*?)",', webpage)
3555
title = mobj.group(1)
3557
# construct media url (with uid/token)
3558
mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3559
mediaURL = mediaURL % (video_id, stream_token)
3562
description = u'No description available'
3563
mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3565
description = mobj.group(1)
3569
mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3572
upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3573
except Exception, e:
3576
# for soundcloud, a request to a cross domain is required for cookies
3577
request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3580
self._downloader.process_info({
3581
'id': video_id.decode('utf-8'),
3583
'uploader': uploader.decode('utf-8'),
3584
'upload_date': upload_date,
3585
'title': simple_title.decode('utf-8'),
3586
'stitle': simple_title.decode('utf-8'),
3590
'description': description.decode('utf-8')
3592
except UnavailableVideoError:
3593
self._downloader.trouble(u'\nERROR: unable to download video')
3596
class InfoQIE(InfoExtractor):
3597
"""Information extractor for infoq.com"""
3599
_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3602
def report_webpage(self, video_id):
3603
"""Report information extraction."""
3604
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3606
def report_extraction(self, video_id):
3607
"""Report information extraction."""
3608
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3610
def _real_extract(self, url):
3611
htmlParser = HTMLParser.HTMLParser()
3613
mobj = re.match(self._VALID_URL, url)
3615
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3618
self.report_webpage(url)
3620
request = urllib2.Request(url)
3622
webpage = urllib2.urlopen(request).read()
3623
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3624
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3627
self.report_extraction(url)
3631
mobj = re.search(r"jsclassref='([^']*)'", webpage)
3633
self._downloader.trouble(u'ERROR: unable to extract video url')
3635
video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3639
mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3641
self._downloader.trouble(u'ERROR: unable to extract video title')
3643
video_title = mobj.group(1).decode('utf-8')
3645
# Extract description
3646
video_description = u'No description available.'
3647
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3648
if mobj is not None:
3649
video_description = mobj.group(1).decode('utf-8')
3651
video_filename = video_url.split('/')[-1]
3652
video_id, extension = video_filename.split('.')
3654
self._downloader.increment_downloads()
3659
'upload_date': None,
3660
'title': video_title,
3661
'stitle': _simplify_title(video_title),
3663
'format': extension, # Extension is always(?) mp4, but seems to be flv
3665
'description': video_description,
3670
self._downloader.process_info(info)
3671
except UnavailableVideoError, err:
3672
self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3674
class MixcloudIE(InfoExtractor):
3675
"""Information extractor for www.mixcloud.com"""
3676
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3677
IE_NAME = u'mixcloud'
3679
def __init__(self, downloader=None):
3680
InfoExtractor.__init__(self, downloader)
3682
def report_download_json(self, file_id):
3683
"""Report JSON download."""
3684
self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3686
def report_extraction(self, file_id):
3687
"""Report information extraction."""
3688
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3690
def get_urls(self, jsonData, fmt, bitrate='best'):
3691
"""Get urls from 'audio_formats' section in json"""
3694
bitrate_list = jsonData[fmt]
3695
if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3696
bitrate = max(bitrate_list) # select highest
3698
url_list = jsonData[fmt][bitrate]
3699
except TypeError: # we have no bitrate info.
3700
url_list = jsonData[fmt]
3704
def check_urls(self, url_list):
3705
"""Returns 1st active url from list"""
3706
for url in url_list:
3708
urllib2.urlopen(url)
3710
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3715
def _print_formats(self, formats):
3716
print 'Available formats:'
3717
for fmt in formats.keys():
3718
for b in formats[fmt]:
3720
ext = formats[fmt][b][0]
3721
print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3722
except TypeError: # we have no bitrate info
3723
ext = formats[fmt][0]
3724
print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3727
def _real_extract(self, url):
3728
mobj = re.match(self._VALID_URL, url)
3730
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3732
# extract uploader & filename from url
3733
uploader = mobj.group(1).decode('utf-8')
3734
file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3736
# construct API request
3737
file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3738
# retrieve .json file with links to files
3739
request = urllib2.Request(file_url)
3741
self.report_download_json(file_url)
3742
jsonData = urllib2.urlopen(request).read()
3743
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3744
self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3748
json_data = json.loads(jsonData)
3749
player_url = json_data['player_swf_url']
3750
formats = dict(json_data['audio_formats'])
3752
req_format = self._downloader.params.get('format', None)
3755
if self._downloader.params.get('listformats', None):
3756
self._print_formats(formats)
3759
if req_format is None or req_format == 'best':
3760
for format_param in formats.keys():
3761
url_list = self.get_urls(formats, format_param)
3763
file_url = self.check_urls(url_list)
3764
if file_url is not None:
3767
if req_format not in formats.keys():
3768
self._downloader.trouble(u'ERROR: format is not available')
3771
url_list = self.get_urls(formats, req_format)
3772
file_url = self.check_urls(url_list)
3773
format_param = req_format
3776
self._downloader.increment_downloads()
3778
# Process file information
3779
self._downloader.process_info({
3780
'id': file_id.decode('utf-8'),
3781
'url': file_url.decode('utf-8'),
3782
'uploader': uploader.decode('utf-8'),
3783
'upload_date': u'NA',
3784
'title': json_data['name'],
3785
'stitle': _simplify_title(json_data['name']),
3786
'ext': file_url.split('.')[-1].decode('utf-8'),
3787
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3788
'thumbnail': json_data['thumbnail_url'],
3789
'description': json_data['description'],
3790
'player_url': player_url.decode('utf-8'),
3792
except UnavailableVideoError, err:
3793
self._downloader.trouble(u'ERROR: unable to download file')
3795
class StanfordOpenClassroomIE(InfoExtractor):
3796
"""Information extractor for Stanford's Open ClassRoom"""
3798
_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3799
IE_NAME = u'stanfordoc'
3801
def report_download_webpage(self, objid):
3802
"""Report information extraction."""
3803
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3805
def report_extraction(self, video_id):
3806
"""Report information extraction."""
3807
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3809
def _real_extract(self, url):
3810
mobj = re.match(self._VALID_URL, url)
3812
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3815
if mobj.group('course') and mobj.group('video'): # A specific video
3816
course = mobj.group('course')
3817
video = mobj.group('video')
3819
'id': _simplify_title(course + '_' + video),
3822
self.report_extraction(info['id'])
3823
baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3824
xmlUrl = baseUrl + video + '.xml'
3826
metaXml = urllib2.urlopen(xmlUrl).read()
3827
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3828
self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3830
mdoc = xml.etree.ElementTree.fromstring(metaXml)
3832
info['title'] = mdoc.findall('./title')[0].text
3833
info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3835
self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3837
info['stitle'] = _simplify_title(info['title'])
3838
info['ext'] = info['url'].rpartition('.')[2]
3839
info['format'] = info['ext']
3840
self._downloader.increment_downloads()
3842
self._downloader.process_info(info)
3843
except UnavailableVideoError, err:
3844
self._downloader.trouble(u'\nERROR: unable to download video')
3845
elif mobj.group('course'): # A course page
3846
unescapeHTML = HTMLParser.HTMLParser().unescape
3848
course = mobj.group('course')
3850
'id': _simplify_title(course),
3854
self.report_download_webpage(info['id'])
3856
coursepage = urllib2.urlopen(url).read()
3857
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3858
self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3861
m = re.search('<h1>([^<]+)</h1>', coursepage)
3863
info['title'] = unescapeHTML(m.group(1))
3865
info['title'] = info['id']
3866
info['stitle'] = _simplify_title(info['title'])
3868
m = re.search('<description>([^<]+)</description>', coursepage)
3870
info['description'] = unescapeHTML(m.group(1))
3872
links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3875
'type': 'reference',
3876
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3880
for entry in info['list']:
3881
assert entry['type'] == 'reference'
3882
self.extract(entry['url'])
3884
unescapeHTML = HTMLParser.HTMLParser().unescape
3887
'id': 'Stanford OpenClassroom',
3891
self.report_download_webpage(info['id'])
3892
rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3894
rootpage = urllib2.urlopen(rootURL).read()
3895
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3896
self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3899
info['title'] = info['id']
3900
info['stitle'] = _simplify_title(info['title'])
3902
links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3905
'type': 'reference',
3906
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3910
for entry in info['list']:
3911
assert entry['type'] == 'reference'
3912
self.extract(entry['url'])
3914
class MTVIE(InfoExtractor):
3915
"""Information extractor for MTV.com"""
3917
_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3920
def report_webpage(self, video_id):
3921
"""Report information extraction."""
3922
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3924
def report_extraction(self, video_id):
3925
"""Report information extraction."""
3926
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3928
def _real_extract(self, url):
3929
mobj = re.match(self._VALID_URL, url)
3931
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3933
if not mobj.group('proto'):
3934
url = 'http://' + url
3935
video_id = mobj.group('videoid')
3936
self.report_webpage(video_id)
3938
request = urllib2.Request(url)
3940
webpage = urllib2.urlopen(request).read()
3941
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3942
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3945
mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3947
self._downloader.trouble(u'ERROR: unable to extract song name')
3949
song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3950
mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3952
self._downloader.trouble(u'ERROR: unable to extract performer')
3954
performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3955
video_title = performer + ' - ' + song_name
3957
mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3959
self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3961
mtvn_uri = mobj.group(1)
3963
mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3965
self._downloader.trouble(u'ERROR: unable to extract content id')
3967
content_id = mobj.group(1)
3969
videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3970
self.report_extraction(video_id)
3971
request = urllib2.Request(videogen_url)
3973
metadataXml = urllib2.urlopen(request).read()
3974
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3975
self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3978
mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3979
renditions = mdoc.findall('.//rendition')
3981
# For now, always pick the highest quality.
3982
rendition = renditions[-1]
3985
_,_,ext = rendition.attrib['type'].partition('/')
3986
format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3987
video_url = rendition.find('./src').text
3989
self._downloader.trouble('Invalid rendition field.')
3992
self._downloader.increment_downloads()
3996
'uploader': performer,
3997
'title': video_title,
3998
'stitle': _simplify_title(video_title),
4004
self._downloader.process_info(info)
4005
except UnavailableVideoError, err:
4006
self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4009
class PostProcessor(object):
4010
"""Post Processor class.
4012
PostProcessor objects can be added to downloaders with their
4013
add_post_processor() method. When the downloader has finished a
4014
successful download, it will take its internal chain of PostProcessors
4015
and start calling the run() method on each one of them, first with
4016
an initial argument and then with the returned value of the previous
4019
The chain will be stopped if one of them ever returns None or the end
4020
of the chain is reached.
4022
PostProcessor objects follow a "mutual registration" process similar
4023
to InfoExtractor objects.
4028
def __init__(self, downloader=None):
4029
self._downloader = downloader
4031
def set_downloader(self, downloader):
4032
"""Sets the downloader for this PP."""
4033
self._downloader = downloader
4035
def run(self, information):
4036
"""Run the PostProcessor.
4038
The "information" argument is a dictionary like the ones
4039
composed by InfoExtractors. The only difference is that this
4040
one has an extra field called "filepath" that points to the
4043
When this method returns None, the postprocessing chain is
4044
stopped. However, this method may return an information
4045
dictionary that will be passed to the next postprocessing
4046
object in the chain. It can be the one it received after
4047
changing some fields.
4049
In addition, this method may raise a PostProcessingError
4050
exception that will be taken into account by the downloader
4053
return information # by default, do nothing
4055
class AudioConversionError(BaseException):
4056
def __init__(self, message):
4057
self.message = message
4059
class FFmpegExtractAudioPP(PostProcessor):
4061
def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4062
PostProcessor.__init__(self, downloader)
4063
if preferredcodec is None:
4064
preferredcodec = 'best'
4065
self._preferredcodec = preferredcodec
4066
self._preferredquality = preferredquality
4067
self._keepvideo = keepvideo
4070
def get_audio_codec(path):
4072
cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4073
handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4074
output = handle.communicate()[0]
4075
if handle.wait() != 0:
4077
except (IOError, OSError):
4080
for line in output.split('\n'):
4081
if line.startswith('codec_name='):
4082
audio_codec = line.split('=')[1].strip()
4083
elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4088
def run_ffmpeg(path, out_path, codec, more_opts):
4092
acodec_opts = ['-acodec', codec]
4093
cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4095
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4096
stdout,stderr = p.communicate()
4097
except (IOError, OSError):
4098
e = sys.exc_info()[1]
4099
if isinstance(e, OSError) and e.errno == 2:
4100
raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4103
if p.returncode != 0:
4104
msg = stderr.strip().split('\n')[-1]
4105
raise AudioConversionError(msg)
4107
def run(self, information):
4108
path = information['filepath']
4110
filecodec = self.get_audio_codec(path)
4111
if filecodec is None:
4112
self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4116
if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4117
if self._preferredcodec == 'm4a' and filecodec == 'aac':
4118
# Lossless, but in another container
4120
extension = self._preferredcodec
4121
more_opts = ['-absf', 'aac_adtstoasc']
4122
elif filecodec in ['aac', 'mp3', 'vorbis']:
4123
# Lossless if possible
4125
extension = filecodec
4126
if filecodec == 'aac':
4127
more_opts = ['-f', 'adts']
4128
if filecodec == 'vorbis':
4132
acodec = 'libmp3lame'
4135
if self._preferredquality is not None:
4136
more_opts += ['-ab', self._preferredquality]
4138
# We convert the audio (lossy)
4139
acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4140
extension = self._preferredcodec
4142
if self._preferredquality is not None:
4143
more_opts += ['-ab', self._preferredquality]
4144
if self._preferredcodec == 'aac':
4145
more_opts += ['-f', 'adts']
4146
if self._preferredcodec == 'm4a':
4147
more_opts += ['-absf', 'aac_adtstoasc']
4148
if self._preferredcodec == 'vorbis':
4150
if self._preferredcodec == 'wav':
4152
more_opts += ['-f', 'wav']
4154
prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4155
new_path = prefix + sep + extension
4156
self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4158
self.run_ffmpeg(path, new_path, acodec, more_opts)
4160
etype,e,tb = sys.exc_info()
4161
if isinstance(e, AudioConversionError):
4162
self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4164
self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4167
# Try to update the date time for extracted audio file.
4168
if information.get('filetime') is not None:
4170
os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4172
self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4174
if not self._keepvideo:
4176
os.remove(_encodeFilename(path))
4177
except (IOError, OSError):
4178
self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4181
information['filepath'] = new_path
4185
def updateSelf(downloader, filename):
4186
''' Update the program file with the latest version from the repository '''
4187
# Note: downloader only used for options
4188
if not os.access(filename, os.W_OK):
4189
sys.exit('ERROR: no write permissions on %s' % filename)
4191
downloader.to_screen(u'Updating to latest version...')
4195
urlh = urllib.urlopen(UPDATE_URL)
4196
newcontent = urlh.read()
4198
vmatch = re.search("__version__ = '([^']+)'", newcontent)
4199
if vmatch is not None and vmatch.group(1) == __version__:
4200
downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4204
except (IOError, OSError), err:
4205
sys.exit('ERROR: unable to download latest version')
4208
outf = open(filename, 'wb')
4210
outf.write(newcontent)
4213
except (IOError, OSError), err:
4214
sys.exit('ERROR: unable to overwrite current version')
4216
downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4219
def _readOptions(filename_bytes):
4221
optionf = open(filename_bytes)
4223
return [] # silently skip if file is not present
4227
res += shlex.split(l, comments=True)
4232
def _format_option_string(option):
4233
''' ('-o', '--option') -> -o, --format METAVAR'''
4237
if option._short_opts: opts.append(option._short_opts[0])
4238
if option._long_opts: opts.append(option._long_opts[0])
4239
if len(opts) > 1: opts.insert(1, ', ')
4241
if option.takes_value(): opts.append(' %s' % option.metavar)
4243
return "".join(opts)
4245
def _find_term_columns():
4246
columns = os.environ.get('COLUMNS', None)
4251
sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4252
out,err = sp.communicate()
4253
return int(out.split()[1])
4259
max_help_position = 80
4261
# No need to wrap help messages if we're on a wide console
4262
columns = _find_term_columns()
4263
if columns: max_width = columns
4265
fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4266
fmt.format_option_strings = _format_option_string
4269
'version' : __version__,
4271
'usage' : '%prog [options] url [url...]',
4272
'conflict_handler' : 'resolve',
4275
parser = optparse.OptionParser(**kw)
4278
general = optparse.OptionGroup(parser, 'General Options')
4279
selection = optparse.OptionGroup(parser, 'Video Selection')
4280
authentication = optparse.OptionGroup(parser, 'Authentication Options')
4281
video_format = optparse.OptionGroup(parser, 'Video Format Options')
4282
postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4283
filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4284
verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4286
general.add_option('-h', '--help',
4287
action='help', help='print this help text and exit')
4288
general.add_option('-v', '--version',
4289
action='version', help='print program version and exit')
4290
general.add_option('-U', '--update',
4291
action='store_true', dest='update_self', help='update this program to latest version')
4292
general.add_option('-i', '--ignore-errors',
4293
action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4294
general.add_option('-r', '--rate-limit',
4295
dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4296
general.add_option('-R', '--retries',
4297
dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4298
general.add_option('--dump-user-agent',
4299
action='store_true', dest='dump_user_agent',
4300
help='display the current browser identification', default=False)
4301
general.add_option('--list-extractors',
4302
action='store_true', dest='list_extractors',
4303
help='List all supported extractors and the URLs they would handle', default=False)
4305
selection.add_option('--playlist-start',
4306
dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4307
selection.add_option('--playlist-end',
4308
dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4309
selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4310
selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4311
selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4313
authentication.add_option('-u', '--username',
4314
dest='username', metavar='USERNAME', help='account username')
4315
authentication.add_option('-p', '--password',
4316
dest='password', metavar='PASSWORD', help='account password')
4317
authentication.add_option('-n', '--netrc',
4318
action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4321
video_format.add_option('-f', '--format',
4322
action='store', dest='format', metavar='FORMAT', help='video format code')
4323
video_format.add_option('--all-formats',
4324
action='store_const', dest='format', help='download all available video formats', const='all')
4325
video_format.add_option('--prefer-free-formats',
4326
action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4327
video_format.add_option('--max-quality',
4328
action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4329
video_format.add_option('-F', '--list-formats',
4330
action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4333
verbosity.add_option('-q', '--quiet',
4334
action='store_true', dest='quiet', help='activates quiet mode', default=False)
4335
verbosity.add_option('-s', '--simulate',
4336
action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4337
verbosity.add_option('--skip-download',
4338
action='store_true', dest='skip_download', help='do not download the video', default=False)
4339
verbosity.add_option('-g', '--get-url',
4340
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4341
verbosity.add_option('-e', '--get-title',
4342
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4343
verbosity.add_option('--get-thumbnail',
4344
action='store_true', dest='getthumbnail',
4345
help='simulate, quiet but print thumbnail URL', default=False)
4346
verbosity.add_option('--get-description',
4347
action='store_true', dest='getdescription',
4348
help='simulate, quiet but print video description', default=False)
4349
verbosity.add_option('--get-filename',
4350
action='store_true', dest='getfilename',
4351
help='simulate, quiet but print output filename', default=False)
4352
verbosity.add_option('--get-format',
4353
action='store_true', dest='getformat',
4354
help='simulate, quiet but print output format', default=False)
4355
verbosity.add_option('--no-progress',
4356
action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4357
verbosity.add_option('--console-title',
4358
action='store_true', dest='consoletitle',
4359
help='display progress in console titlebar', default=False)
4360
verbosity.add_option('-v', '--verbose',
4361
action='store_true', dest='verbose', help='print various debugging information', default=False)
4364
filesystem.add_option('-t', '--title',
4365
action='store_true', dest='usetitle', help='use title in file name', default=False)
4366
filesystem.add_option('-l', '--literal',
4367
action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4368
filesystem.add_option('-A', '--auto-number',
4369
action='store_true', dest='autonumber',
4370
help='number downloaded files starting from 00000', default=False)
4371
filesystem.add_option('-o', '--output',
4372
dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4373
filesystem.add_option('-a', '--batch-file',
4374
dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4375
filesystem.add_option('-w', '--no-overwrites',
4376
action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4377
filesystem.add_option('-c', '--continue',
4378
action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4379
filesystem.add_option('--no-continue',
4380
action='store_false', dest='continue_dl',
4381
help='do not resume partially downloaded files (restart from beginning)')
4382
filesystem.add_option('--cookies',
4383
dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4384
filesystem.add_option('--no-part',
4385
action='store_true', dest='nopart', help='do not use .part files', default=False)
4386
filesystem.add_option('--no-mtime',
4387
action='store_false', dest='updatetime',
4388
help='do not use the Last-modified header to set the file modification time', default=True)
4389
filesystem.add_option('--write-description',
4390
action='store_true', dest='writedescription',
4391
help='write video description to a .description file', default=False)
4392
filesystem.add_option('--write-info-json',
4393
action='store_true', dest='writeinfojson',
4394
help='write video metadata to a .info.json file', default=False)
4397
postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4398
help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4399
postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4400
help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4401
postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4402
help='ffmpeg audio bitrate specification, 128k by default')
4403
postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4404
help='keeps the video file on disk after the post-processing; the video is erased by default')
4407
parser.add_option_group(general)
4408
parser.add_option_group(selection)
4409
parser.add_option_group(filesystem)
4410
parser.add_option_group(verbosity)
4411
parser.add_option_group(video_format)
4412
parser.add_option_group(authentication)
4413
parser.add_option_group(postproc)
4415
xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4417
userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4419
userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4420
argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4421
opts, args = parser.parse_args(argv)
4423
return parser, opts, args
4425
def gen_extractors():
4426
""" Return a list of an instance of every supported extractor.
4427
The order does matter; the first extractor matched is the one handling the URL.
4429
youtube_ie = YoutubeIE()
4430
google_ie = GoogleIE()
4431
yahoo_ie = YahooIE()
4433
YoutubePlaylistIE(youtube_ie),
4434
YoutubeUserIE(youtube_ie),
4435
YoutubeSearchIE(youtube_ie),
4437
MetacafeIE(youtube_ie),
4440
GoogleSearchIE(google_ie),
4443
YahooSearchIE(yahoo_ie),
4456
StanfordOpenClassroomIE(),
4463
parser, opts, args = parseOpts()
4465
# Open appropriate CookieJar
4466
if opts.cookiefile is None:
4467
jar = cookielib.CookieJar()
4470
jar = cookielib.MozillaCookieJar(opts.cookiefile)
4471
if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4473
except (IOError, OSError), err:
4474
sys.exit(u'ERROR: unable to open cookie file')
4477
if opts.dump_user_agent:
4478
print std_headers['User-Agent']
4481
# Batch file verification
4483
if opts.batchfile is not None:
4485
if opts.batchfile == '-':
4488
batchfd = open(opts.batchfile, 'r')
4489
batchurls = batchfd.readlines()
4490
batchurls = [x.strip() for x in batchurls]
4491
batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4493
sys.exit(u'ERROR: batch file could not be read')
4494
all_urls = batchurls + args
4496
# General configuration
4497
cookie_processor = urllib2.HTTPCookieProcessor(jar)
4498
proxy_handler = urllib2.ProxyHandler()
4499
opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4500
urllib2.install_opener(opener)
4501
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4504
print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4506
extractors = gen_extractors()
4508
if opts.list_extractors:
4509
for ie in extractors:
4511
matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4512
all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4513
for mu in matchedUrls:
4517
# Conflicting, missing and erroneous options
4518
if opts.usenetrc and (opts.username is not None or opts.password is not None):
4519
parser.error(u'using .netrc conflicts with giving username/password')
4520
if opts.password is not None and opts.username is None:
4521
parser.error(u'account username missing')
4522
if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4523
parser.error(u'using output template conflicts with using title, literal title or auto number')
4524
if opts.usetitle and opts.useliteral:
4525
parser.error(u'using title conflicts with using literal title')
4526
if opts.username is not None and opts.password is None:
4527
opts.password = getpass.getpass(u'Type account password and press return:')
4528
if opts.ratelimit is not None:
4529
numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4530
if numeric_limit is None:
4531
parser.error(u'invalid rate limit specified')
4532
opts.ratelimit = numeric_limit
4533
if opts.retries is not None:
4535
opts.retries = long(opts.retries)
4536
except (TypeError, ValueError), err:
4537
parser.error(u'invalid retry count specified')
4539
opts.playliststart = int(opts.playliststart)
4540
if opts.playliststart <= 0:
4541
raise ValueError(u'Playlist start must be positive')
4542
except (TypeError, ValueError), err:
4543
parser.error(u'invalid playlist start number specified')
4545
opts.playlistend = int(opts.playlistend)
4546
if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4547
raise ValueError(u'Playlist end must be greater than playlist start')
4548
except (TypeError, ValueError), err:
4549
parser.error(u'invalid playlist end number specified')
4550
if opts.extractaudio:
4551
if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4552
parser.error(u'invalid audio format specified')
4555
fd = FileDownloader({
4556
'usenetrc': opts.usenetrc,
4557
'username': opts.username,
4558
'password': opts.password,
4559
'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4560
'forceurl': opts.geturl,
4561
'forcetitle': opts.gettitle,
4562
'forcethumbnail': opts.getthumbnail,
4563
'forcedescription': opts.getdescription,
4564
'forcefilename': opts.getfilename,
4565
'forceformat': opts.getformat,
4566
'simulate': opts.simulate,
4567
'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4568
'format': opts.format,
4569
'format_limit': opts.format_limit,
4570
'listformats': opts.listformats,
4571
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4572
or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4573
or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4574
or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4575
or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4576
or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4577
or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4578
or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4579
or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4580
or u'%(id)s.%(ext)s'),
4581
'ignoreerrors': opts.ignoreerrors,
4582
'ratelimit': opts.ratelimit,
4583
'nooverwrites': opts.nooverwrites,
4584
'retries': opts.retries,
4585
'continuedl': opts.continue_dl,
4586
'noprogress': opts.noprogress,
4587
'playliststart': opts.playliststart,
4588
'playlistend': opts.playlistend,
4589
'logtostderr': opts.outtmpl == '-',
4590
'consoletitle': opts.consoletitle,
4591
'nopart': opts.nopart,
4592
'updatetime': opts.updatetime,
4593
'writedescription': opts.writedescription,
4594
'writeinfojson': opts.writeinfojson,
4595
'matchtitle': opts.matchtitle,
4596
'rejecttitle': opts.rejecttitle,
4597
'max_downloads': opts.max_downloads,
4598
'prefer_free_formats': opts.prefer_free_formats,
4599
'verbose': opts.verbose,
4601
for extractor in extractors:
4602
fd.add_info_extractor(extractor)
4605
if opts.extractaudio:
4606
fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4609
if opts.update_self:
4610
updateSelf(fd, sys.argv[0])
4613
if len(all_urls) < 1:
4614
if not opts.update_self:
4615
parser.error(u'you must provide at least one URL')
4620
retcode = fd.download(all_urls)
4621
except MaxDownloadsReached:
4622
fd.to_screen(u'--max-download limit reached, aborting.')
4625
# Dump cookie jar if requested
4626
if opts.cookiefile is not None:
4629
except (IOError, OSError), err:
4630
sys.exit(u'ERROR: unable to save cookie jar')
4637
except DownloadError:
4639
except SameFileError:
4640
sys.exit(u'ERROR: fixed output name but more than one file to download')
4641
except KeyboardInterrupt:
4642
sys.exit(u'\nERROR: Interrupted by user')
4644
if __name__ == '__main__':
4647
# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: