2
# -*- coding: utf-8 -*-
3
# Author: Ricardo Garcia Gonzalez
4
# Author: Danny Colligan
5
# Author: Benjamin Johnson
6
# Author: Vasyl' Vavrychuk
7
# Author: Witold Baryluk
8
# Author: Paweł Paprota
9
# Author: Gergely Imreh
10
# License: Public domain code
34
# parse_qs was moved from the cgi module to the urlparse module recently.
36
from urlparse import parse_qs
38
from cgi import parse_qs
41
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
42
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
43
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44
'Accept-Encoding': 'gzip, deflate',
45
'Accept-Language': 'en-us,en;q=0.5',
48
simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
50
def preferredencoding():
51
"""Get preferred encoding.
53
Returns the best encoding scheme for the system, based on
54
locale.getpreferredencoding() and some further tweaks.
56
def yield_preferredencoding():
58
pref = locale.getpreferredencoding()
64
return yield_preferredencoding().next()
66
def htmlentity_transform(matchobj):
67
"""Transforms an HTML entity to a Unicode character.
69
This function receives a match object and is intended to be used with
70
the re.sub() function.
72
entity = matchobj.group(1)
74
# Known non-numeric HTML entity
75
if entity in htmlentitydefs.name2codepoint:
76
return unichr(htmlentitydefs.name2codepoint[entity])
79
mobj = re.match(ur'(?u)#(x?\d+)', entity)
81
numstr = mobj.group(1)
82
if numstr.startswith(u'x'):
84
numstr = u'0%s' % numstr
87
return unichr(long(numstr, base))
89
# Unknown entity in name, return its literal representation
90
return (u'&%s;' % entity)
92
def sanitize_title(utitle):
93
"""Sanitizes a video title so it could be used as part of a filename."""
94
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
95
return utitle.replace(unicode(os.sep), u'%')
97
def sanitize_open(filename, open_mode):
98
"""Try to open the given filename, and slightly tweak it if this fails.
100
Attempts to open the given filename. If this fails, it tries to change
101
the filename slightly, step by step, until it's either able to open it
102
or it fails and raises a final exception, like the standard open()
105
It returns the tuple (stream, definitive_file_name).
109
if sys.platform == 'win32':
111
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
112
return (sys.stdout, filename)
113
stream = open(filename, open_mode)
114
return (stream, filename)
115
except (IOError, OSError), err:
116
# In case of error, try to remove win32 forbidden chars
117
filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
119
# An exception here should be caught in the caller
120
stream = open(filename, open_mode)
121
return (stream, filename)
123
def timeconvert(timestr):
124
"""Convert RFC 2822 defined time string into system timestamp"""
126
timetuple = email.utils.parsedate_tz(timestr)
127
if timetuple is not None:
128
timestamp = email.utils.mktime_tz(timetuple)
131
class DownloadError(Exception):
132
"""Download Error exception.
134
This exception may be thrown by FileDownloader objects if they are not
135
configured to continue on errors. They will contain the appropriate
140
class SameFileError(Exception):
141
"""Same File exception.
143
This exception will be thrown by FileDownloader objects if they detect
144
multiple files would have to be downloaded to the same file on disk.
148
class PostProcessingError(Exception):
149
"""Post Processing exception.
151
This exception may be raised by PostProcessor's .run() method to
152
indicate an error in the postprocessing task.
156
class UnavailableVideoError(Exception):
157
"""Unavailable Format exception.
159
This exception will be thrown when a video is requested
160
in a format that is not available for that video.
164
class ContentTooShortError(Exception):
165
"""Content Too Short exception.
167
This exception may be raised by FileDownloader objects when a file they
168
download is too small for what the server announced first, indicating
169
the connection was probably interrupted.
175
def __init__(self, downloaded, expected):
176
self.downloaded = downloaded
177
self.expected = expected
179
class YoutubeDLHandler(urllib2.HTTPHandler):
180
"""Handler for HTTP requests and responses.
182
This class, when installed with an OpenerDirector, automatically adds
183
the standard headers to every HTTP request and handles gzipped and
184
deflated responses from web servers. If compression is to be avoided in
185
a particular request, the original request in the program code only has
186
to include the HTTP header "Youtubedl-No-Compression", which will be
187
removed before making the real request.
189
Part of this code was copied from:
191
http://techknack.net/python-urllib2-handlers/
193
Andrew Rowls, the author of that code, agreed to release it to the
200
return zlib.decompress(data, -zlib.MAX_WBITS)
202
return zlib.decompress(data)
205
def addinfourl_wrapper(stream, headers, url, code):
206
if hasattr(urllib2.addinfourl, 'getcode'):
207
return urllib2.addinfourl(stream, headers, url, code)
208
ret = urllib2.addinfourl(stream, headers, url)
212
def http_request(self, req):
213
for h in std_headers:
216
req.add_header(h, std_headers[h])
217
if 'Youtubedl-no-compression' in req.headers:
218
if 'Accept-encoding' in req.headers:
219
del req.headers['Accept-encoding']
220
del req.headers['Youtubedl-no-compression']
223
def http_response(self, req, resp):
226
if resp.headers.get('Content-encoding', '') == 'gzip':
227
gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
228
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
229
resp.msg = old_resp.msg
231
if resp.headers.get('Content-encoding', '') == 'deflate':
232
gz = StringIO.StringIO(self.deflate(resp.read()))
233
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
234
resp.msg = old_resp.msg
237
class FileDownloader(object):
238
"""File Downloader class.
240
File downloader objects are the ones responsible of downloading the
241
actual video file and writing it to disk if the user has requested
242
it, among some other tasks. In most cases there should be one per
243
program. As, given a video URL, the downloader doesn't know how to
244
extract all the needed information, task that InfoExtractors do, it
245
has to pass the URL to one of them.
247
For this, file downloader objects have a method that allows
248
InfoExtractors to be registered in a given order. When it is passed
249
a URL, the file downloader handles it to the first InfoExtractor it
250
finds that reports being able to handle it. The InfoExtractor extracts
251
all the information about the video or videos the URL refers to, and
252
asks the FileDownloader to process the video information, possibly
253
downloading the video.
255
File downloaders accept a lot of parameters. In order not to saturate
256
the object constructor with arguments, it receives a dictionary of
257
options instead. These options are available through the params
258
attribute for the InfoExtractors to use. The FileDownloader also
259
registers itself as the downloader in charge for the InfoExtractors
260
that are added to it, so this is a "mutual registration".
264
username: Username for authentication purposes.
265
password: Password for authentication purposes.
266
usenetrc: Use netrc for authentication instead.
267
quiet: Do not print messages to stdout.
268
forceurl: Force printing final URL.
269
forcetitle: Force printing title.
270
forcethumbnail: Force printing thumbnail URL.
271
forcedescription: Force printing description.
272
forcefilename: Force printing final filename.
273
simulate: Do not download the video files.
274
format: Video format code.
275
format_limit: Highest quality format to try.
276
outtmpl: Template for output names.
277
ignoreerrors: Do not stop on download errors.
278
ratelimit: Download speed limit, in bytes/sec.
279
nooverwrites: Prevent overwriting files.
280
retries: Number of times to retry for HTTP error 5xx
281
continuedl: Try to continue downloads if possible.
282
noprogress: Do not print the progress bar.
283
playliststart: Playlist item to start at.
284
playlistend: Playlist item to end at.
285
logtostderr: Log messages to stderr instead of stdout.
286
consoletitle: Display progress in console window's titlebar.
287
nopart: Do not use temporary .part files.
288
updatetime: Use the Last-modified header to set output file timestamps.
294
_download_retcode = None
295
_num_downloads = None
298
def __init__(self, params):
299
"""Create a FileDownloader object with the given options."""
302
self._download_retcode = 0
303
self._num_downloads = 0
304
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
308
def pmkdir(filename):
309
"""Create directory components in filename. Similar to Unix "mkdir -p"."""
310
components = filename.split(os.sep)
311
aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
312
aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
313
for dir in aggregate:
314
if not os.path.exists(dir):
318
def format_bytes(bytes):
321
if type(bytes) is str:
326
exponent = long(math.log(bytes, 1024.0))
327
suffix = 'bkMGTPEZY'[exponent]
328
converted = float(bytes) / float(1024**exponent)
329
return '%.2f%s' % (converted, suffix)
332
def calc_percent(byte_counter, data_len):
335
return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
338
def calc_eta(start, now, total, current):
342
if current == 0 or dif < 0.001: # One millisecond
344
rate = float(current) / dif
345
eta = long((float(total) - float(current)) / rate)
346
(eta_mins, eta_secs) = divmod(eta, 60)
349
return '%02d:%02d' % (eta_mins, eta_secs)
352
def calc_speed(start, now, bytes):
354
if bytes == 0 or dif < 0.001: # One millisecond
355
return '%10s' % '---b/s'
356
return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
359
def best_block_size(elapsed_time, bytes):
360
new_min = max(bytes / 2.0, 1.0)
361
new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
362
if elapsed_time < 0.001:
364
rate = bytes / elapsed_time
372
def parse_bytes(bytestr):
373
"""Parse a string indicating a byte quantity into a long integer."""
374
matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
377
number = float(matchobj.group(1))
378
multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
379
return long(round(number * multiplier))
381
def add_info_extractor(self, ie):
382
"""Add an InfoExtractor object to the end of the list."""
384
ie.set_downloader(self)
386
def add_post_processor(self, pp):
387
"""Add a PostProcessor object to the end of the chain."""
389
pp.set_downloader(self)
391
def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
392
"""Print message to stdout if not in quiet mode."""
394
if not self.params.get('quiet', False):
395
terminator = [u'\n', u''][skip_eol]
396
print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
397
self._screen_file.flush()
398
except (UnicodeEncodeError), err:
399
if not ignore_encoding_errors:
402
def to_stderr(self, message):
403
"""Print message to stderr."""
404
print >>sys.stderr, message.encode(preferredencoding())
406
def to_cons_title(self, message):
407
"""Set console/terminal window title to message."""
408
if not self.params.get('consoletitle', False):
410
if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
411
# c_wchar_p() might not be necessary if `message` is
412
# already of type unicode()
413
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
414
elif 'TERM' in os.environ:
415
sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
417
def fixed_template(self):
418
"""Checks if the output template is fixed."""
419
return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
421
def trouble(self, message=None):
422
"""Determine action to take when a download problem appears.
424
Depending on if the downloader has been configured to ignore
425
download errors or not, this method may throw an exception or
426
not when errors are found, after printing the message.
428
if message is not None:
429
self.to_stderr(message)
430
if not self.params.get('ignoreerrors', False):
431
raise DownloadError(message)
432
self._download_retcode = 1
434
def slow_down(self, start_time, byte_counter):
435
"""Sleep if the download speed is over the rate limit."""
436
rate_limit = self.params.get('ratelimit', None)
437
if rate_limit is None or byte_counter == 0:
440
elapsed = now - start_time
443
speed = float(byte_counter) / elapsed
444
if speed > rate_limit:
445
time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
447
def temp_name(self, filename):
448
"""Returns a temporary filename for the given filename."""
449
if self.params.get('nopart', False) or filename == u'-' or \
450
(os.path.exists(filename) and not os.path.isfile(filename)):
452
return filename + u'.part'
454
def undo_temp_name(self, filename):
455
if filename.endswith(u'.part'):
456
return filename[:-len(u'.part')]
459
def try_rename(self, old_filename, new_filename):
461
if old_filename == new_filename:
463
os.rename(old_filename, new_filename)
464
except (IOError, OSError), err:
465
self.trouble(u'ERROR: unable to rename file')
467
def try_utime(self, filename, last_modified_hdr):
468
"""Try to set the last-modified time of the given file."""
469
if last_modified_hdr is None:
471
if not os.path.isfile(filename):
473
timestr = last_modified_hdr
476
filetime = timeconvert(timestr)
480
os.utime(filename,(time.time(), filetime))
484
def report_destination(self, filename):
485
"""Report destination filename."""
486
self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
488
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
489
"""Report download progress."""
490
if self.params.get('noprogress', False):
492
self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
493
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
494
self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
495
(percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
497
def report_resuming_byte(self, resume_len):
498
"""Report attempt to resume at given byte."""
499
self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
501
def report_retry(self, count, retries):
502
"""Report retry in case of HTTP error 5xx"""
503
self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
505
def report_file_already_downloaded(self, file_name):
506
"""Report file has already been fully downloaded."""
508
self.to_screen(u'[download] %s has already been downloaded' % file_name)
509
except (UnicodeEncodeError), err:
510
self.to_screen(u'[download] The file has already been downloaded')
512
def report_unable_to_resume(self):
513
"""Report it was impossible to resume download."""
514
self.to_screen(u'[download] Unable to resume')
516
def report_finish(self):
517
"""Report download finished."""
518
if self.params.get('noprogress', False):
519
self.to_screen(u'[download] Download completed')
523
def increment_downloads(self):
524
"""Increment the ordinal that assigns a number to each file."""
525
self._num_downloads += 1
527
def prepare_filename(self, info_dict):
528
"""Generate the output filename."""
530
template_dict = dict(info_dict)
531
template_dict['epoch'] = unicode(long(time.time()))
532
template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
533
filename = self.params['outtmpl'] % template_dict
535
except (ValueError, KeyError), err:
536
self.trouble(u'ERROR: invalid system charset or erroneous output template')
539
def process_info(self, info_dict):
540
"""Process a single dictionary returned by an InfoExtractor."""
541
filename = self.prepare_filename(info_dict)
542
# Do nothing else if in simulate mode
543
if self.params.get('simulate', False):
545
if self.params.get('forcetitle', False):
546
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
547
if self.params.get('forceurl', False):
548
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
549
if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
550
print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
551
if self.params.get('forcedescription', False) and 'description' in info_dict:
552
print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
553
if self.params.get('forcefilename', False) and filename is not None:
554
print filename.encode(preferredencoding(), 'xmlcharrefreplace')
560
if self.params.get('nooverwrites', False) and os.path.exists(filename):
561
self.to_stderr(u'WARNING: file exists and will be skipped')
565
self.pmkdir(filename)
566
except (OSError, IOError), err:
567
self.trouble(u'ERROR: unable to create directories: %s' % str(err))
571
success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
572
except (OSError, IOError), err:
573
raise UnavailableVideoError
574
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
575
self.trouble(u'ERROR: unable to download video data: %s' % str(err))
577
except (ContentTooShortError, ), err:
578
self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
583
self.post_process(filename, info_dict)
584
except (PostProcessingError), err:
585
self.trouble(u'ERROR: postprocessing: %s' % str(err))
588
def download(self, url_list):
589
"""Download a given list of URLs."""
590
if len(url_list) > 1 and self.fixed_template():
591
raise SameFileError(self.params['outtmpl'])
594
suitable_found = False
596
# Go to next InfoExtractor if not suitable
597
if not ie.suitable(url):
600
# Suitable InfoExtractor found
601
suitable_found = True
603
# Extract information from URL and process it
606
# Suitable InfoExtractor had been found; go to next URL
609
if not suitable_found:
610
self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
612
return self._download_retcode
614
def post_process(self, filename, ie_info):
615
"""Run the postprocessing chain on the given file."""
617
info['filepath'] = filename
623
def _download_with_rtmpdump(self, filename, url, player_url):
624
self.report_destination(filename)
625
tmpfilename = self.temp_name(filename)
627
# Check for rtmpdump first
629
subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
630
except (OSError, IOError):
631
self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
634
# Download using rtmpdump. rtmpdump returns exit code 2 when
635
# the connection was interrumpted and resuming appears to be
636
# possible. This is part of rtmpdump's normal usage, AFAIK.
637
basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
638
retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
639
while retval == 2 or retval == 1:
640
prevsize = os.path.getsize(tmpfilename)
641
self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
642
time.sleep(5.0) # This seems to be needed
643
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
644
cursize = os.path.getsize(tmpfilename)
645
if prevsize == cursize and retval == 1:
648
self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
649
self.try_rename(tmpfilename, filename)
652
self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
655
def _do_download(self, filename, url, player_url):
656
# Check file already present
657
if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
658
self.report_file_already_downloaded(filename)
661
# Attempt to download using rtmpdump
662
if url.startswith('rtmp'):
663
return self._download_with_rtmpdump(filename, url, player_url)
665
tmpfilename = self.temp_name(filename)
669
# Do not include the Accept-Encoding header
670
headers = {'Youtubedl-no-compression': 'True'}
671
basic_request = urllib2.Request(url, None, headers)
672
request = urllib2.Request(url, None, headers)
674
# Establish possible resume length
675
if os.path.isfile(tmpfilename):
676
resume_len = os.path.getsize(tmpfilename)
680
# Request parameters in case of being able to resume
681
if self.params.get('continuedl', False) and resume_len != 0:
682
self.report_resuming_byte(resume_len)
683
request.add_header('Range','bytes=%d-' % resume_len)
687
retries = self.params.get('retries', 0)
688
while count <= retries:
689
# Establish connection
691
data = urllib2.urlopen(request)
693
except (urllib2.HTTPError, ), err:
694
if (err.code < 500 or err.code >= 600) and err.code != 416:
695
# Unexpected HTTP error
697
elif err.code == 416:
698
# Unable to resume (requested range not satisfiable)
700
# Open the connection again without the range header
701
data = urllib2.urlopen(basic_request)
702
content_length = data.info()['Content-Length']
703
except (urllib2.HTTPError, ), err:
704
if err.code < 500 or err.code >= 600:
707
# Examine the reported length
708
if (content_length is not None and
709
(resume_len - 100 < long(content_length) < resume_len + 100)):
710
# The file had already been fully downloaded.
711
# Explanation to the above condition: in issue #175 it was revealed that
712
# YouTube sometimes adds or removes a few bytes from the end of the file,
713
# changing the file size slightly and causing problems for some users. So
714
# I decided to implement a suggested change and consider the file
715
# completely downloaded if the file size differs less than 100 bytes from
716
# the one in the hard drive.
717
self.report_file_already_downloaded(filename)
718
self.try_rename(tmpfilename, filename)
721
# The length does not match, we start the download over
722
self.report_unable_to_resume()
728
self.report_retry(count, retries)
731
self.trouble(u'ERROR: giving up after %s retries' % retries)
734
data_len = data.info().get('Content-length', None)
735
if data_len is not None:
736
data_len = long(data_len) + resume_len
737
data_len_str = self.format_bytes(data_len)
738
byte_counter = 0 + resume_len
744
data_block = data.read(block_size)
746
if len(data_block) == 0:
748
byte_counter += len(data_block)
750
# Open file just in time
753
(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
754
filename = self.undo_temp_name(tmpfilename)
755
self.report_destination(filename)
756
except (OSError, IOError), err:
757
self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
760
stream.write(data_block)
761
except (IOError, OSError), err:
762
self.trouble(u'\nERROR: unable to write data: %s' % str(err))
764
block_size = self.best_block_size(after - before, len(data_block))
767
percent_str = self.calc_percent(byte_counter, data_len)
768
eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
769
speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
770
self.report_progress(percent_str, data_len_str, speed_str, eta_str)
773
self.slow_down(start, byte_counter - resume_len)
777
if data_len is not None and byte_counter != data_len:
778
raise ContentTooShortError(byte_counter, long(data_len))
779
self.try_rename(tmpfilename, filename)
781
# Update file modification time
782
if self.params.get('updatetime', True):
783
self.try_utime(filename, data.info().get('last-modified', None))
787
class InfoExtractor(object):
788
"""Information Extractor class.
790
Information extractors are the classes that, given a URL, extract
791
information from the video (or videos) the URL refers to. This
792
information includes the real video URL, the video title and simplified
793
title, author and others. The information is stored in a dictionary
794
which is then passed to the FileDownloader. The FileDownloader
795
processes this information possibly downloading the video to the file
796
system, among other possible outcomes. The dictionaries must include
797
the following fields:
799
id: Video identifier.
800
url: Final video URL.
801
uploader: Nickname of the video uploader.
802
title: Literal title.
803
stitle: Simplified title.
804
ext: Video filename extension.
805
format: Video format.
806
player_url: SWF Player URL (may be None).
808
The following fields are optional. Their primary purpose is to allow
809
youtube-dl to serve as the backend for a video search function, such
810
as the one in youtube2mp3. They are only used when their respective
811
forced printing functions are called:
813
thumbnail: Full URL to a video thumbnail image.
814
description: One-line video description.
816
Subclasses of this one should re-define the _real_initialize() and
817
_real_extract() methods, as well as the suitable() static method.
818
Probably, they should also be instantiated and added to the main
825
def __init__(self, downloader=None):
826
"""Constructor. Receives an optional downloader."""
828
self.set_downloader(downloader)
832
"""Receives a URL and returns True if suitable for this IE."""
835
def initialize(self):
836
"""Initializes an instance (authentication, etc)."""
838
self._real_initialize()
841
def extract(self, url):
842
"""Extracts URL information and returns it in list of dicts."""
844
return self._real_extract(url)
846
def set_downloader(self, downloader):
847
"""Sets the downloader for this IE."""
848
self._downloader = downloader
850
def _real_initialize(self):
851
"""Real initialization process. Redefine in subclasses."""
854
def _real_extract(self, url):
855
"""Real extraction process. Redefine in subclasses."""
858
class YoutubeIE(InfoExtractor):
859
"""Information extractor for youtube.com."""
861
_VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
862
_LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
863
_LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
864
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
865
_NETRC_MACHINE = 'youtube'
866
# Listed in order of quality
867
_available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
868
_video_extensions = {
874
'38': 'video', # You actually don't know if this will be MOV, AVI or whatever
881
return (re.match(YoutubeIE._VALID_URL, url) is not None)
883
def report_lang(self):
884
"""Report attempt to set language."""
885
self._downloader.to_screen(u'[youtube] Setting language')
887
def report_login(self):
888
"""Report attempt to log in."""
889
self._downloader.to_screen(u'[youtube] Logging in')
891
def report_age_confirmation(self):
892
"""Report attempt to confirm age."""
893
self._downloader.to_screen(u'[youtube] Confirming age')
895
def report_video_webpage_download(self, video_id):
896
"""Report attempt to download video webpage."""
897
self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
899
def report_video_info_webpage_download(self, video_id):
900
"""Report attempt to download video info webpage."""
901
self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
903
def report_information_extraction(self, video_id):
904
"""Report attempt to extract video information."""
905
self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
907
def report_unavailable_format(self, video_id, format):
908
"""Report extracted video URL."""
909
self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
911
def report_rtmp_download(self):
912
"""Indicate the download will use the RTMP protocol."""
913
self._downloader.to_screen(u'[youtube] RTMP download detected')
915
def _real_initialize(self):
916
if self._downloader is None:
921
downloader_params = self._downloader.params
923
# Attempt to use provided username and password or .netrc data
924
if downloader_params.get('username', None) is not None:
925
username = downloader_params['username']
926
password = downloader_params['password']
927
elif downloader_params.get('usenetrc', False):
929
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
934
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
935
except (IOError, netrc.NetrcParseError), err:
936
self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
940
request = urllib2.Request(self._LANG_URL)
943
urllib2.urlopen(request).read()
944
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
945
self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
948
# No authentication to be performed
954
'current_form': 'loginForm',
956
'action_login': 'Log In',
957
'username': username,
958
'password': password,
960
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
963
login_results = urllib2.urlopen(request).read()
964
if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
965
self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
967
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
968
self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
974
'action_confirm': 'Confirm',
976
request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
978
self.report_age_confirmation()
979
age_results = urllib2.urlopen(request).read()
980
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
981
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
984
def _real_extract(self, url):
985
# Extract video id from URL
986
mobj = re.match(self._VALID_URL, url)
988
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
990
video_id = mobj.group(2)
993
self.report_video_webpage_download(video_id)
994
request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
996
video_webpage = urllib2.urlopen(request).read()
997
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
998
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1001
# Attempt to extract SWF player URL
1002
mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1003
if mobj is not None:
1004
player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1009
self.report_video_info_webpage_download(video_id)
1010
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1011
video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1012
% (video_id, el_type))
1013
request = urllib2.Request(video_info_url)
1015
video_info_webpage = urllib2.urlopen(request).read()
1016
video_info = parse_qs(video_info_webpage)
1017
if 'token' in video_info:
1019
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1022
if 'token' not in video_info:
1023
if 'reason' in video_info:
1024
self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1026
self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1029
# Start extracting information
1030
self.report_information_extraction(video_id)
1033
if 'author' not in video_info:
1034
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1036
video_uploader = urllib.unquote_plus(video_info['author'][0])
1039
if 'title' not in video_info:
1040
self._downloader.trouble(u'ERROR: unable to extract video title')
1042
video_title = urllib.unquote_plus(video_info['title'][0])
1043
video_title = video_title.decode('utf-8')
1044
video_title = sanitize_title(video_title)
1047
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1048
simple_title = simple_title.strip(ur'_')
1051
if 'thumbnail_url' not in video_info:
1052
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1053
video_thumbnail = ''
1054
else: # don't panic if we can't find it
1055
video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1059
mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1060
if mobj is not None:
1061
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1062
format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1063
for expression in format_expressions:
1065
upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1070
video_description = 'No description available.'
1071
if self._downloader.params.get('forcedescription', False):
1072
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1073
if mobj is not None:
1074
video_description = mobj.group(1)
1077
video_token = urllib.unquote_plus(video_info['token'][0])
1079
# Decide which formats to download
1080
req_format = self._downloader.params.get('format', None)
1082
if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1083
url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1084
url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1085
url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1086
format_limit = self._downloader.params.get('format_limit', None)
1087
if format_limit is not None and format_limit in self._available_formats:
1088
format_list = self._available_formats[self._available_formats.index(format_limit):]
1090
format_list = self._available_formats
1091
existing_formats = [x for x in format_list if x in url_map]
1092
if len(existing_formats) == 0:
1093
self._downloader.trouble(u'ERROR: no known formats available for video')
1095
if req_format is None:
1096
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1097
elif req_format == '-1':
1098
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1101
if req_format not in url_map:
1102
self._downloader.trouble(u'ERROR: requested format not available')
1104
video_url_list = [(req_format, url_map[req_format])] # Specific format
1106
elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1107
self.report_rtmp_download()
1108
video_url_list = [(None, video_info['conn'][0])]
1111
self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1114
for format_param, video_real_url in video_url_list:
1115
# At this point we have a new video
1116
self._downloader.increment_downloads()
1119
video_extension = self._video_extensions.get(format_param, 'flv')
1121
# Find the video URL in fmt_url_map or conn paramters
1123
# Process video information
1124
self._downloader.process_info({
1125
'id': video_id.decode('utf-8'),
1126
'url': video_real_url.decode('utf-8'),
1127
'uploader': video_uploader.decode('utf-8'),
1128
'upload_date': upload_date,
1129
'title': video_title,
1130
'stitle': simple_title,
1131
'ext': video_extension.decode('utf-8'),
1132
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1133
'thumbnail': video_thumbnail.decode('utf-8'),
1134
'description': video_description.decode('utf-8'),
1135
'player_url': player_url,
1137
except UnavailableVideoError, err:
1138
self._downloader.trouble(u'\nERROR: unable to download video')
1141
class MetacafeIE(InfoExtractor):
1142
"""Information Extractor for metacafe.com."""
1144
_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1145
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1146
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1149
def __init__(self, youtube_ie, downloader=None):
1150
InfoExtractor.__init__(self, downloader)
1151
self._youtube_ie = youtube_ie
1155
return (re.match(MetacafeIE._VALID_URL, url) is not None)
1157
def report_disclaimer(self):
1158
"""Report disclaimer retrieval."""
1159
self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1161
def report_age_confirmation(self):
1162
"""Report attempt to confirm age."""
1163
self._downloader.to_screen(u'[metacafe] Confirming age')
1165
def report_download_webpage(self, video_id):
1166
"""Report webpage download."""
1167
self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1169
def report_extraction(self, video_id):
1170
"""Report information extraction."""
1171
self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1173
def _real_initialize(self):
1174
# Retrieve disclaimer
1175
request = urllib2.Request(self._DISCLAIMER)
1177
self.report_disclaimer()
1178
disclaimer = urllib2.urlopen(request).read()
1179
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1180
self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1186
'submit': "Continue - I'm over 18",
1188
request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1190
self.report_age_confirmation()
1191
disclaimer = urllib2.urlopen(request).read()
1192
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1193
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1196
def _real_extract(self, url):
1197
# Extract id and simplified title from URL
1198
mobj = re.match(self._VALID_URL, url)
1200
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1203
video_id = mobj.group(1)
1205
# Check if video comes from YouTube
1206
mobj2 = re.match(r'^yt-(.*)$', video_id)
1207
if mobj2 is not None:
1208
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1211
# At this point we have a new video
1212
self._downloader.increment_downloads()
1214
simple_title = mobj.group(2).decode('utf-8')
1216
# Retrieve video webpage to extract further information
1217
request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1219
self.report_download_webpage(video_id)
1220
webpage = urllib2.urlopen(request).read()
1221
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222
self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1225
# Extract URL, uploader and title from webpage
1226
self.report_extraction(video_id)
1227
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1228
if mobj is not None:
1229
mediaURL = urllib.unquote(mobj.group(1))
1230
video_extension = mediaURL[-3:]
1232
# Extract gdaKey if available
1233
mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1235
video_url = mediaURL
1237
gdaKey = mobj.group(1)
1238
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1240
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1242
self._downloader.trouble(u'ERROR: unable to extract media URL')
1244
vardict = parse_qs(mobj.group(1))
1245
if 'mediaData' not in vardict:
1246
self._downloader.trouble(u'ERROR: unable to extract media URL')
1248
mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1250
self._downloader.trouble(u'ERROR: unable to extract media URL')
1252
mediaURL = mobj.group(1).replace('\\/', '/')
1253
video_extension = mediaURL[-3:]
1254
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1256
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1258
self._downloader.trouble(u'ERROR: unable to extract title')
1260
video_title = mobj.group(1).decode('utf-8')
1261
video_title = sanitize_title(video_title)
1263
mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1265
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1267
video_uploader = mobj.group(1)
1270
# Process video information
1271
self._downloader.process_info({
1272
'id': video_id.decode('utf-8'),
1273
'url': video_url.decode('utf-8'),
1274
'uploader': video_uploader.decode('utf-8'),
1275
'upload_date': u'NA',
1276
'title': video_title,
1277
'stitle': simple_title,
1278
'ext': video_extension.decode('utf-8'),
1282
except UnavailableVideoError:
1283
self._downloader.trouble(u'\nERROR: unable to download video')
1286
class DailymotionIE(InfoExtractor):
1287
"""Information Extractor for Dailymotion"""
1289
_VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1291
def __init__(self, downloader=None):
1292
InfoExtractor.__init__(self, downloader)
1296
return (re.match(DailymotionIE._VALID_URL, url) is not None)
1298
def report_download_webpage(self, video_id):
1299
"""Report webpage download."""
1300
self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1302
def report_extraction(self, video_id):
1303
"""Report information extraction."""
1304
self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1306
def _real_initialize(self):
1309
def _real_extract(self, url):
1310
# Extract id and simplified title from URL
1311
mobj = re.match(self._VALID_URL, url)
1313
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1316
# At this point we have a new video
1317
self._downloader.increment_downloads()
1318
video_id = mobj.group(1)
1320
simple_title = mobj.group(2).decode('utf-8')
1321
video_extension = 'flv'
1323
# Retrieve video webpage to extract further information
1324
request = urllib2.Request(url)
1326
self.report_download_webpage(video_id)
1327
webpage = urllib2.urlopen(request).read()
1328
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1329
self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1332
# Extract URL, uploader and title from webpage
1333
self.report_extraction(video_id)
1334
mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1336
self._downloader.trouble(u'ERROR: unable to extract media URL')
1338
mediaURL = urllib.unquote(mobj.group(1))
1340
# if needed add http://www.dailymotion.com/ if relative URL
1342
video_url = mediaURL
1344
# '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1345
mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1347
self._downloader.trouble(u'ERROR: unable to extract title')
1349
video_title = mobj.group(1).decode('utf-8')
1350
video_title = sanitize_title(video_title)
1352
mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1354
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1356
video_uploader = mobj.group(1)
1359
# Process video information
1360
self._downloader.process_info({
1361
'id': video_id.decode('utf-8'),
1362
'url': video_url.decode('utf-8'),
1363
'uploader': video_uploader.decode('utf-8'),
1364
'upload_date': u'NA',
1365
'title': video_title,
1366
'stitle': simple_title,
1367
'ext': video_extension.decode('utf-8'),
1371
except UnavailableVideoError:
1372
self._downloader.trouble(u'\nERROR: unable to download video')
1374
class GoogleIE(InfoExtractor):
1375
"""Information extractor for video.google.com."""
1377
_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1379
def __init__(self, downloader=None):
1380
InfoExtractor.__init__(self, downloader)
1384
return (re.match(GoogleIE._VALID_URL, url) is not None)
1386
def report_download_webpage(self, video_id):
1387
"""Report webpage download."""
1388
self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1390
def report_extraction(self, video_id):
1391
"""Report information extraction."""
1392
self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1394
def _real_initialize(self):
1397
def _real_extract(self, url):
1398
# Extract id from URL
1399
mobj = re.match(self._VALID_URL, url)
1401
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1404
# At this point we have a new video
1405
self._downloader.increment_downloads()
1406
video_id = mobj.group(1)
1408
video_extension = 'mp4'
1410
# Retrieve video webpage to extract further information
1411
request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1413
self.report_download_webpage(video_id)
1414
webpage = urllib2.urlopen(request).read()
1415
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1416
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1419
# Extract URL, uploader, and title from webpage
1420
self.report_extraction(video_id)
1421
mobj = re.search(r"download_url:'([^']+)'", webpage)
1423
video_extension = 'flv'
1424
mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1426
self._downloader.trouble(u'ERROR: unable to extract media URL')
1428
mediaURL = urllib.unquote(mobj.group(1))
1429
mediaURL = mediaURL.replace('\\x3d', '\x3d')
1430
mediaURL = mediaURL.replace('\\x26', '\x26')
1432
video_url = mediaURL
1434
mobj = re.search(r'<title>(.*)</title>', webpage)
1436
self._downloader.trouble(u'ERROR: unable to extract title')
1438
video_title = mobj.group(1).decode('utf-8')
1439
video_title = sanitize_title(video_title)
1440
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1442
# Extract video description
1443
mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1445
self._downloader.trouble(u'ERROR: unable to extract video description')
1447
video_description = mobj.group(1).decode('utf-8')
1448
if not video_description:
1449
video_description = 'No description available.'
1451
# Extract video thumbnail
1452
if self._downloader.params.get('forcethumbnail', False):
1453
request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1455
webpage = urllib2.urlopen(request).read()
1456
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1459
mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1461
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1463
video_thumbnail = mobj.group(1)
1464
else: # we need something to pass to process_info
1465
video_thumbnail = ''
1469
# Process video information
1470
self._downloader.process_info({
1471
'id': video_id.decode('utf-8'),
1472
'url': video_url.decode('utf-8'),
1474
'upload_date': u'NA',
1475
'title': video_title,
1476
'stitle': simple_title,
1477
'ext': video_extension.decode('utf-8'),
1481
except UnavailableVideoError:
1482
self._downloader.trouble(u'\nERROR: unable to download video')
1485
class PhotobucketIE(InfoExtractor):
1486
"""Information extractor for photobucket.com."""
1488
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1490
def __init__(self, downloader=None):
1491
InfoExtractor.__init__(self, downloader)
1495
return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1497
def report_download_webpage(self, video_id):
1498
"""Report webpage download."""
1499
self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1501
def report_extraction(self, video_id):
1502
"""Report information extraction."""
1503
self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1505
def _real_initialize(self):
1508
def _real_extract(self, url):
1509
# Extract id from URL
1510
mobj = re.match(self._VALID_URL, url)
1512
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1515
# At this point we have a new video
1516
self._downloader.increment_downloads()
1517
video_id = mobj.group(1)
1519
video_extension = 'flv'
1521
# Retrieve video webpage to extract further information
1522
request = urllib2.Request(url)
1524
self.report_download_webpage(video_id)
1525
webpage = urllib2.urlopen(request).read()
1526
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1527
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1530
# Extract URL, uploader, and title from webpage
1531
self.report_extraction(video_id)
1532
mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1534
self._downloader.trouble(u'ERROR: unable to extract media URL')
1536
mediaURL = urllib.unquote(mobj.group(1))
1538
video_url = mediaURL
1540
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1542
self._downloader.trouble(u'ERROR: unable to extract title')
1544
video_title = mobj.group(1).decode('utf-8')
1545
video_title = sanitize_title(video_title)
1546
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1548
video_uploader = mobj.group(2).decode('utf-8')
1551
# Process video information
1552
self._downloader.process_info({
1553
'id': video_id.decode('utf-8'),
1554
'url': video_url.decode('utf-8'),
1555
'uploader': video_uploader,
1556
'upload_date': u'NA',
1557
'title': video_title,
1558
'stitle': simple_title,
1559
'ext': video_extension.decode('utf-8'),
1563
except UnavailableVideoError:
1564
self._downloader.trouble(u'\nERROR: unable to download video')
1567
class YahooIE(InfoExtractor):
1568
"""Information extractor for video.yahoo.com."""
1570
# _VALID_URL matches all Yahoo! Video URLs
1571
# _VPAGE_URL matches only the extractable '/watch/' URLs
1572
_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1573
_VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1575
def __init__(self, downloader=None):
1576
InfoExtractor.__init__(self, downloader)
1580
return (re.match(YahooIE._VALID_URL, url) is not None)
1582
def report_download_webpage(self, video_id):
1583
"""Report webpage download."""
1584
self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1586
def report_extraction(self, video_id):
1587
"""Report information extraction."""
1588
self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1590
def _real_initialize(self):
1593
def _real_extract(self, url, new_video=True):
1594
# Extract ID from URL
1595
mobj = re.match(self._VALID_URL, url)
1597
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1600
# At this point we have a new video
1601
self._downloader.increment_downloads()
1602
video_id = mobj.group(2)
1603
video_extension = 'flv'
1605
# Rewrite valid but non-extractable URLs as
1606
# extractable English language /watch/ URLs
1607
if re.match(self._VPAGE_URL, url) is None:
1608
request = urllib2.Request(url)
1610
webpage = urllib2.urlopen(request).read()
1611
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1615
mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1617
self._downloader.trouble(u'ERROR: Unable to extract id field')
1619
yahoo_id = mobj.group(1)
1621
mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1623
self._downloader.trouble(u'ERROR: Unable to extract vid field')
1625
yahoo_vid = mobj.group(1)
1627
url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1628
return self._real_extract(url, new_video=False)
1630
# Retrieve video webpage to extract further information
1631
request = urllib2.Request(url)
1633
self.report_download_webpage(video_id)
1634
webpage = urllib2.urlopen(request).read()
1635
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1639
# Extract uploader and title from webpage
1640
self.report_extraction(video_id)
1641
mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1643
self._downloader.trouble(u'ERROR: unable to extract video title')
1645
video_title = mobj.group(1).decode('utf-8')
1646
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1648
mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1650
self._downloader.trouble(u'ERROR: unable to extract video uploader')
1652
video_uploader = mobj.group(1).decode('utf-8')
1654
# Extract video thumbnail
1655
mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1657
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1659
video_thumbnail = mobj.group(1).decode('utf-8')
1661
# Extract video description
1662
mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1664
self._downloader.trouble(u'ERROR: unable to extract video description')
1666
video_description = mobj.group(1).decode('utf-8')
1667
if not video_description: video_description = 'No description available.'
1669
# Extract video height and width
1670
mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1672
self._downloader.trouble(u'ERROR: unable to extract video height')
1674
yv_video_height = mobj.group(1)
1676
mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1678
self._downloader.trouble(u'ERROR: unable to extract video width')
1680
yv_video_width = mobj.group(1)
1682
# Retrieve video playlist to extract media URL
1683
# I'm not completely sure what all these options are, but we
1684
# seem to need most of them, otherwise the server sends a 401.
1685
yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1686
yv_bitrate = '700' # according to Wikipedia this is hard-coded
1687
request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1688
'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1689
'&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1691
self.report_download_webpage(video_id)
1692
webpage = urllib2.urlopen(request).read()
1693
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1694
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1697
# Extract media URL from playlist XML
1698
mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1700
self._downloader.trouble(u'ERROR: Unable to extract media URL')
1702
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1703
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1706
# Process video information
1707
self._downloader.process_info({
1708
'id': video_id.decode('utf-8'),
1710
'uploader': video_uploader,
1711
'upload_date': u'NA',
1712
'title': video_title,
1713
'stitle': simple_title,
1714
'ext': video_extension.decode('utf-8'),
1715
'thumbnail': video_thumbnail.decode('utf-8'),
1716
'description': video_description,
1717
'thumbnail': video_thumbnail,
1718
'description': video_description,
1721
except UnavailableVideoError:
1722
self._downloader.trouble(u'\nERROR: unable to download video')
1725
class GenericIE(InfoExtractor):
1726
"""Generic last-resort information extractor."""
1728
def __init__(self, downloader=None):
1729
InfoExtractor.__init__(self, downloader)
1735
def report_download_webpage(self, video_id):
1736
"""Report webpage download."""
1737
self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1738
self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1740
def report_extraction(self, video_id):
1741
"""Report information extraction."""
1742
self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1744
def _real_initialize(self):
1747
def _real_extract(self, url):
1748
# At this point we have a new video
1749
self._downloader.increment_downloads()
1751
video_id = url.split('/')[-1]
1752
request = urllib2.Request(url)
1754
self.report_download_webpage(video_id)
1755
webpage = urllib2.urlopen(request).read()
1756
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1757
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1759
except ValueError, err:
1760
# since this is the last-resort InfoExtractor, if
1761
# this error is thrown, it'll be thrown here
1762
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1765
self.report_extraction(video_id)
1766
# Start with something easy: JW Player in SWFObject
1767
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1769
# Broaden the search a little bit
1770
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1772
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1775
# It's possible that one of the regexes
1776
# matched, but returned an empty group:
1777
if mobj.group(1) is None:
1778
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1781
video_url = urllib.unquote(mobj.group(1))
1782
video_id = os.path.basename(video_url)
1784
# here's a fun little line of code for you:
1785
video_extension = os.path.splitext(video_id)[1][1:]
1786
video_id = os.path.splitext(video_id)[0]
1788
# it's tempting to parse this further, but you would
1789
# have to take into account all the variations like
1790
# Video Title - Site Name
1791
# Site Name | Video Title
1792
# Video Title - Tagline | Site Name
1793
# and so on and so forth; it's just not practical
1794
mobj = re.search(r'<title>(.*)</title>', webpage)
1796
self._downloader.trouble(u'ERROR: unable to extract title')
1798
video_title = mobj.group(1).decode('utf-8')
1799
video_title = sanitize_title(video_title)
1800
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1802
# video uploader is domain name
1803
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1805
self._downloader.trouble(u'ERROR: unable to extract title')
1807
video_uploader = mobj.group(1).decode('utf-8')
1810
# Process video information
1811
self._downloader.process_info({
1812
'id': video_id.decode('utf-8'),
1813
'url': video_url.decode('utf-8'),
1814
'uploader': video_uploader,
1815
'upload_date': u'NA',
1816
'title': video_title,
1817
'stitle': simple_title,
1818
'ext': video_extension.decode('utf-8'),
1822
except UnavailableVideoError, err:
1823
self._downloader.trouble(u'\nERROR: unable to download video')
1826
class YoutubeSearchIE(InfoExtractor):
1827
"""Information Extractor for YouTube search queries."""
1828
_VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1829
_TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1830
_VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1831
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1833
_max_youtube_results = 1000
1835
def __init__(self, youtube_ie, downloader=None):
1836
InfoExtractor.__init__(self, downloader)
1837
self._youtube_ie = youtube_ie
1841
return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1843
def report_download_page(self, query, pagenum):
1844
"""Report attempt to download playlist page with given number."""
1845
query = query.decode(preferredencoding())
1846
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1848
def _real_initialize(self):
1849
self._youtube_ie.initialize()
1851
def _real_extract(self, query):
1852
mobj = re.match(self._VALID_QUERY, query)
1854
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1857
prefix, query = query.split(':')
1859
query = query.encode('utf-8')
1861
self._download_n_results(query, 1)
1863
elif prefix == 'all':
1864
self._download_n_results(query, self._max_youtube_results)
1870
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1872
elif n > self._max_youtube_results:
1873
self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1874
n = self._max_youtube_results
1875
self._download_n_results(query, n)
1877
except ValueError: # parsing prefix as integer fails
1878
self._download_n_results(query, 1)
1881
def _download_n_results(self, query, n):
1882
"""Downloads a specified number of results for a query"""
1885
already_seen = set()
1889
self.report_download_page(query, pagenum)
1890
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1891
request = urllib2.Request(result_url)
1893
page = urllib2.urlopen(request).read()
1894
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1895
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1898
# Extract video identifiers
1899
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1900
video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1901
if video_id not in already_seen:
1902
video_ids.append(video_id)
1903
already_seen.add(video_id)
1904
if len(video_ids) == n:
1905
# Specified n videos reached
1906
for id in video_ids:
1907
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1910
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1911
for id in video_ids:
1912
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1915
pagenum = pagenum + 1
1917
class GoogleSearchIE(InfoExtractor):
1918
"""Information Extractor for Google Video search queries."""
1919
_VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1920
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1921
_VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1922
_MORE_PAGES_INDICATOR = r'<span>Next</span>'
1924
_max_google_results = 1000
1926
def __init__(self, google_ie, downloader=None):
1927
InfoExtractor.__init__(self, downloader)
1928
self._google_ie = google_ie
1932
return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1934
def report_download_page(self, query, pagenum):
1935
"""Report attempt to download playlist page with given number."""
1936
query = query.decode(preferredencoding())
1937
self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1939
def _real_initialize(self):
1940
self._google_ie.initialize()
1942
def _real_extract(self, query):
1943
mobj = re.match(self._VALID_QUERY, query)
1945
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1948
prefix, query = query.split(':')
1950
query = query.encode('utf-8')
1952
self._download_n_results(query, 1)
1954
elif prefix == 'all':
1955
self._download_n_results(query, self._max_google_results)
1961
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1963
elif n > self._max_google_results:
1964
self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1965
n = self._max_google_results
1966
self._download_n_results(query, n)
1968
except ValueError: # parsing prefix as integer fails
1969
self._download_n_results(query, 1)
1972
def _download_n_results(self, query, n):
1973
"""Downloads a specified number of results for a query"""
1976
already_seen = set()
1980
self.report_download_page(query, pagenum)
1981
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1982
request = urllib2.Request(result_url)
1984
page = urllib2.urlopen(request).read()
1985
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1986
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1989
# Extract video identifiers
1990
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1991
video_id = mobj.group(1)
1992
if video_id not in already_seen:
1993
video_ids.append(video_id)
1994
already_seen.add(video_id)
1995
if len(video_ids) == n:
1996
# Specified n videos reached
1997
for id in video_ids:
1998
self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2001
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2002
for id in video_ids:
2003
self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2006
pagenum = pagenum + 1
2008
class YahooSearchIE(InfoExtractor):
2009
"""Information Extractor for Yahoo! Video search queries."""
2010
_VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2011
_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2012
_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2013
_MORE_PAGES_INDICATOR = r'\s*Next'
2015
_max_yahoo_results = 1000
2017
def __init__(self, yahoo_ie, downloader=None):
2018
InfoExtractor.__init__(self, downloader)
2019
self._yahoo_ie = yahoo_ie
2023
return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2025
def report_download_page(self, query, pagenum):
2026
"""Report attempt to download playlist page with given number."""
2027
query = query.decode(preferredencoding())
2028
self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2030
def _real_initialize(self):
2031
self._yahoo_ie.initialize()
2033
def _real_extract(self, query):
2034
mobj = re.match(self._VALID_QUERY, query)
2036
self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2039
prefix, query = query.split(':')
2041
query = query.encode('utf-8')
2043
self._download_n_results(query, 1)
2045
elif prefix == 'all':
2046
self._download_n_results(query, self._max_yahoo_results)
2052
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2054
elif n > self._max_yahoo_results:
2055
self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2056
n = self._max_yahoo_results
2057
self._download_n_results(query, n)
2059
except ValueError: # parsing prefix as integer fails
2060
self._download_n_results(query, 1)
2063
def _download_n_results(self, query, n):
2064
"""Downloads a specified number of results for a query"""
2067
already_seen = set()
2071
self.report_download_page(query, pagenum)
2072
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2073
request = urllib2.Request(result_url)
2075
page = urllib2.urlopen(request).read()
2076
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2077
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2080
# Extract video identifiers
2081
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2082
video_id = mobj.group(1)
2083
if video_id not in already_seen:
2084
video_ids.append(video_id)
2085
already_seen.add(video_id)
2086
if len(video_ids) == n:
2087
# Specified n videos reached
2088
for id in video_ids:
2089
self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2092
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2093
for id in video_ids:
2094
self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2097
pagenum = pagenum + 1
2099
class YoutubePlaylistIE(InfoExtractor):
2100
"""Information Extractor for YouTube playlists."""
2102
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2103
_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2104
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2105
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2108
def __init__(self, youtube_ie, downloader=None):
2109
InfoExtractor.__init__(self, downloader)
2110
self._youtube_ie = youtube_ie
2114
return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2116
def report_download_page(self, playlist_id, pagenum):
2117
"""Report attempt to download playlist page with given number."""
2118
self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2120
def _real_initialize(self):
2121
self._youtube_ie.initialize()
2123
def _real_extract(self, url):
2124
# Extract playlist id
2125
mobj = re.match(self._VALID_URL, url)
2127
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2131
if mobj.group(3) is not None:
2132
self._youtube_ie.extract(mobj.group(3))
2135
# Download playlist pages
2136
# prefix is 'p' as default for playlists but there are other types that need extra care
2137
playlist_prefix = mobj.group(1)
2138
if playlist_prefix == 'a':
2139
playlist_access = 'artist'
2141
playlist_prefix = 'p'
2142
playlist_access = 'view_play_list'
2143
playlist_id = mobj.group(2)
2148
self.report_download_page(playlist_id, pagenum)
2149
request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2151
page = urllib2.urlopen(request).read()
2152
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2153
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2156
# Extract video identifiers
2158
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2159
if mobj.group(1) not in ids_in_page:
2160
ids_in_page.append(mobj.group(1))
2161
video_ids.extend(ids_in_page)
2163
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2165
pagenum = pagenum + 1
2167
playliststart = self._downloader.params.get('playliststart', 1) - 1
2168
playlistend = self._downloader.params.get('playlistend', -1)
2169
video_ids = video_ids[playliststart:playlistend]
2171
for id in video_ids:
2172
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2175
class YoutubeUserIE(InfoExtractor):
2176
"""Information Extractor for YouTube users."""
2178
_VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2179
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2180
_GDATA_PAGE_SIZE = 50
2181
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2182
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2185
def __init__(self, youtube_ie, downloader=None):
2186
InfoExtractor.__init__(self, downloader)
2187
self._youtube_ie = youtube_ie
2191
return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2193
def report_download_page(self, username, start_index):
2194
"""Report attempt to download user page."""
2195
self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2196
(username, start_index, start_index + self._GDATA_PAGE_SIZE))
2198
def _real_initialize(self):
2199
self._youtube_ie.initialize()
2201
def _real_extract(self, url):
2203
mobj = re.match(self._VALID_URL, url)
2205
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2208
username = mobj.group(1)
2210
# Download video ids using YouTube Data API. Result size per
2211
# query is limited (currently to 50 videos) so we need to query
2212
# page by page until there are no video ids - it means we got
2219
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2220
self.report_download_page(username, start_index)
2222
request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2225
page = urllib2.urlopen(request).read()
2226
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2227
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2230
# Extract video identifiers
2233
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2234
if mobj.group(1) not in ids_in_page:
2235
ids_in_page.append(mobj.group(1))
2237
video_ids.extend(ids_in_page)
2239
# A little optimization - if current page is not
2240
# "full", ie. does not contain PAGE_SIZE video ids then
2241
# we can assume that this page is the last one - there
2242
# are no more ids on further pages - no need to query
2245
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2250
all_ids_count = len(video_ids)
2251
playliststart = self._downloader.params.get('playliststart', 1) - 1
2252
playlistend = self._downloader.params.get('playlistend', -1)
2254
if playlistend == -1:
2255
video_ids = video_ids[playliststart:]
2257
video_ids = video_ids[playliststart:playlistend]
2259
self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2260
(username, all_ids_count, len(video_ids)))
2262
for video_id in video_ids:
2263
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2266
class DepositFilesIE(InfoExtractor):
2267
"""Information extractor for depositfiles.com"""
2269
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2271
def __init__(self, downloader=None):
2272
InfoExtractor.__init__(self, downloader)
2276
return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2278
def report_download_webpage(self, file_id):
2279
"""Report webpage download."""
2280
self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2282
def report_extraction(self, file_id):
2283
"""Report information extraction."""
2284
self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2286
def _real_initialize(self):
2289
def _real_extract(self, url):
2290
# At this point we have a new file
2291
self._downloader.increment_downloads()
2293
file_id = url.split('/')[-1]
2294
# Rebuild url in english locale
2295
url = 'http://depositfiles.com/en/files/' + file_id
2297
# Retrieve file webpage with 'Free download' button pressed
2298
free_download_indication = { 'gateway_result' : '1' }
2299
request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2301
self.report_download_webpage(file_id)
2302
webpage = urllib2.urlopen(request).read()
2303
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304
self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2307
# Search for the real file URL
2308
mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2309
if (mobj is None) or (mobj.group(1) is None):
2310
# Try to figure out reason of the error.
2311
mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2312
if (mobj is not None) and (mobj.group(1) is not None):
2313
restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2314
self._downloader.trouble(u'ERROR: %s' % restriction_message)
2316
self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2319
file_url = mobj.group(1)
2320
file_extension = os.path.splitext(file_url)[1][1:]
2322
# Search for file title
2323
mobj = re.search(r'<b title="(.*?)">', webpage)
2325
self._downloader.trouble(u'ERROR: unable to extract title')
2327
file_title = mobj.group(1).decode('utf-8')
2330
# Process file information
2331
self._downloader.process_info({
2332
'id': file_id.decode('utf-8'),
2333
'url': file_url.decode('utf-8'),
2335
'upload_date': u'NA',
2336
'title': file_title,
2337
'stitle': file_title,
2338
'ext': file_extension.decode('utf-8'),
2342
except UnavailableVideoError, err:
2343
self._downloader.trouble(u'ERROR: unable to download file')
2345
class FacebookIE(InfoExtractor):
2346
"""Information Extractor for Facebook"""
2348
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2349
_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2350
_NETRC_MACHINE = 'facebook'
2351
_available_formats = ['highqual', 'lowqual']
2352
_video_extensions = {
2357
def __init__(self, downloader=None):
2358
InfoExtractor.__init__(self, downloader)
2362
return (re.match(FacebookIE._VALID_URL, url) is not None)
2364
def _reporter(self, message):
2365
"""Add header and report message."""
2366
self._downloader.to_screen(u'[facebook] %s' % message)
2368
def report_login(self):
2369
"""Report attempt to log in."""
2370
self._reporter(u'Logging in')
2372
def report_video_webpage_download(self, video_id):
2373
"""Report attempt to download video webpage."""
2374
self._reporter(u'%s: Downloading video webpage' % video_id)
2376
def report_information_extraction(self, video_id):
2377
"""Report attempt to extract video information."""
2378
self._reporter(u'%s: Extracting video information' % video_id)
2380
def _parse_page(self, video_webpage):
2381
"""Extract video information from page"""
2383
data = {'title': r'class="video_title datawrap">(.*?)</',
2384
'description': r'<div class="datawrap">(.*?)</div>',
2385
'owner': r'\("video_owner_name", "(.*?)"\)',
2386
'upload_date': r'data-date="(.*?)"',
2387
'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2390
for piece in data.keys():
2391
mobj = re.search(data[piece], video_webpage)
2392
if mobj is not None:
2393
video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2397
for fmt in self._available_formats:
2398
mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2399
if mobj is not None:
2400
# URL is in a Javascript segment inside an escaped Unicode format within
2401
# the generally utf-8 page
2402
video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2403
video_info['video_urls'] = video_urls
2407
def _real_initialize(self):
2408
if self._downloader is None:
2413
downloader_params = self._downloader.params
2415
# Attempt to use provided username and password or .netrc data
2416
if downloader_params.get('username', None) is not None:
2417
useremail = downloader_params['username']
2418
password = downloader_params['password']
2419
elif downloader_params.get('usenetrc', False):
2421
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2422
if info is not None:
2426
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2427
except (IOError, netrc.NetrcParseError), err:
2428
self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2431
if useremail is None:
2440
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2443
login_results = urllib2.urlopen(request).read()
2444
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2445
self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2447
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2448
self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2451
def _real_extract(self, url):
2452
mobj = re.match(self._VALID_URL, url)
2454
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2456
video_id = mobj.group('ID')
2459
self.report_video_webpage_download(video_id)
2460
request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2462
page = urllib2.urlopen(request)
2463
video_webpage = page.read()
2464
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2465
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2468
# Start extracting information
2469
self.report_information_extraction(video_id)
2471
# Extract information
2472
video_info = self._parse_page(video_webpage)
2475
if 'owner' not in video_info:
2476
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2478
video_uploader = video_info['owner']
2481
if 'title' not in video_info:
2482
self._downloader.trouble(u'ERROR: unable to extract video title')
2484
video_title = video_info['title']
2485
video_title = video_title.decode('utf-8')
2486
video_title = sanitize_title(video_title)
2489
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2490
simple_title = simple_title.strip(ur'_')
2493
if 'thumbnail' not in video_info:
2494
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2495
video_thumbnail = ''
2497
video_thumbnail = video_info['thumbnail']
2501
if 'upload_date' in video_info:
2502
upload_time = video_info['upload_date']
2503
timetuple = email.utils.parsedate_tz(upload_time)
2504
if timetuple is not None:
2506
upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2511
video_description = 'No description available.'
2512
if (self._downloader.params.get('forcedescription', False) and
2513
'description' in video_info):
2514
video_description = video_info['description']
2516
url_map = video_info['video_urls']
2517
if len(url_map.keys()) > 0:
2518
# Decide which formats to download
2519
req_format = self._downloader.params.get('format', None)
2520
format_limit = self._downloader.params.get('format_limit', None)
2522
if format_limit is not None and format_limit in self._available_formats:
2523
format_list = self._available_formats[self._available_formats.index(format_limit):]
2525
format_list = self._available_formats
2526
existing_formats = [x for x in format_list if x in url_map]
2527
if len(existing_formats) == 0:
2528
self._downloader.trouble(u'ERROR: no known formats available for video')
2530
if req_format is None:
2531
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2532
elif req_format == '-1':
2533
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2536
if req_format not in url_map:
2537
self._downloader.trouble(u'ERROR: requested format not available')
2539
video_url_list = [(req_format, url_map[req_format])] # Specific format
2541
for format_param, video_real_url in video_url_list:
2543
# At this point we have a new video
2544
self._downloader.increment_downloads()
2547
video_extension = self._video_extensions.get(format_param, 'mp4')
2549
# Find the video URL in fmt_url_map or conn paramters
2551
# Process video information
2552
self._downloader.process_info({
2553
'id': video_id.decode('utf-8'),
2554
'url': video_real_url.decode('utf-8'),
2555
'uploader': video_uploader.decode('utf-8'),
2556
'upload_date': upload_date,
2557
'title': video_title,
2558
'stitle': simple_title,
2559
'ext': video_extension.decode('utf-8'),
2560
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2561
'thumbnail': video_thumbnail.decode('utf-8'),
2562
'description': video_description.decode('utf-8'),
2565
except UnavailableVideoError, err:
2566
self._downloader.trouble(u'\nERROR: unable to download video')
2568
class PostProcessor(object):
2569
"""Post Processor class.
2571
PostProcessor objects can be added to downloaders with their
2572
add_post_processor() method. When the downloader has finished a
2573
successful download, it will take its internal chain of PostProcessors
2574
and start calling the run() method on each one of them, first with
2575
an initial argument and then with the returned value of the previous
2578
The chain will be stopped if one of them ever returns None or the end
2579
of the chain is reached.
2581
PostProcessor objects follow a "mutual registration" process similar
2582
to InfoExtractor objects.
2587
def __init__(self, downloader=None):
2588
self._downloader = downloader
2590
def set_downloader(self, downloader):
2591
"""Sets the downloader for this PP."""
2592
self._downloader = downloader
2594
def run(self, information):
2595
"""Run the PostProcessor.
2597
The "information" argument is a dictionary like the ones
2598
composed by InfoExtractors. The only difference is that this
2599
one has an extra field called "filepath" that points to the
2602
When this method returns None, the postprocessing chain is
2603
stopped. However, this method may return an information
2604
dictionary that will be passed to the next postprocessing
2605
object in the chain. It can be the one it received after
2606
changing some fields.
2608
In addition, this method may raise a PostProcessingError
2609
exception that will be taken into account by the downloader
2612
return information # by default, do nothing
2614
class FFmpegExtractAudioPP(PostProcessor):
2616
def __init__(self, downloader=None, preferredcodec=None):
2617
PostProcessor.__init__(self, downloader)
2618
if preferredcodec is None:
2619
preferredcodec = 'best'
2620
self._preferredcodec = preferredcodec
2623
def get_audio_codec(path):
2625
cmd = ['ffprobe', '-show_streams', '--', path]
2626
handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2627
output = handle.communicate()[0]
2628
if handle.wait() != 0:
2630
except (IOError, OSError):
2633
for line in output.split('\n'):
2634
if line.startswith('codec_name='):
2635
audio_codec = line.split('=')[1].strip()
2636
elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2641
def run_ffmpeg(path, out_path, codec, more_opts):
2643
cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2644
ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2646
except (IOError, OSError):
2649
def run(self, information):
2650
path = information['filepath']
2652
filecodec = self.get_audio_codec(path)
2653
if filecodec is None:
2654
self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2658
if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2659
if filecodec == 'aac' or filecodec == 'mp3':
2660
# Lossless if possible
2662
extension = filecodec
2663
if filecodec == 'aac':
2664
more_opts = ['-f', 'adts']
2667
acodec = 'libmp3lame'
2669
more_opts = ['-ab', '128k']
2671
# We convert the audio (lossy)
2672
acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2673
extension = self._preferredcodec
2674
more_opts = ['-ab', '128k']
2675
if self._preferredcodec == 'aac':
2676
more_opts += ['-f', 'adts']
2678
(prefix, ext) = os.path.splitext(path)
2679
new_path = prefix + '.' + extension
2680
self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2681
status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2684
self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2689
except (IOError, OSError):
2690
self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2693
information['filepath'] = new_path
2696
### MAIN PROGRAM ###
2697
if __name__ == '__main__':
2699
# Modules needed only when running the main program
2703
# Function to update the program file with the latest version from the repository.
2704
def update_self(downloader, filename):
2705
# Note: downloader only used for options
2706
if not os.access(filename, os.W_OK):
2707
sys.exit('ERROR: no write permissions on %s' % filename)
2709
downloader.to_screen('Updating to latest stable version...')
2711
latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2712
latest_version = urllib.urlopen(latest_url).read().strip()
2713
prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2714
newcontent = urllib.urlopen(prog_url).read()
2715
except (IOError, OSError), err:
2716
sys.exit('ERROR: unable to download latest version')
2718
stream = open(filename, 'w')
2719
stream.write(newcontent)
2721
except (IOError, OSError), err:
2722
sys.exit('ERROR: unable to overwrite current version')
2723
downloader.to_screen('Updated to version %s' % latest_version)
2725
# Parse command line
2726
parser = optparse.OptionParser(
2727
usage='Usage: %prog [options] url...',
2728
version='2011.08.04',
2729
conflict_handler='resolve',
2732
parser.add_option('-h', '--help',
2733
action='help', help='print this help text and exit')
2734
parser.add_option('-v', '--version',
2735
action='version', help='print program version and exit')
2736
parser.add_option('-U', '--update',
2737
action='store_true', dest='update_self', help='update this program to latest stable version')
2738
parser.add_option('-i', '--ignore-errors',
2739
action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2740
parser.add_option('-r', '--rate-limit',
2741
dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2742
parser.add_option('-R', '--retries',
2743
dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2744
parser.add_option('--playlist-start',
2745
dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2746
parser.add_option('--playlist-end',
2747
dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2748
parser.add_option('--dump-user-agent',
2749
action='store_true', dest='dump_user_agent',
2750
help='display the current browser identification', default=False)
2752
authentication = optparse.OptionGroup(parser, 'Authentication Options')
2753
authentication.add_option('-u', '--username',
2754
dest='username', metavar='USERNAME', help='account username')
2755
authentication.add_option('-p', '--password',
2756
dest='password', metavar='PASSWORD', help='account password')
2757
authentication.add_option('-n', '--netrc',
2758
action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2759
parser.add_option_group(authentication)
2761
video_format = optparse.OptionGroup(parser, 'Video Format Options')
2762
video_format.add_option('-f', '--format',
2763
action='store', dest='format', metavar='FORMAT', help='video format code')
2764
video_format.add_option('--all-formats',
2765
action='store_const', dest='format', help='download all available video formats', const='-1')
2766
video_format.add_option('--max-quality',
2767
action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2768
parser.add_option_group(video_format)
2770
verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2771
verbosity.add_option('-q', '--quiet',
2772
action='store_true', dest='quiet', help='activates quiet mode', default=False)
2773
verbosity.add_option('-s', '--simulate',
2774
action='store_true', dest='simulate', help='do not download video', default=False)
2775
verbosity.add_option('-g', '--get-url',
2776
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2777
verbosity.add_option('-e', '--get-title',
2778
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2779
verbosity.add_option('--get-thumbnail',
2780
action='store_true', dest='getthumbnail',
2781
help='simulate, quiet but print thumbnail URL', default=False)
2782
verbosity.add_option('--get-description',
2783
action='store_true', dest='getdescription',
2784
help='simulate, quiet but print video description', default=False)
2785
verbosity.add_option('--get-filename',
2786
action='store_true', dest='getfilename',
2787
help='simulate, quiet but print output filename', default=False)
2788
verbosity.add_option('--no-progress',
2789
action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2790
verbosity.add_option('--console-title',
2791
action='store_true', dest='consoletitle',
2792
help='display progress in console titlebar', default=False)
2793
parser.add_option_group(verbosity)
2795
filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2796
filesystem.add_option('-t', '--title',
2797
action='store_true', dest='usetitle', help='use title in file name', default=False)
2798
filesystem.add_option('-l', '--literal',
2799
action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2800
filesystem.add_option('-A', '--auto-number',
2801
action='store_true', dest='autonumber',
2802
help='number downloaded files starting from 00000', default=False)
2803
filesystem.add_option('-o', '--output',
2804
dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2805
filesystem.add_option('-a', '--batch-file',
2806
dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2807
filesystem.add_option('-w', '--no-overwrites',
2808
action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2809
filesystem.add_option('-c', '--continue',
2810
action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2811
filesystem.add_option('--cookies',
2812
dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2813
filesystem.add_option('--no-part',
2814
action='store_true', dest='nopart', help='do not use .part files', default=False)
2815
filesystem.add_option('--no-mtime',
2816
action='store_false', dest='updatetime',
2817
help='do not use the Last-modified header to set the file modification time', default=True)
2818
parser.add_option_group(filesystem)
2820
postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2821
postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2822
help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2823
postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2824
help='"best", "aac" or "mp3"; best by default')
2825
parser.add_option_group(postproc)
2827
(opts, args) = parser.parse_args()
2829
# Open appropriate CookieJar
2830
if opts.cookiefile is None:
2831
jar = cookielib.CookieJar()
2834
jar = cookielib.MozillaCookieJar(opts.cookiefile)
2835
if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2837
except (IOError, OSError), err:
2838
sys.exit(u'ERROR: unable to open cookie file')
2841
if opts.dump_user_agent:
2842
print std_headers['User-Agent']
2845
# General configuration
2846
cookie_processor = urllib2.HTTPCookieProcessor(jar)
2847
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2848
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2850
# Batch file verification
2852
if opts.batchfile is not None:
2854
if opts.batchfile == '-':
2857
batchfd = open(opts.batchfile, 'r')
2858
batchurls = batchfd.readlines()
2859
batchurls = [x.strip() for x in batchurls]
2860
batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2862
sys.exit(u'ERROR: batch file could not be read')
2863
all_urls = batchurls + args
2865
# Conflicting, missing and erroneous options
2866
if opts.usenetrc and (opts.username is not None or opts.password is not None):
2867
parser.error(u'using .netrc conflicts with giving username/password')
2868
if opts.password is not None and opts.username is None:
2869
parser.error(u'account username missing')
2870
if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2871
parser.error(u'using output template conflicts with using title, literal title or auto number')
2872
if opts.usetitle and opts.useliteral:
2873
parser.error(u'using title conflicts with using literal title')
2874
if opts.username is not None and opts.password is None:
2875
opts.password = getpass.getpass(u'Type account password and press return:')
2876
if opts.ratelimit is not None:
2877
numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2878
if numeric_limit is None:
2879
parser.error(u'invalid rate limit specified')
2880
opts.ratelimit = numeric_limit
2881
if opts.retries is not None:
2883
opts.retries = long(opts.retries)
2884
except (TypeError, ValueError), err:
2885
parser.error(u'invalid retry count specified')
2887
opts.playliststart = long(opts.playliststart)
2888
if opts.playliststart <= 0:
2890
except (TypeError, ValueError), err:
2891
parser.error(u'invalid playlist start number specified')
2893
opts.playlistend = long(opts.playlistend)
2894
if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2896
except (TypeError, ValueError), err:
2897
parser.error(u'invalid playlist end number specified')
2898
if opts.extractaudio:
2899
if opts.audioformat not in ['best', 'aac', 'mp3']:
2900
parser.error(u'invalid audio format specified')
2902
# Information extractors
2903
youtube_ie = YoutubeIE()
2904
metacafe_ie = MetacafeIE(youtube_ie)
2905
dailymotion_ie = DailymotionIE()
2906
youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2907
youtube_user_ie = YoutubeUserIE(youtube_ie)
2908
youtube_search_ie = YoutubeSearchIE(youtube_ie)
2909
google_ie = GoogleIE()
2910
google_search_ie = GoogleSearchIE(google_ie)
2911
photobucket_ie = PhotobucketIE()
2912
yahoo_ie = YahooIE()
2913
yahoo_search_ie = YahooSearchIE(yahoo_ie)
2914
deposit_files_ie = DepositFilesIE()
2915
facebook_ie = FacebookIE()
2916
generic_ie = GenericIE()
2919
fd = FileDownloader({
2920
'usenetrc': opts.usenetrc,
2921
'username': opts.username,
2922
'password': opts.password,
2923
'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2924
'forceurl': opts.geturl,
2925
'forcetitle': opts.gettitle,
2926
'forcethumbnail': opts.getthumbnail,
2927
'forcedescription': opts.getdescription,
2928
'forcefilename': opts.getfilename,
2929
'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2930
'format': opts.format,
2931
'format_limit': opts.format_limit,
2932
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2933
or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2934
or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2935
or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2936
or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2937
or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2938
or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2939
or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2940
or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2941
or u'%(id)s.%(ext)s'),
2942
'ignoreerrors': opts.ignoreerrors,
2943
'ratelimit': opts.ratelimit,
2944
'nooverwrites': opts.nooverwrites,
2945
'retries': opts.retries,
2946
'continuedl': opts.continue_dl,
2947
'noprogress': opts.noprogress,
2948
'playliststart': opts.playliststart,
2949
'playlistend': opts.playlistend,
2950
'logtostderr': opts.outtmpl == '-',
2951
'consoletitle': opts.consoletitle,
2952
'nopart': opts.nopart,
2953
'updatetime': opts.updatetime,
2955
fd.add_info_extractor(youtube_search_ie)
2956
fd.add_info_extractor(youtube_pl_ie)
2957
fd.add_info_extractor(youtube_user_ie)
2958
fd.add_info_extractor(metacafe_ie)
2959
fd.add_info_extractor(dailymotion_ie)
2960
fd.add_info_extractor(youtube_ie)
2961
fd.add_info_extractor(google_ie)
2962
fd.add_info_extractor(google_search_ie)
2963
fd.add_info_extractor(photobucket_ie)
2964
fd.add_info_extractor(yahoo_ie)
2965
fd.add_info_extractor(yahoo_search_ie)
2966
fd.add_info_extractor(deposit_files_ie)
2967
fd.add_info_extractor(facebook_ie)
2969
# This must come last since it's the
2970
# fallback if none of the others work
2971
fd.add_info_extractor(generic_ie)
2974
if opts.extractaudio:
2975
fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
2978
if opts.update_self:
2979
update_self(fd, sys.argv[0])
2982
if len(all_urls) < 1:
2983
if not opts.update_self:
2984
parser.error(u'you must provide at least one URL')
2987
retcode = fd.download(all_urls)
2989
# Dump cookie jar if requested
2990
if opts.cookiefile is not None:
2993
except (IOError, OSError), err:
2994
sys.exit(u'ERROR: unable to save cookie jar')
2998
except DownloadError:
3000
except SameFileError:
3001
sys.exit(u'ERROR: fixed output name but more than one file to download')
3002
except KeyboardInterrupt:
3003
sys.exit(u'\nERROR: Interrupted by user')