~ubuntu-branches/ubuntu/maverick/youtube-dl/maverick-updates : revision 22

1

#!/usr/bin/env python

2

# -*- coding: utf-8 -*-

3

# Author: Ricardo Garcia Gonzalez

4

# Author: Danny Colligan

5

# Author: Benjamin Johnson

6

# Author: Vasyl' Vavrychuk

7

# Author: Witold Baryluk

8

# Author: Paweł Paprota

9

# Author: Gergely Imreh

10

# License: Public domain code

11

import cookielib

12

import ctypes

13

import datetime

14

import email.utils

15

import gzip

16

import htmlentitydefs

17

import httplib

18

import locale

19

import math

20

import netrc

21

import os

22

import os.path

23

import re

24

import socket

25

import string

26

import StringIO

27

import subprocess

28

import sys

29

import time

30

import urllib

31

import urllib2

32

import zlib

33

34

# parse_qs was moved from the cgi module to the urlparse module recently.

35

try:

36

from urlparse import parse_qs

37

except ImportError:

38

from cgi import parse_qs

39

40

std_headers = {

41

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',

42

'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',

43

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

44

'Accept-Encoding': 'gzip, deflate',

45

'Accept-Language': 'en-us,en;q=0.5',

46

}

47

48

simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')

49

50

def preferredencoding():

51

"""Get preferred encoding.

52

53

Returns the best encoding scheme for the system, based on

54

locale.getpreferredencoding() and some further tweaks.

55

"""

56

def yield_preferredencoding():

57

try:

58

pref = locale.getpreferredencoding()

59

u'TEST'.encode(pref)

60

except:

61

pref = 'UTF-8'

62

while True:

63

yield pref

64

return yield_preferredencoding().next()

65

66

def htmlentity_transform(matchobj):

67

"""Transforms an HTML entity to a Unicode character.

68

69

This function receives a match object and is intended to be used with

70

the re.sub() function.

71

"""

72

entity = matchobj.group(1)

73

74

# Known non-numeric HTML entity

75

if entity in htmlentitydefs.name2codepoint:

76

return unichr(htmlentitydefs.name2codepoint[entity])

77

78

# Unicode character

79

mobj = re.match(ur'(?u)#(x?\d+)', entity)

80

if mobj is not None:

81

numstr = mobj.group(1)

82

if numstr.startswith(u'x'):

83

base = 16

84

numstr = u'0%s' % numstr

85

else:

86

base = 10

87

return unichr(long(numstr, base))

88

89

# Unknown entity in name, return its literal representation

90

return (u'&%s;' % entity)

91

92

def sanitize_title(utitle):

93

"""Sanitizes a video title so it could be used as part of a filename."""

94

utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)

95

return utitle.replace(unicode(os.sep), u'%')

96

97

def sanitize_open(filename, open_mode):

98

"""Try to open the given filename, and slightly tweak it if this fails.

99

100

Attempts to open the given filename. If this fails, it tries to change

101

the filename slightly, step by step, until it's either able to open it

102

or it fails and raises a final exception, like the standard open()

103

function.

104

105

It returns the tuple (stream, definitive_file_name).

106

"""

107

try:

108

if filename == u'-':

109

if sys.platform == 'win32':

110

import msvcrt

111

msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)

112

return (sys.stdout, filename)

113

stream = open(filename, open_mode)

114

return (stream, filename)

115

except (IOError, OSError), err:

116

# In case of error, try to remove win32 forbidden chars

117

filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)

118

119

# An exception here should be caught in the caller

120

stream = open(filename, open_mode)

121

return (stream, filename)

122

123

def timeconvert(timestr):

124

"""Convert RFC 2822 defined time string into system timestamp"""

125

timestamp = None

126

timetuple = email.utils.parsedate_tz(timestr)

127

if timetuple is not None:

128

timestamp = email.utils.mktime_tz(timetuple)

129

return timestamp

130

131

class DownloadError(Exception):

132

"""Download Error exception.

133

134

This exception may be thrown by FileDownloader objects if they are not

135

configured to continue on errors. They will contain the appropriate

136

error message.

137

"""

138

pass

139

140

class SameFileError(Exception):

141

"""Same File exception.

142

143

This exception will be thrown by FileDownloader objects if they detect

144

multiple files would have to be downloaded to the same file on disk.

145

"""

146

pass

147

148

class PostProcessingError(Exception):

149

"""Post Processing exception.

150

151

This exception may be raised by PostProcessor's .run() method to

152

indicate an error in the postprocessing task.

153

"""

154

pass

155

156

class UnavailableVideoError(Exception):

157

"""Unavailable Format exception.

158

159

This exception will be thrown when a video is requested

160

in a format that is not available for that video.

161

"""

162

pass

163

164

class ContentTooShortError(Exception):

165

"""Content Too Short exception.

166

167

This exception may be raised by FileDownloader objects when a file they

168

download is too small for what the server announced first, indicating

169

the connection was probably interrupted.

170

"""

171

# Both in bytes

172

downloaded = None

173

expected = None

174

175

def __init__(self, downloaded, expected):

176

self.downloaded = downloaded

177

self.expected = expected

178

179

class YoutubeDLHandler(urllib2.HTTPHandler):

180

"""Handler for HTTP requests and responses.

181

182

This class, when installed with an OpenerDirector, automatically adds

183

the standard headers to every HTTP request and handles gzipped and

184

deflated responses from web servers. If compression is to be avoided in

185

a particular request, the original request in the program code only has

186

to include the HTTP header "Youtubedl-No-Compression", which will be

187

removed before making the real request.

188

189

Part of this code was copied from:

190

191

http://techknack.net/python-urllib2-handlers/

192

193

Andrew Rowls, the author of that code, agreed to release it to the

194

public domain.

195

"""

196

197

@staticmethod

198

def deflate(data):

199

try:

200

return zlib.decompress(data, -zlib.MAX_WBITS)

201

except zlib.error:

202

return zlib.decompress(data)

203

204

@staticmethod

205

def addinfourl_wrapper(stream, headers, url, code):

206

if hasattr(urllib2.addinfourl, 'getcode'):

207

return urllib2.addinfourl(stream, headers, url, code)

208

ret = urllib2.addinfourl(stream, headers, url)

209

ret.code = code

210

return ret

211

212

def http_request(self, req):

213

for h in std_headers:

214

if h in req.headers:

215

del req.headers[h]

216

req.add_header(h, std_headers[h])

217

if 'Youtubedl-no-compression' in req.headers:

218

if 'Accept-encoding' in req.headers:

219

del req.headers['Accept-encoding']

220

del req.headers['Youtubedl-no-compression']

221

return req

222

223

def http_response(self, req, resp):

224

old_resp = resp

225

# gzip

226

if resp.headers.get('Content-encoding', '') == 'gzip':

227

gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')

228

resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)

229

resp.msg = old_resp.msg

230

# deflate

231

if resp.headers.get('Content-encoding', '') == 'deflate':

232

gz = StringIO.StringIO(self.deflate(resp.read()))

233

resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)

234

resp.msg = old_resp.msg

235

return resp

236

237

class FileDownloader(object):

238

"""File Downloader class.

239

240

File downloader objects are the ones responsible of downloading the

241

actual video file and writing it to disk if the user has requested

242

it, among some other tasks. In most cases there should be one per

243

program. As, given a video URL, the downloader doesn't know how to

244

extract all the needed information, task that InfoExtractors do, it

245

has to pass the URL to one of them.

246

247

For this, file downloader objects have a method that allows

248

InfoExtractors to be registered in a given order. When it is passed

249

a URL, the file downloader handles it to the first InfoExtractor it

250

finds that reports being able to handle it. The InfoExtractor extracts

251

all the information about the video or videos the URL refers to, and

252

asks the FileDownloader to process the video information, possibly

253

downloading the video.

254

255

File downloaders accept a lot of parameters. In order not to saturate

256

the object constructor with arguments, it receives a dictionary of

257

options instead. These options are available through the params

258

attribute for the InfoExtractors to use. The FileDownloader also

259

registers itself as the downloader in charge for the InfoExtractors

260

that are added to it, so this is a "mutual registration".

261

262

Available options:

263

264

username: Username for authentication purposes.

265

password: Password for authentication purposes.

266

usenetrc: Use netrc for authentication instead.

267

quiet: Do not print messages to stdout.

268

forceurl: Force printing final URL.

269

forcetitle: Force printing title.

270

forcethumbnail: Force printing thumbnail URL.

271

forcedescription: Force printing description.

272

forcefilename: Force printing final filename.

273

simulate: Do not download the video files.

274

format: Video format code.

275

format_limit: Highest quality format to try.

276

outtmpl: Template for output names.

277

ignoreerrors: Do not stop on download errors.

278

ratelimit: Download speed limit, in bytes/sec.

279

nooverwrites: Prevent overwriting files.

280

retries: Number of times to retry for HTTP error 5xx

281

continuedl: Try to continue downloads if possible.

282

noprogress: Do not print the progress bar.

283

playliststart: Playlist item to start at.

284

playlistend: Playlist item to end at.

285

logtostderr: Log messages to stderr instead of stdout.

286

consoletitle: Display progress in console window's titlebar.

287

nopart: Do not use temporary .part files.

288

updatetime: Use the Last-modified header to set output file timestamps.

289

"""

290

291

params = None

292

_ies = []

293

_pps = []

294

_download_retcode = None

295

_num_downloads = None

296

_screen_file = None

297

298

def __init__(self, params):

299

"""Create a FileDownloader object with the given options."""

300

self._ies = []

301

self._pps = []

302

self._download_retcode = 0

303

self._num_downloads = 0

304

self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]

305

self.params = params

306

307

@staticmethod

308

def pmkdir(filename):

309

"""Create directory components in filename. Similar to Unix "mkdir -p"."""

310

components = filename.split(os.sep)

311

aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]

312

aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator

313

for dir in aggregate:

314

if not os.path.exists(dir):

315

os.mkdir(dir)

316

317

@staticmethod

318

def format_bytes(bytes):

319

if bytes is None:

320

return 'N/A'

321

if type(bytes) is str:

322

bytes = float(bytes)

323

if bytes == 0.0:

324

exponent = 0

325

else:

326

exponent = long(math.log(bytes, 1024.0))

327

suffix = 'bkMGTPEZY'[exponent]

328

converted = float(bytes) / float(1024**exponent)

329

return '%.2f%s' % (converted, suffix)

330

331

@staticmethod

332

def calc_percent(byte_counter, data_len):

333

if data_len is None:

334

return '---.-%'

335

return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))

336

337

@staticmethod

338

def calc_eta(start, now, total, current):

339

if total is None:

340

return '--:--'

341

dif = now - start

342

if current == 0 or dif < 0.001: # One millisecond

343

return '--:--'

344

rate = float(current) / dif

345

eta = long((float(total) - float(current)) / rate)

346

(eta_mins, eta_secs) = divmod(eta, 60)

347

if eta_mins > 99:

348

return '--:--'

349

return '%02d:%02d' % (eta_mins, eta_secs)

350

351

@staticmethod

352

def calc_speed(start, now, bytes):

353

dif = now - start

354

if bytes == 0 or dif < 0.001: # One millisecond

355

return '%10s' % '---b/s'

356

return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))

357

358

@staticmethod

359

def best_block_size(elapsed_time, bytes):

360

new_min = max(bytes / 2.0, 1.0)

361

new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB

362

if elapsed_time < 0.001:

363

return long(new_max)

364

rate = bytes / elapsed_time

365

if rate > new_max:

366

return long(new_max)

367

if rate < new_min:

368

return long(new_min)

369

return long(rate)

370

371

@staticmethod

372

def parse_bytes(bytestr):

373

"""Parse a string indicating a byte quantity into a long integer."""

374

matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)

375

if matchobj is None:

376

return None

377

number = float(matchobj.group(1))

378

multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())

379

return long(round(number * multiplier))

380

381

def add_info_extractor(self, ie):

382

"""Add an InfoExtractor object to the end of the list."""

383

self._ies.append(ie)

384

ie.set_downloader(self)

385

386

def add_post_processor(self, pp):

387

"""Add a PostProcessor object to the end of the chain."""

388

self._pps.append(pp)

389

pp.set_downloader(self)

390

391

def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):

392

"""Print message to stdout if not in quiet mode."""

393

try:

394

if not self.params.get('quiet', False):

395

terminator = [u'\n', u''][skip_eol]

396

print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),

397

self._screen_file.flush()

398

except (UnicodeEncodeError), err:

399

if not ignore_encoding_errors:

400

raise

401

402

def to_stderr(self, message):

403

"""Print message to stderr."""

404

print >>sys.stderr, message.encode(preferredencoding())

405

406

def to_cons_title(self, message):

407

"""Set console/terminal window title to message."""

408

if not self.params.get('consoletitle', False):

409

return

410

if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():

411

# c_wchar_p() might not be necessary if `message` is

412

# already of type unicode()

413

ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))

414

elif 'TERM' in os.environ:

415

sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))

416

417

def fixed_template(self):

418

"""Checks if the output template is fixed."""

419

return (re.search(ur'(?u)%$.+?$s', self.params['outtmpl']) is None)

420

421

def trouble(self, message=None):

422

"""Determine action to take when a download problem appears.

423

424

Depending on if the downloader has been configured to ignore

425

download errors or not, this method may throw an exception or

426

not when errors are found, after printing the message.

427

"""

428

if message is not None:

429

self.to_stderr(message)

430

if not self.params.get('ignoreerrors', False):

431

raise DownloadError(message)

432

self._download_retcode = 1

433

434

def slow_down(self, start_time, byte_counter):

435

"""Sleep if the download speed is over the rate limit."""

436

rate_limit = self.params.get('ratelimit', None)

437

if rate_limit is None or byte_counter == 0:

438

return

439

now = time.time()

440

elapsed = now - start_time

441

if elapsed <= 0.0:

442

return

443

speed = float(byte_counter) / elapsed

444

if speed > rate_limit:

445

time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)

446

447

def temp_name(self, filename):

448

"""Returns a temporary filename for the given filename."""

449

if self.params.get('nopart', False) or filename == u'-' or \

450

(os.path.exists(filename) and not os.path.isfile(filename)):

451

return filename

452

return filename + u'.part'

453

454

def undo_temp_name(self, filename):

455

if filename.endswith(u'.part'):

456

return filename[:-len(u'.part')]

457

return filename

458

459

def try_rename(self, old_filename, new_filename):

460

try:

461

if old_filename == new_filename:

462

return

463

os.rename(old_filename, new_filename)

464

except (IOError, OSError), err:

465

self.trouble(u'ERROR: unable to rename file')

466

467

def try_utime(self, filename, last_modified_hdr):

468

"""Try to set the last-modified time of the given file."""

469

if last_modified_hdr is None:

470

return

471

if not os.path.isfile(filename):

472

return

473

timestr = last_modified_hdr

474

if timestr is None:

475

return

476

filetime = timeconvert(timestr)

477

if filetime is None:

478

return

479

try:

480

os.utime(filename,(time.time(), filetime))

481

except:

482

pass

483

484

def report_destination(self, filename):

485

"""Report destination filename."""

486

self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)

487

488

def report_progress(self, percent_str, data_len_str, speed_str, eta_str):

489

"""Report download progress."""

490

if self.params.get('noprogress', False):

491

return

492

self.to_screen(u'\r[download] %s of %s at %s ETA %s' %

493

(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)

494

self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %

495

(percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))

496

497

def report_resuming_byte(self, resume_len):

498

"""Report attempt to resume at given byte."""

499

self.to_screen(u'[download] Resuming download at byte %s' % resume_len)

500

501

def report_retry(self, count, retries):

502

"""Report retry in case of HTTP error 5xx"""

503

self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))

504

505

def report_file_already_downloaded(self, file_name):

506

"""Report file has already been fully downloaded."""

507

try:

508

self.to_screen(u'[download] %s has already been downloaded' % file_name)

509

except (UnicodeEncodeError), err:

510

self.to_screen(u'[download] The file has already been downloaded')

511

512

def report_unable_to_resume(self):

513

"""Report it was impossible to resume download."""

514

self.to_screen(u'[download] Unable to resume')

515

516

def report_finish(self):

517

"""Report download finished."""

518

if self.params.get('noprogress', False):

519

self.to_screen(u'[download] Download completed')

520

else:

521

self.to_screen(u'')

522

523

def increment_downloads(self):

524

"""Increment the ordinal that assigns a number to each file."""

525

self._num_downloads += 1

526

527

def prepare_filename(self, info_dict):

528

"""Generate the output filename."""

529

try:

530

template_dict = dict(info_dict)

531

template_dict['epoch'] = unicode(long(time.time()))

532

template_dict['autonumber'] = unicode('%05d' % self._num_downloads)

533

filename = self.params['outtmpl'] % template_dict

534

return filename

535

except (ValueError, KeyError), err:

536

self.trouble(u'ERROR: invalid system charset or erroneous output template')

537

return None

538

539

def process_info(self, info_dict):

540

"""Process a single dictionary returned by an InfoExtractor."""

541

filename = self.prepare_filename(info_dict)

542

# Do nothing else if in simulate mode

543

if self.params.get('simulate', False):

544

# Forced printings

545

if self.params.get('forcetitle', False):

546

print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')

547

if self.params.get('forceurl', False):

548

print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')

549

if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:

550

print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')

551

if self.params.get('forcedescription', False) and 'description' in info_dict:

552

print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')

553

if self.params.get('forcefilename', False) and filename is not None:

554

print filename.encode(preferredencoding(), 'xmlcharrefreplace')

555

556

return

557

558

if filename is None:

559

return

560

if self.params.get('nooverwrites', False) and os.path.exists(filename):

561

self.to_stderr(u'WARNING: file exists and will be skipped')

562

return

563

564

try:

565

self.pmkdir(filename)

566

except (OSError, IOError), err:

567

self.trouble(u'ERROR: unable to create directories: %s' % str(err))

568

return

569

570

try:

571

success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))

572

except (OSError, IOError), err:

573

raise UnavailableVideoError

574

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

575

self.trouble(u'ERROR: unable to download video data: %s' % str(err))

576

return

577

except (ContentTooShortError, ), err:

578

self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))

579

return

580

581

if success:

582

try:

583

self.post_process(filename, info_dict)

584

except (PostProcessingError), err:

585

self.trouble(u'ERROR: postprocessing: %s' % str(err))

586

return

587

588

def download(self, url_list):

589

"""Download a given list of URLs."""

590

if len(url_list) > 1 and self.fixed_template():

591

raise SameFileError(self.params['outtmpl'])

592

593

for url in url_list:

594

suitable_found = False

595

for ie in self._ies:

596

# Go to next InfoExtractor if not suitable

597

if not ie.suitable(url):

598

continue

599

600

# Suitable InfoExtractor found

601

suitable_found = True

602

603

# Extract information from URL and process it

604

ie.extract(url)

605

606

# Suitable InfoExtractor had been found; go to next URL

607

break

608

609

if not suitable_found:

610

self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)

611

612

return self._download_retcode

613

614

def post_process(self, filename, ie_info):

615

"""Run the postprocessing chain on the given file."""

616

info = dict(ie_info)

617

info['filepath'] = filename

618

for pp in self._pps:

619

info = pp.run(info)

620

if info is None:

621

break

622

623

def _download_with_rtmpdump(self, filename, url, player_url):

624

self.report_destination(filename)

625

tmpfilename = self.temp_name(filename)

626

627

# Check for rtmpdump first

628

try:

629

subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)

630

except (OSError, IOError):

631

self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')

632

return False

633

634

# Download using rtmpdump. rtmpdump returns exit code 2 when

635

# the connection was interrumpted and resuming appears to be

636

# possible. This is part of rtmpdump's normal usage, AFAIK.

637

basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]

638

retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])

639

while retval == 2 or retval == 1:

640

prevsize = os.path.getsize(tmpfilename)

641

self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)

642

time.sleep(5.0) # This seems to be needed

643

retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])

644

cursize = os.path.getsize(tmpfilename)

645

if prevsize == cursize and retval == 1:

646

break

647

if retval == 0:

648

self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))

649

self.try_rename(tmpfilename, filename)

650

return True

651

else:

652

self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)

653

return False

654

655

def _do_download(self, filename, url, player_url):

656

# Check file already present

657

if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):

658

self.report_file_already_downloaded(filename)

659

return True

660

661

# Attempt to download using rtmpdump

662

if url.startswith('rtmp'):

663

return self._download_with_rtmpdump(filename, url, player_url)

664

665

tmpfilename = self.temp_name(filename)

666

stream = None

667

open_mode = 'wb'

668

669

# Do not include the Accept-Encoding header

670

headers = {'Youtubedl-no-compression': 'True'}

671

basic_request = urllib2.Request(url, None, headers)

672

request = urllib2.Request(url, None, headers)

673

674

# Establish possible resume length

675

if os.path.isfile(tmpfilename):

676

resume_len = os.path.getsize(tmpfilename)

677

else:

678

resume_len = 0

679

680

# Request parameters in case of being able to resume

681

if self.params.get('continuedl', False) and resume_len != 0:

682

self.report_resuming_byte(resume_len)

683

request.add_header('Range','bytes=%d-' % resume_len)

684

open_mode = 'ab'

685

686

count = 0

687

retries = self.params.get('retries', 0)

688

while count <= retries:

689

# Establish connection

690

try:

691

data = urllib2.urlopen(request)

692

break

693

except (urllib2.HTTPError, ), err:

694

if (err.code < 500 or err.code >= 600) and err.code != 416:

695

# Unexpected HTTP error

696

raise

697

elif err.code == 416:

698

# Unable to resume (requested range not satisfiable)

699

try:

700

# Open the connection again without the range header

701

data = urllib2.urlopen(basic_request)

702

content_length = data.info()['Content-Length']

703

except (urllib2.HTTPError, ), err:

704

if err.code < 500 or err.code >= 600:

705

raise

706

else:

707

# Examine the reported length

708

if (content_length is not None and

709

(resume_len - 100 < long(content_length) < resume_len + 100)):

710

# The file had already been fully downloaded.

711

# Explanation to the above condition: in issue #175 it was revealed that

712

# YouTube sometimes adds or removes a few bytes from the end of the file,

713

# changing the file size slightly and causing problems for some users. So

714

# I decided to implement a suggested change and consider the file

715

# completely downloaded if the file size differs less than 100 bytes from

716

# the one in the hard drive.

717

self.report_file_already_downloaded(filename)

718

self.try_rename(tmpfilename, filename)

719

return True

720

else:

721

# The length does not match, we start the download over

722

self.report_unable_to_resume()

723

open_mode = 'wb'

724

break

725

# Retry

726

count += 1

727

if count <= retries:

728

self.report_retry(count, retries)

729

730

if count > retries:

731

self.trouble(u'ERROR: giving up after %s retries' % retries)

732

return False

733

734

data_len = data.info().get('Content-length', None)

735

if data_len is not None:

736

data_len = long(data_len) + resume_len

737

data_len_str = self.format_bytes(data_len)

738

byte_counter = 0 + resume_len

739

block_size = 1024

740

start = time.time()

741

while True:

742

# Download and write

743

before = time.time()

744

data_block = data.read(block_size)

745

after = time.time()

746

if len(data_block) == 0:

747

break

748

byte_counter += len(data_block)

749

750

# Open file just in time

751

if stream is None:

752

try:

753

(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)

754

filename = self.undo_temp_name(tmpfilename)

755

self.report_destination(filename)

756

except (OSError, IOError), err:

757

self.trouble(u'ERROR: unable to open for writing: %s' % str(err))

758

return False

759

try:

760

stream.write(data_block)

761

except (IOError, OSError), err:

762

self.trouble(u'\nERROR: unable to write data: %s' % str(err))

763

return False

764

block_size = self.best_block_size(after - before, len(data_block))

765

766

# Progress message

767

percent_str = self.calc_percent(byte_counter, data_len)

768

eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)

769

speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)

770

self.report_progress(percent_str, data_len_str, speed_str, eta_str)

771

772

# Apply rate limit

773

self.slow_down(start, byte_counter - resume_len)

774

775

stream.close()

776

self.report_finish()

777

if data_len is not None and byte_counter != data_len:

778

raise ContentTooShortError(byte_counter, long(data_len))

779

self.try_rename(tmpfilename, filename)

780

781

# Update file modification time

782

if self.params.get('updatetime', True):

783

self.try_utime(filename, data.info().get('last-modified', None))

784

785

return True

786

787

class InfoExtractor(object):

788

"""Information Extractor class.

789

790

Information extractors are the classes that, given a URL, extract

791

information from the video (or videos) the URL refers to. This

792

information includes the real video URL, the video title and simplified

793

title, author and others. The information is stored in a dictionary

794

which is then passed to the FileDownloader. The FileDownloader

795

processes this information possibly downloading the video to the file

796

system, among other possible outcomes. The dictionaries must include

797

the following fields:

798

799

id: Video identifier.

800

url: Final video URL.

801

uploader: Nickname of the video uploader.

802

title: Literal title.

803

stitle: Simplified title.

804

ext: Video filename extension.

805

format: Video format.

806

player_url: SWF Player URL (may be None).

807

808

The following fields are optional. Their primary purpose is to allow

809

youtube-dl to serve as the backend for a video search function, such

810

as the one in youtube2mp3. They are only used when their respective

811

forced printing functions are called:

812

813

thumbnail: Full URL to a video thumbnail image.

814

description: One-line video description.

815

816

Subclasses of this one should re-define the _real_initialize() and

817

_real_extract() methods, as well as the suitable() static method.

818

Probably, they should also be instantiated and added to the main

819

downloader.

820

"""

821

822

_ready = False

823

_downloader = None

824

825

def __init__(self, downloader=None):

826

"""Constructor. Receives an optional downloader."""

827

self._ready = False

828

self.set_downloader(downloader)

829

830

@staticmethod

831

def suitable(url):

832

"""Receives a URL and returns True if suitable for this IE."""

833

return False

834

835

def initialize(self):

836

"""Initializes an instance (authentication, etc)."""

837

if not self._ready:

838

self._real_initialize()

839

self._ready = True

840

841

def extract(self, url):

842

"""Extracts URL information and returns it in list of dicts."""

843

self.initialize()

844

return self._real_extract(url)

845

846

def set_downloader(self, downloader):

847

"""Sets the downloader for this IE."""

848

self._downloader = downloader

849

850

def _real_initialize(self):

851

"""Real initialization process. Redefine in subclasses."""

852

pass

853

854

def _real_extract(self, url):

855

"""Real extraction process. Redefine in subclasses."""

856

pass

857

858

class YoutubeIE(InfoExtractor):

859

"""Information extractor for youtube.com."""

860

861

862

_LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'

863

_LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'

864

_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'

865

_NETRC_MACHINE = 'youtube'

866

# Listed in order of quality

867

_available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']

868

_video_extensions = {

869

'13': '3gp',

870

'17': 'mp4',

871

'18': 'mp4',

872

'22': 'mp4',

873

'37': 'mp4',

874

'38': 'video', # You actually don't know if this will be MOV, AVI or whatever

875

'43': 'webm',

876

'45': 'webm',

877

}

878

879

@staticmethod

880

def suitable(url):

881

return (re.match(YoutubeIE._VALID_URL, url) is not None)

882

883

def report_lang(self):

884

"""Report attempt to set language."""

885

self._downloader.to_screen(u'[youtube] Setting language')

886

887

def report_login(self):

888

"""Report attempt to log in."""

889

self._downloader.to_screen(u'[youtube] Logging in')

890

891

def report_age_confirmation(self):

892

"""Report attempt to confirm age."""

893

self._downloader.to_screen(u'[youtube] Confirming age')

894

895

def report_video_webpage_download(self, video_id):

896

"""Report attempt to download video webpage."""

897

self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)

898

899

def report_video_info_webpage_download(self, video_id):

900

"""Report attempt to download video info webpage."""

901

self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)

902

903

def report_information_extraction(self, video_id):

904

"""Report attempt to extract video information."""

905

self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)

906

907

def report_unavailable_format(self, video_id, format):

908

"""Report extracted video URL."""

909

self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))

910

911

def report_rtmp_download(self):

912

"""Indicate the download will use the RTMP protocol."""

913

self._downloader.to_screen(u'[youtube] RTMP download detected')

914

915

def _real_initialize(self):

916

if self._downloader is None:

917

return

918

919

username = None

920

password = None

921

downloader_params = self._downloader.params

922

923

# Attempt to use provided username and password or .netrc data

924

if downloader_params.get('username', None) is not None:

925

username = downloader_params['username']

926

password = downloader_params['password']

927

elif downloader_params.get('usenetrc', False):

928

try:

929

info = netrc.netrc().authenticators(self._NETRC_MACHINE)

930

if info is not None:

931

username = info[0]

932

password = info[2]

933

else:

934

raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)

935

except (IOError, netrc.NetrcParseError), err:

936

self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))

937

return

938

939

# Set language

940

request = urllib2.Request(self._LANG_URL)

941

try:

942

self.report_lang()

943

urllib2.urlopen(request).read()

944

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

945

self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))

946

return

947

948

# No authentication to be performed

949

if username is None:

950

return

951

952

# Log in

953

login_form = {

954

'current_form': 'loginForm',

955

'next': '/',

956

'action_login': 'Log In',

957

'username': username,

958

'password': password,

959

}

960

request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))

961

try:

962

self.report_login()

963

login_results = urllib2.urlopen(request).read()

964

if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:

965

self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')

966

return

967

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

968

self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))

969

return

970

971

# Confirm age

972

age_form = {

973

'next_url': '/',

974

'action_confirm': 'Confirm',

975

}

976

request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))

977

try:

978

self.report_age_confirmation()

979

age_results = urllib2.urlopen(request).read()

980

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

981

self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))

982

return

983

984

def _real_extract(self, url):

985

# Extract video id from URL

986

mobj = re.match(self._VALID_URL, url)

987

if mobj is None:

988

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

989

return

990

video_id = mobj.group(2)

991

992

# Get video webpage

993

self.report_video_webpage_download(video_id)

994

request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)

995

try:

996

video_webpage = urllib2.urlopen(request).read()

997

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

998

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

999

return

1000

1001

# Attempt to extract SWF player URL

1002

mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1003

if mobj is not None:

1004

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

1005

else:

1006

player_url = None

1007

1008

# Get video info

1009

self.report_video_info_webpage_download(video_id)

1010

for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:

1011

video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1012

% (video_id, el_type))

1013

request = urllib2.Request(video_info_url)

1014

try:

1015

video_info_webpage = urllib2.urlopen(request).read()

1016

video_info = parse_qs(video_info_webpage)

1017

if 'token' in video_info:

1018

break

1019

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1020

self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))

1021

return

1022

if 'token' not in video_info:

1023

if 'reason' in video_info:

1024

self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))

1025

else:

1026

self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')

1027

return

1028

1029

# Start extracting information

1030

self.report_information_extraction(video_id)

1031

1032

# uploader

1033

if 'author' not in video_info:

1034

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

1035

return

1036

video_uploader = urllib.unquote_plus(video_info['author'][0])

1037

1038

# title

1039

if 'title' not in video_info:

1040

self._downloader.trouble(u'ERROR: unable to extract video title')

1041

return

1042

video_title = urllib.unquote_plus(video_info['title'][0])

1043

video_title = video_title.decode('utf-8')

1044

video_title = sanitize_title(video_title)

1045

1046

# simplified title

1047

simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)

1048

simple_title = simple_title.strip(ur'_')

1049

1050

# thumbnail image

1051

if 'thumbnail_url' not in video_info:

1052

self._downloader.trouble(u'WARNING: unable to extract video thumbnail')

1053

video_thumbnail = ''

1054

else: # don't panic if we can't find it

1055

video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])

1056

1057

# upload date

1058

upload_date = u'NA'

1059

mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)

1060

if mobj is not None:

1061

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1062

format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']

1063

for expression in format_expressions:

1064

try:

1065

upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')

1066

except:

1067

pass

1068

1069

# description

1070

video_description = 'No description available.'

1071

if self._downloader.params.get('forcedescription', False):

1072

mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)

1073

if mobj is not None:

1074

video_description = mobj.group(1)

1075

1076

# token

1077

video_token = urllib.unquote_plus(video_info['token'][0])

1078

1079

# Decide which formats to download

1080

req_format = self._downloader.params.get('format', None)

1081

1082

if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:

1083

url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')

1084

url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]

1085

url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)

1086

format_limit = self._downloader.params.get('format_limit', None)

1087

if format_limit is not None and format_limit in self._available_formats:

1088

format_list = self._available_formats[self._available_formats.index(format_limit):]

1089

else:

1090

format_list = self._available_formats

1091

existing_formats = [x for x in format_list if x in url_map]

1092

if len(existing_formats) == 0:

1093

self._downloader.trouble(u'ERROR: no known formats available for video')

1094

return

1095

if req_format is None:

1096

video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality

1097

elif req_format == '-1':

1098

video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats

1099

else:

1100

# Specific format

1101

if req_format not in url_map:

1102

self._downloader.trouble(u'ERROR: requested format not available')

1103

return

1104

video_url_list = [(req_format, url_map[req_format])] # Specific format

1105

1106

elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1107

self.report_rtmp_download()

1108

video_url_list = [(None, video_info['conn'][0])]

1109

1110

else:

1111

self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')

1112

return

1113

1114

for format_param, video_real_url in video_url_list:

1115

# At this point we have a new video

1116

self._downloader.increment_downloads()

1117

1118

# Extension

1119

video_extension = self._video_extensions.get(format_param, 'flv')

1120

1121

# Find the video URL in fmt_url_map or conn paramters

1122

try:

1123

# Process video information

1124

self._downloader.process_info({

1125

'id': video_id.decode('utf-8'),

1126

'url': video_real_url.decode('utf-8'),

1127

'uploader': video_uploader.decode('utf-8'),

1128

'upload_date': upload_date,

1129

'title': video_title,

1130

'stitle': simple_title,

1131

'ext': video_extension.decode('utf-8'),

1132

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

1133

'thumbnail': video_thumbnail.decode('utf-8'),

1134

'description': video_description.decode('utf-8'),

1135

'player_url': player_url,

1136

})

1137

except UnavailableVideoError, err:

1138

self._downloader.trouble(u'\nERROR: unable to download video')

1139

1140

1141

class MetacafeIE(InfoExtractor):

1142

"""Information Extractor for metacafe.com."""

1143

1144

_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'

1145

_DISCLAIMER = 'http://www.metacafe.com/family_filter/'

1146

_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'

1147

_youtube_ie = None

1148

1149

def __init__(self, youtube_ie, downloader=None):

1150

InfoExtractor.__init__(self, downloader)

1151

self._youtube_ie = youtube_ie

1152

1153

@staticmethod

1154

def suitable(url):

1155

return (re.match(MetacafeIE._VALID_URL, url) is not None)

1156

1157

def report_disclaimer(self):

1158

"""Report disclaimer retrieval."""

1159

self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')

1160

1161

def report_age_confirmation(self):

1162

"""Report attempt to confirm age."""

1163

self._downloader.to_screen(u'[metacafe] Confirming age')

1164

1165

def report_download_webpage(self, video_id):

1166

"""Report webpage download."""

1167

self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)

1168

1169

def report_extraction(self, video_id):

1170

"""Report information extraction."""

1171

self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)

1172

1173

def _real_initialize(self):

1174

# Retrieve disclaimer

1175

request = urllib2.Request(self._DISCLAIMER)

1176

try:

1177

self.report_disclaimer()

1178

disclaimer = urllib2.urlopen(request).read()

1179

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1180

self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))

1181

return

1182

1183

# Confirm age

1184

disclaimer_form = {

1185

'filters': '0',

1186

'submit': "Continue - I'm over 18",

1187

}

1188

request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))

1189

try:

1190

self.report_age_confirmation()

1191

disclaimer = urllib2.urlopen(request).read()

1192

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1193

self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))

1194

return

1195

1196

def _real_extract(self, url):

1197

# Extract id and simplified title from URL

1198

mobj = re.match(self._VALID_URL, url)

1199

if mobj is None:

1200

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

1201

return

1202

1203

video_id = mobj.group(1)

1204

1205

# Check if video comes from YouTube

1206

mobj2 = re.match(r'^yt-(.*)$', video_id)

1207

if mobj2 is not None:

1208

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))

1209

return

1210

1211

# At this point we have a new video

1212

self._downloader.increment_downloads()

1213

1214

simple_title = mobj.group(2).decode('utf-8')

1215

1216

# Retrieve video webpage to extract further information

1217

request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)

1218

try:

1219

self.report_download_webpage(video_id)

1220

webpage = urllib2.urlopen(request).read()

1221

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1222

self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))

1223

return

1224

1225

# Extract URL, uploader and title from webpage

1226

self.report_extraction(video_id)

1227

mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)

1228

if mobj is not None:

1229

mediaURL = urllib.unquote(mobj.group(1))

1230

video_extension = mediaURL[-3:]

1231

1232

# Extract gdaKey if available

1233

mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)

1234

if mobj is None:

1235

video_url = mediaURL

1236

else:

1237

gdaKey = mobj.group(1)

1238

video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)

1239

else:

1240

mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)

1241

if mobj is None:

1242

self._downloader.trouble(u'ERROR: unable to extract media URL')

1243

return

1244

vardict = parse_qs(mobj.group(1))

1245

if 'mediaData' not in vardict:

1246

self._downloader.trouble(u'ERROR: unable to extract media URL')

1247

return

1248

mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])

1249

if mobj is None:

1250

self._downloader.trouble(u'ERROR: unable to extract media URL')

1251

return

1252

mediaURL = mobj.group(1).replace('\\/', '/')

1253

video_extension = mediaURL[-3:]

1254

video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))

1255

1256

mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)

1257

if mobj is None:

1258

self._downloader.trouble(u'ERROR: unable to extract title')

1259

return

1260

video_title = mobj.group(1).decode('utf-8')

1261

video_title = sanitize_title(video_title)

1262

1263

mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)

1264

if mobj is None:

1265

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

1266

return

1267

video_uploader = mobj.group(1)

1268

1269

try:

1270

# Process video information

1271

self._downloader.process_info({

1272

'id': video_id.decode('utf-8'),

1273

'url': video_url.decode('utf-8'),

1274

'uploader': video_uploader.decode('utf-8'),

1275

'upload_date': u'NA',

1276

'title': video_title,

1277

'stitle': simple_title,

1278

'ext': video_extension.decode('utf-8'),

1279

'format': u'NA',

1280

'player_url': None,

1281

})

1282

except UnavailableVideoError:

1283

self._downloader.trouble(u'\nERROR: unable to download video')

1284

1285

1286

class DailymotionIE(InfoExtractor):

1287

"""Information Extractor for Dailymotion"""

1288

1289

_VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'

1290

1291

def __init__(self, downloader=None):

1292

InfoExtractor.__init__(self, downloader)

1293

1294

@staticmethod

1295

def suitable(url):

1296

return (re.match(DailymotionIE._VALID_URL, url) is not None)

1297

1298

def report_download_webpage(self, video_id):

1299

"""Report webpage download."""

1300

self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)

1301

1302

def report_extraction(self, video_id):

1303

"""Report information extraction."""

1304

self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)

1305

1306

def _real_initialize(self):

1307

return

1308

1309

def _real_extract(self, url):

1310

# Extract id and simplified title from URL

1311

mobj = re.match(self._VALID_URL, url)

1312

if mobj is None:

1313

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

1314

return

1315

1316

# At this point we have a new video

1317

self._downloader.increment_downloads()

1318

video_id = mobj.group(1)

1319

1320

simple_title = mobj.group(2).decode('utf-8')

1321

video_extension = 'flv'

1322

1323

# Retrieve video webpage to extract further information

1324

request = urllib2.Request(url)

1325

try:

1326

self.report_download_webpage(video_id)

1327

webpage = urllib2.urlopen(request).read()

1328

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1329

self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))

1330

return

1331

1332

# Extract URL, uploader and title from webpage

1333

self.report_extraction(video_id)

1334

mobj = re.search(r'(?i)addVariable$\"video\"\s*,\s*\"([^\"]*)\"$', webpage)

1335

if mobj is None:

1336

self._downloader.trouble(u'ERROR: unable to extract media URL')

1337

return

1338

mediaURL = urllib.unquote(mobj.group(1))

1339

1340

# if needed add http://www.dailymotion.com/ if relative URL

1341

1342

video_url = mediaURL

1343

1344

# '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'

1345

mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)

1346

if mobj is None:

1347

self._downloader.trouble(u'ERROR: unable to extract title')

1348

return

1349

video_title = mobj.group(1).decode('utf-8')

1350

video_title = sanitize_title(video_title)

1351

1352

mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)

1353

if mobj is None:

1354

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

1355

return

1356

video_uploader = mobj.group(1)

1357

1358

try:

1359

# Process video information

1360

self._downloader.process_info({

1361

'id': video_id.decode('utf-8'),

1362

'url': video_url.decode('utf-8'),

1363

'uploader': video_uploader.decode('utf-8'),

1364

'upload_date': u'NA',

1365

'title': video_title,

1366

'stitle': simple_title,

1367

'ext': video_extension.decode('utf-8'),

1368

'format': u'NA',

1369

'player_url': None,

1370

})

1371

except UnavailableVideoError:

1372

self._downloader.trouble(u'\nERROR: unable to download video')

1373

1374

class GoogleIE(InfoExtractor):

1375

"""Information extractor for video.google.com."""

1376

1377

_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'

1378

1379

def __init__(self, downloader=None):

1380

InfoExtractor.__init__(self, downloader)

1381

1382

@staticmethod

1383

def suitable(url):

1384

return (re.match(GoogleIE._VALID_URL, url) is not None)

1385

1386

def report_download_webpage(self, video_id):

1387

"""Report webpage download."""

1388

self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)

1389

1390

def report_extraction(self, video_id):

1391

"""Report information extraction."""

1392

self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)

1393

1394

def _real_initialize(self):

1395

return

1396

1397

def _real_extract(self, url):

1398

# Extract id from URL

1399

mobj = re.match(self._VALID_URL, url)

1400

if mobj is None:

1401

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1402

return

1403

1404

# At this point we have a new video

1405

self._downloader.increment_downloads()

1406

video_id = mobj.group(1)

1407

1408

video_extension = 'mp4'

1409

1410

# Retrieve video webpage to extract further information

1411

request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)

1412

try:

1413

self.report_download_webpage(video_id)

1414

webpage = urllib2.urlopen(request).read()

1415

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1416

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1417

return

1418

1419

# Extract URL, uploader, and title from webpage

1420

self.report_extraction(video_id)

1421

mobj = re.search(r"download_url:'([^']+)'", webpage)

1422

if mobj is None:

1423

video_extension = 'flv'

1424

mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)

1425

if mobj is None:

1426

self._downloader.trouble(u'ERROR: unable to extract media URL')

1427

return

1428

mediaURL = urllib.unquote(mobj.group(1))

1429

mediaURL = mediaURL.replace('\\x3d', '\x3d')

1430

mediaURL = mediaURL.replace('\\x26', '\x26')

1431

1432

video_url = mediaURL

1433

1434

mobj = re.search(r'<title>(.*)</title>', webpage)

1435

if mobj is None:

1436

self._downloader.trouble(u'ERROR: unable to extract title')

1437

return

1438

video_title = mobj.group(1).decode('utf-8')

1439

video_title = sanitize_title(video_title)

1440

simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)

1441

1442

# Extract video description

1443

mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)

1444

if mobj is None:

1445

self._downloader.trouble(u'ERROR: unable to extract video description')

1446

return

1447

video_description = mobj.group(1).decode('utf-8')

1448

if not video_description:

1449

video_description = 'No description available.'

1450

1451

# Extract video thumbnail

1452

if self._downloader.params.get('forcethumbnail', False):

1453

request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))

1454

try:

1455

webpage = urllib2.urlopen(request).read()

1456

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1457

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1458

return

1459

mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)

1460

if mobj is None:

1461

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

1462

return

1463

video_thumbnail = mobj.group(1)

1464

else: # we need something to pass to process_info

1465

video_thumbnail = ''

1466

1467

1468

try:

1469

# Process video information

1470

self._downloader.process_info({

1471

'id': video_id.decode('utf-8'),

1472

'url': video_url.decode('utf-8'),

1473

'uploader': u'NA',

1474

'upload_date': u'NA',

1475

'title': video_title,

1476

'stitle': simple_title,

1477

'ext': video_extension.decode('utf-8'),

1478

'format': u'NA',

1479

'player_url': None,

1480

})

1481

except UnavailableVideoError:

1482

self._downloader.trouble(u'\nERROR: unable to download video')

1483

1484

1485

class PhotobucketIE(InfoExtractor):

1486

"""Information extractor for photobucket.com."""

1487

1488

_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'

1489

1490

def __init__(self, downloader=None):

1491

InfoExtractor.__init__(self, downloader)

1492

1493

@staticmethod

1494

def suitable(url):

1495

return (re.match(PhotobucketIE._VALID_URL, url) is not None)

1496

1497

def report_download_webpage(self, video_id):

1498

"""Report webpage download."""

1499

self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)

1500

1501

def report_extraction(self, video_id):

1502

"""Report information extraction."""

1503

self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)

1504

1505

def _real_initialize(self):

1506

return

1507

1508

def _real_extract(self, url):

1509

# Extract id from URL

1510

mobj = re.match(self._VALID_URL, url)

1511

if mobj is None:

1512

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1513

return

1514

1515

# At this point we have a new video

1516

self._downloader.increment_downloads()

1517

video_id = mobj.group(1)

1518

1519

video_extension = 'flv'

1520

1521

# Retrieve video webpage to extract further information

1522

request = urllib2.Request(url)

1523

try:

1524

self.report_download_webpage(video_id)

1525

webpage = urllib2.urlopen(request).read()

1526

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1527

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1528

return

1529

1530

# Extract URL, uploader, and title from webpage

1531

self.report_extraction(video_id)

1532

mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)

1533

if mobj is None:

1534

self._downloader.trouble(u'ERROR: unable to extract media URL')

1535

return

1536

mediaURL = urllib.unquote(mobj.group(1))

1537

1538

video_url = mediaURL

1539

1540

mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)

1541

if mobj is None:

1542

self._downloader.trouble(u'ERROR: unable to extract title')

1543

return

1544

video_title = mobj.group(1).decode('utf-8')

1545

video_title = sanitize_title(video_title)

1546

simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)

1547

1548

video_uploader = mobj.group(2).decode('utf-8')

1549

1550

try:

1551

# Process video information

1552

self._downloader.process_info({

1553

'id': video_id.decode('utf-8'),

1554

'url': video_url.decode('utf-8'),

1555

'uploader': video_uploader,

1556

'upload_date': u'NA',

1557

'title': video_title,

1558

'stitle': simple_title,

1559

'ext': video_extension.decode('utf-8'),

1560

'format': u'NA',

1561

'player_url': None,

1562

})

1563

except UnavailableVideoError:

1564

self._downloader.trouble(u'\nERROR: unable to download video')

1565

1566

1567

class YahooIE(InfoExtractor):

1568

"""Information extractor for video.yahoo.com."""

1569

1570

# _VALID_URL matches all Yahoo! Video URLs

1571

# _VPAGE_URL matches only the extractable '/watch/' URLs

1572

_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'

1573

_VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'

1574

1575

def __init__(self, downloader=None):

1576

InfoExtractor.__init__(self, downloader)

1577

1578

@staticmethod

1579

def suitable(url):

1580

return (re.match(YahooIE._VALID_URL, url) is not None)

1581

1582

def report_download_webpage(self, video_id):

1583

"""Report webpage download."""

1584

self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)

1585

1586

def report_extraction(self, video_id):

1587

"""Report information extraction."""

1588

self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)

1589

1590

def _real_initialize(self):

1591

return

1592

1593

def _real_extract(self, url, new_video=True):

1594

# Extract ID from URL

1595

mobj = re.match(self._VALID_URL, url)

1596

if mobj is None:

1597

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1598

return

1599

1600

# At this point we have a new video

1601

self._downloader.increment_downloads()

1602

video_id = mobj.group(2)

1603

video_extension = 'flv'

1604

1605

# Rewrite valid but non-extractable URLs as

1606

# extractable English language /watch/ URLs

1607

if re.match(self._VPAGE_URL, url) is None:

1608

request = urllib2.Request(url)

1609

try:

1610

webpage = urllib2.urlopen(request).read()

1611

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1612

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1613

return

1614

1615

mobj = re.search(r'$"id", "([0-9]+)"$;', webpage)

1616

if mobj is None:

1617

self._downloader.trouble(u'ERROR: Unable to extract id field')

1618

return

1619

yahoo_id = mobj.group(1)

1620

1621

mobj = re.search(r'$"vid", "([0-9]+)"$;', webpage)

1622

if mobj is None:

1623

self._downloader.trouble(u'ERROR: Unable to extract vid field')

1624

return

1625

yahoo_vid = mobj.group(1)

1626

1627

url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)

1628

return self._real_extract(url, new_video=False)

1629

1630

# Retrieve video webpage to extract further information

1631

request = urllib2.Request(url)

1632

try:

1633

self.report_download_webpage(video_id)

1634

webpage = urllib2.urlopen(request).read()

1635

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1636

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1637

return

1638

1639

# Extract uploader and title from webpage

1640

self.report_extraction(video_id)

1641

mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)

1642

if mobj is None:

1643

self._downloader.trouble(u'ERROR: unable to extract video title')

1644

return

1645

video_title = mobj.group(1).decode('utf-8')

1646

simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)

1647

1648

mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)

1649

if mobj is None:

1650

self._downloader.trouble(u'ERROR: unable to extract video uploader')

1651

return

1652

video_uploader = mobj.group(1).decode('utf-8')

1653

1654

# Extract video thumbnail

1655

mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)

1656

if mobj is None:

1657

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

1658

return

1659

video_thumbnail = mobj.group(1).decode('utf-8')

1660

1661

# Extract video description

1662

mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)

1663

if mobj is None:

1664

self._downloader.trouble(u'ERROR: unable to extract video description')

1665

return

1666

video_description = mobj.group(1).decode('utf-8')

1667

if not video_description: video_description = 'No description available.'

1668

1669

# Extract video height and width

1670

mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)

1671

if mobj is None:

1672

self._downloader.trouble(u'ERROR: unable to extract video height')

1673

return

1674

yv_video_height = mobj.group(1)

1675

1676

mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)

1677

if mobj is None:

1678

self._downloader.trouble(u'ERROR: unable to extract video width')

1679

return

1680

yv_video_width = mobj.group(1)

1681

1682

# Retrieve video playlist to extract media URL

1683

# I'm not completely sure what all these options are, but we

1684

# seem to need most of them, otherwise the server sends a 401.

1685

yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents

1686

yv_bitrate = '700' # according to Wikipedia this is hard-coded

1687

request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +

1688

'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +

1689

'&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')

1690

try:

1691

self.report_download_webpage(video_id)

1692

webpage = urllib2.urlopen(request).read()

1693

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1694

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1695

return

1696

1697

# Extract media URL from playlist XML

1698

mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)

1699

if mobj is None:

1700

self._downloader.trouble(u'ERROR: Unable to extract media URL')

1701

return

1702

video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')

1703

video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)

1704

1705

try:

1706

# Process video information

1707

self._downloader.process_info({

1708

'id': video_id.decode('utf-8'),

1709

'url': video_url,

1710

'uploader': video_uploader,

1711

'upload_date': u'NA',

1712

'title': video_title,

1713

'stitle': simple_title,

1714

'ext': video_extension.decode('utf-8'),

1715

'thumbnail': video_thumbnail.decode('utf-8'),

1716

'description': video_description,

1717

'thumbnail': video_thumbnail,

1718

'description': video_description,

1719

'player_url': None,

1720

})

1721

except UnavailableVideoError:

1722

self._downloader.trouble(u'\nERROR: unable to download video')

1723

1724

1725

class GenericIE(InfoExtractor):

1726

"""Generic last-resort information extractor."""

1727

1728

def __init__(self, downloader=None):

1729

InfoExtractor.__init__(self, downloader)

1730

1731

@staticmethod

1732

def suitable(url):

1733

return True

1734

1735

def report_download_webpage(self, video_id):

1736

"""Report webpage download."""

1737

self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')

1738

self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)

1739

1740

def report_extraction(self, video_id):

1741

"""Report information extraction."""

1742

self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)

1743

1744

def _real_initialize(self):

1745

return

1746

1747

def _real_extract(self, url):

1748

# At this point we have a new video

1749

self._downloader.increment_downloads()

1750

1751

video_id = url.split('/')[-1]

1752

request = urllib2.Request(url)

1753

try:

1754

self.report_download_webpage(video_id)

1755

webpage = urllib2.urlopen(request).read()

1756

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1757

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1758

return

1759

except ValueError, err:

1760

# since this is the last-resort InfoExtractor, if

1761

# this error is thrown, it'll be thrown here

1762

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1763

return

1764

1765

self.report_extraction(video_id)

1766

# Start with something easy: JW Player in SWFObject

1767

mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)

1768

if mobj is None:

1769

# Broaden the search a little bit

1770

mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)

1771

if mobj is None:

1772

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1773

return

1774

1775

# It's possible that one of the regexes

1776

# matched, but returned an empty group:

1777

if mobj.group(1) is None:

1778

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1779

return

1780

1781

video_url = urllib.unquote(mobj.group(1))

1782

video_id = os.path.basename(video_url)

1783

1784

# here's a fun little line of code for you:

1785

video_extension = os.path.splitext(video_id)[1][1:]

1786

video_id = os.path.splitext(video_id)[0]

1787

1788

# it's tempting to parse this further, but you would

1789

# have to take into account all the variations like

1790

# Video Title - Site Name

1791

# Site Name | Video Title

1792

# Video Title - Tagline | Site Name

1793

# and so on and so forth; it's just not practical

1794

mobj = re.search(r'<title>(.*)</title>', webpage)

1795

if mobj is None:

1796

self._downloader.trouble(u'ERROR: unable to extract title')

1797

return

1798

video_title = mobj.group(1).decode('utf-8')

1799

video_title = sanitize_title(video_title)

1800

simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)

1801

1802

# video uploader is domain name

1803

mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)

1804

if mobj is None:

1805

self._downloader.trouble(u'ERROR: unable to extract title')

1806

return

1807

video_uploader = mobj.group(1).decode('utf-8')

1808

1809

try:

1810

# Process video information

1811

self._downloader.process_info({

1812

'id': video_id.decode('utf-8'),

1813

'url': video_url.decode('utf-8'),

1814

'uploader': video_uploader,

1815

'upload_date': u'NA',

1816

'title': video_title,

1817

'stitle': simple_title,

1818

'ext': video_extension.decode('utf-8'),

1819

'format': u'NA',

1820

'player_url': None,

1821

})

1822

except UnavailableVideoError, err:

1823

self._downloader.trouble(u'\nERROR: unable to download video')

1824

1825

1826

class YoutubeSearchIE(InfoExtractor):

1827

"""Information Extractor for YouTube search queries."""

1828

_VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'

1829

_TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'

1830

_VIDEO_INDICATOR = r'href="/watch\?v=.+?"'

1831

_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'

1832

_youtube_ie = None

1833

_max_youtube_results = 1000

1834

1835

def __init__(self, youtube_ie, downloader=None):

1836

InfoExtractor.__init__(self, downloader)

1837

self._youtube_ie = youtube_ie

1838

1839

@staticmethod

1840

def suitable(url):

1841

return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)

1842

1843

def report_download_page(self, query, pagenum):

1844

"""Report attempt to download playlist page with given number."""

1845

query = query.decode(preferredencoding())

1846

self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))

1847

1848

def _real_initialize(self):

1849

self._youtube_ie.initialize()

1850

1851

def _real_extract(self, query):

1852

mobj = re.match(self._VALID_QUERY, query)

1853

if mobj is None:

1854

self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)

1855

return

1856

1857

prefix, query = query.split(':')

1858

prefix = prefix[8:]

1859

query = query.encode('utf-8')

1860

if prefix == '':

1861

self._download_n_results(query, 1)

1862

return

1863

elif prefix == 'all':

1864

self._download_n_results(query, self._max_youtube_results)

1865

return

1866

else:

1867

try:

1868

n = long(prefix)

1869

if n <= 0:

1870

self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))

1871

return

1872

elif n > self._max_youtube_results:

1873

self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))

1874

n = self._max_youtube_results

1875

self._download_n_results(query, n)

1876

return

1877

except ValueError: # parsing prefix as integer fails

1878

self._download_n_results(query, 1)

1879

return

1880

1881

def _download_n_results(self, query, n):

1882

"""Downloads a specified number of results for a query"""

1883

1884

video_ids = []

1885

already_seen = set()

1886

pagenum = 1

1887

1888

while True:

1889

self.report_download_page(query, pagenum)

1890

result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)

1891

request = urllib2.Request(result_url)

1892

try:

1893

page = urllib2.urlopen(request).read()

1894

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1895

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

1896

return

1897

1898

# Extract video identifiers

1899

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

1900

video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]

1901

if video_id not in already_seen:

1902

video_ids.append(video_id)

1903

already_seen.add(video_id)

1904

if len(video_ids) == n:

1905

# Specified n videos reached

1906

for id in video_ids:

1907

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)

1908

return

1909

1910

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

1911

for id in video_ids:

1912

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)

1913

return

1914

1915

pagenum = pagenum + 1

1916

1917

class GoogleSearchIE(InfoExtractor):

1918

"""Information Extractor for Google Video search queries."""

1919

_VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'

1920

_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'

1921

_VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'

1922

_MORE_PAGES_INDICATOR = r'<span>Next</span>'

1923

_google_ie = None

1924

_max_google_results = 1000

1925

1926

def __init__(self, google_ie, downloader=None):

1927

InfoExtractor.__init__(self, downloader)

1928

self._google_ie = google_ie

1929

1930

@staticmethod

1931

def suitable(url):

1932

return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)

1933

1934

def report_download_page(self, query, pagenum):

1935

"""Report attempt to download playlist page with given number."""

1936

query = query.decode(preferredencoding())

1937

self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))

1938

1939

def _real_initialize(self):

1940

self._google_ie.initialize()

1941

1942

def _real_extract(self, query):

1943

mobj = re.match(self._VALID_QUERY, query)

1944

if mobj is None:

1945

self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)

1946

return

1947

1948

prefix, query = query.split(':')

1949

prefix = prefix[8:]

1950

query = query.encode('utf-8')

1951

if prefix == '':

1952

self._download_n_results(query, 1)

1953

return

1954

elif prefix == 'all':

1955

self._download_n_results(query, self._max_google_results)

1956

return

1957

else:

1958

try:

1959

n = long(prefix)

1960

if n <= 0:

1961

self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))

1962

return

1963

elif n > self._max_google_results:

1964

self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))

1965

n = self._max_google_results

1966

self._download_n_results(query, n)

1967

return

1968

except ValueError: # parsing prefix as integer fails

1969

self._download_n_results(query, 1)

1970

return

1971

1972

def _download_n_results(self, query, n):

1973

"""Downloads a specified number of results for a query"""

1974

1975

video_ids = []

1976

already_seen = set()

1977

pagenum = 1

1978

1979

while True:

1980

self.report_download_page(query, pagenum)

1981

result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)

1982

request = urllib2.Request(result_url)

1983

try:

1984

page = urllib2.urlopen(request).read()

1985

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1986

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

1987

return

1988

1989

# Extract video identifiers

1990

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

1991

video_id = mobj.group(1)

1992

if video_id not in already_seen:

1993

video_ids.append(video_id)

1994

already_seen.add(video_id)

1995

if len(video_ids) == n:

1996

# Specified n videos reached

1997

for id in video_ids:

1998

self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)

1999

return

2000

2001

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2002

for id in video_ids:

2003

self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)

2004

return

2005

2006

pagenum = pagenum + 1

2007

2008

class YahooSearchIE(InfoExtractor):

2009

"""Information Extractor for Yahoo! Video search queries."""

2010

_VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'

2011

_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'

2012

_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'

2013

_MORE_PAGES_INDICATOR = r'\s*Next'

2014

_yahoo_ie = None

2015

_max_yahoo_results = 1000

2016

2017

def __init__(self, yahoo_ie, downloader=None):

2018

InfoExtractor.__init__(self, downloader)

2019

self._yahoo_ie = yahoo_ie

2020

2021

@staticmethod

2022

def suitable(url):

2023

return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)

2024

2025

def report_download_page(self, query, pagenum):

2026

"""Report attempt to download playlist page with given number."""

2027

query = query.decode(preferredencoding())

2028

self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))

2029

2030

def _real_initialize(self):

2031

self._yahoo_ie.initialize()

2032

2033

def _real_extract(self, query):

2034

mobj = re.match(self._VALID_QUERY, query)

2035

if mobj is None:

2036

self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)

2037

return

2038

2039

prefix, query = query.split(':')

2040

prefix = prefix[8:]

2041

query = query.encode('utf-8')

2042

if prefix == '':

2043

self._download_n_results(query, 1)

2044

return

2045

elif prefix == 'all':

2046

self._download_n_results(query, self._max_yahoo_results)

2047

return

2048

else:

2049

try:

2050

n = long(prefix)

2051

if n <= 0:

2052

self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))

2053

return

2054

elif n > self._max_yahoo_results:

2055

self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))

2056

n = self._max_yahoo_results

2057

self._download_n_results(query, n)

2058

return

2059

except ValueError: # parsing prefix as integer fails

2060

self._download_n_results(query, 1)

2061

return

2062

2063

def _download_n_results(self, query, n):

2064

"""Downloads a specified number of results for a query"""

2065

2066

video_ids = []

2067

already_seen = set()

2068

pagenum = 1

2069

2070

while True:

2071

self.report_download_page(query, pagenum)

2072

result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)

2073

request = urllib2.Request(result_url)

2074

try:

2075

page = urllib2.urlopen(request).read()

2076

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2077

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2078

return

2079

2080

# Extract video identifiers

2081

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2082

video_id = mobj.group(1)

2083

if video_id not in already_seen:

2084

video_ids.append(video_id)

2085

already_seen.add(video_id)

2086

if len(video_ids) == n:

2087

# Specified n videos reached

2088

for id in video_ids:

2089

self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)

2090

return

2091

2092

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2093

for id in video_ids:

2094

self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)

2095

return

2096

2097

pagenum = pagenum + 1

2098

2099

class YoutubePlaylistIE(InfoExtractor):

2100

"""Information Extractor for YouTube playlists."""

2101

2102

2103

_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'

2104

_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'

2105

_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'

2106

_youtube_ie = None

2107

2108

def __init__(self, youtube_ie, downloader=None):

2109

InfoExtractor.__init__(self, downloader)

2110

self._youtube_ie = youtube_ie

2111

2112

@staticmethod

2113

def suitable(url):

2114

return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)

2115

2116

def report_download_page(self, playlist_id, pagenum):

2117

"""Report attempt to download playlist page with given number."""

2118

self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))

2119

2120

def _real_initialize(self):

2121

self._youtube_ie.initialize()

2122

2123

def _real_extract(self, url):

2124

# Extract playlist id

2125

mobj = re.match(self._VALID_URL, url)

2126

if mobj is None:

2127

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

2128

return

2129

2130

# Single video case

2131

if mobj.group(3) is not None:

2132

self._youtube_ie.extract(mobj.group(3))

2133

return

2134

2135

# Download playlist pages

2136

# prefix is 'p' as default for playlists but there are other types that need extra care

2137

playlist_prefix = mobj.group(1)

2138

if playlist_prefix == 'a':

2139

playlist_access = 'artist'

2140

else:

2141

playlist_prefix = 'p'

2142

playlist_access = 'view_play_list'

2143

playlist_id = mobj.group(2)

2144

video_ids = []

2145

pagenum = 1

2146

2147

while True:

2148

self.report_download_page(playlist_id, pagenum)

2149

request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))

2150

try:

2151

page = urllib2.urlopen(request).read()

2152

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2153

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2154

return

2155

2156

# Extract video identifiers

2157

ids_in_page = []

2158

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2159

if mobj.group(1) not in ids_in_page:

2160

ids_in_page.append(mobj.group(1))

2161

video_ids.extend(ids_in_page)

2162

2163

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2164

break

2165

pagenum = pagenum + 1

2166

2167

playliststart = self._downloader.params.get('playliststart', 1) - 1

2168

playlistend = self._downloader.params.get('playlistend', -1)

2169

video_ids = video_ids[playliststart:playlistend]

2170

2171

for id in video_ids:

2172

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)

2173

return

2174

2175

class YoutubeUserIE(InfoExtractor):

2176

"""Information Extractor for YouTube users."""

2177

2178

_VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'

2179

_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'

2180

_GDATA_PAGE_SIZE = 50

2181

_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'

2182

_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'

2183

_youtube_ie = None

2184

2185

def __init__(self, youtube_ie, downloader=None):

2186

InfoExtractor.__init__(self, downloader)

2187

self._youtube_ie = youtube_ie

2188

2189

@staticmethod

2190

def suitable(url):

2191

return (re.match(YoutubeUserIE._VALID_URL, url) is not None)

2192

2193

def report_download_page(self, username, start_index):

2194

"""Report attempt to download user page."""

2195

self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %

2196

(username, start_index, start_index + self._GDATA_PAGE_SIZE))

2197

2198

def _real_initialize(self):

2199

self._youtube_ie.initialize()

2200

2201

def _real_extract(self, url):

2202

# Extract username

2203

mobj = re.match(self._VALID_URL, url)

2204

if mobj is None:

2205

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

2206

return

2207

2208

username = mobj.group(1)

2209

2210

# Download video ids using YouTube Data API. Result size per

2211

# query is limited (currently to 50 videos) so we need to query

2212

# page by page until there are no video ids - it means we got

2213

# all of them.

2214

2215

video_ids = []

2216

pagenum = 0

2217

2218

while True:

2219

start_index = pagenum * self._GDATA_PAGE_SIZE + 1

2220

self.report_download_page(username, start_index)

2221

2222

request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))

2223

2224

try:

2225

page = urllib2.urlopen(request).read()

2226

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2227

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2228

return

2229

2230

# Extract video identifiers

2231

ids_in_page = []

2232

2233

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2234

if mobj.group(1) not in ids_in_page:

2235

ids_in_page.append(mobj.group(1))

2236

2237

video_ids.extend(ids_in_page)

2238

2239

# A little optimization - if current page is not

2240

# "full", ie. does not contain PAGE_SIZE video ids then

2241

# we can assume that this page is the last one - there

2242

# are no more ids on further pages - no need to query

2243

# again.

2244

2245

if len(ids_in_page) < self._GDATA_PAGE_SIZE:

2246

break

2247

2248

pagenum += 1

2249

2250

all_ids_count = len(video_ids)

2251

playliststart = self._downloader.params.get('playliststart', 1) - 1

2252

playlistend = self._downloader.params.get('playlistend', -1)

2253

2254

if playlistend == -1:

2255

video_ids = video_ids[playliststart:]

2256

else:

2257

video_ids = video_ids[playliststart:playlistend]

2258

2259

self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %

2260

(username, all_ids_count, len(video_ids)))

2261

2262

for video_id in video_ids:

2263

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)

2264

2265

2266

class DepositFilesIE(InfoExtractor):

2267

"""Information extractor for depositfiles.com"""

2268

2269

_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'

2270

2271

def __init__(self, downloader=None):

2272

InfoExtractor.__init__(self, downloader)

2273

2274

@staticmethod

2275

def suitable(url):

2276

return (re.match(DepositFilesIE._VALID_URL, url) is not None)

2277

2278

def report_download_webpage(self, file_id):

2279

"""Report webpage download."""

2280

self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)

2281

2282

def report_extraction(self, file_id):

2283

"""Report information extraction."""

2284

self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)

2285

2286

def _real_initialize(self):

2287

return

2288

2289

def _real_extract(self, url):

2290

# At this point we have a new file

2291

self._downloader.increment_downloads()

2292

2293

file_id = url.split('/')[-1]

2294

# Rebuild url in english locale

2295

url = 'http://depositfiles.com/en/files/' + file_id

2296

2297

# Retrieve file webpage with 'Free download' button pressed

2298

free_download_indication = { 'gateway_result' : '1' }

2299

request = urllib2.Request(url, urllib.urlencode(free_download_indication))

2300

try:

2301

self.report_download_webpage(file_id)

2302

webpage = urllib2.urlopen(request).read()

2303

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2304

self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))

2305

return

2306

2307

# Search for the real file URL

2308

mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)

2309

if (mobj is None) or (mobj.group(1) is None):

2310

# Try to figure out reason of the error.

2311

mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)

2312

if (mobj is not None) and (mobj.group(1) is not None):

2313

restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()

2314

self._downloader.trouble(u'ERROR: %s' % restriction_message)

2315

else:

2316

self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)

2317

return

2318

2319

file_url = mobj.group(1)

2320

file_extension = os.path.splitext(file_url)[1][1:]

2321

2322

# Search for file title

2323

mobj = re.search(r'<b title="(.*?)">', webpage)

2324

if mobj is None:

2325

self._downloader.trouble(u'ERROR: unable to extract title')

2326

return

2327

file_title = mobj.group(1).decode('utf-8')

2328

2329

try:

2330

# Process file information

2331

self._downloader.process_info({

2332

'id': file_id.decode('utf-8'),

2333

'url': file_url.decode('utf-8'),

2334

'uploader': u'NA',

2335

'upload_date': u'NA',

2336

'title': file_title,

2337

'stitle': file_title,

2338

'ext': file_extension.decode('utf-8'),

2339

'format': u'NA',

2340

'player_url': None,

2341

})

2342

except UnavailableVideoError, err:

2343

self._downloader.trouble(u'ERROR: unable to download file')

2344

2345

class FacebookIE(InfoExtractor):

2346

"""Information Extractor for Facebook"""

2347

2348

_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'

2349

_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'

2350

_NETRC_MACHINE = 'facebook'

2351

_available_formats = ['highqual', 'lowqual']

2352

_video_extensions = {

2353

'highqual': 'mp4',

2354

'lowqual': 'mp4',

2355

}

2356

2357

def __init__(self, downloader=None):

2358

InfoExtractor.__init__(self, downloader)

2359

2360

@staticmethod

2361

def suitable(url):

2362

return (re.match(FacebookIE._VALID_URL, url) is not None)

2363

2364

def _reporter(self, message):

2365

"""Add header and report message."""

2366

self._downloader.to_screen(u'[facebook] %s' % message)

2367

2368

def report_login(self):

2369

"""Report attempt to log in."""

2370

self._reporter(u'Logging in')

2371

2372

def report_video_webpage_download(self, video_id):

2373

"""Report attempt to download video webpage."""

2374

self._reporter(u'%s: Downloading video webpage' % video_id)

2375

2376

def report_information_extraction(self, video_id):

2377

"""Report attempt to extract video information."""

2378

self._reporter(u'%s: Extracting video information' % video_id)

2379

2380

def _parse_page(self, video_webpage):

2381

"""Extract video information from page"""

2382

# General data

2383

data = {'title': r'class="video_title datawrap">(.*?)</',

2384

'description': r'<div class="datawrap">(.*?)</div>',

2385

'owner': r'$"video_owner_name", "(.*?)"$',

2386

'upload_date': r'data-date="(.*?)"',

2387

'thumbnail': r'$"thumb_url", "(?P<THUMB>.*?)"$',

2388

}

2389

video_info = {}

2390

for piece in data.keys():

2391

mobj = re.search(data[piece], video_webpage)

2392

if mobj is not None:

2393

video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))

2394

2395

# Video urls

2396

video_urls = {}

2397

for fmt in self._available_formats:

2398

mobj = re.search(r'$"%s_src\", "(.+?)"$' % fmt, video_webpage)

2399

if mobj is not None:

2400

# URL is in a Javascript segment inside an escaped Unicode format within

2401

# the generally utf-8 page

2402

video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))

2403

video_info['video_urls'] = video_urls

2404

2405

return video_info

2406

2407

def _real_initialize(self):

2408

if self._downloader is None:

2409

return

2410

2411

useremail = None

2412

password = None

2413

downloader_params = self._downloader.params

2414

2415

# Attempt to use provided username and password or .netrc data

2416

if downloader_params.get('username', None) is not None:

2417

useremail = downloader_params['username']

2418

password = downloader_params['password']

2419

elif downloader_params.get('usenetrc', False):

2420

try:

2421

info = netrc.netrc().authenticators(self._NETRC_MACHINE)

2422

if info is not None:

2423

useremail = info[0]

2424

password = info[2]

2425

else:

2426

raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)

2427

except (IOError, netrc.NetrcParseError), err:

2428

self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))

2429

return

2430

2431

if useremail is None:

2432

return

2433

2434

# Log in

2435

login_form = {

2436

'email': useremail,

2437

'pass': password,

2438

'login': 'Log+In'

2439

}

2440

request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))

2441

try:

2442

self.report_login()

2443

login_results = urllib2.urlopen(request).read()

2444

if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:

2445

self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')

2446

return

2447

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2448

self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))

2449

return

2450

2451

def _real_extract(self, url):

2452

mobj = re.match(self._VALID_URL, url)

2453

if mobj is None:

2454

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

2455

return

2456

video_id = mobj.group('ID')

2457

2458

# Get video webpage

2459

self.report_video_webpage_download(video_id)

2460

request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)

2461

try:

2462

page = urllib2.urlopen(request)

2463

video_webpage = page.read()

2464

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2465

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

2466

return

2467

2468

# Start extracting information

2469

self.report_information_extraction(video_id)

2470

2471

# Extract information

2472

video_info = self._parse_page(video_webpage)

2473

2474

# uploader

2475

if 'owner' not in video_info:

2476

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

2477

return

2478

video_uploader = video_info['owner']

2479

2480

# title

2481

if 'title' not in video_info:

2482

self._downloader.trouble(u'ERROR: unable to extract video title')

2483

return

2484

video_title = video_info['title']

2485

video_title = video_title.decode('utf-8')

2486

video_title = sanitize_title(video_title)

2487

2488

# simplified title

2489

simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)

2490

simple_title = simple_title.strip(ur'_')

2491

2492

# thumbnail image

2493

if 'thumbnail' not in video_info:

2494

self._downloader.trouble(u'WARNING: unable to extract video thumbnail')

2495

video_thumbnail = ''

2496

else:

2497

video_thumbnail = video_info['thumbnail']

2498

2499

# upload date

2500

upload_date = u'NA'

2501

if 'upload_date' in video_info:

2502

upload_time = video_info['upload_date']

2503

timetuple = email.utils.parsedate_tz(upload_time)

2504

if timetuple is not None:

2505

try:

2506

upload_date = time.strftime('%Y%m%d', timetuple[0:9])

2507

except:

2508

pass

2509

2510

# description

2511

video_description = 'No description available.'

2512

if (self._downloader.params.get('forcedescription', False) and

2513

'description' in video_info):

2514

video_description = video_info['description']

2515

2516

url_map = video_info['video_urls']

2517

if len(url_map.keys()) > 0:

2518

# Decide which formats to download

2519

req_format = self._downloader.params.get('format', None)

2520

format_limit = self._downloader.params.get('format_limit', None)

2521

2522

if format_limit is not None and format_limit in self._available_formats:

2523

format_list = self._available_formats[self._available_formats.index(format_limit):]

2524

else:

2525

format_list = self._available_formats

2526

existing_formats = [x for x in format_list if x in url_map]

2527

if len(existing_formats) == 0:

2528

self._downloader.trouble(u'ERROR: no known formats available for video')

2529

return

2530

if req_format is None:

2531

video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality

2532

elif req_format == '-1':

2533

video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats

2534

else:

2535

# Specific format

2536

if req_format not in url_map:

2537

self._downloader.trouble(u'ERROR: requested format not available')

2538

return

2539

video_url_list = [(req_format, url_map[req_format])] # Specific format

2540

2541

for format_param, video_real_url in video_url_list:

2542

2543

# At this point we have a new video

2544

self._downloader.increment_downloads()

2545

2546

# Extension

2547

video_extension = self._video_extensions.get(format_param, 'mp4')

2548

2549

# Find the video URL in fmt_url_map or conn paramters

2550

try:

2551

# Process video information

2552

self._downloader.process_info({

2553

'id': video_id.decode('utf-8'),

2554

'url': video_real_url.decode('utf-8'),

2555

'uploader': video_uploader.decode('utf-8'),

2556

'upload_date': upload_date,

2557

'title': video_title,

2558

'stitle': simple_title,

2559

'ext': video_extension.decode('utf-8'),

2560

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

2561

'thumbnail': video_thumbnail.decode('utf-8'),

2562

'description': video_description.decode('utf-8'),

2563

'player_url': None,

2564

})

2565

except UnavailableVideoError, err:

2566

self._downloader.trouble(u'\nERROR: unable to download video')

2567

2568

class PostProcessor(object):

2569

"""Post Processor class.

2570

2571

PostProcessor objects can be added to downloaders with their

2572

add_post_processor() method. When the downloader has finished a

2573

successful download, it will take its internal chain of PostProcessors

2574

and start calling the run() method on each one of them, first with

2575

an initial argument and then with the returned value of the previous

2576

PostProcessor.

2577

2578

The chain will be stopped if one of them ever returns None or the end

2579

of the chain is reached.

2580

2581

PostProcessor objects follow a "mutual registration" process similar

2582

to InfoExtractor objects.

2583

"""

2584

2585

_downloader = None

2586

2587

def __init__(self, downloader=None):

2588

self._downloader = downloader

2589

2590

def set_downloader(self, downloader):

2591

"""Sets the downloader for this PP."""

2592

self._downloader = downloader

2593

2594

def run(self, information):

2595

"""Run the PostProcessor.

2596

2597

The "information" argument is a dictionary like the ones

2598

composed by InfoExtractors. The only difference is that this

2599

one has an extra field called "filepath" that points to the

2600

downloaded file.

2601

2602

When this method returns None, the postprocessing chain is

2603

stopped. However, this method may return an information

2604

dictionary that will be passed to the next postprocessing

2605

object in the chain. It can be the one it received after

2606

changing some fields.

2607

2608

In addition, this method may raise a PostProcessingError

2609

exception that will be taken into account by the downloader

2610

it was called from.

2611

"""

2612

return information # by default, do nothing

2613

2614

class FFmpegExtractAudioPP(PostProcessor):

2615

2616

def __init__(self, downloader=None, preferredcodec=None):

2617

PostProcessor.__init__(self, downloader)

2618

if preferredcodec is None:

2619

preferredcodec = 'best'

2620

self._preferredcodec = preferredcodec

2621

2622

@staticmethod

2623

def get_audio_codec(path):

2624

try:

2625

cmd = ['ffprobe', '-show_streams', '--', path]

2626

handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)

2627

output = handle.communicate()[0]

2628

if handle.wait() != 0:

2629

return None

2630

except (IOError, OSError):

2631

return None

2632

audio_codec = None

2633

for line in output.split('\n'):

2634

if line.startswith('codec_name='):

2635

audio_codec = line.split('=')[1].strip()

2636

elif line.strip() == 'codec_type=audio' and audio_codec is not None:

2637

return audio_codec

2638

return None

2639

2640

@staticmethod

2641

def run_ffmpeg(path, out_path, codec, more_opts):

2642

try:

2643

cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]

2644

ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)

2645

return (ret == 0)

2646

except (IOError, OSError):

2647

return False

2648

2649

def run(self, information):

2650

path = information['filepath']

2651

2652

filecodec = self.get_audio_codec(path)

2653

if filecodec is None:

2654

self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')

2655

return None

2656

2657

more_opts = []

2658

if self._preferredcodec == 'best' or self._preferredcodec == filecodec:

2659

if filecodec == 'aac' or filecodec == 'mp3':

2660

# Lossless if possible

2661

acodec = 'copy'

2662

extension = filecodec

2663

if filecodec == 'aac':

2664

more_opts = ['-f', 'adts']

2665

else:

2666

# MP3 otherwise.

2667

acodec = 'libmp3lame'

2668

extension = 'mp3'

2669

more_opts = ['-ab', '128k']

2670

else:

2671

# We convert the audio (lossy)

2672

acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]

2673

extension = self._preferredcodec

2674

more_opts = ['-ab', '128k']

2675

if self._preferredcodec == 'aac':

2676

more_opts += ['-f', 'adts']

2677

2678

(prefix, ext) = os.path.splitext(path)

2679

new_path = prefix + '.' + extension

2680

self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)

2681

status = self.run_ffmpeg(path, new_path, acodec, more_opts)

2682

2683

if not status:

2684

self._downloader.to_stderr(u'WARNING: error running ffmpeg')

2685

return None

2686

2687

try:

2688

os.remove(path)

2689

except (IOError, OSError):

2690

self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')

2691

return None

2692

2693

information['filepath'] = new_path

2694

return information

2695

2696

### MAIN PROGRAM ###

2697

if __name__ == '__main__':

2698

try:

2699

# Modules needed only when running the main program

2700

import getpass

2701

import optparse

2702

2703

# Function to update the program file with the latest version from the repository.

2704

def update_self(downloader, filename):

2705

# Note: downloader only used for options

2706

if not os.access(filename, os.W_OK):

2707

sys.exit('ERROR: no write permissions on %s' % filename)

2708

2709

downloader.to_screen('Updating to latest stable version...')

2710

try:

2711

latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'

2712

latest_version = urllib.urlopen(latest_url).read().strip()

2713

prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version

2714

newcontent = urllib.urlopen(prog_url).read()

2715

except (IOError, OSError), err:

2716

sys.exit('ERROR: unable to download latest version')

2717

try:

2718

stream = open(filename, 'w')

2719

stream.write(newcontent)

2720

stream.close()

2721

except (IOError, OSError), err:

2722

sys.exit('ERROR: unable to overwrite current version')

2723

downloader.to_screen('Updated to version %s' % latest_version)

2724

2725

# Parse command line

2726

parser = optparse.OptionParser(

2727

usage='Usage: %prog [options] url...',

2728

version='2011.08.04',

2729

conflict_handler='resolve',

2730

)

2731

2732

parser.add_option('-h', '--help',

2733

action='help', help='print this help text and exit')

2734

parser.add_option('-v', '--version',

2735

action='version', help='print program version and exit')

2736

parser.add_option('-U', '--update',

2737

action='store_true', dest='update_self', help='update this program to latest stable version')

2738

parser.add_option('-i', '--ignore-errors',

2739

action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)

2740

parser.add_option('-r', '--rate-limit',

2741

dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')

2742

parser.add_option('-R', '--retries',

2743

dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)

2744

parser.add_option('--playlist-start',

2745

dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)

2746

parser.add_option('--playlist-end',

2747

dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)

2748

parser.add_option('--dump-user-agent',

2749

action='store_true', dest='dump_user_agent',

2750

help='display the current browser identification', default=False)

2751

2752

authentication = optparse.OptionGroup(parser, 'Authentication Options')

2753

authentication.add_option('-u', '--username',

2754

dest='username', metavar='USERNAME', help='account username')

2755

authentication.add_option('-p', '--password',

2756

dest='password', metavar='PASSWORD', help='account password')

2757

authentication.add_option('-n', '--netrc',

2758

action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)

2759

parser.add_option_group(authentication)

2760

2761

video_format = optparse.OptionGroup(parser, 'Video Format Options')

2762

video_format.add_option('-f', '--format',

2763

action='store', dest='format', metavar='FORMAT', help='video format code')

2764

video_format.add_option('--all-formats',

2765

action='store_const', dest='format', help='download all available video formats', const='-1')

2766

video_format.add_option('--max-quality',

2767

action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')

2768

parser.add_option_group(video_format)

2769

2770

verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')

2771

verbosity.add_option('-q', '--quiet',

2772

action='store_true', dest='quiet', help='activates quiet mode', default=False)

2773

verbosity.add_option('-s', '--simulate',

2774

action='store_true', dest='simulate', help='do not download video', default=False)

2775

verbosity.add_option('-g', '--get-url',

2776

action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)

2777

verbosity.add_option('-e', '--get-title',

2778

action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)

2779

verbosity.add_option('--get-thumbnail',

2780

action='store_true', dest='getthumbnail',

2781

help='simulate, quiet but print thumbnail URL', default=False)

2782

verbosity.add_option('--get-description',

2783

action='store_true', dest='getdescription',

2784

help='simulate, quiet but print video description', default=False)

2785

verbosity.add_option('--get-filename',

2786

action='store_true', dest='getfilename',

2787

help='simulate, quiet but print output filename', default=False)

2788

verbosity.add_option('--no-progress',

2789

action='store_true', dest='noprogress', help='do not print progress bar', default=False)

2790

verbosity.add_option('--console-title',

2791

action='store_true', dest='consoletitle',

2792

help='display progress in console titlebar', default=False)

2793

parser.add_option_group(verbosity)

2794

2795

filesystem = optparse.OptionGroup(parser, 'Filesystem Options')

2796

filesystem.add_option('-t', '--title',

2797

action='store_true', dest='usetitle', help='use title in file name', default=False)

2798

filesystem.add_option('-l', '--literal',

2799

action='store_true', dest='useliteral', help='use literal title in file name', default=False)

2800

filesystem.add_option('-A', '--auto-number',

2801

action='store_true', dest='autonumber',

2802

help='number downloaded files starting from 00000', default=False)

2803

filesystem.add_option('-o', '--output',

2804

dest='outtmpl', metavar='TEMPLATE', help='output filename template')

2805

filesystem.add_option('-a', '--batch-file',

2806

dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')

2807

filesystem.add_option('-w', '--no-overwrites',

2808

action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)

2809

filesystem.add_option('-c', '--continue',

2810

action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)

2811

filesystem.add_option('--cookies',

2812

dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')

2813

filesystem.add_option('--no-part',

2814

action='store_true', dest='nopart', help='do not use .part files', default=False)

2815

filesystem.add_option('--no-mtime',

2816

action='store_false', dest='updatetime',

2817

help='do not use the Last-modified header to set the file modification time', default=True)

2818

parser.add_option_group(filesystem)

2819

2820

postproc = optparse.OptionGroup(parser, 'Post-processing Options')

2821

postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,

2822

help='convert video files to audio-only files (requires ffmpeg and ffprobe)')

2823

postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',

2824

help='"best", "aac" or "mp3"; best by default')

2825

parser.add_option_group(postproc)

2826

2827

(opts, args) = parser.parse_args()

2828

2829

# Open appropriate CookieJar

2830

if opts.cookiefile is None:

2831

jar = cookielib.CookieJar()

2832

else:

2833

try:

2834

jar = cookielib.MozillaCookieJar(opts.cookiefile)

2835

if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):

2836

jar.load()

2837

except (IOError, OSError), err:

2838

sys.exit(u'ERROR: unable to open cookie file')

2839

2840

# Dump user agent

2841

if opts.dump_user_agent:

2842

print std_headers['User-Agent']

2843

sys.exit(0)

2844

2845

# General configuration

2846

cookie_processor = urllib2.HTTPCookieProcessor(jar)

2847

urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))

2848

socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)

2849

2850

# Batch file verification

2851

batchurls = []

2852

if opts.batchfile is not None:

2853

try:

2854

if opts.batchfile == '-':

2855

batchfd = sys.stdin

2856

else:

2857

batchfd = open(opts.batchfile, 'r')

2858

batchurls = batchfd.readlines()

2859

batchurls = [x.strip() for x in batchurls]

2860

batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]

2861

except IOError:

2862

sys.exit(u'ERROR: batch file could not be read')

2863

all_urls = batchurls + args

2864

2865

# Conflicting, missing and erroneous options

2866

if opts.usenetrc and (opts.username is not None or opts.password is not None):

2867

parser.error(u'using .netrc conflicts with giving username/password')

2868

if opts.password is not None and opts.username is None:

2869

parser.error(u'account username missing')

2870

if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):

2871

parser.error(u'using output template conflicts with using title, literal title or auto number')

2872

if opts.usetitle and opts.useliteral:

2873

parser.error(u'using title conflicts with using literal title')

2874

if opts.username is not None and opts.password is None:

2875

opts.password = getpass.getpass(u'Type account password and press return:')

2876

if opts.ratelimit is not None:

2877

numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)

2878

if numeric_limit is None:

2879

parser.error(u'invalid rate limit specified')

2880

opts.ratelimit = numeric_limit

2881

if opts.retries is not None:

2882

try:

2883

opts.retries = long(opts.retries)

2884

except (TypeError, ValueError), err:

2885

parser.error(u'invalid retry count specified')

2886

try:

2887

opts.playliststart = long(opts.playliststart)

2888

if opts.playliststart <= 0:

2889

raise ValueError

2890

except (TypeError, ValueError), err:

2891

parser.error(u'invalid playlist start number specified')

2892

try:

2893

opts.playlistend = long(opts.playlistend)

2894

if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):

2895

raise ValueError

2896

except (TypeError, ValueError), err:

2897

parser.error(u'invalid playlist end number specified')

2898

if opts.extractaudio:

2899

if opts.audioformat not in ['best', 'aac', 'mp3']:

2900

parser.error(u'invalid audio format specified')

2901

2902

# Information extractors

2903

youtube_ie = YoutubeIE()

2904

metacafe_ie = MetacafeIE(youtube_ie)

2905

dailymotion_ie = DailymotionIE()

2906

youtube_pl_ie = YoutubePlaylistIE(youtube_ie)

2907

youtube_user_ie = YoutubeUserIE(youtube_ie)

2908

youtube_search_ie = YoutubeSearchIE(youtube_ie)

2909

google_ie = GoogleIE()

2910

google_search_ie = GoogleSearchIE(google_ie)

2911

photobucket_ie = PhotobucketIE()

2912

yahoo_ie = YahooIE()

2913

yahoo_search_ie = YahooSearchIE(yahoo_ie)

2914

deposit_files_ie = DepositFilesIE()

2915

facebook_ie = FacebookIE()

2916

generic_ie = GenericIE()

2917

2918

# File downloader

2919

fd = FileDownloader({

2920

'usenetrc': opts.usenetrc,

2921

'username': opts.username,

2922

'password': opts.password,

2923

'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),

2924

'forceurl': opts.geturl,

2925

'forcetitle': opts.gettitle,

2926

'forcethumbnail': opts.getthumbnail,

2927

'forcedescription': opts.getdescription,

2928

'forcefilename': opts.getfilename,

2929

'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),

2930

'format': opts.format,

2931

'format_limit': opts.format_limit,

2932

'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))

2933

or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')

2934

or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')

2935

or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')

2936

or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')

2937

or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')

2938

or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')

2939

or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')

2940

or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')

2941

or u'%(id)s.%(ext)s'),

2942

'ignoreerrors': opts.ignoreerrors,

2943

'ratelimit': opts.ratelimit,

2944

'nooverwrites': opts.nooverwrites,

2945

'retries': opts.retries,

2946

'continuedl': opts.continue_dl,

2947

'noprogress': opts.noprogress,

2948

'playliststart': opts.playliststart,

2949

'playlistend': opts.playlistend,

2950

'logtostderr': opts.outtmpl == '-',

2951

'consoletitle': opts.consoletitle,

2952

'nopart': opts.nopart,

2953

'updatetime': opts.updatetime,

2954

})

2955

fd.add_info_extractor(youtube_search_ie)

2956

fd.add_info_extractor(youtube_pl_ie)

2957

fd.add_info_extractor(youtube_user_ie)

2958

fd.add_info_extractor(metacafe_ie)

2959

fd.add_info_extractor(dailymotion_ie)

2960

fd.add_info_extractor(youtube_ie)

2961

fd.add_info_extractor(google_ie)

2962

fd.add_info_extractor(google_search_ie)

2963

fd.add_info_extractor(photobucket_ie)

2964

fd.add_info_extractor(yahoo_ie)

2965

fd.add_info_extractor(yahoo_search_ie)

2966

fd.add_info_extractor(deposit_files_ie)

2967

fd.add_info_extractor(facebook_ie)

2968

2969

# This must come last since it's the

2970

# fallback if none of the others work

2971

fd.add_info_extractor(generic_ie)

2972

2973

# PostProcessors

2974

if opts.extractaudio:

2975

fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))

2976

2977

# Update version

2978

if opts.update_self:

2979

update_self(fd, sys.argv[0])

2980

2981

# Maybe do nothing

2982

if len(all_urls) < 1:

2983

if not opts.update_self:

2984

parser.error(u'you must provide at least one URL')

2985

else:

2986

sys.exit()

2987

retcode = fd.download(all_urls)

2988

2989

# Dump cookie jar if requested

2990

if opts.cookiefile is not None:

2991

try:

2992

jar.save()

2993

except (IOError, OSError), err:

2994

sys.exit(u'ERROR: unable to save cookie jar')

2995

2996

sys.exit(retcode)

2997

2998

except DownloadError:

2999

sys.exit(1)

3000

except SameFileError:

3001

sys.exit(u'ERROR: fixed output name but more than one file to download')

3002

except KeyboardInterrupt:

3003

sys.exit(u'\nERROR: Interrupted by user')