~cairo-dock-team/cairo-dock-plug-ins-extras/git : revision 478

1

#!/usr/bin/env python

2

# -*- coding: utf-8 -*-

3

4

__authors__ = (

5

'Ricardo Garcia Gonzalez',

6

'Danny Colligan',

7

'Benjamin Johnson',

8

'Vasyl\' Vavrychuk',

9

'Witold Baryluk',

10

'Paweł Paprota',

11

'Gergely Imreh',

12

'Rogério Brito',

13

'Philipp Hagemeister',

14

'Sören Schulze',

15

'Kevin Ngo',

16

'Ori Avtalion',

17

'shizeeg',

18

)

19

20

__license__ = 'Public Domain'

21

__version__ = '2012.02.27'

22

23

UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'

24

25

26

import cookielib

27

import datetime

28

import getpass

29

import gzip

30

import htmlentitydefs

31

import HTMLParser

32

import httplib

33

import locale

34

import math

35

import netrc

36

import optparse

37

import os

38

import os.path

39

import re

40

import shlex

41

import socket

42

import string

43

import subprocess

44

import sys

45

import time

46

import urllib

47

import urllib2

48

import warnings

49

import zlib

50

51

if os.name == 'nt':

52

import ctypes

53

54

try:

55

import email.utils

56

except ImportError: # Python 2.4

57

import email.Utils

58

try:

59

import cStringIO as StringIO

60

except ImportError:

61

import StringIO

62

63

# parse_qs was moved from the cgi module to the urlparse module recently.

64

try:

65

from urlparse import parse_qs

66

except ImportError:

67

from cgi import parse_qs

68

69

try:

70

import lxml.etree

71

except ImportError:

72

pass # Handled below

73

74

try:

75

import xml.etree.ElementTree

76

except ImportError: # Python<2.5: Not officially supported, but let it slip

77

warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')

78

79

std_headers = {

80

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',

81

'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',

82

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

83

'Accept-Encoding': 'gzip, deflate',

84

'Accept-Language': 'en-us,en;q=0.5',

85

}

86

87

try:

88

import json

89

except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):

90

import re

91

class json(object):

92

@staticmethod

93

def loads(s):

94

s = s.decode('UTF-8')

95

def raiseError(msg, i):

96

raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))

97

def skipSpace(i, expectMore=True):

98

while i < len(s) and s[i] in ' \t\r\n':

99

i += 1

100

if expectMore:

101

if i >= len(s):

102

raiseError('Premature end', i)

103

return i

104

def decodeEscape(match):

105

esc = match.group(1)

106

_STATIC = {

107

'"': '"',

108

'\\': '\\',

109

'/': '/',

110

'b': unichr(0x8),

111

'f': unichr(0xc),

112

'n': '\n',

113

'r': '\r',

114

't': '\t',

115

}

116

if esc in _STATIC:

117

return _STATIC[esc]

118

if esc[0] == 'u':

119

if len(esc) == 1+4:

120

return unichr(int(esc[1:5], 16))

121

if len(esc) == 5+6 and esc[5:7] == '\\u':

122

hi = int(esc[1:5], 16)

123

low = int(esc[7:11], 16)

124

return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)

125

raise ValueError('Unknown escape ' + str(esc))

126

def parseString(i):

127

i += 1

128

e = i

129

while True:

130

e = s.index('"', e)

131

bslashes = 0

132

while s[e-bslashes-1] == '\\':

133

bslashes += 1

134

if bslashes % 2 == 1:

135

e += 1

136

continue

137

break

138

rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')

139

stri = rexp.sub(decodeEscape, s[i:e])

140

return (e+1,stri)

141

def parseObj(i):

142

i += 1

143

res = {}

144

i = skipSpace(i)

145

if s[i] == '}': # Empty dictionary

146

return (i+1,res)

147

while True:

148

if s[i] != '"':

149

raiseError('Expected a string object key', i)

150

i,key = parseString(i)

151

i = skipSpace(i)

152

if i >= len(s) or s[i] != ':':

153

raiseError('Expected a colon', i)

154

i,val = parse(i+1)

155

res[key] = val

156

i = skipSpace(i)

157

if s[i] == '}':

158

return (i+1, res)

159

if s[i] != ',':

160

raiseError('Expected comma or closing curly brace', i)

161

i = skipSpace(i+1)

162

def parseArray(i):

163

res = []

164

i = skipSpace(i+1)

165

if s[i] == ']': # Empty array

166

return (i+1,res)

167

while True:

168

i,val = parse(i)

169

res.append(val)

170

i = skipSpace(i) # Raise exception if premature end

171

if s[i] == ']':

172

return (i+1, res)

173

if s[i] != ',':

174

raiseError('Expected a comma or closing bracket', i)

175

i = skipSpace(i+1)

176

def parseDiscrete(i):

177

for k,v in {'true': True, 'false': False, 'null': None}.items():

178

if s.startswith(k, i):

179

return (i+len(k), v)

180

raiseError('Not a boolean (or null)', i)

181

def parseNumber(i):

182

mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])

183

if mobj is None:

184

raiseError('Not a number', i)

185

nums = mobj.group(1)

186

if '.' in nums or 'e' in nums or 'E' in nums:

187

return (i+len(nums), float(nums))

188

return (i+len(nums), int(nums))

189

CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}

190

def parse(i):

191

i = skipSpace(i)

192

i,res = CHARMAP.get(s[i], parseNumber)(i)

193

i = skipSpace(i, False)

194

return (i,res)

195

i,res = parse(0)

196

if i < len(s):

197

raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')

198

return res

199

200

def preferredencoding():

201

"""Get preferred encoding.

202

203

Returns the best encoding scheme for the system, based on

204

locale.getpreferredencoding() and some further tweaks.

205

"""

206

def yield_preferredencoding():

207

try:

208

pref = locale.getpreferredencoding()

209

u'TEST'.encode(pref)

210

except:

211

pref = 'UTF-8'

212

while True:

213

yield pref

214

return yield_preferredencoding().next()

215

216

217

def htmlentity_transform(matchobj):

218

"""Transforms an HTML entity to a Unicode character.

219

220

This function receives a match object and is intended to be used with

221

the re.sub() function.

222

"""

223

entity = matchobj.group(1)

224

225

# Known non-numeric HTML entity

226

if entity in htmlentitydefs.name2codepoint:

227

return unichr(htmlentitydefs.name2codepoint[entity])

228

229

# Unicode character

230

mobj = re.match(ur'(?u)#(x?\d+)', entity)

231

if mobj is not None:

232

numstr = mobj.group(1)

233

if numstr.startswith(u'x'):

234

base = 16

235

numstr = u'0%s' % numstr

236

else:

237

base = 10

238

return unichr(long(numstr, base))

239

240

# Unknown entity in name, return its literal representation

241

return (u'&%s;' % entity)

242

243

244

def sanitize_title(utitle):

245

"""Sanitizes a video title so it could be used as part of a filename."""

246

utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)

247

return utitle.replace(unicode(os.sep), u'%')

248

249

250

def sanitize_open(filename, open_mode):

251

"""Try to open the given filename, and slightly tweak it if this fails.

252

253

Attempts to open the given filename. If this fails, it tries to change

254

the filename slightly, step by step, until it's either able to open it

255

or it fails and raises a final exception, like the standard open()

256

function.

257

258

It returns the tuple (stream, definitive_file_name).

259

"""

260

try:

261

if filename == u'-':

262

if sys.platform == 'win32':

263

import msvcrt

264

msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)

265

return (sys.stdout, filename)

266

stream = open(_encodeFilename(filename), open_mode)

267

return (stream, filename)

268

except (IOError, OSError), err:

269

# In case of error, try to remove win32 forbidden chars

270

filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)

271

272

# An exception here should be caught in the caller

273

stream = open(_encodeFilename(filename), open_mode)

274

return (stream, filename)

275

276

277

def timeconvert(timestr):

278

"""Convert RFC 2822 defined time string into system timestamp"""

279

timestamp = None

280

timetuple = email.utils.parsedate_tz(timestr)

281

if timetuple is not None:

282

timestamp = email.utils.mktime_tz(timetuple)

283

return timestamp

284

285

def _simplify_title(title):

286

expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)

287

return expr.sub(u'_', title).strip(u'_')

288

289

def _orderedSet(iterable):

290

""" Remove all duplicates from the input iterable """

291

res = []

292

for el in iterable:

293

if el not in res:

294

res.append(el)

295

return res

296

297

def _unescapeHTML(s):

298

"""

299

@param s a string (of type unicode)

300

"""

301

assert type(s) == type(u'')

302

303

htmlParser = HTMLParser.HTMLParser()

304

return htmlParser.unescape(s)

305

306

def _encodeFilename(s):

307

"""

308

@param s The name of the file (of type unicode)

309

"""

310

311

assert type(s) == type(u'')

312

313

if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:

314

# Pass u'' directly to use Unicode APIs on Windows 2000 and up

315

# (Detecting Windows NT 4 is tricky because 'major >= 4' would

316

# match Windows 9x series as well. Besides, NT 4 is obsolete.)

317

return s

318

else:

319

return s.encode(sys.getfilesystemencoding(), 'ignore')

320

321

class DownloadError(Exception):

322

"""Download Error exception.

323

324

This exception may be thrown by FileDownloader objects if they are not

325

configured to continue on errors. They will contain the appropriate

326

error message.

327

"""

328

pass

329

330

331

class SameFileError(Exception):

332

"""Same File exception.

333

334

This exception will be thrown by FileDownloader objects if they detect

335

multiple files would have to be downloaded to the same file on disk.

336

"""

337

pass

338

339

340

class PostProcessingError(Exception):

341

"""Post Processing exception.

342

343

This exception may be raised by PostProcessor's .run() method to

344

indicate an error in the postprocessing task.

345

"""

346

pass

347

348

class MaxDownloadsReached(Exception):

349

""" --max-downloads limit has been reached. """

350

pass

351

352

353

class UnavailableVideoError(Exception):

354

"""Unavailable Format exception.

355

356

This exception will be thrown when a video is requested

357

in a format that is not available for that video.

358

"""

359

pass

360

361

362

class ContentTooShortError(Exception):

363

"""Content Too Short exception.

364

365

This exception may be raised by FileDownloader objects when a file they

366

download is too small for what the server announced first, indicating

367

the connection was probably interrupted.

368

"""

369

# Both in bytes

370

downloaded = None

371

expected = None

372

373

def __init__(self, downloaded, expected):

374

self.downloaded = downloaded

375

self.expected = expected

376

377

378

class YoutubeDLHandler(urllib2.HTTPHandler):

379

"""Handler for HTTP requests and responses.

380

381

This class, when installed with an OpenerDirector, automatically adds

382

the standard headers to every HTTP request and handles gzipped and

383

deflated responses from web servers. If compression is to be avoided in

384

a particular request, the original request in the program code only has

385

to include the HTTP header "Youtubedl-No-Compression", which will be

386

removed before making the real request.

387

388

Part of this code was copied from:

389

390

http://techknack.net/python-urllib2-handlers/

391

392

Andrew Rowls, the author of that code, agreed to release it to the

393

public domain.

394

"""

395

396

@staticmethod

397

def deflate(data):

398

try:

399

return zlib.decompress(data, -zlib.MAX_WBITS)

400

except zlib.error:

401

return zlib.decompress(data)

402

403

@staticmethod

404

def addinfourl_wrapper(stream, headers, url, code):

405

if hasattr(urllib2.addinfourl, 'getcode'):

406

return urllib2.addinfourl(stream, headers, url, code)

407

ret = urllib2.addinfourl(stream, headers, url)

408

ret.code = code

409

return ret

410

411

def http_request(self, req):

412

for h in std_headers:

413

if h in req.headers:

414

del req.headers[h]

415

req.add_header(h, std_headers[h])

416

if 'Youtubedl-no-compression' in req.headers:

417

if 'Accept-encoding' in req.headers:

418

del req.headers['Accept-encoding']

419

del req.headers['Youtubedl-no-compression']

420

return req

421

422

def http_response(self, req, resp):

423

old_resp = resp

424

# gzip

425

if resp.headers.get('Content-encoding', '') == 'gzip':

426

gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')

427

resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)

428

resp.msg = old_resp.msg

429

# deflate

430

if resp.headers.get('Content-encoding', '') == 'deflate':

431

gz = StringIO.StringIO(self.deflate(resp.read()))

432

resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)

433

resp.msg = old_resp.msg

434

return resp

435

436

437

class FileDownloader(object):

438

"""File Downloader class.

439

440

File downloader objects are the ones responsible of downloading the

441

actual video file and writing it to disk if the user has requested

442

it, among some other tasks. In most cases there should be one per

443

program. As, given a video URL, the downloader doesn't know how to

444

extract all the needed information, task that InfoExtractors do, it

445

has to pass the URL to one of them.

446

447

For this, file downloader objects have a method that allows

448

InfoExtractors to be registered in a given order. When it is passed

449

a URL, the file downloader handles it to the first InfoExtractor it

450

finds that reports being able to handle it. The InfoExtractor extracts

451

all the information about the video or videos the URL refers to, and

452

asks the FileDownloader to process the video information, possibly

453

downloading the video.

454

455

File downloaders accept a lot of parameters. In order not to saturate

456

the object constructor with arguments, it receives a dictionary of

457

options instead. These options are available through the params

458

attribute for the InfoExtractors to use. The FileDownloader also

459

registers itself as the downloader in charge for the InfoExtractors

460

that are added to it, so this is a "mutual registration".

461

462

Available options:

463

464

username: Username for authentication purposes.

465

password: Password for authentication purposes.

466

usenetrc: Use netrc for authentication instead.

467

quiet: Do not print messages to stdout.

468

forceurl: Force printing final URL.

469

forcetitle: Force printing title.

470

forcethumbnail: Force printing thumbnail URL.

471

forcedescription: Force printing description.

472

forcefilename: Force printing final filename.

473

simulate: Do not download the video files.

474

format: Video format code.

475

format_limit: Highest quality format to try.

476

outtmpl: Template for output names.

477

ignoreerrors: Do not stop on download errors.

478

ratelimit: Download speed limit, in bytes/sec.

479

nooverwrites: Prevent overwriting files.

480

retries: Number of times to retry for HTTP error 5xx

481

continuedl: Try to continue downloads if possible.

482

noprogress: Do not print the progress bar.

483

playliststart: Playlist item to start at.

484

playlistend: Playlist item to end at.

485

matchtitle: Download only matching titles.

486

rejecttitle: Reject downloads for matching titles.

487

logtostderr: Log messages to stderr instead of stdout.

488

consoletitle: Display progress in console window's titlebar.

489

nopart: Do not use temporary .part files.

490

updatetime: Use the Last-modified header to set output file timestamps.

491

writedescription: Write the video description to a .description file

492

writeinfojson: Write the video description to a .info.json file

493

"""

494

495

params = None

496

_ies = []

497

_pps = []

498

_download_retcode = None

499

_num_downloads = None

500

_screen_file = None

501

502

def __init__(self, params):

503

"""Create a FileDownloader object with the given options."""

504

self._ies = []

505

self._pps = []

506

self._download_retcode = 0

507

self._num_downloads = 0

508

self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]

509

self.params = params

510

511

@staticmethod

512

def format_bytes(bytes):

513

if bytes is None:

514

return 'N/A'

515

if type(bytes) is str:

516

bytes = float(bytes)

517

if bytes == 0.0:

518

exponent = 0

519

else:

520

exponent = long(math.log(bytes, 1024.0))

521

suffix = 'bkMGTPEZY'[exponent]

522

converted = float(bytes) / float(1024 ** exponent)

523

return '%.2f%s' % (converted, suffix)

524

525

@staticmethod

526

def calc_percent(byte_counter, data_len):

527

if data_len is None:

528

return '---.-%'

529

return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))

530

531

@staticmethod

532

def calc_eta(start, now, total, current):

533

if total is None:

534

return '--:--'

535

dif = now - start

536

if current == 0 or dif < 0.001: # One millisecond

537

return '--:--'

538

rate = float(current) / dif

539

eta = long((float(total) - float(current)) / rate)

540

(eta_mins, eta_secs) = divmod(eta, 60)

541

if eta_mins > 99:

542

return '--:--'

543

return '%02d:%02d' % (eta_mins, eta_secs)

544

545

@staticmethod

546

def calc_speed(start, now, bytes):

547

dif = now - start

548

if bytes == 0 or dif < 0.001: # One millisecond

549

return '%10s' % '---b/s'

550

return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))

551

552

@staticmethod

553

def best_block_size(elapsed_time, bytes):

554

new_min = max(bytes / 2.0, 1.0)

555

new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB

556

if elapsed_time < 0.001:

557

return long(new_max)

558

rate = bytes / elapsed_time

559

if rate > new_max:

560

return long(new_max)

561

if rate < new_min:

562

return long(new_min)

563

return long(rate)

564

565

@staticmethod

566

def parse_bytes(bytestr):

567

"""Parse a string indicating a byte quantity into a long integer."""

568

matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)

569

if matchobj is None:

570

return None

571

number = float(matchobj.group(1))

572

multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())

573

return long(round(number * multiplier))

574

575

def add_info_extractor(self, ie):

576

"""Add an InfoExtractor object to the end of the list."""

577

self._ies.append(ie)

578

ie.set_downloader(self)

579

580

def add_post_processor(self, pp):

581

"""Add a PostProcessor object to the end of the chain."""

582

self._pps.append(pp)

583

pp.set_downloader(self)

584

585

def to_screen(self, message, skip_eol=False):

586

"""Print message to stdout if not in quiet mode."""

587

assert type(message) == type(u'')

588

if not self.params.get('quiet', False):

589

terminator = [u'\n', u''][skip_eol]

590

output = message + terminator

591

592

if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr

593

output = output.encode(preferredencoding(), 'ignore')

594

self._screen_file.write(output)

595

self._screen_file.flush()

596

597

def to_stderr(self, message):

598

"""Print message to stderr."""

599

print >>sys.stderr, message.encode(preferredencoding())

600

601

def to_cons_title(self, message):

602

"""Set console/terminal window title to message."""

603

if not self.params.get('consoletitle', False):

604

return

605

if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():

606

# c_wchar_p() might not be necessary if `message` is

607

# already of type unicode()

608

ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))

609

elif 'TERM' in os.environ:

610

sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))

611

612

def fixed_template(self):

613

"""Checks if the output template is fixed."""

614

return (re.search(ur'(?u)%$.+?$s', self.params['outtmpl']) is None)

615

616

def trouble(self, message=None):

617

"""Determine action to take when a download problem appears.

618

619

Depending on if the downloader has been configured to ignore

620

download errors or not, this method may throw an exception or

621

not when errors are found, after printing the message.

622

"""

623

if message is not None:

624

self.to_stderr(message)

625

if not self.params.get('ignoreerrors', False):

626

raise DownloadError(message)

627

self._download_retcode = 1

628

629

def slow_down(self, start_time, byte_counter):

630

"""Sleep if the download speed is over the rate limit."""

631

rate_limit = self.params.get('ratelimit', None)

632

if rate_limit is None or byte_counter == 0:

633

return

634

now = time.time()

635

elapsed = now - start_time

636

if elapsed <= 0.0:

637

return

638

speed = float(byte_counter) / elapsed

639

if speed > rate_limit:

640

time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)

641

642

def temp_name(self, filename):

643

"""Returns a temporary filename for the given filename."""

644

if self.params.get('nopart', False) or filename == u'-' or \

645

(os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):

646

return filename

647

return filename + u'.part'

648

649

def undo_temp_name(self, filename):

650

if filename.endswith(u'.part'):

651

return filename[:-len(u'.part')]

652

return filename

653

654

def try_rename(self, old_filename, new_filename):

655

try:

656

if old_filename == new_filename:

657

return

658

os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))

659

except (IOError, OSError), err:

660

self.trouble(u'ERROR: unable to rename file')

661

662

def try_utime(self, filename, last_modified_hdr):

663

"""Try to set the last-modified time of the given file."""

664

if last_modified_hdr is None:

665

return

666

if not os.path.isfile(_encodeFilename(filename)):

667

return

668

timestr = last_modified_hdr

669

if timestr is None:

670

return

671

filetime = timeconvert(timestr)

672

if filetime is None:

673

return filetime

674

try:

675

os.utime(filename, (time.time(), filetime))

676

except:

677

pass

678

return filetime

679

680

def report_writedescription(self, descfn):

681

""" Report that the description file is being written """

682

self.to_screen(u'[info] Writing video description to: ' + descfn)

683

684

def report_writeinfojson(self, infofn):

685

""" Report that the metadata file has been written """

686

self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)

687

688

def report_destination(self, filename):

689

"""Report destination filename."""

690

self.to_screen(u'[download] Destination: ' + filename)

691

692

def report_progress(self, percent_str, data_len_str, speed_str, eta_str):

693

"""Report download progress."""

694

if self.params.get('noprogress', False):

695

return

696

self.to_screen(u'\r[download] %s of %s at %s ETA %s' %

697

(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)

698

self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %

699

(percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))

700

701

def report_resuming_byte(self, resume_len):

702

"""Report attempt to resume at given byte."""

703

self.to_screen(u'[download] Resuming download at byte %s' % resume_len)

704

705

def report_retry(self, count, retries):

706

"""Report retry in case of HTTP error 5xx"""

707

self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))

708

709

def report_file_already_downloaded(self, file_name):

710

"""Report file has already been fully downloaded."""

711

try:

712

self.to_screen(u'[download] %s has already been downloaded' % file_name)

713

except (UnicodeEncodeError), err:

714

self.to_screen(u'[download] The file has already been downloaded')

715

716

def report_unable_to_resume(self):

717

"""Report it was impossible to resume download."""

718

self.to_screen(u'[download] Unable to resume')

719

720

def report_finish(self):

721

"""Report download finished."""

722

if self.params.get('noprogress', False):

723

self.to_screen(u'[download] Download completed')

724

else:

725

self.to_screen(u'')

726

727

def increment_downloads(self):

728

"""Increment the ordinal that assigns a number to each file."""

729

self._num_downloads += 1

730

731

def prepare_filename(self, info_dict):

732

"""Generate the output filename."""

733

try:

734

template_dict = dict(info_dict)

735

template_dict['epoch'] = unicode(long(time.time()))

736

template_dict['autonumber'] = unicode('%05d' % self._num_downloads)

737

filename = self.params['outtmpl'] % template_dict

738

return filename

739

except (ValueError, KeyError), err:

740

self.trouble(u'ERROR: invalid system charset or erroneous output template')

741

return None

742

743

def _match_entry(self, info_dict):

744

""" Returns None iff the file should be downloaded """

745

746

title = info_dict['title']

747

matchtitle = self.params.get('matchtitle', False)

748

if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):

749

return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'

750

rejecttitle = self.params.get('rejecttitle', False)

751

if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):

752

return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'

753

return None

754

755

def process_info(self, info_dict):

756

"""Process a single dictionary returned by an InfoExtractor."""

757

758

reason = self._match_entry(info_dict)

759

if reason is not None:

760

self.to_screen(u'[download] ' + reason)

761

return

762

763

max_downloads = self.params.get('max_downloads')

764

if max_downloads is not None:

765

if self._num_downloads > int(max_downloads):

766

raise MaxDownloadsReached()

767

768

filename = self.prepare_filename(info_dict)

769

770

# Forced printings

771

if self.params.get('forcetitle', False):

772

print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')

773

if self.params.get('forceurl', False):

774

print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')

775

if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:

776

print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')

777

if self.params.get('forcedescription', False) and 'description' in info_dict:

778

print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')

779

if self.params.get('forcefilename', False) and filename is not None:

780

print filename.encode(preferredencoding(), 'xmlcharrefreplace')

781

if self.params.get('forceformat', False):

782

print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')

783

784

# Do nothing else if in simulate mode

785

if self.params.get('simulate', False):

786

return

787

788

if filename is None:

789

return

790

791

try:

792

dn = os.path.dirname(_encodeFilename(filename))

793

if dn != '' and not os.path.exists(dn): # dn is already encoded

794

os.makedirs(dn)

795

except (OSError, IOError), err:

796

self.trouble(u'ERROR: unable to create directory ' + unicode(err))

797

return

798

799

if self.params.get('writedescription', False):

800

try:

801

descfn = filename + u'.description'

802

self.report_writedescription(descfn)

803

descfile = open(_encodeFilename(descfn), 'wb')

804

try:

805

descfile.write(info_dict['description'].encode('utf-8'))

806

finally:

807

descfile.close()

808

except (OSError, IOError):

809

self.trouble(u'ERROR: Cannot write description file ' + descfn)

810

return

811

812

if self.params.get('writeinfojson', False):

813

infofn = filename + u'.info.json'

814

self.report_writeinfojson(infofn)

815

try:

816

json.dump

817

except (NameError,AttributeError):

818

self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')

819

return

820

try:

821

infof = open(_encodeFilename(infofn), 'wb')

822

try:

823

json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))

824

json.dump(json_info_dict, infof)

825

finally:

826

infof.close()

827

except (OSError, IOError):

828

self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)

829

return

830

831

if not self.params.get('skip_download', False):

832

if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):

833

success = True

834

else:

835

try:

836

success = self._do_download(filename, info_dict)

837

except (OSError, IOError), err:

838

raise UnavailableVideoError

839

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

840

self.trouble(u'ERROR: unable to download video data: %s' % str(err))

841

return

842

except (ContentTooShortError, ), err:

843

self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))

844

return

845

846

if success:

847

try:

848

self.post_process(filename, info_dict)

849

except (PostProcessingError), err:

850

self.trouble(u'ERROR: postprocessing: %s' % str(err))

851

return

852

853

def download(self, url_list):

854

"""Download a given list of URLs."""

855

if len(url_list) > 1 and self.fixed_template():

856

raise SameFileError(self.params['outtmpl'])

857

858

for url in url_list:

859

suitable_found = False

860

for ie in self._ies:

861

# Go to next InfoExtractor if not suitable

862

if not ie.suitable(url):

863

continue

864

865

# Suitable InfoExtractor found

866

suitable_found = True

867

868

# Extract information from URL and process it

869

ie.extract(url)

870

871

# Suitable InfoExtractor had been found; go to next URL

872

break

873

874

if not suitable_found:

875

self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)

876

877

return self._download_retcode

878

879

def post_process(self, filename, ie_info):

880

"""Run the postprocessing chain on the given file."""

881

info = dict(ie_info)

882

info['filepath'] = filename

883

for pp in self._pps:

884

info = pp.run(info)

885

if info is None:

886

break

887

888

def _download_with_rtmpdump(self, filename, url, player_url):

889

self.report_destination(filename)

890

tmpfilename = self.temp_name(filename)

891

892

# Check for rtmpdump first

893

try:

894

subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)

895

except (OSError, IOError):

896

self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')

897

return False

898

899

# Download using rtmpdump. rtmpdump returns exit code 2 when

900

# the connection was interrumpted and resuming appears to be

901

# possible. This is part of rtmpdump's normal usage, AFAIK.

902

basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]

903

args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]

904

if self.params.get('verbose', False):

905

try:

906

import pipes

907

shell_quote = lambda args: ' '.join(map(pipes.quote, args))

908

except ImportError:

909

shell_quote = repr

910

self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))

911

retval = subprocess.call(args)

912

while retval == 2 or retval == 1:

913

prevsize = os.path.getsize(_encodeFilename(tmpfilename))

914

self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)

915

time.sleep(5.0) # This seems to be needed

916

retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])

917

cursize = os.path.getsize(_encodeFilename(tmpfilename))

918

if prevsize == cursize and retval == 1:

919

break

920

# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those

921

if prevsize == cursize and retval == 2 and cursize > 1024:

922

self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')

923

retval = 0

924

break

925

if retval == 0:

926

self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))

927

self.try_rename(tmpfilename, filename)

928

return True

929

else:

930

self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)

931

return False

932

933

def _do_download(self, filename, info_dict):

934

url = info_dict['url']

935

player_url = info_dict.get('player_url', None)

936

937

# Check file already present

938

if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):

939

self.report_file_already_downloaded(filename)

940

return True

941

942

# Attempt to download using rtmpdump

943

if url.startswith('rtmp'):

944

return self._download_with_rtmpdump(filename, url, player_url)

945

946

tmpfilename = self.temp_name(filename)

947

stream = None

948

949

# Do not include the Accept-Encoding header

950

headers = {'Youtubedl-no-compression': 'True'}

951

basic_request = urllib2.Request(url, None, headers)

952

request = urllib2.Request(url, None, headers)

953

954

# Establish possible resume length

955

if os.path.isfile(_encodeFilename(tmpfilename)):

956

resume_len = os.path.getsize(_encodeFilename(tmpfilename))

957

else:

958

resume_len = 0

959

960

open_mode = 'wb'

961

if resume_len != 0:

962

if self.params.get('continuedl', False):

963

self.report_resuming_byte(resume_len)

964

request.add_header('Range','bytes=%d-' % resume_len)

965

open_mode = 'ab'

966

else:

967

resume_len = 0

968

969

count = 0

970

retries = self.params.get('retries', 0)

971

while count <= retries:

972

# Establish connection

973

try:

974

if count == 0 and 'urlhandle' in info_dict:

975

data = info_dict['urlhandle']

976

data = urllib2.urlopen(request)

977

break

978

except (urllib2.HTTPError, ), err:

979

if (err.code < 500 or err.code >= 600) and err.code != 416:

980

# Unexpected HTTP error

981

raise

982

elif err.code == 416:

983

# Unable to resume (requested range not satisfiable)

984

try:

985

# Open the connection again without the range header

986

data = urllib2.urlopen(basic_request)

987

content_length = data.info()['Content-Length']

988

except (urllib2.HTTPError, ), err:

989

if err.code < 500 or err.code >= 600:

990

raise

991

else:

992

# Examine the reported length

993

if (content_length is not None and

994

(resume_len - 100 < long(content_length) < resume_len + 100)):

995

# The file had already been fully downloaded.

996

# Explanation to the above condition: in issue #175 it was revealed that

997

# YouTube sometimes adds or removes a few bytes from the end of the file,

998

# changing the file size slightly and causing problems for some users. So

999

# I decided to implement a suggested change and consider the file

1000

# completely downloaded if the file size differs less than 100 bytes from

1001

# the one in the hard drive.

1002

self.report_file_already_downloaded(filename)

1003

self.try_rename(tmpfilename, filename)

1004

return True

1005

else:

1006

# The length does not match, we start the download over

1007

self.report_unable_to_resume()

1008

open_mode = 'wb'

1009

break

1010

# Retry

1011

count += 1

1012

if count <= retries:

1013

self.report_retry(count, retries)

1014

1015

if count > retries:

1016

self.trouble(u'ERROR: giving up after %s retries' % retries)

1017

return False

1018

1019

data_len = data.info().get('Content-length', None)

1020

if data_len is not None:

1021

data_len = long(data_len) + resume_len

1022

data_len_str = self.format_bytes(data_len)

1023

byte_counter = 0 + resume_len

1024

block_size = 1024

1025

start = time.time()

1026

while True:

1027

# Download and write

1028

before = time.time()

1029

data_block = data.read(block_size)

1030

after = time.time()

1031

if len(data_block) == 0:

1032

break

1033

byte_counter += len(data_block)

1034

1035

# Open file just in time

1036

if stream is None:

1037

try:

1038

(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)

1039

assert stream is not None

1040

filename = self.undo_temp_name(tmpfilename)

1041

self.report_destination(filename)

1042

except (OSError, IOError), err:

1043

self.trouble(u'ERROR: unable to open for writing: %s' % str(err))

1044

return False

1045

try:

1046

stream.write(data_block)

1047

except (IOError, OSError), err:

1048

self.trouble(u'\nERROR: unable to write data: %s' % str(err))

1049

return False

1050

block_size = self.best_block_size(after - before, len(data_block))

1051

1052

# Progress message

1053

speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)

1054

if data_len is None:

1055

self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')

1056

else:

1057

percent_str = self.calc_percent(byte_counter, data_len)

1058

eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)

1059

self.report_progress(percent_str, data_len_str, speed_str, eta_str)

1060

1061

# Apply rate limit

1062

self.slow_down(start, byte_counter - resume_len)

1063

1064

if stream is None:

1065

self.trouble(u'\nERROR: Did not get any data blocks')

1066

return False

1067

stream.close()

1068

self.report_finish()

1069

if data_len is not None and byte_counter != data_len:

1070

raise ContentTooShortError(byte_counter, long(data_len))

1071

self.try_rename(tmpfilename, filename)

1072

1073

# Update file modification time

1074

if self.params.get('updatetime', True):

1075

info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))

1076

1077

return True

1078

1079

1080

class InfoExtractor(object):

1081

"""Information Extractor class.

1082

1083

Information extractors are the classes that, given a URL, extract

1084

information from the video (or videos) the URL refers to. This

1085

information includes the real video URL, the video title and simplified

1086

title, author and others. The information is stored in a dictionary

1087

which is then passed to the FileDownloader. The FileDownloader

1088

processes this information possibly downloading the video to the file

1089

system, among other possible outcomes. The dictionaries must include

1090

the following fields:

1091

1092

id: Video identifier.

1093

url: Final video URL.

1094

uploader: Nickname of the video uploader.

1095

title: Literal title.

1096

stitle: Simplified title.

1097

ext: Video filename extension.

1098

format: Video format.

1099

player_url: SWF Player URL (may be None).

1100

1101

The following fields are optional. Their primary purpose is to allow

1102

youtube-dl to serve as the backend for a video search function, such

1103

as the one in youtube2mp3. They are only used when their respective

1104

forced printing functions are called:

1105

1106

thumbnail: Full URL to a video thumbnail image.

1107

description: One-line video description.

1108

1109

Subclasses of this one should re-define the _real_initialize() and

1110

_real_extract() methods and define a _VALID_URL regexp.

1111

Probably, they should also be added to the list of extractors.

1112

"""

1113

1114

_ready = False

1115

_downloader = None

1116

1117

def __init__(self, downloader=None):

1118

"""Constructor. Receives an optional downloader."""

1119

self._ready = False

1120

self.set_downloader(downloader)

1121

1122

def suitable(self, url):

1123

"""Receives a URL and returns True if suitable for this IE."""

1124

return re.match(self._VALID_URL, url) is not None

1125

1126

def initialize(self):

1127

"""Initializes an instance (authentication, etc)."""

1128

if not self._ready:

1129

self._real_initialize()

1130

self._ready = True

1131

1132

def extract(self, url):

1133

"""Extracts URL information and returns it in list of dicts."""

1134

self.initialize()

1135

return self._real_extract(url)

1136

1137

def set_downloader(self, downloader):

1138

"""Sets the downloader for this IE."""

1139

self._downloader = downloader

1140

1141

def _real_initialize(self):

1142

"""Real initialization process. Redefine in subclasses."""

1143

pass

1144

1145

def _real_extract(self, url):

1146

"""Real extraction process. Redefine in subclasses."""

1147

pass

1148

1149

1150

class YoutubeIE(InfoExtractor):

1151

"""Information extractor for youtube.com."""

1152

1153

1154

_LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'

1155

_LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'

1156

_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'

1157

_NETRC_MACHINE = 'youtube'

1158

# Listed in order of quality

1159

_available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']

1160

_available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']

1161

_video_extensions = {

1162

'13': '3gp',

1163

'17': 'mp4',

1164

'18': 'mp4',

1165

'22': 'mp4',

1166

'37': 'mp4',

1167

'38': 'video', # You actually don't know if this will be MOV, AVI or whatever

1168

'43': 'webm',

1169

'44': 'webm',

1170

'45': 'webm',

1171

}

1172

_video_dimensions = {

1173

'5': '240x400',

1174

'6': '???',

1175

'13': '???',

1176

'17': '144x176',

1177

'18': '360x640',

1178

'22': '720x1280',

1179

'34': '360x640',

1180

'35': '480x854',

1181

'37': '1080x1920',

1182

'38': '3072x4096',

1183

'43': '360x640',

1184

'44': '480x854',

1185

'45': '720x1280',

1186

}

1187

IE_NAME = u'youtube'

1188

1189

def report_lang(self):

1190

"""Report attempt to set language."""

1191

self._downloader.to_screen(u'[youtube] Setting language')

1192

1193

def report_login(self):

1194

"""Report attempt to log in."""

1195

self._downloader.to_screen(u'[youtube] Logging in')

1196

1197

def report_age_confirmation(self):

1198

"""Report attempt to confirm age."""

1199

self._downloader.to_screen(u'[youtube] Confirming age')

1200

1201

def report_video_webpage_download(self, video_id):

1202

"""Report attempt to download video webpage."""

1203

self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)

1204

1205

def report_video_info_webpage_download(self, video_id):

1206

"""Report attempt to download video info webpage."""

1207

self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)

1208

1209

def report_information_extraction(self, video_id):

1210

"""Report attempt to extract video information."""

1211

self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)

1212

1213

def report_unavailable_format(self, video_id, format):

1214

"""Report extracted video URL."""

1215

self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))

1216

1217

def report_rtmp_download(self):

1218

"""Indicate the download will use the RTMP protocol."""

1219

self._downloader.to_screen(u'[youtube] RTMP download detected')

1220

1221

def _print_formats(self, formats):

1222

print 'Available formats:'

1223

for x in formats:

1224

print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))

1225

1226

def _real_initialize(self):

1227

if self._downloader is None:

1228

return

1229

1230

username = None

1231

password = None

1232

downloader_params = self._downloader.params

1233

1234

# Attempt to use provided username and password or .netrc data

1235

if downloader_params.get('username', None) is not None:

1236

username = downloader_params['username']

1237

password = downloader_params['password']

1238

elif downloader_params.get('usenetrc', False):

1239

try:

1240

info = netrc.netrc().authenticators(self._NETRC_MACHINE)

1241

if info is not None:

1242

username = info[0]

1243

password = info[2]

1244

else:

1245

raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)

1246

except (IOError, netrc.NetrcParseError), err:

1247

self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))

1248

return

1249

1250

# Set language

1251

request = urllib2.Request(self._LANG_URL)

1252

try:

1253

self.report_lang()

1254

urllib2.urlopen(request).read()

1255

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1256

self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))

1257

return

1258

1259

# No authentication to be performed

1260

if username is None:

1261

return

1262

1263

# Log in

1264

login_form = {

1265

'current_form': 'loginForm',

1266

'next': '/',

1267

'action_login': 'Log In',

1268

'username': username,

1269

'password': password,

1270

}

1271

request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))

1272

try:

1273

self.report_login()

1274

login_results = urllib2.urlopen(request).read()

1275

if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:

1276

self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')

1277

return

1278

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1279

self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))

1280

return

1281

1282

# Confirm age

1283

age_form = {

1284

'next_url': '/',

1285

'action_confirm': 'Confirm',

1286

}

1287

request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))

1288

try:

1289

self.report_age_confirmation()

1290

age_results = urllib2.urlopen(request).read()

1291

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1292

self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))

1293

return

1294

1295

def _real_extract(self, url):

1296

# Extract video id from URL

1297

mobj = re.match(self._VALID_URL, url)

1298

if mobj is None:

1299

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

1300

return

1301

video_id = mobj.group(2)

1302

1303

# Get video webpage

1304

self.report_video_webpage_download(video_id)

1305

request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)

1306

try:

1307

video_webpage = urllib2.urlopen(request).read()

1308

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1309

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

1310

return

1311

1312

# Attempt to extract SWF player URL

1313

mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1314

if mobj is not None:

1315

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

1316

else:

1317

player_url = None

1318

1319

# Get video info

1320

self.report_video_info_webpage_download(video_id)

1321

for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:

1322

video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1323

% (video_id, el_type))

1324

request = urllib2.Request(video_info_url)

1325

try:

1326

video_info_webpage = urllib2.urlopen(request).read()

1327

video_info = parse_qs(video_info_webpage)

1328

if 'token' in video_info:

1329

break

1330

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1331

self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))

1332

return

1333

if 'token' not in video_info:

1334

if 'reason' in video_info:

1335

self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))

1336

else:

1337

self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')

1338

return

1339

1340

# Start extracting information

1341

self.report_information_extraction(video_id)

1342

1343

# uploader

1344

if 'author' not in video_info:

1345

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

1346

return

1347

video_uploader = urllib.unquote_plus(video_info['author'][0])

1348

1349

# title

1350

if 'title' not in video_info:

1351

self._downloader.trouble(u'ERROR: unable to extract video title')

1352

return

1353

video_title = urllib.unquote_plus(video_info['title'][0])

1354

video_title = video_title.decode('utf-8')

1355

video_title = sanitize_title(video_title)

1356

1357

# simplified title

1358

simple_title = _simplify_title(video_title)

1359

1360

# thumbnail image

1361

if 'thumbnail_url' not in video_info:

1362

self._downloader.trouble(u'WARNING: unable to extract video thumbnail')

1363

video_thumbnail = ''

1364

else: # don't panic if we can't find it

1365

video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])

1366

1367

# upload date

1368

upload_date = u'NA'

1369

mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)

1370

if mobj is not None:

1371

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1372

format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']

1373

for expression in format_expressions:

1374

try:

1375

upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')

1376

except:

1377

pass

1378

1379

# description

1380

try:

1381

lxml.etree

1382

except NameError:

1383

video_description = u'No description available.'

1384

mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)

1385

if mobj is not None:

1386

video_description = mobj.group(1).decode('utf-8')

1387

else:

1388

html_parser = lxml.etree.HTMLParser(encoding='utf-8')

1389

vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)

1390

video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))

1391

# TODO use another parser

1392

1393

# token

1394

video_token = urllib.unquote_plus(video_info['token'][0])

1395

1396

# Decide which formats to download

1397

req_format = self._downloader.params.get('format', None)

1398

1399

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1400

self.report_rtmp_download()

1401

video_url_list = [(None, video_info['conn'][0])]

1402

elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:

1403

url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')

1404

url_data = [parse_qs(uds) for uds in url_data_strs]

1405

url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)

1406

url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)

1407

1408

format_limit = self._downloader.params.get('format_limit', None)

1409

available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats

1410

if format_limit is not None and format_limit in available_formats:

1411

format_list = available_formats[available_formats.index(format_limit):]

1412

else:

1413

format_list = available_formats

1414

existing_formats = [x for x in format_list if x in url_map]

1415

if len(existing_formats) == 0:

1416

self._downloader.trouble(u'ERROR: no known formats available for video')

1417

return

1418

if self._downloader.params.get('listformats', None):

1419

self._print_formats(existing_formats)

1420

return

1421

if req_format is None or req_format == 'best':

1422

video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality

1423

elif req_format == 'worst':

1424

video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality

1425

elif req_format in ('-1', 'all'):

1426

video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats

1427

else:

1428

# Specific formats. We pick the first in a slash-delimeted sequence.

1429

# For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.

1430

req_formats = req_format.split('/')

1431

video_url_list = None

1432

for rf in req_formats:

1433

if rf in url_map:

1434

video_url_list = [(rf, url_map[rf])]

1435

break

1436

if video_url_list is None:

1437

self._downloader.trouble(u'ERROR: requested format not available')

1438

return

1439

else:

1440

self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')

1441

return

1442

1443

for format_param, video_real_url in video_url_list:

1444

# At this point we have a new video

1445

self._downloader.increment_downloads()

1446

1447

# Extension

1448

video_extension = self._video_extensions.get(format_param, 'flv')

1449

1450

try:

1451

# Process video information

1452

self._downloader.process_info({

1453

'id': video_id.decode('utf-8'),

1454

'url': video_real_url.decode('utf-8'),

1455

'uploader': video_uploader.decode('utf-8'),

1456

'upload_date': upload_date,

1457

'title': video_title,

1458

'stitle': simple_title,

1459

'ext': video_extension.decode('utf-8'),

1460

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

1461

'thumbnail': video_thumbnail.decode('utf-8'),

1462

'description': video_description,

1463

'player_url': player_url,

1464

})

1465

except UnavailableVideoError, err:

1466

self._downloader.trouble(u'\nERROR: unable to download video')

1467

1468

1469

class MetacafeIE(InfoExtractor):

1470

"""Information Extractor for metacafe.com."""

1471

1472

_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'

1473

_DISCLAIMER = 'http://www.metacafe.com/family_filter/'

1474

_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'

1475

_youtube_ie = None

1476

IE_NAME = u'metacafe'

1477

1478

def __init__(self, youtube_ie, downloader=None):

1479

InfoExtractor.__init__(self, downloader)

1480

self._youtube_ie = youtube_ie

1481

1482

def report_disclaimer(self):

1483

"""Report disclaimer retrieval."""

1484

self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')

1485

1486

def report_age_confirmation(self):

1487

"""Report attempt to confirm age."""

1488

self._downloader.to_screen(u'[metacafe] Confirming age')

1489

1490

def report_download_webpage(self, video_id):

1491

"""Report webpage download."""

1492

self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)

1493

1494

def report_extraction(self, video_id):

1495

"""Report information extraction."""

1496

self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)

1497

1498

def _real_initialize(self):

1499

# Retrieve disclaimer

1500

request = urllib2.Request(self._DISCLAIMER)

1501

try:

1502

self.report_disclaimer()

1503

disclaimer = urllib2.urlopen(request).read()

1504

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1505

self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))

1506

return

1507

1508

# Confirm age

1509

disclaimer_form = {

1510

'filters': '0',

1511

'submit': "Continue - I'm over 18",

1512

}

1513

request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))

1514

try:

1515

self.report_age_confirmation()

1516

disclaimer = urllib2.urlopen(request).read()

1517

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1518

self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))

1519

return

1520

1521

def _real_extract(self, url):

1522

# Extract id and simplified title from URL

1523

mobj = re.match(self._VALID_URL, url)

1524

if mobj is None:

1525

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

1526

return

1527

1528

video_id = mobj.group(1)

1529

1530

# Check if video comes from YouTube

1531

mobj2 = re.match(r'^yt-(.*)$', video_id)

1532

if mobj2 is not None:

1533

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))

1534

return

1535

1536

# At this point we have a new video

1537

self._downloader.increment_downloads()

1538

1539

simple_title = mobj.group(2).decode('utf-8')

1540

1541

# Retrieve video webpage to extract further information

1542

request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)

1543

try:

1544

self.report_download_webpage(video_id)

1545

webpage = urllib2.urlopen(request).read()

1546

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1547

self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))

1548

return

1549

1550

# Extract URL, uploader and title from webpage

1551

self.report_extraction(video_id)

1552

mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)

1553

if mobj is not None:

1554

mediaURL = urllib.unquote(mobj.group(1))

1555

video_extension = mediaURL[-3:]

1556

1557

# Extract gdaKey if available

1558

mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)

1559

if mobj is None:

1560

video_url = mediaURL

1561

else:

1562

gdaKey = mobj.group(1)

1563

video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)

1564

else:

1565

mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)

1566

if mobj is None:

1567

self._downloader.trouble(u'ERROR: unable to extract media URL')

1568

return

1569

vardict = parse_qs(mobj.group(1))

1570

if 'mediaData' not in vardict:

1571

self._downloader.trouble(u'ERROR: unable to extract media URL')

1572

return

1573

mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])

1574

if mobj is None:

1575

self._downloader.trouble(u'ERROR: unable to extract media URL')

1576

return

1577

mediaURL = mobj.group(1).replace('\\/', '/')

1578

video_extension = mediaURL[-3:]

1579

video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))

1580

1581

mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)

1582

if mobj is None:

1583

self._downloader.trouble(u'ERROR: unable to extract title')

1584

return

1585

video_title = mobj.group(1).decode('utf-8')

1586

video_title = sanitize_title(video_title)

1587

1588

mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)

1589

if mobj is None:

1590

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

1591

return

1592

video_uploader = mobj.group(1)

1593

1594

try:

1595

# Process video information

1596

self._downloader.process_info({

1597

'id': video_id.decode('utf-8'),

1598

'url': video_url.decode('utf-8'),

1599

'uploader': video_uploader.decode('utf-8'),

1600

'upload_date': u'NA',

1601

'title': video_title,

1602

'stitle': simple_title,

1603

'ext': video_extension.decode('utf-8'),

1604

'format': u'NA',

1605

'player_url': None,

1606

})

1607

except UnavailableVideoError:

1608

self._downloader.trouble(u'\nERROR: unable to download video')

1609

1610

1611

class DailymotionIE(InfoExtractor):

1612

"""Information Extractor for Dailymotion"""

1613

1614

_VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'

1615

IE_NAME = u'dailymotion'

1616

1617

def __init__(self, downloader=None):

1618

InfoExtractor.__init__(self, downloader)

1619

1620

def report_download_webpage(self, video_id):

1621

"""Report webpage download."""

1622

self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)

1623

1624

def report_extraction(self, video_id):

1625

"""Report information extraction."""

1626

self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)

1627

1628

def _real_extract(self, url):

1629

# Extract id and simplified title from URL

1630

mobj = re.match(self._VALID_URL, url)

1631

if mobj is None:

1632

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

1633

return

1634

1635

# At this point we have a new video

1636

self._downloader.increment_downloads()

1637

video_id = mobj.group(1)

1638

1639

video_extension = 'flv'

1640

1641

# Retrieve video webpage to extract further information

1642

request = urllib2.Request(url)

1643

request.add_header('Cookie', 'family_filter=off')

1644

try:

1645

self.report_download_webpage(video_id)

1646

webpage = urllib2.urlopen(request).read()

1647

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1648

self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))

1649

return

1650

1651

# Extract URL, uploader and title from webpage

1652

self.report_extraction(video_id)

1653

mobj = re.search(r'(?i)addVariable$\"sequence\"\s*,\s*\"([^\"]+?)\"$', webpage)

1654

if mobj is None:

1655

self._downloader.trouble(u'ERROR: unable to extract media URL')

1656

return

1657

sequence = urllib.unquote(mobj.group(1))

1658

mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)

1659

if mobj is None:

1660

self._downloader.trouble(u'ERROR: unable to extract media URL')

1661

return

1662

mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')

1663

1664

# if needed add http://www.dailymotion.com/ if relative URL

1665

1666

video_url = mediaURL

1667

1668

mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)

1669

if mobj is None:

1670

self._downloader.trouble(u'ERROR: unable to extract title')

1671

return

1672

video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))

1673

video_title = sanitize_title(video_title)

1674

simple_title = _simplify_title(video_title)

1675

1676

mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)

1677

if mobj is None:

1678

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

1679

return

1680

video_uploader = mobj.group(1)

1681

1682

try:

1683

# Process video information

1684

self._downloader.process_info({

1685

'id': video_id.decode('utf-8'),

1686

'url': video_url.decode('utf-8'),

1687

'uploader': video_uploader.decode('utf-8'),

1688

'upload_date': u'NA',

1689

'title': video_title,

1690

'stitle': simple_title,

1691

'ext': video_extension.decode('utf-8'),

1692

'format': u'NA',

1693

'player_url': None,

1694

})

1695

except UnavailableVideoError:

1696

self._downloader.trouble(u'\nERROR: unable to download video')

1697

1698

1699

class GoogleIE(InfoExtractor):

1700

"""Information extractor for video.google.com."""

1701

1702

_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'

1703

IE_NAME = u'video.google'

1704

1705

def __init__(self, downloader=None):

1706

InfoExtractor.__init__(self, downloader)

1707

1708

def report_download_webpage(self, video_id):

1709

"""Report webpage download."""

1710

self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)

1711

1712

def report_extraction(self, video_id):

1713

"""Report information extraction."""

1714

self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)

1715

1716

def _real_extract(self, url):

1717

# Extract id from URL

1718

mobj = re.match(self._VALID_URL, url)

1719

if mobj is None:

1720

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1721

return

1722

1723

# At this point we have a new video

1724

self._downloader.increment_downloads()

1725

video_id = mobj.group(1)

1726

1727

video_extension = 'mp4'

1728

1729

# Retrieve video webpage to extract further information

1730

request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)

1731

try:

1732

self.report_download_webpage(video_id)

1733

webpage = urllib2.urlopen(request).read()

1734

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1735

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1736

return

1737

1738

# Extract URL, uploader, and title from webpage

1739

self.report_extraction(video_id)

1740

mobj = re.search(r"download_url:'([^']+)'", webpage)

1741

if mobj is None:

1742

video_extension = 'flv'

1743

mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)

1744

if mobj is None:

1745

self._downloader.trouble(u'ERROR: unable to extract media URL')

1746

return

1747

mediaURL = urllib.unquote(mobj.group(1))

1748

mediaURL = mediaURL.replace('\\x3d', '\x3d')

1749

mediaURL = mediaURL.replace('\\x26', '\x26')

1750

1751

video_url = mediaURL

1752

1753

mobj = re.search(r'<title>(.*)</title>', webpage)

1754

if mobj is None:

1755

self._downloader.trouble(u'ERROR: unable to extract title')

1756

return

1757

video_title = mobj.group(1).decode('utf-8')

1758

video_title = sanitize_title(video_title)

1759

simple_title = _simplify_title(video_title)

1760

1761

# Extract video description

1762

mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)

1763

if mobj is None:

1764

self._downloader.trouble(u'ERROR: unable to extract video description')

1765

return

1766

video_description = mobj.group(1).decode('utf-8')

1767

if not video_description:

1768

video_description = 'No description available.'

1769

1770

# Extract video thumbnail

1771

if self._downloader.params.get('forcethumbnail', False):

1772

request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))

1773

try:

1774

webpage = urllib2.urlopen(request).read()

1775

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1776

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1777

return

1778

mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)

1779

if mobj is None:

1780

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

1781

return

1782

video_thumbnail = mobj.group(1)

1783

else: # we need something to pass to process_info

1784

video_thumbnail = ''

1785

1786

try:

1787

# Process video information

1788

self._downloader.process_info({

1789

'id': video_id.decode('utf-8'),

1790

'url': video_url.decode('utf-8'),

1791

'uploader': u'NA',

1792

'upload_date': u'NA',

1793

'title': video_title,

1794

'stitle': simple_title,

1795

'ext': video_extension.decode('utf-8'),

1796

'format': u'NA',

1797

'player_url': None,

1798

})

1799

except UnavailableVideoError:

1800

self._downloader.trouble(u'\nERROR: unable to download video')

1801

1802

1803

class PhotobucketIE(InfoExtractor):

1804

"""Information extractor for photobucket.com."""

1805

1806

_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'

1807

IE_NAME = u'photobucket'

1808

1809

def __init__(self, downloader=None):

1810

InfoExtractor.__init__(self, downloader)

1811

1812

def report_download_webpage(self, video_id):

1813

"""Report webpage download."""

1814

self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)

1815

1816

def report_extraction(self, video_id):

1817

"""Report information extraction."""

1818

self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)

1819

1820

def _real_extract(self, url):

1821

# Extract id from URL

1822

mobj = re.match(self._VALID_URL, url)

1823

if mobj is None:

1824

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1825

return

1826

1827

# At this point we have a new video

1828

self._downloader.increment_downloads()

1829

video_id = mobj.group(1)

1830

1831

video_extension = 'flv'

1832

1833

# Retrieve video webpage to extract further information

1834

request = urllib2.Request(url)

1835

try:

1836

self.report_download_webpage(video_id)

1837

webpage = urllib2.urlopen(request).read()

1838

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1839

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1840

return

1841

1842

# Extract URL, uploader, and title from webpage

1843

self.report_extraction(video_id)

1844

mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)

1845

if mobj is None:

1846

self._downloader.trouble(u'ERROR: unable to extract media URL')

1847

return

1848

mediaURL = urllib.unquote(mobj.group(1))

1849

1850

video_url = mediaURL

1851

1852

mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)

1853

if mobj is None:

1854

self._downloader.trouble(u'ERROR: unable to extract title')

1855

return

1856

video_title = mobj.group(1).decode('utf-8')

1857

video_title = sanitize_title(video_title)

1858

simple_title = _simplify_title(vide_title)

1859

1860

video_uploader = mobj.group(2).decode('utf-8')

1861

1862

try:

1863

# Process video information

1864

self._downloader.process_info({

1865

'id': video_id.decode('utf-8'),

1866

'url': video_url.decode('utf-8'),

1867

'uploader': video_uploader,

1868

'upload_date': u'NA',

1869

'title': video_title,

1870

'stitle': simple_title,

1871

'ext': video_extension.decode('utf-8'),

1872

'format': u'NA',

1873

'player_url': None,

1874

})

1875

except UnavailableVideoError:

1876

self._downloader.trouble(u'\nERROR: unable to download video')

1877

1878

1879

class YahooIE(InfoExtractor):

1880

"""Information extractor for video.yahoo.com."""

1881

1882

# _VALID_URL matches all Yahoo! Video URLs

1883

# _VPAGE_URL matches only the extractable '/watch/' URLs

1884

_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'

1885

_VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'

1886

IE_NAME = u'video.yahoo'

1887

1888

def __init__(self, downloader=None):

1889

InfoExtractor.__init__(self, downloader)

1890

1891

def report_download_webpage(self, video_id):

1892

"""Report webpage download."""

1893

self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)

1894

1895

def report_extraction(self, video_id):

1896

"""Report information extraction."""

1897

self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)

1898

1899

def _real_extract(self, url, new_video=True):

1900

# Extract ID from URL

1901

mobj = re.match(self._VALID_URL, url)

1902

if mobj is None:

1903

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

1904

return

1905

1906

# At this point we have a new video

1907

self._downloader.increment_downloads()

1908

video_id = mobj.group(2)

1909

video_extension = 'flv'

1910

1911

# Rewrite valid but non-extractable URLs as

1912

# extractable English language /watch/ URLs

1913

if re.match(self._VPAGE_URL, url) is None:

1914

request = urllib2.Request(url)

1915

try:

1916

webpage = urllib2.urlopen(request).read()

1917

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1918

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1919

return

1920

1921

mobj = re.search(r'$"id", "([0-9]+)"$;', webpage)

1922

if mobj is None:

1923

self._downloader.trouble(u'ERROR: Unable to extract id field')

1924

return

1925

yahoo_id = mobj.group(1)

1926

1927

mobj = re.search(r'$"vid", "([0-9]+)"$;', webpage)

1928

if mobj is None:

1929

self._downloader.trouble(u'ERROR: Unable to extract vid field')

1930

return

1931

yahoo_vid = mobj.group(1)

1932

1933

url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)

1934

return self._real_extract(url, new_video=False)

1935

1936

# Retrieve video webpage to extract further information

1937

request = urllib2.Request(url)

1938

try:

1939

self.report_download_webpage(video_id)

1940

webpage = urllib2.urlopen(request).read()

1941

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

1942

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

1943

return

1944

1945

# Extract uploader and title from webpage

1946

self.report_extraction(video_id)

1947

mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)

1948

if mobj is None:

1949

self._downloader.trouble(u'ERROR: unable to extract video title')

1950

return

1951

video_title = mobj.group(1).decode('utf-8')

1952

simple_title = _simplify_title(video_title)

1953

1954

mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)

1955

if mobj is None:

1956

self._downloader.trouble(u'ERROR: unable to extract video uploader')

1957

return

1958

video_uploader = mobj.group(1).decode('utf-8')

1959

1960

# Extract video thumbnail

1961

mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)

1962

if mobj is None:

1963

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

1964

return

1965

video_thumbnail = mobj.group(1).decode('utf-8')

1966

1967

# Extract video description

1968

mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)

1969

if mobj is None:

1970

self._downloader.trouble(u'ERROR: unable to extract video description')

1971

return

1972

video_description = mobj.group(1).decode('utf-8')

1973

if not video_description:

1974

video_description = 'No description available.'

1975

1976

# Extract video height and width

1977

mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)

1978

if mobj is None:

1979

self._downloader.trouble(u'ERROR: unable to extract video height')

1980

return

1981

yv_video_height = mobj.group(1)

1982

1983

mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)

1984

if mobj is None:

1985

self._downloader.trouble(u'ERROR: unable to extract video width')

1986

return

1987

yv_video_width = mobj.group(1)

1988

1989

# Retrieve video playlist to extract media URL

1990

# I'm not completely sure what all these options are, but we

1991

# seem to need most of them, otherwise the server sends a 401.

1992

yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents

1993

yv_bitrate = '700' # according to Wikipedia this is hard-coded

1994

request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +

1995

'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +

1996

'&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')

1997

try:

1998

self.report_download_webpage(video_id)

1999

webpage = urllib2.urlopen(request).read()

2000

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2001

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

2002

return

2003

2004

# Extract media URL from playlist XML

2005

mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)

2006

if mobj is None:

2007

self._downloader.trouble(u'ERROR: Unable to extract media URL')

2008

return

2009

video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')

2010

video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)

2011

2012

try:

2013

# Process video information

2014

self._downloader.process_info({

2015

'id': video_id.decode('utf-8'),

2016

'url': video_url,

2017

'uploader': video_uploader,

2018

'upload_date': u'NA',

2019

'title': video_title,

2020

'stitle': simple_title,

2021

'ext': video_extension.decode('utf-8'),

2022

'thumbnail': video_thumbnail.decode('utf-8'),

2023

'description': video_description,

2024

'thumbnail': video_thumbnail,

2025

'player_url': None,

2026

})

2027

except UnavailableVideoError:

2028

self._downloader.trouble(u'\nERROR: unable to download video')

2029

2030

2031

class VimeoIE(InfoExtractor):

2032

"""Information extractor for vimeo.com."""

2033

2034

# _VALID_URL matches Vimeo URLs

2035

_VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'

2036

IE_NAME = u'vimeo'

2037

2038

def __init__(self, downloader=None):

2039

InfoExtractor.__init__(self, downloader)

2040

2041

def report_download_webpage(self, video_id):

2042

"""Report webpage download."""

2043

self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)

2044

2045

def report_extraction(self, video_id):

2046

"""Report information extraction."""

2047

self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)

2048

2049

def _real_extract(self, url, new_video=True):

2050

# Extract ID from URL

2051

mobj = re.match(self._VALID_URL, url)

2052

if mobj is None:

2053

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

2054

return

2055

2056

# At this point we have a new video

2057

self._downloader.increment_downloads()

2058

video_id = mobj.group(1)

2059

2060

# Retrieve video webpage to extract further information

2061

request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)

2062

try:

2063

self.report_download_webpage(video_id)

2064

webpage = urllib2.urlopen(request).read()

2065

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2066

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

2067

return

2068

2069

# Now we begin extracting as much information as we can from what we

2070

# retrieved. First we extract the information common to all extractors,

2071

# and latter we extract those that are Vimeo specific.

2072

self.report_extraction(video_id)

2073

2074

# Extract title

2075

mobj = re.search(r'<caption>(.*?)</caption>', webpage)

2076

if mobj is None:

2077

self._downloader.trouble(u'ERROR: unable to extract video title')

2078

return

2079

video_title = mobj.group(1).decode('utf-8')

2080

simple_title = _simplify_title(video_title)

2081

2082

# Extract uploader

2083

mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)

2084

if mobj is None:

2085

self._downloader.trouble(u'ERROR: unable to extract video uploader')

2086

return

2087

video_uploader = mobj.group(1).decode('utf-8')

2088

2089

# Extract video thumbnail

2090

mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)

2091

if mobj is None:

2092

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

2093

return

2094

video_thumbnail = mobj.group(1).decode('utf-8')

2095

2096

# # Extract video description

2097

# mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)

2098

# if mobj is None:

2099

# self._downloader.trouble(u'ERROR: unable to extract video description')

2100

# return

2101

# video_description = mobj.group(1).decode('utf-8')

2102

# if not video_description: video_description = 'No description available.'

2103

video_description = 'Foo.'

2104

2105

# Vimeo specific: extract request signature

2106

mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)

2107

if mobj is None:

2108

self._downloader.trouble(u'ERROR: unable to extract request signature')

2109

return

2110

sig = mobj.group(1).decode('utf-8')

2111

2112

# Vimeo specific: extract video quality information

2113

mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)

2114

if mobj is None:

2115

self._downloader.trouble(u'ERROR: unable to extract video quality information')

2116

return

2117

quality = mobj.group(1).decode('utf-8')

2118

2119

if int(quality) == 1:

2120

quality = 'hd'

2121

else:

2122

quality = 'sd'

2123

2124

# Vimeo specific: Extract request signature expiration

2125

mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)

2126

if mobj is None:

2127

self._downloader.trouble(u'ERROR: unable to extract request signature expiration')

2128

return

2129

sig_exp = mobj.group(1).decode('utf-8')

2130

2131

video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)

2132

2133

try:

2134

# Process video information

2135

self._downloader.process_info({

2136

'id': video_id.decode('utf-8'),

2137

'url': video_url,

2138

'uploader': video_uploader,

2139

'upload_date': u'NA',

2140

'title': video_title,

2141

'stitle': simple_title,

2142

'ext': u'mp4',

2143

'thumbnail': video_thumbnail.decode('utf-8'),

2144

'description': video_description,

2145

'thumbnail': video_thumbnail,

2146

'description': video_description,

2147

'player_url': None,

2148

})

2149

except UnavailableVideoError:

2150

self._downloader.trouble(u'ERROR: unable to download video')

2151

2152

2153

class GenericIE(InfoExtractor):

2154

"""Generic last-resort information extractor."""

2155

2156

_VALID_URL = r'.*'

2157

IE_NAME = u'generic'

2158

2159

def __init__(self, downloader=None):

2160

InfoExtractor.__init__(self, downloader)

2161

2162

def report_download_webpage(self, video_id):

2163

"""Report webpage download."""

2164

self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')

2165

self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)

2166

2167

def report_extraction(self, video_id):

2168

"""Report information extraction."""

2169

self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)

2170

2171

def _real_extract(self, url):

2172

# At this point we have a new video

2173

self._downloader.increment_downloads()

2174

2175

video_id = url.split('/')[-1]

2176

request = urllib2.Request(url)

2177

try:

2178

self.report_download_webpage(video_id)

2179

webpage = urllib2.urlopen(request).read()

2180

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2181

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

2182

return

2183

except ValueError, err:

2184

# since this is the last-resort InfoExtractor, if

2185

# this error is thrown, it'll be thrown here

2186

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

2187

return

2188

2189

self.report_extraction(video_id)

2190

# Start with something easy: JW Player in SWFObject

2191

mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)

2192

if mobj is None:

2193

# Broaden the search a little bit

2194

mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)

2195

if mobj is None:

2196

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

2197

return

2198

2199

# It's possible that one of the regexes

2200

# matched, but returned an empty group:

2201

if mobj.group(1) is None:

2202

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

2203

return

2204

2205

video_url = urllib.unquote(mobj.group(1))

2206

video_id = os.path.basename(video_url)

2207

2208

# here's a fun little line of code for you:

2209

video_extension = os.path.splitext(video_id)[1][1:]

2210

video_id = os.path.splitext(video_id)[0]

2211

2212

# it's tempting to parse this further, but you would

2213

# have to take into account all the variations like

2214

# Video Title - Site Name

2215

# Site Name | Video Title

2216

# Video Title - Tagline | Site Name

2217

# and so on and so forth; it's just not practical

2218

mobj = re.search(r'<title>(.*)</title>', webpage)

2219

if mobj is None:

2220

self._downloader.trouble(u'ERROR: unable to extract title')

2221

return

2222

video_title = mobj.group(1).decode('utf-8')

2223

video_title = sanitize_title(video_title)

2224

simple_title = _simplify_title(video_title)

2225

2226

# video uploader is domain name

2227

mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)

2228

if mobj is None:

2229

self._downloader.trouble(u'ERROR: unable to extract title')

2230

return

2231

video_uploader = mobj.group(1).decode('utf-8')

2232

2233

try:

2234

# Process video information

2235

self._downloader.process_info({

2236

'id': video_id.decode('utf-8'),

2237

'url': video_url.decode('utf-8'),

2238

'uploader': video_uploader,

2239

'upload_date': u'NA',

2240

'title': video_title,

2241

'stitle': simple_title,

2242

'ext': video_extension.decode('utf-8'),

2243

'format': u'NA',

2244

'player_url': None,

2245

})

2246

except UnavailableVideoError, err:

2247

self._downloader.trouble(u'\nERROR: unable to download video')

2248

2249

2250

class YoutubeSearchIE(InfoExtractor):

2251

"""Information Extractor for YouTube search queries."""

2252

_VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'

2253

_TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'

2254

_VIDEO_INDICATOR = r'href="/watch\?v=.+?"'

2255

_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'

2256

_youtube_ie = None

2257

_max_youtube_results = 1000

2258

IE_NAME = u'youtube:search'

2259

2260

def __init__(self, youtube_ie, downloader=None):

2261

InfoExtractor.__init__(self, downloader)

2262

self._youtube_ie = youtube_ie

2263

2264

def report_download_page(self, query, pagenum):

2265

"""Report attempt to download playlist page with given number."""

2266

query = query.decode(preferredencoding())

2267

self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))

2268

2269

def _real_initialize(self):

2270

self._youtube_ie.initialize()

2271

2272

def _real_extract(self, query):

2273

mobj = re.match(self._VALID_URL, query)

2274

if mobj is None:

2275

self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)

2276

return

2277

2278

prefix, query = query.split(':')

2279

prefix = prefix[8:]

2280

query = query.encode('utf-8')

2281

if prefix == '':

2282

self._download_n_results(query, 1)

2283

return

2284

elif prefix == 'all':

2285

self._download_n_results(query, self._max_youtube_results)

2286

return

2287

else:

2288

try:

2289

n = long(prefix)

2290

if n <= 0:

2291

self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))

2292

return

2293

elif n > self._max_youtube_results:

2294

self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))

2295

n = self._max_youtube_results

2296

self._download_n_results(query, n)

2297

return

2298

except ValueError: # parsing prefix as integer fails

2299

self._download_n_results(query, 1)

2300

return

2301

2302

def _download_n_results(self, query, n):

2303

"""Downloads a specified number of results for a query"""

2304

2305

video_ids = []

2306

already_seen = set()

2307

pagenum = 1

2308

2309

while True:

2310

self.report_download_page(query, pagenum)

2311

result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)

2312

request = urllib2.Request(result_url)

2313

try:

2314

page = urllib2.urlopen(request).read()

2315

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2316

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2317

return

2318

2319

# Extract video identifiers

2320

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2321

video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]

2322

if video_id not in already_seen:

2323

video_ids.append(video_id)

2324

already_seen.add(video_id)

2325

if len(video_ids) == n:

2326

# Specified n videos reached

2327

for id in video_ids:

2328

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)

2329

return

2330

2331

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2332

for id in video_ids:

2333

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)

2334

return

2335

2336

pagenum = pagenum + 1

2337

2338

2339

class GoogleSearchIE(InfoExtractor):

2340

"""Information Extractor for Google Video search queries."""

2341

_VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'

2342

_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'

2343

_VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'

2344

_MORE_PAGES_INDICATOR = r'<span>Next</span>'

2345

_google_ie = None

2346

_max_google_results = 1000

2347

IE_NAME = u'video.google:search'

2348

2349

def __init__(self, google_ie, downloader=None):

2350

InfoExtractor.__init__(self, downloader)

2351

self._google_ie = google_ie

2352

2353

def report_download_page(self, query, pagenum):

2354

"""Report attempt to download playlist page with given number."""

2355

query = query.decode(preferredencoding())

2356

self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))

2357

2358

def _real_initialize(self):

2359

self._google_ie.initialize()

2360

2361

def _real_extract(self, query):

2362

mobj = re.match(self._VALID_URL, query)

2363

if mobj is None:

2364

self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)

2365

return

2366

2367

prefix, query = query.split(':')

2368

prefix = prefix[8:]

2369

query = query.encode('utf-8')

2370

if prefix == '':

2371

self._download_n_results(query, 1)

2372

return

2373

elif prefix == 'all':

2374

self._download_n_results(query, self._max_google_results)

2375

return

2376

else:

2377

try:

2378

n = long(prefix)

2379

if n <= 0:

2380

self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))

2381

return

2382

elif n > self._max_google_results:

2383

self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))

2384

n = self._max_google_results

2385

self._download_n_results(query, n)

2386

return

2387

except ValueError: # parsing prefix as integer fails

2388

self._download_n_results(query, 1)

2389

return

2390

2391

def _download_n_results(self, query, n):

2392

"""Downloads a specified number of results for a query"""

2393

2394

video_ids = []

2395

already_seen = set()

2396

pagenum = 1

2397

2398

while True:

2399

self.report_download_page(query, pagenum)

2400

result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)

2401

request = urllib2.Request(result_url)

2402

try:

2403

page = urllib2.urlopen(request).read()

2404

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2405

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2406

return

2407

2408

# Extract video identifiers

2409

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2410

video_id = mobj.group(1)

2411

if video_id not in already_seen:

2412

video_ids.append(video_id)

2413

already_seen.add(video_id)

2414

if len(video_ids) == n:

2415

# Specified n videos reached

2416

for id in video_ids:

2417

self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)

2418

return

2419

2420

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2421

for id in video_ids:

2422

self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)

2423

return

2424

2425

pagenum = pagenum + 1

2426

2427

2428

class YahooSearchIE(InfoExtractor):

2429

"""Information Extractor for Yahoo! Video search queries."""

2430

_VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'

2431

_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'

2432

_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'

2433

_MORE_PAGES_INDICATOR = r'\s*Next'

2434

_yahoo_ie = None

2435

_max_yahoo_results = 1000

2436

IE_NAME = u'video.yahoo:search'

2437

2438

def __init__(self, yahoo_ie, downloader=None):

2439

InfoExtractor.__init__(self, downloader)

2440

self._yahoo_ie = yahoo_ie

2441

2442

def report_download_page(self, query, pagenum):

2443

"""Report attempt to download playlist page with given number."""

2444

query = query.decode(preferredencoding())

2445

self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))

2446

2447

def _real_initialize(self):

2448

self._yahoo_ie.initialize()

2449

2450

def _real_extract(self, query):

2451

mobj = re.match(self._VALID_URL, query)

2452

if mobj is None:

2453

self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)

2454

return

2455

2456

prefix, query = query.split(':')

2457

prefix = prefix[8:]

2458

query = query.encode('utf-8')

2459

if prefix == '':

2460

self._download_n_results(query, 1)

2461

return

2462

elif prefix == 'all':

2463

self._download_n_results(query, self._max_yahoo_results)

2464

return

2465

else:

2466

try:

2467

n = long(prefix)

2468

if n <= 0:

2469

self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))

2470

return

2471

elif n > self._max_yahoo_results:

2472

self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))

2473

n = self._max_yahoo_results

2474

self._download_n_results(query, n)

2475

return

2476

except ValueError: # parsing prefix as integer fails

2477

self._download_n_results(query, 1)

2478

return

2479

2480

def _download_n_results(self, query, n):

2481

"""Downloads a specified number of results for a query"""

2482

2483

video_ids = []

2484

already_seen = set()

2485

pagenum = 1

2486

2487

while True:

2488

self.report_download_page(query, pagenum)

2489

result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)

2490

request = urllib2.Request(result_url)

2491

try:

2492

page = urllib2.urlopen(request).read()

2493

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2494

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2495

return

2496

2497

# Extract video identifiers

2498

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2499

video_id = mobj.group(1)

2500

if video_id not in already_seen:

2501

video_ids.append(video_id)

2502

already_seen.add(video_id)

2503

if len(video_ids) == n:

2504

# Specified n videos reached

2505

for id in video_ids:

2506

self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)

2507

return

2508

2509

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2510

for id in video_ids:

2511

self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)

2512

return

2513

2514

pagenum = pagenum + 1

2515

2516

2517

class YoutubePlaylistIE(InfoExtractor):

2518

"""Information Extractor for YouTube playlists."""

2519

2520

2521

_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'

2522

_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'

2523

_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'

2524

_youtube_ie = None

2525

IE_NAME = u'youtube:playlist'

2526

2527

def __init__(self, youtube_ie, downloader=None):

2528

InfoExtractor.__init__(self, downloader)

2529

self._youtube_ie = youtube_ie

2530

2531

def report_download_page(self, playlist_id, pagenum):

2532

"""Report attempt to download playlist page with given number."""

2533

self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))

2534

2535

def _real_initialize(self):

2536

self._youtube_ie.initialize()

2537

2538

def _real_extract(self, url):

2539

# Extract playlist id

2540

mobj = re.match(self._VALID_URL, url)

2541

if mobj is None:

2542

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

2543

return

2544

2545

# Single video case

2546

if mobj.group(3) is not None:

2547

self._youtube_ie.extract(mobj.group(3))

2548

return

2549

2550

# Download playlist pages

2551

# prefix is 'p' as default for playlists but there are other types that need extra care

2552

playlist_prefix = mobj.group(1)

2553

if playlist_prefix == 'a':

2554

playlist_access = 'artist'

2555

else:

2556

playlist_prefix = 'p'

2557

playlist_access = 'view_play_list'

2558

playlist_id = mobj.group(2)

2559

video_ids = []

2560

pagenum = 1

2561

2562

while True:

2563

self.report_download_page(playlist_id, pagenum)

2564

url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)

2565

request = urllib2.Request(url)

2566

try:

2567

page = urllib2.urlopen(request).read()

2568

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2569

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2570

return

2571

2572

# Extract video identifiers

2573

ids_in_page = []

2574

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2575

if mobj.group(1) not in ids_in_page:

2576

ids_in_page.append(mobj.group(1))

2577

video_ids.extend(ids_in_page)

2578

2579

if re.search(self._MORE_PAGES_INDICATOR, page) is None:

2580

break

2581

pagenum = pagenum + 1

2582

2583

playliststart = self._downloader.params.get('playliststart', 1) - 1

2584

playlistend = self._downloader.params.get('playlistend', -1)

2585

video_ids = video_ids[playliststart:playlistend]

2586

2587

for id in video_ids:

2588

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)

2589

return

2590

2591

2592

class YoutubeUserIE(InfoExtractor):

2593

"""Information Extractor for YouTube users."""

2594

2595

_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'

2596

_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'

2597

_GDATA_PAGE_SIZE = 50

2598

_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'

2599

_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'

2600

_youtube_ie = None

2601

IE_NAME = u'youtube:user'

2602

2603

def __init__(self, youtube_ie, downloader=None):

2604

InfoExtractor.__init__(self, downloader)

2605

self._youtube_ie = youtube_ie

2606

2607

def report_download_page(self, username, start_index):

2608

"""Report attempt to download user page."""

2609

self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %

2610

(username, start_index, start_index + self._GDATA_PAGE_SIZE))

2611

2612

def _real_initialize(self):

2613

self._youtube_ie.initialize()

2614

2615

def _real_extract(self, url):

2616

# Extract username

2617

mobj = re.match(self._VALID_URL, url)

2618

if mobj is None:

2619

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

2620

return

2621

2622

username = mobj.group(1)

2623

2624

# Download video ids using YouTube Data API. Result size per

2625

# query is limited (currently to 50 videos) so we need to query

2626

# page by page until there are no video ids - it means we got

2627

# all of them.

2628

2629

video_ids = []

2630

pagenum = 0

2631

2632

while True:

2633

start_index = pagenum * self._GDATA_PAGE_SIZE + 1

2634

self.report_download_page(username, start_index)

2635

2636

request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))

2637

2638

try:

2639

page = urllib2.urlopen(request).read()

2640

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2641

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

2642

return

2643

2644

# Extract video identifiers

2645

ids_in_page = []

2646

2647

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

2648

if mobj.group(1) not in ids_in_page:

2649

ids_in_page.append(mobj.group(1))

2650

2651

video_ids.extend(ids_in_page)

2652

2653

# A little optimization - if current page is not

2654

# "full", ie. does not contain PAGE_SIZE video ids then

2655

# we can assume that this page is the last one - there

2656

# are no more ids on further pages - no need to query

2657

# again.

2658

2659

if len(ids_in_page) < self._GDATA_PAGE_SIZE:

2660

break

2661

2662

pagenum += 1

2663

2664

all_ids_count = len(video_ids)

2665

playliststart = self._downloader.params.get('playliststart', 1) - 1

2666

playlistend = self._downloader.params.get('playlistend', -1)

2667

2668

if playlistend == -1:

2669

video_ids = video_ids[playliststart:]

2670

else:

2671

video_ids = video_ids[playliststart:playlistend]

2672

2673

self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %

2674

(username, all_ids_count, len(video_ids)))

2675

2676

for video_id in video_ids:

2677

self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)

2678

2679

2680

class DepositFilesIE(InfoExtractor):

2681

"""Information extractor for depositfiles.com"""

2682

2683

_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'

2684

IE_NAME = u'DepositFiles'

2685

2686

def __init__(self, downloader=None):

2687

InfoExtractor.__init__(self, downloader)

2688

2689

def report_download_webpage(self, file_id):

2690

"""Report webpage download."""

2691

self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)

2692

2693

def report_extraction(self, file_id):

2694

"""Report information extraction."""

2695

self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)

2696

2697

def _real_extract(self, url):

2698

# At this point we have a new file

2699

self._downloader.increment_downloads()

2700

2701

file_id = url.split('/')[-1]

2702

# Rebuild url in english locale

2703

url = 'http://depositfiles.com/en/files/' + file_id

2704

2705

# Retrieve file webpage with 'Free download' button pressed

2706

free_download_indication = { 'gateway_result' : '1' }

2707

request = urllib2.Request(url, urllib.urlencode(free_download_indication))

2708

try:

2709

self.report_download_webpage(file_id)

2710

webpage = urllib2.urlopen(request).read()

2711

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2712

self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))

2713

return

2714

2715

# Search for the real file URL

2716

mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)

2717

if (mobj is None) or (mobj.group(1) is None):

2718

# Try to figure out reason of the error.

2719

mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)

2720

if (mobj is not None) and (mobj.group(1) is not None):

2721

restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()

2722

self._downloader.trouble(u'ERROR: %s' % restriction_message)

2723

else:

2724

self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)

2725

return

2726

2727

file_url = mobj.group(1)

2728

file_extension = os.path.splitext(file_url)[1][1:]

2729

2730

# Search for file title

2731

mobj = re.search(r'<b title="(.*?)">', webpage)

2732

if mobj is None:

2733

self._downloader.trouble(u'ERROR: unable to extract title')

2734

return

2735

file_title = mobj.group(1).decode('utf-8')

2736

2737

try:

2738

# Process file information

2739

self._downloader.process_info({

2740

'id': file_id.decode('utf-8'),

2741

'url': file_url.decode('utf-8'),

2742

'uploader': u'NA',

2743

'upload_date': u'NA',

2744

'title': file_title,

2745

'stitle': file_title,

2746

'ext': file_extension.decode('utf-8'),

2747

'format': u'NA',

2748

'player_url': None,

2749

})

2750

except UnavailableVideoError, err:

2751

self._downloader.trouble(u'ERROR: unable to download file')

2752

2753

2754

class FacebookIE(InfoExtractor):

2755

"""Information Extractor for Facebook"""

2756

2757

_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'

2758

_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'

2759

_NETRC_MACHINE = 'facebook'

2760

_available_formats = ['video', 'highqual', 'lowqual']

2761

_video_extensions = {

2762

'video': 'mp4',

2763

'highqual': 'mp4',

2764

'lowqual': 'mp4',

2765

}

2766

IE_NAME = u'facebook'

2767

2768

def __init__(self, downloader=None):

2769

InfoExtractor.__init__(self, downloader)

2770

2771

def _reporter(self, message):

2772

"""Add header and report message."""

2773

self._downloader.to_screen(u'[facebook] %s' % message)

2774

2775

def report_login(self):

2776

"""Report attempt to log in."""

2777

self._reporter(u'Logging in')

2778

2779

def report_video_webpage_download(self, video_id):

2780

"""Report attempt to download video webpage."""

2781

self._reporter(u'%s: Downloading video webpage' % video_id)

2782

2783

def report_information_extraction(self, video_id):

2784

"""Report attempt to extract video information."""

2785

self._reporter(u'%s: Extracting video information' % video_id)

2786

2787

def _parse_page(self, video_webpage):

2788

"""Extract video information from page"""

2789

# General data

2790

data = {'title': r'$"video_title", "(.*?)"$',

2791

'description': r'<div class="datawrap">(.*?)</div>',

2792

'owner': r'$"video_owner_name", "(.*?)"$',

2793

'thumbnail': r'$"thumb_url", "(?P<THUMB>.*?)"$',

2794

}

2795

video_info = {}

2796

for piece in data.keys():

2797

mobj = re.search(data[piece], video_webpage)

2798

if mobj is not None:

2799

video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))

2800

2801

# Video urls

2802

video_urls = {}

2803

for fmt in self._available_formats:

2804

mobj = re.search(r'$"%s_src\", "(.+?)"$' % fmt, video_webpage)

2805

if mobj is not None:

2806

# URL is in a Javascript segment inside an escaped Unicode format within

2807

# the generally utf-8 page

2808

video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))

2809

video_info['video_urls'] = video_urls

2810

2811

return video_info

2812

2813

def _real_initialize(self):

2814

if self._downloader is None:

2815

return

2816

2817

useremail = None

2818

password = None

2819

downloader_params = self._downloader.params

2820

2821

# Attempt to use provided username and password or .netrc data

2822

if downloader_params.get('username', None) is not None:

2823

useremail = downloader_params['username']

2824

password = downloader_params['password']

2825

elif downloader_params.get('usenetrc', False):

2826

try:

2827

info = netrc.netrc().authenticators(self._NETRC_MACHINE)

2828

if info is not None:

2829

useremail = info[0]

2830

password = info[2]

2831

else:

2832

raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)

2833

except (IOError, netrc.NetrcParseError), err:

2834

self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))

2835

return

2836

2837

if useremail is None:

2838

return

2839

2840

# Log in

2841

login_form = {

2842

'email': useremail,

2843

'pass': password,

2844

'login': 'Log+In'

2845

}

2846

request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))

2847

try:

2848

self.report_login()

2849

login_results = urllib2.urlopen(request).read()

2850

if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:

2851

self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')

2852

return

2853

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2854

self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))

2855

return

2856

2857

def _real_extract(self, url):

2858

mobj = re.match(self._VALID_URL, url)

2859

if mobj is None:

2860

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

2861

return

2862

video_id = mobj.group('ID')

2863

2864

# Get video webpage

2865

self.report_video_webpage_download(video_id)

2866

request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)

2867

try:

2868

page = urllib2.urlopen(request)

2869

video_webpage = page.read()

2870

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

2871

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

2872

return

2873

2874

# Start extracting information

2875

self.report_information_extraction(video_id)

2876

2877

# Extract information

2878

video_info = self._parse_page(video_webpage)

2879

2880

# uploader

2881

if 'owner' not in video_info:

2882

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

2883

return

2884

video_uploader = video_info['owner']

2885

2886

# title

2887

if 'title' not in video_info:

2888

self._downloader.trouble(u'ERROR: unable to extract video title')

2889

return

2890

video_title = video_info['title']

2891

video_title = video_title.decode('utf-8')

2892

video_title = sanitize_title(video_title)

2893

2894

simple_title = _simplify_title(video_title)

2895

2896

# thumbnail image

2897

if 'thumbnail' not in video_info:

2898

self._downloader.trouble(u'WARNING: unable to extract video thumbnail')

2899

video_thumbnail = ''

2900

else:

2901

video_thumbnail = video_info['thumbnail']

2902

2903

# upload date

2904

upload_date = u'NA'

2905

if 'upload_date' in video_info:

2906

upload_time = video_info['upload_date']

2907

timetuple = email.utils.parsedate_tz(upload_time)

2908

if timetuple is not None:

2909

try:

2910

upload_date = time.strftime('%Y%m%d', timetuple[0:9])

2911

except:

2912

pass

2913

2914

# description

2915

video_description = video_info.get('description', 'No description available.')

2916

2917

url_map = video_info['video_urls']

2918

if len(url_map.keys()) > 0:

2919

# Decide which formats to download

2920

req_format = self._downloader.params.get('format', None)

2921

format_limit = self._downloader.params.get('format_limit', None)

2922

2923

if format_limit is not None and format_limit in self._available_formats:

2924

format_list = self._available_formats[self._available_formats.index(format_limit):]

2925

else:

2926

format_list = self._available_formats

2927

existing_formats = [x for x in format_list if x in url_map]

2928

if len(existing_formats) == 0:

2929

self._downloader.trouble(u'ERROR: no known formats available for video')

2930

return

2931

if req_format is None:

2932

video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality

2933

elif req_format == 'worst':

2934

video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality

2935

elif req_format == '-1':

2936

video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats

2937

else:

2938

# Specific format

2939

if req_format not in url_map:

2940

self._downloader.trouble(u'ERROR: requested format not available')

2941

return

2942

video_url_list = [(req_format, url_map[req_format])] # Specific format

2943

2944

for format_param, video_real_url in video_url_list:

2945

2946

# At this point we have a new video

2947

self._downloader.increment_downloads()

2948

2949

# Extension

2950

video_extension = self._video_extensions.get(format_param, 'mp4')

2951

2952

try:

2953

# Process video information

2954

self._downloader.process_info({

2955

'id': video_id.decode('utf-8'),

2956

'url': video_real_url.decode('utf-8'),

2957

'uploader': video_uploader.decode('utf-8'),

2958

'upload_date': upload_date,

2959

'title': video_title,

2960

'stitle': simple_title,

2961

'ext': video_extension.decode('utf-8'),

2962

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

2963

'thumbnail': video_thumbnail.decode('utf-8'),

2964

'description': video_description.decode('utf-8'),

2965

'player_url': None,

2966

})

2967

except UnavailableVideoError, err:

2968

self._downloader.trouble(u'\nERROR: unable to download video')

2969

2970

class BlipTVIE(InfoExtractor):

2971

"""Information extractor for blip.tv"""

2972

2973

_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'

2974

_URL_EXT = r'^.*\.([a-z0-9]+)$'

2975

IE_NAME = u'blip.tv'

2976

2977

def report_extraction(self, file_id):

2978

"""Report information extraction."""

2979

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))

2980

2981

def report_direct_download(self, title):

2982

"""Report information extraction."""

2983

self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))

2984

2985

def _real_extract(self, url):

2986

mobj = re.match(self._VALID_URL, url)

2987

if mobj is None:

2988

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

2989

return

2990

2991

if '?' in url:

2992

cchar = '&'

2993

else:

2994

cchar = '?'

2995

json_url = url + cchar + 'skin=json&version=2&no_wrap=1'

2996

request = urllib2.Request(json_url)

2997

self.report_extraction(mobj.group(1))

2998

info = None

2999

try:

3000

urlh = urllib2.urlopen(request)

3001

if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download

3002

basename = url.split('/')[-1]

3003

title,ext = os.path.splitext(basename)

3004

title = title.decode('UTF-8')

3005

ext = ext.replace('.', '')

3006

self.report_direct_download(title)

3007

info = {

3008

'id': title,

3009

'url': url,

3010

'title': title,

3011

'stitle': _simplify_title(title),

3012

'ext': ext,

3013

'urlhandle': urlh

3014

}

3015

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3016

self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))

3017

return

3018

if info is None: # Regular URL

3019

try:

3020

json_code = urlh.read()

3021

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3022

self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))

3023

return

3024

3025

try:

3026

json_data = json.loads(json_code)

3027

if 'Post' in json_data:

3028

data = json_data['Post']

3029

else:

3030

data = json_data

3031

3032

upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')

3033

video_url = data['media']['url']

3034

umobj = re.match(self._URL_EXT, video_url)

3035

if umobj is None:

3036

raise ValueError('Can not determine filename extension')

3037

ext = umobj.group(1)

3038

3039

info = {

3040

'id': data['item_id'],

3041

'url': video_url,

3042

'uploader': data['display_name'],

3043

'upload_date': upload_date,

3044

'title': data['title'],

3045

'stitle': _simplify_title(data['title']),

3046

'ext': ext,

3047

'format': data['media']['mimeType'],

3048

'thumbnail': data['thumbnailUrl'],

3049

'description': data['description'],

3050

'player_url': data['embedUrl']

3051

}

3052

except (ValueError,KeyError), err:

3053

self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))

3054

return

3055

3056

self._downloader.increment_downloads()

3057

3058

try:

3059

self._downloader.process_info(info)

3060

except UnavailableVideoError, err:

3061

self._downloader.trouble(u'\nERROR: unable to download video')

3062

3063

3064

class MyVideoIE(InfoExtractor):

3065

"""Information Extractor for myvideo.de."""

3066

3067

_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'

3068

IE_NAME = u'myvideo'

3069

3070

def __init__(self, downloader=None):

3071

InfoExtractor.__init__(self, downloader)

3072

3073

def report_download_webpage(self, video_id):

3074

"""Report webpage download."""

3075

self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)

3076

3077

def report_extraction(self, video_id):

3078

"""Report information extraction."""

3079

self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)

3080

3081

def _real_extract(self,url):

3082

mobj = re.match(self._VALID_URL, url)

3083

if mobj is None:

3084

self._download.trouble(u'ERROR: invalid URL: %s' % url)

3085

return

3086

3087

video_id = mobj.group(1)

3088

3089

# Get video webpage

3090

request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)

3091

try:

3092

self.report_download_webpage(video_id)

3093

webpage = urllib2.urlopen(request).read()

3094

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3095

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))

3096

return

3097

3098

self.report_extraction(video_id)

3099

mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',

3100

webpage)

3101

if mobj is None:

3102

self._downloader.trouble(u'ERROR: unable to extract media URL')

3103

return

3104

video_url = mobj.group(1) + ('/%s.flv' % video_id)

3105

3106

mobj = re.search('<title>([^<]+)</title>', webpage)

3107

if mobj is None:

3108

self._downloader.trouble(u'ERROR: unable to extract title')

3109

return

3110

3111

video_title = mobj.group(1)

3112

video_title = sanitize_title(video_title)

3113

3114

simple_title = _simplify_title(video_title)

3115

3116

try:

3117

self._downloader.process_info({

3118

'id': video_id,

3119

'url': video_url,

3120

'uploader': u'NA',

3121

'upload_date': u'NA',

3122

'title': video_title,

3123

'stitle': simple_title,

3124

'ext': u'flv',

3125

'format': u'NA',

3126

'player_url': None,

3127

})

3128

except UnavailableVideoError:

3129

self._downloader.trouble(u'\nERROR: Unable to download video')

3130

3131

class ComedyCentralIE(InfoExtractor):

3132

"""Information extractor for The Daily Show and Colbert Report """

3133

3134

3135

IE_NAME = u'comedycentral'

3136

3137

def report_extraction(self, episode_id):

3138

self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)

3139

3140

def report_config_download(self, episode_id):

3141

self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)

3142

3143

def report_index_download(self, episode_id):

3144

self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)

3145

3146

def report_player_url(self, episode_id):

3147

self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)

3148

3149

def _real_extract(self, url):

3150

mobj = re.match(self._VALID_URL, url)

3151

if mobj is None:

3152

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3153

return

3154

3155

if mobj.group('shortname'):

3156

if mobj.group('shortname') in ('tds', 'thedailyshow'):

3157

url = u'http://www.thedailyshow.com/full-episodes/'

3158

else:

3159

url = u'http://www.colbertnation.com/full-episodes/'

3160

mobj = re.match(self._VALID_URL, url)

3161

assert mobj is not None

3162

3163

dlNewest = not mobj.group('episode')

3164

if dlNewest:

3165

epTitle = mobj.group('showname')

3166

else:

3167

epTitle = mobj.group('episode')

3168

3169

req = urllib2.Request(url)

3170

self.report_extraction(epTitle)

3171

try:

3172

htmlHandle = urllib2.urlopen(req)

3173

html = htmlHandle.read()

3174

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3175

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))

3176

return

3177

if dlNewest:

3178

url = htmlHandle.geturl()

3179

mobj = re.match(self._VALID_URL, url)

3180

if mobj is None:

3181

self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)

3182

return

3183

if mobj.group('episode') == '':

3184

self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)

3185

return

3186

epTitle = mobj.group('episode')

3187

3188

mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)

3189

if len(mMovieParams) == 0:

3190

self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)

3191

return

3192

3193

playerUrl_raw = mMovieParams[0][0]

3194

self.report_player_url(epTitle)

3195

try:

3196

urlHandle = urllib2.urlopen(playerUrl_raw)

3197

playerUrl = urlHandle.geturl()

3198

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3199

self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))

3200

return

3201

3202

uri = mMovieParams[0][1]

3203

indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})

3204

self.report_index_download(epTitle)

3205

try:

3206

indexXml = urllib2.urlopen(indexUrl).read()

3207

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3208

self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))

3209

return

3210

3211

idoc = xml.etree.ElementTree.fromstring(indexXml)

3212

itemEls = idoc.findall('.//item')

3213

for itemEl in itemEls:

3214

mediaId = itemEl.findall('./guid')[0].text

3215

shortMediaId = mediaId.split(':')[-1]

3216

showId = mediaId.split(':')[-2].replace('.com', '')

3217

officialTitle = itemEl.findall('./title')[0].text

3218

officialDate = itemEl.findall('./pubDate')[0].text

3219

3220

configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +

3221

urllib.urlencode({'uri': mediaId}))

3222

configReq = urllib2.Request(configUrl)

3223

self.report_config_download(epTitle)

3224

try:

3225

configXml = urllib2.urlopen(configReq).read()

3226

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3227

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))

3228

return

3229

3230

cdoc = xml.etree.ElementTree.fromstring(configXml)

3231

turls = []

3232

for rendition in cdoc.findall('.//rendition'):

3233

finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)

3234

turls.append(finfo)

3235

3236

if len(turls) == 0:

3237

self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')

3238

continue

3239

3240

# For now, just pick the highest bitrate

3241

format,video_url = turls[-1]

3242

3243

self._downloader.increment_downloads()

3244

3245

effTitle = showId + u'-' + epTitle

3246

info = {

3247

'id': shortMediaId,

3248

'url': video_url,

3249

'uploader': showId,

3250

'upload_date': officialDate,

3251

'title': effTitle,

3252

'stitle': _simplify_title(effTitle),

3253

'ext': 'mp4',

3254

'format': format,

3255

'thumbnail': None,

3256

'description': officialTitle,

3257

'player_url': playerUrl

3258

}

3259

3260

try:

3261

self._downloader.process_info(info)

3262

except UnavailableVideoError, err:

3263

self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)

3264

continue

3265

3266

3267

class EscapistIE(InfoExtractor):

3268

"""Information extractor for The Escapist """

3269

3270

_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'

3271

IE_NAME = u'escapist'

3272

3273

def report_extraction(self, showName):

3274

self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)

3275

3276

def report_config_download(self, showName):

3277

self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)

3278

3279

def _real_extract(self, url):

3280

htmlParser = HTMLParser.HTMLParser()

3281

3282

mobj = re.match(self._VALID_URL, url)

3283

if mobj is None:

3284

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3285

return

3286

showName = mobj.group('showname')

3287

videoId = mobj.group('episode')

3288

3289

self.report_extraction(showName)

3290

try:

3291

webPage = urllib2.urlopen(url).read()

3292

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3293

self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))

3294

return

3295

3296

descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)

3297

description = htmlParser.unescape(descMatch.group(1))

3298

imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)

3299

imgUrl = htmlParser.unescape(imgMatch.group(1))

3300

playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)

3301

playerUrl = htmlParser.unescape(playerUrlMatch.group(1))

3302

configUrlMatch = re.search('config=(.*)$', playerUrl)

3303

configUrl = urllib2.unquote(configUrlMatch.group(1))

3304

3305

self.report_config_download(showName)

3306

try:

3307

configJSON = urllib2.urlopen(configUrl).read()

3308

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3309

self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))

3310

return

3311

3312

# Technically, it's JavaScript, not JSON

3313

configJSON = configJSON.replace("'", '"')

3314

3315

try:

3316

config = json.loads(configJSON)

3317

except (ValueError,), err:

3318

self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))

3319

return

3320

3321

playlist = config['playlist']

3322

videoUrl = playlist[1]['url']

3323

3324

self._downloader.increment_downloads()

3325

info = {

3326

'id': videoId,

3327

'url': videoUrl,

3328

'uploader': showName,

3329

'upload_date': None,

3330

'title': showName,

3331

'stitle': _simplify_title(showName),

3332

'ext': 'flv',

3333

'format': 'flv',

3334

'thumbnail': imgUrl,

3335

'description': description,

3336

'player_url': playerUrl,

3337

}

3338

3339

try:

3340

self._downloader.process_info(info)

3341

except UnavailableVideoError, err:

3342

self._downloader.trouble(u'\nERROR: unable to download ' + videoId)

3343

3344

3345

class CollegeHumorIE(InfoExtractor):

3346

"""Information extractor for collegehumor.com"""

3347

3348

_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'

3349

IE_NAME = u'collegehumor'

3350

3351

def report_webpage(self, video_id):

3352

"""Report information extraction."""

3353

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

3354

3355

def report_extraction(self, video_id):

3356

"""Report information extraction."""

3357

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

3358

3359

def _real_extract(self, url):

3360

htmlParser = HTMLParser.HTMLParser()

3361

3362

mobj = re.match(self._VALID_URL, url)

3363

if mobj is None:

3364

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3365

return

3366

video_id = mobj.group('videoid')

3367

3368

self.report_webpage(video_id)

3369

request = urllib2.Request(url)

3370

try:

3371

webpage = urllib2.urlopen(request).read()

3372

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3373

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

3374

return

3375

3376

m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)

3377

if m is None:

3378

self._downloader.trouble(u'ERROR: Cannot extract internal video ID')

3379

return

3380

internal_video_id = m.group('internalvideoid')

3381

3382

info = {

3383

'id': video_id,

3384

'internal_id': internal_video_id,

3385

}

3386

3387

self.report_extraction(video_id)

3388

xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id

3389

try:

3390

metaXml = urllib2.urlopen(xmlUrl).read()

3391

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3392

self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))

3393

return

3394

3395

mdoc = xml.etree.ElementTree.fromstring(metaXml)

3396

try:

3397

videoNode = mdoc.findall('./video')[0]

3398

info['description'] = videoNode.findall('./description')[0].text

3399

info['title'] = videoNode.findall('./caption')[0].text

3400

info['stitle'] = _simplify_title(info['title'])

3401

info['url'] = videoNode.findall('./file')[0].text

3402

info['thumbnail'] = videoNode.findall('./thumbnail')[0].text

3403

info['ext'] = info['url'].rpartition('.')[2]

3404

info['format'] = info['ext']

3405

except IndexError:

3406

self._downloader.trouble(u'\nERROR: Invalid metadata XML file')

3407

return

3408

3409

self._downloader.increment_downloads()

3410

3411

try:

3412

self._downloader.process_info(info)

3413

except UnavailableVideoError, err:

3414

self._downloader.trouble(u'\nERROR: unable to download video')

3415

3416

3417

class XVideosIE(InfoExtractor):

3418

"""Information extractor for xvideos.com"""

3419

3420

_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'

3421

IE_NAME = u'xvideos'

3422

3423

def report_webpage(self, video_id):

3424

"""Report information extraction."""

3425

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

3426

3427

def report_extraction(self, video_id):

3428

"""Report information extraction."""

3429

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

3430

3431

def _real_extract(self, url):

3432

htmlParser = HTMLParser.HTMLParser()

3433

3434

mobj = re.match(self._VALID_URL, url)

3435

if mobj is None:

3436

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3437

return

3438

video_id = mobj.group(1).decode('utf-8')

3439

3440

self.report_webpage(video_id)

3441

3442

request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)

3443

try:

3444

webpage = urllib2.urlopen(request).read()

3445

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3446

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

3447

return

3448

3449

self.report_extraction(video_id)

3450

3451

3452

# Extract video URL

3453

mobj = re.search(r'flv_url=(.+?)&', webpage)

3454

if mobj is None:

3455

self._downloader.trouble(u'ERROR: unable to extract video url')

3456

return

3457

video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))

3458

3459

3460

# Extract title

3461

mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)

3462

if mobj is None:

3463

self._downloader.trouble(u'ERROR: unable to extract video title')

3464

return

3465

video_title = mobj.group(1).decode('utf-8')

3466

3467

3468

# Extract video thumbnail

3469

mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)

3470

if mobj is None:

3471

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

3472

return

3473

video_thumbnail = mobj.group(1).decode('utf-8')

3474

3475

3476

3477

self._downloader.increment_downloads()

3478

info = {

3479

'id': video_id,

3480

'url': video_url,

3481

'uploader': None,

3482

'upload_date': None,

3483

'title': video_title,

3484

'stitle': _simplify_title(video_title),

3485

'ext': 'flv',

3486

'format': 'flv',

3487

'thumbnail': video_thumbnail,

3488

'description': None,

3489

'player_url': None,

3490

}

3491

3492

try:

3493

self._downloader.process_info(info)

3494

except UnavailableVideoError, err:

3495

self._downloader.trouble(u'\nERROR: unable to download ' + video_id)

3496

3497

3498

class SoundcloudIE(InfoExtractor):

3499

"""Information extractor for soundcloud.com

3500

To access the media, the uid of the song and a stream token

3501

must be extracted from the page source and the script must make

3502

a request to media.soundcloud.com/crossdomain.xml. Then

3503

the media can be grabbed by requesting from an url composed

3504

of the stream token and uid

3505

"""

3506

3507

_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'

3508

IE_NAME = u'soundcloud'

3509

3510

def __init__(self, downloader=None):

3511

InfoExtractor.__init__(self, downloader)

3512

3513

def report_webpage(self, video_id):

3514

"""Report information extraction."""

3515

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

3516

3517

def report_extraction(self, video_id):

3518

"""Report information extraction."""

3519

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

3520

3521

def _real_extract(self, url):

3522

htmlParser = HTMLParser.HTMLParser()

3523

3524

mobj = re.match(self._VALID_URL, url)

3525

if mobj is None:

3526

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3527

return

3528

3529

# extract uploader (which is in the url)

3530

uploader = mobj.group(1).decode('utf-8')

3531

# extract simple title (uploader + slug of song title)

3532

slug_title = mobj.group(2).decode('utf-8')

3533

simple_title = uploader + '-' + slug_title

3534

3535

self.report_webpage('%s/%s' % (uploader, slug_title))

3536

3537

request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))

3538

try:

3539

webpage = urllib2.urlopen(request).read()

3540

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3541

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

3542

return

3543

3544

self.report_extraction('%s/%s' % (uploader, slug_title))

3545

3546

# extract uid and stream token that soundcloud hands out for access

3547

mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)

3548

if mobj:

3549

video_id = mobj.group(1)

3550

stream_token = mobj.group(2)

3551

3552

# extract unsimplified title

3553

mobj = re.search('"title":"(.*?)",', webpage)

3554

if mobj:

3555

title = mobj.group(1)

3556

3557

# construct media url (with uid/token)

3558

mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"

3559

mediaURL = mediaURL % (video_id, stream_token)

3560

3561

# description

3562

description = u'No description available'

3563

mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)

3564

if mobj:

3565

description = mobj.group(1)

3566

3567

# upload date

3568

upload_date = None

3569

mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)

3570

if mobj:

3571

try:

3572

upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')

3573

except Exception, e:

3574

print str(e)

3575

3576

# for soundcloud, a request to a cross domain is required for cookies

3577

request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)

3578

3579

try:

3580

self._downloader.process_info({

3581

'id': video_id.decode('utf-8'),

3582

'url': mediaURL,

3583

'uploader': uploader.decode('utf-8'),

3584

'upload_date': upload_date,

3585

'title': simple_title.decode('utf-8'),

3586

'stitle': simple_title.decode('utf-8'),

3587

'ext': u'mp3',

3588

'format': u'NA',

3589

'player_url': None,

3590

'description': description.decode('utf-8')

3591

})

3592

except UnavailableVideoError:

3593

self._downloader.trouble(u'\nERROR: unable to download video')

3594

3595

3596

class InfoQIE(InfoExtractor):

3597

"""Information extractor for infoq.com"""

3598

3599

_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'

3600

IE_NAME = u'infoq'

3601

3602

def report_webpage(self, video_id):

3603

"""Report information extraction."""

3604

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

3605

3606

def report_extraction(self, video_id):

3607

"""Report information extraction."""

3608

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

3609

3610

def _real_extract(self, url):

3611

htmlParser = HTMLParser.HTMLParser()

3612

3613

mobj = re.match(self._VALID_URL, url)

3614

if mobj is None:

3615

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3616

return

3617

3618

self.report_webpage(url)

3619

3620

request = urllib2.Request(url)

3621

try:

3622

webpage = urllib2.urlopen(request).read()

3623

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3624

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

3625

return

3626

3627

self.report_extraction(url)

3628

3629

3630

# Extract video URL

3631

mobj = re.search(r"jsclassref='([^']*)'", webpage)

3632

if mobj is None:

3633

self._downloader.trouble(u'ERROR: unable to extract video url')

3634

return

3635

video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))

3636

3637

3638

# Extract title

3639

mobj = re.search(r'contentTitle = "(.*?)";', webpage)

3640

if mobj is None:

3641

self._downloader.trouble(u'ERROR: unable to extract video title')

3642

return

3643

video_title = mobj.group(1).decode('utf-8')

3644

3645

# Extract description

3646

video_description = u'No description available.'

3647

mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)

3648

if mobj is not None:

3649

video_description = mobj.group(1).decode('utf-8')

3650

3651

video_filename = video_url.split('/')[-1]

3652

video_id, extension = video_filename.split('.')

3653

3654

self._downloader.increment_downloads()

3655

info = {

3656

'id': video_id,

3657

'url': video_url,

3658

'uploader': None,

3659

'upload_date': None,

3660

'title': video_title,

3661

'stitle': _simplify_title(video_title),

3662

'ext': extension,

3663

'format': extension, # Extension is always(?) mp4, but seems to be flv

3664

'thumbnail': None,

3665

'description': video_description,

3666

'player_url': None,

3667

}

3668

3669

try:

3670

self._downloader.process_info(info)

3671

except UnavailableVideoError, err:

3672

self._downloader.trouble(u'\nERROR: unable to download ' + video_url)

3673

3674

class MixcloudIE(InfoExtractor):

3675

"""Information extractor for www.mixcloud.com"""

3676

_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'

3677

IE_NAME = u'mixcloud'

3678

3679

def __init__(self, downloader=None):

3680

InfoExtractor.__init__(self, downloader)

3681

3682

def report_download_json(self, file_id):

3683

"""Report JSON download."""

3684

self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)

3685

3686

def report_extraction(self, file_id):

3687

"""Report information extraction."""

3688

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))

3689

3690

def get_urls(self, jsonData, fmt, bitrate='best'):

3691

"""Get urls from 'audio_formats' section in json"""

3692

file_url = None

3693

try:

3694

bitrate_list = jsonData[fmt]

3695

if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:

3696

bitrate = max(bitrate_list) # select highest

3697

3698

url_list = jsonData[fmt][bitrate]

3699

except TypeError: # we have no bitrate info.

3700

url_list = jsonData[fmt]

3701

3702

return url_list

3703

3704

def check_urls(self, url_list):

3705

"""Returns 1st active url from list"""

3706

for url in url_list:

3707

try:

3708

urllib2.urlopen(url)

3709

return url

3710

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3711

url = None

3712

3713

return None

3714

3715

def _print_formats(self, formats):

3716

print 'Available formats:'

3717

for fmt in formats.keys():

3718

for b in formats[fmt]:

3719

try:

3720

ext = formats[fmt][b][0]

3721

print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])

3722

except TypeError: # we have no bitrate info

3723

ext = formats[fmt][0]

3724

print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])

3725

break

3726

3727

def _real_extract(self, url):

3728

mobj = re.match(self._VALID_URL, url)

3729

if mobj is None:

3730

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3731

return

3732

# extract uploader & filename from url

3733

uploader = mobj.group(1).decode('utf-8')

3734

file_id = uploader + "-" + mobj.group(2).decode('utf-8')

3735

3736

# construct API request

3737

file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'

3738

# retrieve .json file with links to files

3739

request = urllib2.Request(file_url)

3740

try:

3741

self.report_download_json(file_url)

3742

jsonData = urllib2.urlopen(request).read()

3743

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3744

self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))

3745

return

3746

3747

# parse JSON

3748

json_data = json.loads(jsonData)

3749

player_url = json_data['player_swf_url']

3750

formats = dict(json_data['audio_formats'])

3751

3752

req_format = self._downloader.params.get('format', None)

3753

bitrate = None

3754

3755

if self._downloader.params.get('listformats', None):

3756

self._print_formats(formats)

3757

return

3758

3759

if req_format is None or req_format == 'best':

3760

for format_param in formats.keys():

3761

url_list = self.get_urls(formats, format_param)

3762

# check urls

3763

file_url = self.check_urls(url_list)

3764

if file_url is not None:

3765

break # got it!

3766

else:

3767

if req_format not in formats.keys():

3768

self._downloader.trouble(u'ERROR: format is not available')

3769

return

3770

3771

url_list = self.get_urls(formats, req_format)

3772

file_url = self.check_urls(url_list)

3773

format_param = req_format

3774

3775

# We have audio

3776

self._downloader.increment_downloads()

3777

try:

3778

# Process file information

3779

self._downloader.process_info({

3780

'id': file_id.decode('utf-8'),

3781

'url': file_url.decode('utf-8'),

3782

'uploader': uploader.decode('utf-8'),

3783

'upload_date': u'NA',

3784

'title': json_data['name'],

3785

'stitle': _simplify_title(json_data['name']),

3786

'ext': file_url.split('.')[-1].decode('utf-8'),

3787

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

3788

'thumbnail': json_data['thumbnail_url'],

3789

'description': json_data['description'],

3790

'player_url': player_url.decode('utf-8'),

3791

})

3792

except UnavailableVideoError, err:

3793

self._downloader.trouble(u'ERROR: unable to download file')

3794

3795

class StanfordOpenClassroomIE(InfoExtractor):

3796

"""Information extractor for Stanford's Open ClassRoom"""

3797

3798

_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'

3799

IE_NAME = u'stanfordoc'

3800

3801

def report_download_webpage(self, objid):

3802

"""Report information extraction."""

3803

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))

3804

3805

def report_extraction(self, video_id):

3806

"""Report information extraction."""

3807

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

3808

3809

def _real_extract(self, url):

3810

mobj = re.match(self._VALID_URL, url)

3811

if mobj is None:

3812

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3813

return

3814

3815

if mobj.group('course') and mobj.group('video'): # A specific video

3816

course = mobj.group('course')

3817

video = mobj.group('video')

3818

info = {

3819

'id': _simplify_title(course + '_' + video),

3820

}

3821

3822

self.report_extraction(info['id'])

3823

baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'

3824

xmlUrl = baseUrl + video + '.xml'

3825

try:

3826

metaXml = urllib2.urlopen(xmlUrl).read()

3827

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3828

self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))

3829

return

3830

mdoc = xml.etree.ElementTree.fromstring(metaXml)

3831

try:

3832

info['title'] = mdoc.findall('./title')[0].text

3833

info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text

3834

except IndexError:

3835

self._downloader.trouble(u'\nERROR: Invalid metadata XML file')

3836

return

3837

info['stitle'] = _simplify_title(info['title'])

3838

info['ext'] = info['url'].rpartition('.')[2]

3839

info['format'] = info['ext']

3840

self._downloader.increment_downloads()

3841

try:

3842

self._downloader.process_info(info)

3843

except UnavailableVideoError, err:

3844

self._downloader.trouble(u'\nERROR: unable to download video')

3845

elif mobj.group('course'): # A course page

3846

unescapeHTML = HTMLParser.HTMLParser().unescape

3847

3848

course = mobj.group('course')

3849

info = {

3850

'id': _simplify_title(course),

3851

'type': 'playlist',

3852

}

3853

3854

self.report_download_webpage(info['id'])

3855

try:

3856

coursepage = urllib2.urlopen(url).read()

3857

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3858

self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))

3859

return

3860

3861

m = re.search('<h1>([^<]+)</h1>', coursepage)

3862

if m:

3863

info['title'] = unescapeHTML(m.group(1))

3864

else:

3865

info['title'] = info['id']

3866

info['stitle'] = _simplify_title(info['title'])

3867

3868

m = re.search('<description>([^<]+)</description>', coursepage)

3869

if m:

3870

info['description'] = unescapeHTML(m.group(1))

3871

3872

links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))

3873

info['list'] = [

3874

{

3875

'type': 'reference',

3876

'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),

3877

}

3878

for vpage in links]

3879

3880

for entry in info['list']:

3881

assert entry['type'] == 'reference'

3882

self.extract(entry['url'])

3883

else: # Root page

3884

unescapeHTML = HTMLParser.HTMLParser().unescape

3885

3886

info = {

3887

'id': 'Stanford OpenClassroom',

3888

'type': 'playlist',

3889

}

3890

3891

self.report_download_webpage(info['id'])

3892

rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'

3893

try:

3894

rootpage = urllib2.urlopen(rootURL).read()

3895

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3896

self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))

3897

return

3898

3899

info['title'] = info['id']

3900

info['stitle'] = _simplify_title(info['title'])

3901

3902

links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))

3903

info['list'] = [

3904

{

3905

'type': 'reference',

3906

'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),

3907

}

3908

for cpage in links]

3909

3910

for entry in info['list']:

3911

assert entry['type'] == 'reference'

3912

self.extract(entry['url'])

3913

3914

class MTVIE(InfoExtractor):

3915

"""Information extractor for MTV.com"""

3916

3917

_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'

3918

IE_NAME = u'mtv'

3919

3920

def report_webpage(self, video_id):

3921

"""Report information extraction."""

3922

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

3923

3924

def report_extraction(self, video_id):

3925

"""Report information extraction."""

3926

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

3927

3928

def _real_extract(self, url):

3929

mobj = re.match(self._VALID_URL, url)

3930

if mobj is None:

3931

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

3932

return

3933

if not mobj.group('proto'):

3934

url = 'http://' + url

3935

video_id = mobj.group('videoid')

3936

self.report_webpage(video_id)

3937

3938

request = urllib2.Request(url)

3939

try:

3940

webpage = urllib2.urlopen(request).read()

3941

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3942

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))

3943

return

3944

3945

mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)

3946

if mobj is None:

3947

self._downloader.trouble(u'ERROR: unable to extract song name')

3948

return

3949

song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))

3950

mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)

3951

if mobj is None:

3952

self._downloader.trouble(u'ERROR: unable to extract performer')

3953

return

3954

performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))

3955

video_title = performer + ' - ' + song_name

3956

3957

mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)

3958

if mobj is None:

3959

self._downloader.trouble(u'ERROR: unable to mtvn_uri')

3960

return

3961

mtvn_uri = mobj.group(1)

3962

3963

mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)

3964

if mobj is None:

3965

self._downloader.trouble(u'ERROR: unable to extract content id')

3966

return

3967

content_id = mobj.group(1)

3968

3969

videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri

3970

self.report_extraction(video_id)

3971

request = urllib2.Request(videogen_url)

3972

try:

3973

metadataXml = urllib2.urlopen(request).read()

3974

except (urllib2.URLError, httplib.HTTPException, socket.error), err:

3975

self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))

3976

return

3977

3978

mdoc = xml.etree.ElementTree.fromstring(metadataXml)

3979

renditions = mdoc.findall('.//rendition')

3980

3981

# For now, always pick the highest quality.

3982

rendition = renditions[-1]

3983

3984

try:

3985

_,_,ext = rendition.attrib['type'].partition('/')

3986

format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']

3987

video_url = rendition.find('./src').text

3988

except KeyError:

3989

self._downloader.trouble('Invalid rendition field.')

3990

return

3991

3992

self._downloader.increment_downloads()

3993

info = {

3994

'id': video_id,

3995

'url': video_url,

3996

'uploader': performer,

3997

'title': video_title,

3998

'stitle': _simplify_title(video_title),

3999

'ext': ext,

4000

'format': format,

4001

}

4002

4003

try:

4004

self._downloader.process_info(info)

4005

except UnavailableVideoError, err:

4006

self._downloader.trouble(u'\nERROR: unable to download ' + video_id)

4007

4008

4009

class PostProcessor(object):

4010

"""Post Processor class.

4011

4012

PostProcessor objects can be added to downloaders with their

4013

add_post_processor() method. When the downloader has finished a

4014

successful download, it will take its internal chain of PostProcessors

4015

and start calling the run() method on each one of them, first with

4016

an initial argument and then with the returned value of the previous

4017

PostProcessor.

4018

4019

The chain will be stopped if one of them ever returns None or the end

4020

of the chain is reached.

4021

4022

PostProcessor objects follow a "mutual registration" process similar

4023

to InfoExtractor objects.

4024

"""

4025

4026

_downloader = None

4027

4028

def __init__(self, downloader=None):

4029

self._downloader = downloader

4030

4031

def set_downloader(self, downloader):

4032

"""Sets the downloader for this PP."""

4033

self._downloader = downloader

4034

4035

def run(self, information):

4036

"""Run the PostProcessor.

4037

4038

The "information" argument is a dictionary like the ones

4039

composed by InfoExtractors. The only difference is that this

4040

one has an extra field called "filepath" that points to the

4041

downloaded file.

4042

4043

When this method returns None, the postprocessing chain is

4044

stopped. However, this method may return an information

4045

dictionary that will be passed to the next postprocessing

4046

object in the chain. It can be the one it received after

4047

changing some fields.

4048

4049

In addition, this method may raise a PostProcessingError

4050

exception that will be taken into account by the downloader

4051

it was called from.

4052

"""

4053

return information # by default, do nothing

4054

4055

class AudioConversionError(BaseException):

4056

def __init__(self, message):

4057

self.message = message

4058

4059

class FFmpegExtractAudioPP(PostProcessor):

4060

4061

def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):

4062

PostProcessor.__init__(self, downloader)

4063

if preferredcodec is None:

4064

preferredcodec = 'best'

4065

self._preferredcodec = preferredcodec

4066

self._preferredquality = preferredquality

4067

self._keepvideo = keepvideo

4068

4069

@staticmethod

4070

def get_audio_codec(path):

4071

try:

4072

cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]

4073

handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)

4074

output = handle.communicate()[0]

4075

if handle.wait() != 0:

4076

return None

4077

except (IOError, OSError):

4078

return None

4079

audio_codec = None

4080

for line in output.split('\n'):

4081

if line.startswith('codec_name='):

4082

audio_codec = line.split('=')[1].strip()

4083

elif line.strip() == 'codec_type=audio' and audio_codec is not None:

4084

return audio_codec

4085

return None

4086

4087

@staticmethod

4088

def run_ffmpeg(path, out_path, codec, more_opts):

4089

if codec is None:

4090

acodec_opts = []

4091

else:

4092

acodec_opts = ['-acodec', codec]

4093

cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]

4094

try:

4095

p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

4096

stdout,stderr = p.communicate()

4097

except (IOError, OSError):

4098

e = sys.exc_info()[1]

4099

if isinstance(e, OSError) and e.errno == 2:

4100

raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')

4101

else:

4102

raise e

4103

if p.returncode != 0:

4104

msg = stderr.strip().split('\n')[-1]

4105

raise AudioConversionError(msg)

4106

4107

def run(self, information):

4108

path = information['filepath']

4109

4110

filecodec = self.get_audio_codec(path)

4111

if filecodec is None:

4112

self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')

4113

return None

4114

4115

more_opts = []

4116

if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):

4117

if self._preferredcodec == 'm4a' and filecodec == 'aac':

4118

# Lossless, but in another container

4119

acodec = 'copy'

4120

extension = self._preferredcodec

4121

more_opts = ['-absf', 'aac_adtstoasc']

4122

elif filecodec in ['aac', 'mp3', 'vorbis']:

4123

# Lossless if possible

4124

acodec = 'copy'

4125

extension = filecodec

4126

if filecodec == 'aac':

4127

more_opts = ['-f', 'adts']

4128

if filecodec == 'vorbis':

4129

extension = 'ogg'

4130

else:

4131

# MP3 otherwise.

4132

acodec = 'libmp3lame'

4133

extension = 'mp3'

4134

more_opts = []

4135

if self._preferredquality is not None:

4136

more_opts += ['-ab', self._preferredquality]

4137

else:

4138

# We convert the audio (lossy)

4139

acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]

4140

extension = self._preferredcodec

4141

more_opts = []

4142

if self._preferredquality is not None:

4143

more_opts += ['-ab', self._preferredquality]

4144

if self._preferredcodec == 'aac':

4145

more_opts += ['-f', 'adts']

4146

if self._preferredcodec == 'm4a':

4147

more_opts += ['-absf', 'aac_adtstoasc']

4148

if self._preferredcodec == 'vorbis':

4149

extension = 'ogg'

4150

if self._preferredcodec == 'wav':

4151

extension = 'wav'

4152

more_opts += ['-f', 'wav']

4153

4154

prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups

4155

new_path = prefix + sep + extension

4156

self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)

4157

try:

4158

self.run_ffmpeg(path, new_path, acodec, more_opts)

4159

except:

4160

etype,e,tb = sys.exc_info()

4161

if isinstance(e, AudioConversionError):

4162

self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)

4163

else:

4164

self._downloader.to_stderr(u'ERROR: error running ffmpeg')

4165

return None

4166

4167

# Try to update the date time for extracted audio file.

4168

if information.get('filetime') is not None:

4169

try:

4170

os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))

4171

except:

4172

self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')

4173

4174

if not self._keepvideo:

4175

try:

4176

os.remove(_encodeFilename(path))

4177

except (IOError, OSError):

4178

self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')

4179

return None

4180

4181

information['filepath'] = new_path

4182

return information

4183

4184

4185

def updateSelf(downloader, filename):

4186

''' Update the program file with the latest version from the repository '''

4187

# Note: downloader only used for options

4188

if not os.access(filename, os.W_OK):

4189

sys.exit('ERROR: no write permissions on %s' % filename)

4190

4191

downloader.to_screen(u'Updating to latest version...')

4192

4193

try:

4194

try:

4195

urlh = urllib.urlopen(UPDATE_URL)

4196

newcontent = urlh.read()

4197

4198

vmatch = re.search("__version__ = '([^']+)'", newcontent)

4199

if vmatch is not None and vmatch.group(1) == __version__:

4200

downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')

4201

return

4202

finally:

4203

urlh.close()

4204

except (IOError, OSError), err:

4205

sys.exit('ERROR: unable to download latest version')

4206

4207

try:

4208

outf = open(filename, 'wb')

4209

try:

4210

outf.write(newcontent)

4211

finally:

4212

outf.close()

4213

except (IOError, OSError), err:

4214

sys.exit('ERROR: unable to overwrite current version')

4215

4216

downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')

4217

4218

def parseOpts():

4219

def _readOptions(filename_bytes):

4220

try:

4221

optionf = open(filename_bytes)

4222

except IOError:

4223

return [] # silently skip if file is not present

4224

try:

4225

res = []

4226

for l in optionf:

4227

res += shlex.split(l, comments=True)

4228

finally:

4229

optionf.close()

4230

return res

4231

4232

def _format_option_string(option):

4233

''' ('-o', '--option') -> -o, --format METAVAR'''

4234

4235

opts = []

4236

4237

if option._short_opts: opts.append(option._short_opts[0])

4238

if option._long_opts: opts.append(option._long_opts[0])

4239

if len(opts) > 1: opts.insert(1, ', ')

4240

4241

if option.takes_value(): opts.append(' %s' % option.metavar)

4242

4243

return "".join(opts)

4244

4245

def _find_term_columns():

4246

columns = os.environ.get('COLUMNS', None)

4247

if columns:

4248

return int(columns)

4249

4250

try:

4251

sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

4252

out,err = sp.communicate()

4253

return int(out.split()[1])

4254

except:

4255

pass

4256

return None

4257

4258

max_width = 80

4259

max_help_position = 80

4260

4261

# No need to wrap help messages if we're on a wide console

4262

columns = _find_term_columns()

4263

if columns: max_width = columns

4264

4265

fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)

4266

fmt.format_option_strings = _format_option_string

4267

4268

kw = {

4269

'version' : __version__,

4270

'formatter' : fmt,

4271

'usage' : '%prog [options] url [url...]',

4272

'conflict_handler' : 'resolve',

4273

}

4274

4275

parser = optparse.OptionParser(**kw)

4276

4277

# option groups

4278

general = optparse.OptionGroup(parser, 'General Options')

4279

selection = optparse.OptionGroup(parser, 'Video Selection')

4280

authentication = optparse.OptionGroup(parser, 'Authentication Options')

4281

video_format = optparse.OptionGroup(parser, 'Video Format Options')

4282

postproc = optparse.OptionGroup(parser, 'Post-processing Options')

4283

filesystem = optparse.OptionGroup(parser, 'Filesystem Options')

4284

verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')

4285

4286

general.add_option('-h', '--help',

4287

action='help', help='print this help text and exit')

4288

general.add_option('-v', '--version',

4289

action='version', help='print program version and exit')

4290

general.add_option('-U', '--update',

4291

action='store_true', dest='update_self', help='update this program to latest version')

4292

general.add_option('-i', '--ignore-errors',

4293

action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)

4294

general.add_option('-r', '--rate-limit',

4295

dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')

4296

general.add_option('-R', '--retries',

4297

dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)

4298

general.add_option('--dump-user-agent',

4299

action='store_true', dest='dump_user_agent',

4300

help='display the current browser identification', default=False)

4301

general.add_option('--list-extractors',

4302

action='store_true', dest='list_extractors',

4303

help='List all supported extractors and the URLs they would handle', default=False)

4304

4305

selection.add_option('--playlist-start',

4306

dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)

4307

selection.add_option('--playlist-end',

4308

dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)

4309

selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')

4310

selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')

4311

selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)

4312

4313

authentication.add_option('-u', '--username',

4314

dest='username', metavar='USERNAME', help='account username')

4315

authentication.add_option('-p', '--password',

4316

dest='password', metavar='PASSWORD', help='account password')

4317

authentication.add_option('-n', '--netrc',

4318

action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)

4319

4320

4321

video_format.add_option('-f', '--format',

4322

action='store', dest='format', metavar='FORMAT', help='video format code')

4323

video_format.add_option('--all-formats',

4324

action='store_const', dest='format', help='download all available video formats', const='all')

4325

video_format.add_option('--prefer-free-formats',

4326

action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')

4327

video_format.add_option('--max-quality',

4328

action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')

4329

video_format.add_option('-F', '--list-formats',

4330

action='store_true', dest='listformats', help='list all available formats (currently youtube only)')

4331

4332

4333

verbosity.add_option('-q', '--quiet',

4334

action='store_true', dest='quiet', help='activates quiet mode', default=False)

4335

verbosity.add_option('-s', '--simulate',

4336

action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)

4337

verbosity.add_option('--skip-download',

4338

action='store_true', dest='skip_download', help='do not download the video', default=False)

4339

verbosity.add_option('-g', '--get-url',

4340

action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)

4341

verbosity.add_option('-e', '--get-title',

4342

action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)

4343

verbosity.add_option('--get-thumbnail',

4344

action='store_true', dest='getthumbnail',

4345

help='simulate, quiet but print thumbnail URL', default=False)

4346

verbosity.add_option('--get-description',

4347

action='store_true', dest='getdescription',

4348

help='simulate, quiet but print video description', default=False)

4349

verbosity.add_option('--get-filename',

4350

action='store_true', dest='getfilename',

4351

help='simulate, quiet but print output filename', default=False)

4352

verbosity.add_option('--get-format',

4353

action='store_true', dest='getformat',

4354

help='simulate, quiet but print output format', default=False)

4355

verbosity.add_option('--no-progress',

4356

action='store_true', dest='noprogress', help='do not print progress bar', default=False)

4357

verbosity.add_option('--console-title',

4358

action='store_true', dest='consoletitle',

4359

help='display progress in console titlebar', default=False)

4360

verbosity.add_option('-v', '--verbose',

4361

action='store_true', dest='verbose', help='print various debugging information', default=False)

4362

4363

4364

filesystem.add_option('-t', '--title',

4365

action='store_true', dest='usetitle', help='use title in file name', default=False)

4366

filesystem.add_option('-l', '--literal',

4367

action='store_true', dest='useliteral', help='use literal title in file name', default=False)

4368

filesystem.add_option('-A', '--auto-number',

4369

action='store_true', dest='autonumber',

4370

help='number downloaded files starting from 00000', default=False)

4371

filesystem.add_option('-o', '--output',

4372

dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')

4373

filesystem.add_option('-a', '--batch-file',

4374

dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')

4375

filesystem.add_option('-w', '--no-overwrites',

4376

action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)

4377

filesystem.add_option('-c', '--continue',

4378

action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)

4379

filesystem.add_option('--no-continue',

4380

action='store_false', dest='continue_dl',

4381

help='do not resume partially downloaded files (restart from beginning)')

4382

filesystem.add_option('--cookies',

4383

dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')

4384

filesystem.add_option('--no-part',

4385

action='store_true', dest='nopart', help='do not use .part files', default=False)

4386

filesystem.add_option('--no-mtime',

4387

action='store_false', dest='updatetime',

4388

help='do not use the Last-modified header to set the file modification time', default=True)

4389

filesystem.add_option('--write-description',

4390

action='store_true', dest='writedescription',

4391

help='write video description to a .description file', default=False)

4392

filesystem.add_option('--write-info-json',

4393

action='store_true', dest='writeinfojson',

4394

help='write video metadata to a .info.json file', default=False)

4395

4396

4397

postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,

4398

help='convert video files to audio-only files (requires ffmpeg and ffprobe)')

4399

postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',

4400

help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')

4401

postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',

4402

help='ffmpeg audio bitrate specification, 128k by default')

4403

postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,

4404

help='keeps the video file on disk after the post-processing; the video is erased by default')

4405

4406

4407

parser.add_option_group(general)

4408

parser.add_option_group(selection)

4409

parser.add_option_group(filesystem)

4410

parser.add_option_group(verbosity)

4411

parser.add_option_group(video_format)

4412

parser.add_option_group(authentication)

4413

parser.add_option_group(postproc)

4414

4415

xdg_config_home = os.environ.get('XDG_CONFIG_HOME')

4416

if xdg_config_home:

4417

userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')

4418

else:

4419

userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')

4420

argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]

4421

opts, args = parser.parse_args(argv)

4422

4423

return parser, opts, args

4424

4425

def gen_extractors():

4426

""" Return a list of an instance of every supported extractor.

4427

The order does matter; the first extractor matched is the one handling the URL.

4428

"""

4429

youtube_ie = YoutubeIE()

4430

google_ie = GoogleIE()

4431

yahoo_ie = YahooIE()

4432

return [

4433

YoutubePlaylistIE(youtube_ie),

4434

YoutubeUserIE(youtube_ie),

4435

YoutubeSearchIE(youtube_ie),

4436

youtube_ie,

4437

MetacafeIE(youtube_ie),

4438

DailymotionIE(),

4439

google_ie,

4440

GoogleSearchIE(google_ie),

4441

PhotobucketIE(),

4442

yahoo_ie,

4443

YahooSearchIE(yahoo_ie),

4444

DepositFilesIE(),

4445

FacebookIE(),

4446

BlipTVIE(),

4447

VimeoIE(),

4448

MyVideoIE(),

4449

ComedyCentralIE(),

4450

EscapistIE(),

4451

CollegeHumorIE(),

4452

XVideosIE(),

4453

SoundcloudIE(),

4454

InfoQIE(),

4455

MixcloudIE(),

4456

StanfordOpenClassroomIE(),

4457

MTVIE(),

4458

4459

GenericIE()

4460

]

4461

4462

def _real_main():

4463

parser, opts, args = parseOpts()

4464

4465

# Open appropriate CookieJar

4466

if opts.cookiefile is None:

4467

jar = cookielib.CookieJar()

4468

else:

4469

try:

4470

jar = cookielib.MozillaCookieJar(opts.cookiefile)

4471

if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):

4472

jar.load()

4473

except (IOError, OSError), err:

4474

sys.exit(u'ERROR: unable to open cookie file')

4475

4476

# Dump user agent

4477

if opts.dump_user_agent:

4478

print std_headers['User-Agent']

4479

sys.exit(0)

4480

4481

# Batch file verification

4482

batchurls = []

4483

if opts.batchfile is not None:

4484

try:

4485

if opts.batchfile == '-':

4486

batchfd = sys.stdin

4487

else:

4488

batchfd = open(opts.batchfile, 'r')

4489

batchurls = batchfd.readlines()

4490

batchurls = [x.strip() for x in batchurls]

4491

batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]

4492

except IOError:

4493

sys.exit(u'ERROR: batch file could not be read')

4494

all_urls = batchurls + args

4495

4496

# General configuration

4497

cookie_processor = urllib2.HTTPCookieProcessor(jar)

4498

proxy_handler = urllib2.ProxyHandler()

4499

opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())

4500

urllib2.install_opener(opener)

4501

socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)

4502

4503

if opts.verbose:

4504

print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))

4505

4506

extractors = gen_extractors()

4507

4508

if opts.list_extractors:

4509

for ie in extractors:

4510

print(ie.IE_NAME)

4511

matchedUrls = filter(lambda url: ie.suitable(url), all_urls)

4512

all_urls = filter(lambda url: url not in matchedUrls, all_urls)

4513

for mu in matchedUrls:

4514

print(u' ' + mu)

4515

sys.exit(0)

4516

4517

# Conflicting, missing and erroneous options

4518

if opts.usenetrc and (opts.username is not None or opts.password is not None):

4519

parser.error(u'using .netrc conflicts with giving username/password')

4520

if opts.password is not None and opts.username is None:

4521

parser.error(u'account username missing')

4522

if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):

4523

parser.error(u'using output template conflicts with using title, literal title or auto number')

4524

if opts.usetitle and opts.useliteral:

4525

parser.error(u'using title conflicts with using literal title')

4526

if opts.username is not None and opts.password is None:

4527

opts.password = getpass.getpass(u'Type account password and press return:')

4528

if opts.ratelimit is not None:

4529

numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)

4530

if numeric_limit is None:

4531

parser.error(u'invalid rate limit specified')

4532

opts.ratelimit = numeric_limit

4533

if opts.retries is not None:

4534

try:

4535

opts.retries = long(opts.retries)

4536

except (TypeError, ValueError), err:

4537

parser.error(u'invalid retry count specified')

4538

try:

4539

opts.playliststart = int(opts.playliststart)

4540

if opts.playliststart <= 0:

4541

raise ValueError(u'Playlist start must be positive')

4542

except (TypeError, ValueError), err:

4543

parser.error(u'invalid playlist start number specified')

4544

try:

4545

opts.playlistend = int(opts.playlistend)

4546

if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):

4547

raise ValueError(u'Playlist end must be greater than playlist start')

4548

except (TypeError, ValueError), err:

4549

parser.error(u'invalid playlist end number specified')

4550

if opts.extractaudio:

4551

if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:

4552

parser.error(u'invalid audio format specified')

4553

4554

# File downloader

4555

fd = FileDownloader({

4556

'usenetrc': opts.usenetrc,

4557

'username': opts.username,

4558

'password': opts.password,

4559

'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),

4560

'forceurl': opts.geturl,

4561

'forcetitle': opts.gettitle,

4562

'forcethumbnail': opts.getthumbnail,

4563

'forcedescription': opts.getdescription,

4564

'forcefilename': opts.getfilename,

4565

'forceformat': opts.getformat,

4566

'simulate': opts.simulate,

4567

'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),

4568

'format': opts.format,

4569

'format_limit': opts.format_limit,

4570

'listformats': opts.listformats,

4571

'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))

4572

or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')

4573

or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')

4574

or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')

4575

or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')

4576

or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')

4577

or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')

4578

or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')

4579

or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')

4580

or u'%(id)s.%(ext)s'),

4581

'ignoreerrors': opts.ignoreerrors,

4582

'ratelimit': opts.ratelimit,

4583

'nooverwrites': opts.nooverwrites,

4584

'retries': opts.retries,

4585

'continuedl': opts.continue_dl,

4586

'noprogress': opts.noprogress,

4587

'playliststart': opts.playliststart,

4588

'playlistend': opts.playlistend,

4589

'logtostderr': opts.outtmpl == '-',

4590

'consoletitle': opts.consoletitle,

4591

'nopart': opts.nopart,

4592

'updatetime': opts.updatetime,

4593

'writedescription': opts.writedescription,

4594

'writeinfojson': opts.writeinfojson,

4595

'matchtitle': opts.matchtitle,

4596

'rejecttitle': opts.rejecttitle,

4597

'max_downloads': opts.max_downloads,

4598

'prefer_free_formats': opts.prefer_free_formats,

4599

'verbose': opts.verbose,

4600

})

4601

for extractor in extractors:

4602

fd.add_info_extractor(extractor)

4603

4604

# PostProcessors

4605

if opts.extractaudio:

4606

fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))

4607

4608

# Update version

4609

if opts.update_self:

4610

updateSelf(fd, sys.argv[0])

4611

4612

# Maybe do nothing

4613

if len(all_urls) < 1:

4614

if not opts.update_self:

4615

parser.error(u'you must provide at least one URL')

4616

else:

4617

sys.exit()

4618

4619

try:

4620

retcode = fd.download(all_urls)

4621

except MaxDownloadsReached:

4622

fd.to_screen(u'--max-download limit reached, aborting.')

4623

retcode = 101

4624

4625

# Dump cookie jar if requested

4626

if opts.cookiefile is not None:

4627

try:

4628

jar.save()

4629

except (IOError, OSError), err:

4630

sys.exit(u'ERROR: unable to save cookie jar')

4631

4632

sys.exit(retcode)

4633

4634

def main():

4635

try:

4636

_real_main()

4637

except DownloadError:

4638

sys.exit(1)

4639

except SameFileError:

4640

sys.exit(u'ERROR: fixed output name but more than one file to download')

4641

except KeyboardInterrupt:

4642

sys.exit(u'\nERROR: Interrupted by user')

4643

4644

if __name__ == '__main__':

4645

main()

4646

4647

# vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: