~facundo/encuentro/trunk

« back to all changes in this revision

Viewing changes to external/youtube-dl/youtube_dl/extractor/bilibili.py

  • Committer: Facundo Batista
  • Date: 2015-12-27 11:27:15 UTC
  • mto: This revision was merged to the branch mainline in revision 274.
  • Revision ID: facundo@taniquetil.com.ar-20151227112715-ztuasdhqm26hycug
Able to download TEDx.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
# coding: utf-8
 
2
from __future__ import unicode_literals
 
3
 
 
4
import re
 
5
 
 
6
from .common import InfoExtractor
 
7
from ..compat import compat_str
 
8
from ..utils import (
 
9
    int_or_none,
 
10
    unescapeHTML,
 
11
    ExtractorError,
 
12
    xpath_text,
 
13
)
 
14
 
 
15
 
 
16
class BiliBiliIE(InfoExtractor):
 
17
    _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?'
 
18
 
 
19
    _TESTS = [{
 
20
        'url': 'http://www.bilibili.tv/video/av1074402/',
 
21
        'md5': '2c301e4dab317596e837c3e7633e7d86',
 
22
        'info_dict': {
 
23
            'id': '1554319',
 
24
            'ext': 'flv',
 
25
            'title': '【金坷垃】金泡沫',
 
26
            'duration': 308313,
 
27
            'upload_date': '20140420',
 
28
            'thumbnail': 're:^https?://.+\.jpg',
 
29
            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
 
30
            'timestamp': 1397983878,
 
31
            'uploader': '菊子桑',
 
32
        },
 
33
    }, {
 
34
        'url': 'http://www.bilibili.com/video/av1041170/',
 
35
        'info_dict': {
 
36
            'id': '1041170',
 
37
            'title': '【BD1080P】刀语【诸神&异域】',
 
38
            'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~',
 
39
            'uploader': '枫叶逝去',
 
40
            'timestamp': 1396501299,
 
41
        },
 
42
        'playlist_count': 9,
 
43
    }]
 
44
 
 
45
    def _real_extract(self, url):
 
46
        mobj = re.match(self._VALID_URL, url)
 
47
        video_id = mobj.group('id')
 
48
        page_num = mobj.group('page_num') or '1'
 
49
 
 
50
        view_data = self._download_json(
 
51
            'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num),
 
52
            video_id)
 
53
        if 'error' in view_data:
 
54
            raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True)
 
55
 
 
56
        cid = view_data['cid']
 
57
        title = unescapeHTML(view_data['title'])
 
58
 
 
59
        doc = self._download_xml(
 
60
            'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid,
 
61
            cid,
 
62
            'Downloading page %s/%s' % (page_num, view_data['pages'])
 
63
        )
 
64
 
 
65
        if xpath_text(doc, './result') == 'error':
 
66
            raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True)
 
67
 
 
68
        entries = []
 
69
 
 
70
        for durl in doc.findall('./durl'):
 
71
            size = xpath_text(durl, ['./filesize', './size'])
 
72
            formats = [{
 
73
                'url': durl.find('./url').text,
 
74
                'filesize': int_or_none(size),
 
75
                'ext': 'flv',
 
76
            }]
 
77
            backup_urls = durl.find('./backup_url')
 
78
            if backup_urls is not None:
 
79
                for backup_url in backup_urls.findall('./url'):
 
80
                    formats.append({'url': backup_url.text})
 
81
            formats.reverse()
 
82
 
 
83
            entries.append({
 
84
                'id': '%s_part%s' % (cid, xpath_text(durl, './order')),
 
85
                'title': title,
 
86
                'duration': int_or_none(xpath_text(durl, './length'), 1000),
 
87
                'formats': formats,
 
88
            })
 
89
 
 
90
        info = {
 
91
            'id': compat_str(cid),
 
92
            'title': title,
 
93
            'description': view_data.get('description'),
 
94
            'thumbnail': view_data.get('pic'),
 
95
            'uploader': view_data.get('author'),
 
96
            'timestamp': int_or_none(view_data.get('created')),
 
97
            'view_count': int_or_none(view_data.get('play')),
 
98
            'duration': int_or_none(xpath_text(doc, './timelength')),
 
99
        }
 
100
 
 
101
        if len(entries) == 1:
 
102
            entries[0].update(info)
 
103
            return entries[0]
 
104
        else:
 
105
            info.update({
 
106
                '_type': 'multi_video',
 
107
                'id': video_id,
 
108
                'entries': entries,
 
109
            })
 
110
            return info