~facundo/encuentro/trunk

« back to all changes in this revision

Viewing changes to external/youtube-dl/youtube_dl/extractor/sexu.py

  • Committer: Facundo Batista
  • Date: 2015-12-27 11:27:15 UTC
  • mto: This revision was merged to the branch mainline in revision 274.
  • Revision ID: facundo@taniquetil.com.ar-20151227112715-ztuasdhqm26hycug
Able to download TEDx.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
from __future__ import unicode_literals
 
2
 
 
3
import re
 
4
 
 
5
from .common import InfoExtractor
 
6
 
 
7
 
 
8
class SexuIE(InfoExtractor):
 
9
    _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P<id>\d+)'
 
10
    _TEST = {
 
11
        'url': 'http://sexu.com/961791/',
 
12
        'md5': 'ff615aca9691053c94f8f10d96cd7884',
 
13
        'info_dict': {
 
14
            'id': '961791',
 
15
            'ext': 'mp4',
 
16
            'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
 
17
            'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54',
 
18
            'categories': list,  # NSFW
 
19
            'thumbnail': 're:https?://.*\.jpg$',
 
20
            'age_limit': 18,
 
21
        }
 
22
    }
 
23
 
 
24
    def _real_extract(self, url):
 
25
        video_id = self._match_id(url)
 
26
        webpage = self._download_webpage(url, video_id)
 
27
 
 
28
        quality_arr = self._search_regex(
 
29
            r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string')
 
30
        formats = [{
 
31
            'url': fmt[0].replace('\\', ''),
 
32
            'format_id': fmt[1],
 
33
            'height': int(fmt[1][:3]),
 
34
        } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)]
 
35
        self._sort_formats(formats)
 
36
 
 
37
        title = self._html_search_regex(
 
38
            r'<title>([^<]+)\s*-\s*Sexu\.Com</title>', webpage, 'title')
 
39
 
 
40
        description = self._html_search_meta(
 
41
            'description', webpage, 'description')
 
42
 
 
43
        thumbnail = self._html_search_regex(
 
44
            r'image:\s*"([^"]+)"',
 
45
            webpage, 'thumbnail', fatal=False)
 
46
 
 
47
        categories_str = self._html_search_meta(
 
48
            'keywords', webpage, 'categories')
 
49
        categories = (
 
50
            None if categories_str is None
 
51
            else categories_str.split(','))
 
52
 
 
53
        return {
 
54
            'id': video_id,
 
55
            'title': title,
 
56
            'description': description,
 
57
            'thumbnail': thumbnail,
 
58
            'categories': categories,
 
59
            'formats': formats,
 
60
            'age_limit': 18,
 
61
        }