~facundo/encuentro/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Copyright 2012-2017 Facundo Batista
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 3, as published
# by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranties of
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
# PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# For further info, check  https://launchpad.net/encuentro

"""Main server process to get all info from BACUA web site."""

import logging
import re
import sys

from urllib import request

from bs4 import BeautifulSoup

# we execute this script from inside the directory; pylint: disable=W0403
import helpers
import srv_logger

PAGE_URL = (
    "http://catalogo.bacua.gob.ar/"
    "catalogo.php?buscador=&ordenamiento=title&idTematica=0&page=%s"
)
BACKEND = "http://backend.bacua.gob.ar/video.php?v=_%s"
IMG_URL = 'http://backend.bacua.gob.ar/img.php?idvideo=%s'

DURACION_REG = re.compile('</span>([^"]*)</h6>')

logger = logging.getLogger("BACUA")


def scrap_list_page(html):
    """Scrap the list page."""
    pagina = re.compile(b'<p class="info_resultado_busca">([^"]*)</p>')
    m = pagina.search(html).group(1)
    s = re.sub(b'<[^<]+?>', b'', m)
    t = re.compile(b'[0-9]+[0-9]')
    h = t.search(s).group(0)
    s = int(h) + 1
    lista = []
    for i in range(1, s):
        lista.append(PAGE_URL % i)
    return lista


@helpers.retryable(logger)
def get_list_pages():
    """Get list of pages."""
    logger.info("Getting list of pages")
    response = request.urlopen(PAGE_URL)
    html = response.read()
    lista = scrap_list_page(html)
    logger.info("    got %d", len(lista))
    return lista


def scrap_page(html):
    """Scrap the page."""
    contents = []
    sanitized = helpers.sanitize(html)
    soup = BeautifulSoup(sanitized, "html.parser")
    for i in soup.findAll("div", {"class": "video_muestra_catalogo"}):
        for a_node in i.find_all("a"):
            onclick = a_node.get("onclick", "")
            if onclick.startswith("javascript:verVideo"):
                break
        else:
            # video not really present for this program
            continue

        title = i.h4.contents[0].title().strip()
        _sinop_cat = i.find("h5", {"class": "sinopsis_cat"}).contents
        sinopsis = _sinop_cat[0] if _sinop_cat else u""
        id_video = i.findAll("li")[1].a['href'].split("=")[1]
        image_url = IMG_URL % (id_video,)
        video_url = BACKEND % (id_video,)

        d = {"duration": "?", "channel": "Bacua", "section": "Micro",
             "description": sinopsis, "title": title, "url": video_url,
             "episode_id": 'bacua_' + id_video, "image_url": image_url,
             "season": None}
        contents.append(d)
    return contents


@helpers.retryable(logger)
def get_content(page_url):
    """Get content from a page."""
    logger.info("Getting info for page %r", page_url)
    u = request.urlopen(page_url)
    html = u.read()
    contents = scrap_page(html)
    logger.info("    got %d contents", len(contents))
    return contents


def get_all_data():
    """Get everything."""
    all_programs = []
    for page_url in get_list_pages():
        contents = get_content(page_url)
        for content in contents:
            all_programs.append(content)
    logger.info("Done! Total programs: %d", len(all_programs))
    return all_programs


def main():
    """Entry Point."""
    all_data = get_all_data()
    helpers.save_file("bacua-v05", all_data)


if __name__ == '__main__':
    shy = len(sys.argv) > 1 and sys.argv[1] == '--shy'
    srv_logger.setup_log(shy)
    main()