1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
# Copyright 2012-2017 Facundo Batista
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 3, as published
# by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranties of
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
# PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
#
# For further info, check https://launchpad.net/encuentro
"""Main server process to get all info from BACUA web site."""
import logging
import re
import sys
from urllib import request
from bs4 import BeautifulSoup
# we execute this script from inside the directory; pylint: disable=W0403
import helpers
import srv_logger
PAGE_URL = (
"http://catalogo.bacua.gob.ar/"
"catalogo.php?buscador=&ordenamiento=title&idTematica=0&page=%s"
)
BACKEND = "http://backend.bacua.gob.ar/video.php?v=_%s"
IMG_URL = 'http://backend.bacua.gob.ar/img.php?idvideo=%s'
DURACION_REG = re.compile('</span>([^"]*)</h6>')
logger = logging.getLogger("BACUA")
def scrap_list_page(html):
"""Scrap the list page."""
pagina = re.compile(b'<p class="info_resultado_busca">([^"]*)</p>')
m = pagina.search(html).group(1)
s = re.sub(b'<[^<]+?>', b'', m)
t = re.compile(b'[0-9]+[0-9]')
h = t.search(s).group(0)
s = int(h) + 1
lista = []
for i in range(1, s):
lista.append(PAGE_URL % i)
return lista
@helpers.retryable(logger)
def get_list_pages():
"""Get list of pages."""
logger.info("Getting list of pages")
response = request.urlopen(PAGE_URL)
html = response.read()
lista = scrap_list_page(html)
logger.info(" got %d", len(lista))
return lista
def scrap_page(html):
"""Scrap the page."""
contents = []
sanitized = helpers.sanitize(html)
soup = BeautifulSoup(sanitized, "html.parser")
for i in soup.findAll("div", {"class": "video_muestra_catalogo"}):
for a_node in i.find_all("a"):
onclick = a_node.get("onclick", "")
if onclick.startswith("javascript:verVideo"):
break
else:
# video not really present for this program
continue
title = i.h4.contents[0].title().strip()
_sinop_cat = i.find("h5", {"class": "sinopsis_cat"}).contents
sinopsis = _sinop_cat[0] if _sinop_cat else u""
id_video = i.findAll("li")[1].a['href'].split("=")[1]
image_url = IMG_URL % (id_video,)
video_url = BACKEND % (id_video,)
d = {"duration": "?", "channel": "Bacua", "section": "Micro",
"description": sinopsis, "title": title, "url": video_url,
"episode_id": 'bacua_' + id_video, "image_url": image_url,
"season": None}
contents.append(d)
return contents
@helpers.retryable(logger)
def get_content(page_url):
"""Get content from a page."""
logger.info("Getting info for page %r", page_url)
u = request.urlopen(page_url)
html = u.read()
contents = scrap_page(html)
logger.info(" got %d contents", len(contents))
return contents
def get_all_data():
"""Get everything."""
all_programs = []
for page_url in get_list_pages():
contents = get_content(page_url)
for content in contents:
all_programs.append(content)
logger.info("Done! Total programs: %d", len(all_programs))
return all_programs
def main():
"""Entry Point."""
all_data = get_all_data()
helpers.save_file("bacua-v05", all_data)
if __name__ == '__main__':
shy = len(sys.argv) > 1 and sys.argv[1] == '--shy'
srv_logger.setup_log(shy)
main()
|