~facundo/encuentro/trunk

« back to all changes in this revision

Viewing changes to server/get_bacua_episodes.py

Committer: Facundo Batista
Date: 2017-06-23 00:35:18 UTC
mto: This revision was merged to the branch mainline in revision 296.
Revision ID: facundo@taniquetil.com.ar-20170623003518-sv60l0eib3rjx882

Huge renewal refactor.

files added:
run

tests/ej-dqsv-11.swf

tests/ej-encuen-bestvideo-1.html

tests/ej-encuen-bestvideo-2.html

tests/ej-encuen-list-1.html

tests/ej-encuen-program-1.html

tests/ej-encuen-program-2.html

tests/ej-encuen-program-3.html

tests/ej-encuen-series-1.html

tests/images/swf_image_11ab.jpeg

tests/images/swf_image_11ac.jpeg

tests/images/swf_image_11lr.jpeg

tests/images/swf_image_11sv.jpeg

files removed:
server/get_cda_episodes.py

server/scrapers_cda.py

tests/ej-cda-main-1.json

tests/ej-encuen-programa_1.html

tests/ej-encuen-programa_2.html

tests/test_cda_scrapers.py

files modified:
.bzrignore

bin/encuentro

encuentro/main.py

encuentro/network.py

encuentro/ui/main.py

encuentro/update.py

requirements_py3.txt

server/get_bacua_episodes.py

server/get_conect_episodes.py

server/get_dqsv_episodes.py

server/get_encuen_episodes.py

server/helpers.py

server/scrapers_conect.py

server/scrapers_dqsv.py

server/scrapers_encuen.py

server/srv_logger.py

test

tests/test_bacua_scrapers.py

tests/test_conect_scrapers.py

tests/test_dqsv_scrapers.py

tests/test_encuen_scrapers.py

tests/test_helpers.py

Show diffs side-by-side

added added

removed removed

server/get_bacua_episodes.py

# -*- coding: utf8 -*-

# This program is free software: you can redistribute it and/or modify it

# under the terms of the GNU General Public License version 3, as published

import logging

import re

import sys

import urllib2

from urllib import request

from bs4 import BeautifulSoup

def scrap_list_page(html):

"""Scrap the list page."""

pagina = re.compile('<p class="info_resultado_busca">([^"]*)</p>')

pagina = re.compile(b'<p class="info_resultado_busca">([^"]*)</p>')

m = pagina.search(html).group(1)

s = re.sub('<[^<]+?>', '', m)

t = re.compile('[0-9]+[0-9]')

s = re.sub(b'<[^<]+?>', b'', m)

t = re.compile(b'[0-9]+[0-9]')

h = t.search(s).group(0)

s = int(h) + 1

lista = []

def get_list_pages():

"""Get list of pages."""

logger.info("Getting list of pages")

response = urllib2.urlopen(PAGE_URL)

response = request.urlopen(PAGE_URL)

html = response.read()

lista = scrap_list_page(html)

logger.info(" got %d", len(lista))

"""Scrap the page."""

contents = []

sanitized = helpers.sanitize(html)

soup = BeautifulSoup(sanitized)

soup = BeautifulSoup(sanitized, "html.parser")

for i in soup.findAll("div", {"class": "video_muestra_catalogo"}):

for a_node in i.find_all("a"):

onclick = a_node.get("onclick", "")

def get_content(page_url):

100

"""Get content from a page."""

101

100

logger.info("Getting info for page %r", page_url)

102

u = urllib2.urlopen(page_url)

101

u = request.urlopen(page_url)

103

102

html = u.read()

104

103

contents = scrap_page(html)

105

104

logger.info(" got %d contents", len(contents))

Older »