4
from calibre.web.feeds.news import BasicNewsRecipe
5
from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
6
from calibre import strftime
10
Still can't figure out why I'm getting strange characters. Esp. the Great Movies descriptions in the TOC.
11
Anyone help me figure that out?
14
2011-02-19: Version 2: Added "Oscars" section and fixed date problem
17
class Ebert(BasicNewsRecipe):
19
__author__ = 'Shane Erstad'
21
description = 'Roger Ebert Movie Reviews'
22
publisher = 'Chicago Sun Times'
25
max_articles_per_feed = 100
27
use_embedded_content = False
29
masthead_url = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
31
remove_empty_feeds = False
32
PREFIX = 'http://rogerebert.suntimes.com'
33
patternReviews = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
34
patternCommentary = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
35
patternPeople = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
36
patternOscars = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?OSCARS.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
37
patternGlossary = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
41
conversion_options = {
42
'comment' : description
44
, 'publisher' : publisher
45
, 'language' : language
46
, 'linearize_tables' : True
51
(u'Reviews' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' )
52
,(u'Commentary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY')
53
,(u'Great Movies' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08')
54
,(u'People' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE')
55
,(u'Oscars' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=OSCARS')
56
,(u'Glossary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
60
preprocess_regexps = [
61
(re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL|re.IGNORECASE),
67
def print_version(self, url):
68
return url + '&template=printart'
70
def parse_index(self):
72
lfeeds = self.get_feeds()
73
for feedobj in lfeeds:
74
feedtitle, feedurl = feedobj
75
self.log('\tFeedurl: ', feedurl)
76
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
78
page = urllib2.urlopen(feedurl).read()
80
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
81
pattern = self.patternReviews
82
elif feedtitle == 'Commentary':
83
pattern = self.patternCommentary
84
elif feedtitle == 'People':
85
pattern = self.patternPeople
86
elif feedtitle == 'Glossary':
87
pattern = self.patternGlossary
88
elif feedtitle == 'Oscars':
89
pattern = self.patternOscars
92
regex = re.compile(pattern, re.IGNORECASE|re.DOTALL)
94
for match in regex.finditer(page):
95
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
96
movietitle = match.group(1)
97
thislink = match.group(2)
98
description = match.group(3)
99
elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
100
thislink = match.group(1)
101
description = match.group(2)
105
for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
106
thisurl = self.PREFIX + link['href']
107
thislinktext = self.tag_to_string(link)
109
if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
110
thistitle = movietitle
111
elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
112
thistitle = thislinktext
118
pattern2 = r'AID=\/(.*?)\/'
119
reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
120
match2 = reg2.search(thisurl)
122
c = time.strptime(match2.group(1),"%Y%m%d")
123
mydate=strftime("%A, %B %d, %Y", c)
125
mydate = strftime("%A, %B %d, %Y")
130
,'date' :' [' + mydate + ']'
132
,'description':description
134
totalfeeds.append((feedtitle, articles))