4
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
6
Fetches the last 7 days of featured articles from slate.com
10
from calibre.web.feeds.recipes import BasicNewsRecipe
11
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
13
class Slate(BasicNewsRecipe):
14
# Method variables for customizing downloads
16
description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
17
__author__ = 'GRiker@hotmail.com'
18
language = _('English')
19
max_articles_per_feed = 40
23
simultaneous_downloads = 5
30
# Method variables for customizing feed parsing
32
use_embedded_content = None
34
# Method variables for pre/post processing of HTML
35
remove_tags = [ dict(name=['link','style']),
36
dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
37
'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
38
'fray_article_discussion','bizbox_sponsored_links_bottom',
39
'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
40
'article_top_wedge','content-top','page-title',
41
'block-today039s-business-press-archives','block-blog-roll',
42
'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
43
'service-links-bottom','comments','ft']),
44
dict(attrs={'class':['fray_article_links','clearing','nav',
45
'service-links service-links-stack','yui-b last',
46
'read-more-comments']})]
47
extra_css = '.headline {text-align:left;}\n\
48
.byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\
49
.dateline {text-align:left; height:0pt;}\n\
50
.source {align:left;}\n\
51
.credit {text-align:right;font-size:smaller;}\n'
53
baseURL = 'http://slate.com'
56
def tag_to_strings(self, tag):
59
if isinstance(tag, basestring):
62
for item in tag.contents:
63
if isinstance(item, (NavigableString, CData)):
64
strings.append(item.string)
65
elif isinstance(item, Tag):
66
res = self.tag_to_string(item)
71
def extract_sections(self):
72
soup = self.index_to_soup( self.baseURL )
74
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
75
soup = soup.find(True, attrs={'id':'toc_links_container'})
77
todays_section = soup.find(True, attrs={'class':'todaydateline'})
78
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
79
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
81
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
82
for older_section in older_section_dates :
83
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
85
headline_stories = soup_top_stories.find('ul')
86
section_lists = soup.findAll('ul')
87
# Prepend the headlines to the first section
88
section_lists[0].insert(0,headline_stories)
91
for section in section_lists :
92
sections.append(section)
97
def extract_section_articles(self, sections_html) :
98
soup = self.index_to_soup(str(sections_html))
99
sections = soup.findAll('ul')
104
for (i,section) in enumerate(sections) :
106
# Get the section name
107
if section.has_key('id') :
108
key = self.section_dates[i]
114
# Get the section article_list
115
article_list = section.findAll('li')
117
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
118
excludedTitleKeywords = ['Gabfest','Slate V']
119
excludedAuthorKeywords = ['Prudence']
121
# Extract the article attributes
122
for article in article_list :
123
bylines = self.tag_to_strings(article)
124
url = article.a['href']
126
full_title = self.tag_to_string(article)
132
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
133
description = "A summary of what's in the major U.S. newspapers."
135
if len(bylines) == 3 :
136
author = bylines[2].strip()
137
author = re.sub('[\r][\n][\t][\t\t]','', author)
138
author = re.sub(',','', author)
139
if bylines[1] is not None :
140
description = bylines[1]
141
full_byline = self.tag_to_string(article)
142
if full_byline.find('major U.S. newspapers') > 0 :
143
description = "A summary of what's in the major U.S. newspapers."
146
if len(bylines) > 3 and author is not None:
148
for (i,substring) in enumerate(bylines[3:]) :
149
#print "substring: %s" % substring.encode('cp1252')
150
author += substring.strip()
151
if i < len(bylines[3:]) :
154
# Skip articles whose descriptions contain excluded keywords
155
if description is not None :
156
excluded = re.compile('|'.join(excludedDescriptionKeywords))
157
found_excluded = excluded.search(description)
161
# Skip articles whose title contain excluded keywords
162
if full_title is not None :
163
excluded = re.compile('|'.join(excludedTitleKeywords))
164
#self.log("evaluating full_title: %s" % full_title)
165
found_excluded = excluded.search(full_title)
169
# Skip articles whose author contain excluded keywords
170
if author is not None :
171
excluded = re.compile('|'.join(excludedAuthorKeywords))
172
found_excluded = excluded.search(author)
176
skip_this_article = False
177
# Check to make sure we're not adding a duplicate
178
for article in articles[key] :
179
if article['url'] == url :
180
skip_this_article = True
183
if skip_this_article :
186
# Build the dictionary entry for this article
188
if not articles.has_key(feed) :
190
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
191
author=author, content=''))
192
# Promote 'newspapers' to top
193
for (i,article) in enumerate(articles[feed]) :
194
if article['description'] is not None :
195
if article['description'].find('newspapers') > 0 :
196
articles[feed].insert(0,articles[feed].pop(i))
199
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
200
ans = self.remove_duplicates(ans)
203
def flatten_document(self, ans):
205
for (i,section) in enumerate(ans) :
206
for article in section[1] :
207
flat_articles.append(article)
208
flat_section = ['All Articles', flat_articles]
209
flat_ans = [flat_section]
213
def remove_duplicates(self, ans):
214
for (i,section) in enumerate(ans) :
215
for article in section[1] :
216
for (j,subsequent_section) in enumerate(ans[i+1:]) :
217
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
218
if article['url'] == subsequent_article['url'] :
219
del subsequent_section[1][k]
222
def print_version(self, url) :
223
return url + 'pagenum/all/'
226
def parse_index(self) :
227
sections = self.extract_sections()
228
section_list = self.extract_section_articles(sections)
229
section_list = self.flatten_document(section_list)
233
def postprocess_html(self, soup, first_fetch) :
234
# Fix up dept_kicker as <h3><em>
235
dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
236
if dept_kicker is not None :
237
kicker_strings = self.tag_to_strings(dept_kicker)
238
kicker = kicker_strings[2] + kicker_strings[3]
239
kicker = re.sub('.','',kicker)
240
h3Tag = Tag(soup, "h3")
241
emTag = Tag(soup, "em")
242
h3Tag.insert(0, emTag)
243
emTag.insert(0,kicker)
244
dept_kicker.replaceWith(h3Tag)
246
# Change <h1> to <h2>
247
headline = soup.find("h1")
248
if headline is not None :
249
h2tag = Tag(soup, "h2")
250
h2tag['class'] = "headline"
251
strs = self.tag_to_strings(headline)
253
for (i,substr) in enumerate(strs) :
255
if i < len(strs) -1 :
257
h2tag.insert(0, result)
258
headline.replaceWith(h2tag)
260
# Fix up the concatenated byline and dateline
261
byline = soup.find(True,attrs={'class':'byline'})
262
if byline is not None :
263
bylineTag = Tag(soup,'div')
264
bylineTag['class'] = 'byline'
265
bylineTag.insert(0,self.tag_to_string(byline))
266
byline.replaceWith(bylineTag)
268
dateline = soup.find(True, attrs={'class':'dateline'})
269
if dateline is not None :
270
datelineTag = Tag(soup, 'div')
271
datelineTag['class'] = 'dateline'
272
datelineTag.insert(0,self.tag_to_string(dateline))
273
dateline.replaceWith(datelineTag)
275
# Change captions to italic, add <hr>
276
for caption in soup.findAll(True, {'class':'caption'}) :
277
if caption is not None:
278
emTag = Tag(soup, "em")
279
emTag.insert(0, '<br />' + self.tag_to_string(caption))
280
hrTag = Tag(soup, 'hr')
281
emTag.insert(1, hrTag)
282
caption.replaceWith(emTag)
286
def postprocess_book(self, oeb, opts, log) :
288
def extract_byline(href) :
289
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
290
byline = soup.find(True,attrs={'class':'byline'})
291
if byline is not None:
292
return self.tag_to_string(byline,use_alt=False)
296
def extract_description(href) :
297
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
298
paragraphs = soup.findAll('p')
299
for p in paragraphs :
300
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
301
self.tag_to_string(p,use_alt=False).startswith('Posted '):
304
images = p.findAll(True, attrs={'class':'imagewrapper'})
305
for image in images :
307
return self.tag_to_string(p,use_alt=False)[:200] + '...'
311
if oeb.toc.depth() == 2 :
312
for article in oeb.toc :
313
if article.author is None :
314
article.author = extract_byline(article.href)
316
if article.description is None :
317
article.description = extract_description(article.href)
320
elif oeb.toc.depth() == 3 :
321
for section in oeb.toc :
322
for article in section :
323
if article.author is None :
324
article.author = extract_byline(article.href)
326
if article.description is None :
327
article.description = extract_description(article.href)