1
1
#!/usr/bin/env python
2
3
__license__ = 'GPL v3'
3
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
4
__docformat__ = 'restructuredtext en'
4
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
10
from calibre.web.feeds.news import BasicNewsRecipe
13
class NYTimesMobile(BasicNewsRecipe):
15
title = 'The New York Times'
16
__author__ = 'Kovid Goyal'
9
from calibre import entity_to_unicode
10
from calibre.web.feeds.recipes import BasicNewsRecipe
11
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
13
class NYTimes(BasicNewsRecipe):
15
title = 'New York Times Top Stories'
17
17
language = _('English')
18
description = 'Daily news from the New York Times (mobile version)'
19
timefmt = ' [%a, %d %b, %Y]'
20
multithreaded_fetch = True
21
max_articles_per_feed = 15
18
description = 'Top Stories from the New York Times'
20
# List of sections typically included in Top Stories. Use a keyword from the
21
# right column in the excludeSectionKeywords[] list to skip downloading that section
24
'business' : 'Business',
25
'diningwine' : 'Dining & Wine',
26
'editorials' : 'Editorials',
28
'magazine' : 'Magazine',
29
'mediaadvertising' : 'Media & Advertising',
30
'newyorkregion' : 'New York/Region',
32
'politics' : 'Politics',
33
'science' : 'Science',
35
'technology' : 'Technology',
36
'topstories' : 'Top Stories',
42
# By default, no sections are skipped.
43
excludeSectionKeywords = []
45
# To skip sections containing the word 'Sports' or 'Dining', use:
46
# excludeSectionKeywords = ['Sports', 'Dining']
48
# Fetch only Business and Technology
49
#excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
51
# Fetch only Top Stories
52
#excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
54
# The maximum number of articles that will be downloaded
55
max_articles_per_feed = 50
58
needs_subscription = True
59
remove_tags_after = dict(attrs={'id':['comments']})
60
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink',
61
'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
62
'columnGroup','entry-meta','entry-response module','jumpLink','nav',
63
'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
64
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive',
65
'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
66
'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
68
dict(name=['script', 'noscript', 'style','hr'])]
22
70
no_stylesheets = True
24
.h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left }
25
.h2 { font-size: large; font-weight: bold }
26
.credit { font-size: small }
27
.aut { font-weight: bold }
28
.bodycontent { font-family: serif }
32
dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}),
33
dict(name='a', href='/main')
36
dict(name='a', attrs={'name': 'bottom'})
39
def image_url_processor(self, baseurl, url):
40
return re.sub(r'(&|&).*', '', url)
71
extra_css = '.headline {text-align:left;}\n\
72
.byline {font:monospace; margin-bottom:0px;}\n\
73
.source {align:left;}\n\
74
.credit {text-align:right;font-size:smaller;}\n'
42
76
def get_browser(self):
43
return BasicNewsRecipe.get_browser(mobile_browser=True)
45
def download(self, for_lrf=False):
47
self.max_articles_per_feed = 10
48
return BasicNewsRecipe.download(self, for_lrf=for_lrf)
50
def process_section(self, href):
51
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
54
root = html.fromstring(raw)
55
for art in self.find_articles(root):
58
if x['title'] == art['title']:
61
if append: articles.append(art)
62
more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
65
href = more[0].get('href')
66
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
70
def find_articles(self, root):
71
for a in root.xpath('//a[@accesskey]'):
73
if href.startswith('http://'):
77
br = BasicNewsRecipe.get_browser()
78
if self.username is not None and self.password is not None:
79
br.open('http://www.nytimes.com/auth/login')
80
br.select_form(name='login')
81
br['USERID'] = self.username
82
br['PASSWORD'] = self.password
86
def index_to_soup(self, url_or_raw, raw=False):
88
OVERRIDE of class method
89
deals with various page encodings between index and articles
91
def get_the_soup(docEncoding, url_or_raw, raw=False) :
92
if re.match(r'\w+://', url_or_raw):
93
f = self.browser.open(url_or_raw)
97
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
76
url = 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1',
78
'title': a.text.strip(),
103
if not isinstance(_raw, unicode) and self.encoding:
104
_raw = _raw.decode(docEncoding, 'replace')
105
massage = list(BeautifulSoup.MARKUP_MASSAGE)
106
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
107
return BeautifulSoup(_raw, markupMassage=massage)
110
soup = get_the_soup( self.encoding, url_or_raw )
111
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
112
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
113
if docEncoding == '' :
114
docEncoding = self.encoding
116
if docEncoding != self.encoding :
117
soup = get_the_soup(docEncoding, url_or_raw)
85
121
def parse_index(self):
86
raw = self.index_to_soup('http://mobile.nytimes.com', raw=True)
87
root = html.fromstring(raw)
88
feeds = [('Latest news', list(self.find_articles(root)))]
90
for a in root.xpath('//a[starts-with(@href, "section")]'):
91
title = a.text.replace('»', '').replace(u'\xbb', '').strip()
92
print 'Processing section:', title
93
articles = self.process_section(a.get('href'))
94
feeds.append((title, articles))
98
def postprocess_html(self, soup, first_fetch):
99
for img in soup.findAll('img', width=True):
101
width = int(img['width'].replace('px', ''))
109
del img.parent['style']
125
feed = key = 'All Top Stories'
129
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
131
# Fetch the outer table
132
table = soup.find('table')
133
previousTable = table
136
# Find the deepest table containing the stories
138
table = table.find('table')
139
if table.find(text=re.compile('top stories start')) :
140
previousTable = table
143
table = previousTable
146
# There are multiple subtables, find the one containing the stories
147
for block in table.findAll('table') :
148
if block.find(text=re.compile('top stories start')) :
154
# Again there are multiple subtables, find the one containing the stories
155
for storyblock in table.findAll('table') :
156
if storyblock.find(text=re.compile('top stories start')) :
161
skipThisSection = False
163
# Within this table are <font face="times new roman, times, san serif"> entries
164
for tr in storyblock.findAllNext('tr'):
165
if tr.find('span') is not None :
167
sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
168
'times new roman,times, sans serif',
169
'times new roman, times, sans serif']})
175
# Get the Section title
176
for (x,i) in enumerate(sectionblock.contents) :
177
skipThisSection = False
178
# Extract the section title
179
if ('Comment' in str(i.__class__)) :
180
if 'start(name=' in i :
181
section = i[i.find('=')+1:-2]
183
if not self.sections.has_key(section) :
184
skipThisSection = True
187
# Check for excluded section
188
if len(self.excludeSectionKeywords):
189
key = self.sections[section]
190
excluded = re.compile('|'.join(self.excludeSectionKeywords))
191
if excluded.search(key) or articles.has_key(key):
192
if self.verbose : self.log("Skipping section %s" % key)
193
skipThisSection = True
196
# Get the bylines and descriptions
197
if not skipThisSection :
198
for (x,i) in enumerate(sectionblock.contents) :
200
# Extract the bylines and descriptions
201
if (i.string is not None) and \
202
(i.string.strip() > "") and \
203
not ('Comment' in str(i.__class__)) :
205
contentString = i.strip().encode('utf-8')
206
if contentString[0:3] == 'By ' :
207
bylines.append(contentString)
209
descriptions.append(contentString)
211
# Fetch the article titles and URLs
212
articleCount = len(sectionblock.findAll('span'))
213
for (i,span) in enumerate(sectionblock.findAll('span')) :
214
a = span.find('a', href=True)
217
url = re.sub(r'\?.*', '', a['href'])
218
url += '?pagewanted=all'
220
title = self.tag_to_string(a, use_alt=True)
221
# prepend the section name
222
title = self.sections[section] + " · " + title
224
if not isinstance(title, unicode):
225
title = title.decode('utf-8', 'replace')
227
description = descriptions[i]
229
if len(bylines) == articleCount :
234
# Check for duplicates
235
duplicateFound = False
236
if len(articles[feed]) > 1:
237
#print articles[feed]
238
for article in articles[feed] :
239
#print "comparing %s\n %s\n" % (url, article['url'])
240
if url == article['url'] :
241
duplicateFound = True
248
if not articles.has_key(feed):
250
articles[feed].append(
251
dict(title=title, url=url, date=pubdate,
252
description=description, author=author, content=''))
254
ans = self.sort_index_by(ans, {'Top Stories':-1})
255
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
259
def preprocess_html(self, soup):
260
refresh = soup.find('meta', {'http-equiv':'refresh'})
263
content = refresh.get('content').partition('=')[2]
264
raw = self.browser.open('http://www.nytimes.com'+content).read()
265
return BeautifulSoup(raw.decode('cp1252', 'replace'))
267
def postprocess_html(self,soup, True):
268
# Change class="kicker" to <h3>
269
kicker = soup.find(True, {'class':'kicker'})
270
if kicker is not None :
271
h3Tag = Tag(soup, "h3")
272
h3Tag.insert(0, self.tag_to_string(kicker))
273
kicker.replaceWith(h3Tag)
275
# Change captions to italic -1
276
for caption in soup.findAll(True, {'class':'caption'}) :
277
if caption is not None:
278
emTag = Tag(soup, "em")
279
#emTag['class'] = "caption"
280
#emTag['font-size-adjust'] = "-1"
281
emTag.insert(0, self.tag_to_string(caption))
282
hrTag = Tag(soup, 'hr')
283
emTag.insert(1, hrTag)
284
caption.replaceWith(emTag)
286
# Change <nyt_headline> to <h2>
287
headline = soup.find("nyt_headline")
288
if headline is not None :
289
h2tag = Tag(soup, "h2")
290
h2tag['class'] = "headline"
291
h2tag.insert(0, self.tag_to_string(headline))
292
headline.replaceWith(h2tag)
294
# Change <h1> to <h3> - used in editorial blogs
295
masthead = soup.find("h1")
296
if masthead is not None :
298
if masthead.a is not None :
299
del(masthead.a['href'])
300
h3tag = Tag(soup, "h3")
301
h3tag.insert(0, self.tag_to_string(masthead))
302
masthead.replaceWith(h3tag)
304
# Change <span class="bold"> to <b>
305
for subhead in soup.findAll(True, {'class':'bold'}) :
306
bTag = Tag(soup, "b")
307
bTag.insert(0, self.tag_to_string(subhead))
308
subhead.replaceWith(bTag)