2
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
4
Profile to download CNN
7
from calibre.ebooks.lrf.web.profiles import DefaultProfile
9
class CNN(DefaultProfile):
13
timefmt = ' [%d %b %Y]'
14
html_description = True
18
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [
19
(r'<head>.*?<title', lambda match : '<head><title'),
20
(r'</title>.*?</head>', lambda match : '</title></head>'),
21
(r'<body.*?<\!\-\-Article.*?>', lambda match : ''),
22
(r'<\!\-\-Article End\-\->.*?</body>', lambda match : '</body>'),
23
(r'(</h\d>)<ul>.*?</ul>', lambda match : match.group(1)), # drop story highlights
24
(r'<h2>(.*?)</h2><h1>(.*?)</h1>', lambda match : '<h1>' + match.group(1) + '</h1><h2>' + match.group(2) + '</h2>'), # sports uses h2 for main title and h1 for subtitle (???) switch these around
25
(r'<span class="cnnEmbeddedMosLnk">.*?</span>', lambda match : ''), # drop 'watch more' links
26
(r'(<div class="cnnstorybody">).*?(<p)', lambda match : match.group(1) + match.group(2)), # drop sports photos
27
(r'</?table.*?>|</?tr.*?>|</?td.*?>', lambda match : ''), # drop table formatting
28
(r'<div class="cnnendofstorycontent".*?>.*?</div>', lambda match : ''), # drop extra business links
29
(r'<a href="#TOP">.*?</a>', lambda match : '') # drop business 'to top' link
32
def print_version(self, url):
33
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
37
('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),
38
('World', 'http://rss.cnn.com/rss/cnn_world.rss'),
39
('U.S.', 'http://rss.cnn.com/rss/cnn_us.rss'),
40
('Sports', 'http://rss.cnn.com/rss/si_topstories.rss'),
41
('Business', 'http://rss.cnn.com/rss/money_latest.rss'),
42
('Politics', 'http://rss.cnn.com/rss/cnn_allpolitics.rss'),
43
('Law', 'http://rss.cnn.com/rss/cnn_law.rss'),
44
('Technology', 'http://rss.cnn.com/rss/cnn_tech.rss'),
45
('Science & Space', 'http://rss.cnn.com/rss/cnn_space.rss'),
46
('Health', 'http://rss.cnn.com/rss/cnn_health.rss'),
47
('Entertainment', 'http://rss.cnn.com/rss/cnn_showbiz.rss'),
48
('Education', 'http://rss.cnn.com/rss/cnn_education.rss'),
49
('Offbeat', 'http://rss.cnn.com/rss/cnn_offbeat.rss'),
50
('Most Popular', 'http://rss.cnn.com/rss/cnn_mostpopular.rss')