1
## Copyright (C) 2008 B.Scott Wxby [bswxby] &
2
## Copyright (C) 2007 David Chen SonyReader<at>DaveChen<dot>org
4
## This program is free software; you can redistribute it and/or modify
5
## it under the terms of the GNU General Public License as published by
6
## the Free Software Foundation; either version 2 of the License, or
7
## (at your option) any later version.
9
## This program is distributed in the hope that it will be useful,
10
## but WITHOUT ANY WARRANTY; without even the implied warranty of
11
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
## GNU General Public License for more details.
14
## Version 0.3-2008_2_28
15
## Based on WIRED.py by David Chen, 2007, and newsweek.py, bbc.py, nytimes.py by Kovid Goyal
16
## https://calibre.kovidgoyal.net/wiki/UserProfiles
19
## >web2lrf --user-profile nasa.py
20
## Comment out the RSS feeds you don't want in the last section below
23
## NASA [YearMonthDate Time].lrf
26
Custom User Profile to download RSS News Feeds and Articles from Wired.com
31
from calibre.ebooks.lrf.web.profiles import DefaultProfile
33
class NASA(DefaultProfile):
37
timefmt = ' [%Y%b%d %H%M]'
38
html_description = True
41
## Don't grab articles more than 30 days old
44
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
46
## Fix the encoding to UTF-8
47
(r'<meta http-equiv="Content-Type" content="text/html; charset=(\S+)"', lambda match : match.group().replace(match.group(1), 'UTF-8')),
49
## Remove any banners/links/ads/cruft before the body of the article.
50
(r'<body.*?((<div id="article_body">)|(<div id="st-page-maincontent">)|(<div id="containermain">)|(<p class="ap-story-p">)|(<!-- img_nav -->))', lambda match: '<body><div>'),
52
## Remove any links/ads/comments/cruft from the end of the body of the article.
53
(r'((<!-- end article content -->)|(<div id="st-custom-afterpagecontent">)|(<p class="ap-story-p">©)|(<div class="entry-footer">)|(<div id="see_also">)|(<p>Via <a href=)|(<div id="ss_nav">)).*?</html>', lambda match : '</div></body></html>'),
55
## Correctly embed in-line images by removing the surrounding javascript that will be ignored in the conversion
56
(r'<a.*?onclick.*?>.*?(<img .*?>)', lambda match: match.group(1),),
58
## This removes header and footer information from each print version.
59
(re.compile(r'<!-- Top Header starts -->.*?<!-- Body starts -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
60
(re.compile(r'<hr align="center" width="200"><p align="center">.*?<!-- Press Release standard text ends -->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
61
(re.compile(r'<!-- Top Header starts -->.*?<!---->', re.IGNORECASE | re.DOTALL), lambda match : '<New Stuff>'),
63
## This removes the "download image" of various sizes from the Image of the day.
64
(re.compile(r'(?is)<div id="download_image_box_print">.*?<div id="caption_region_print">'), lambda match : '<New Stuff>'),
70
## NASA's print pages differ only by the ending "_prt.htm", so I've replaced them below.
72
def print_version(self, url):
73
return url.replace('.html', '_prt.htm')
75
## Comment out the feeds you don't want retrieved.
76
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
77
## If you want one of these at the top, append a space in front of the name.
82
(' Breaking News', 'http://www.nasa.gov/rss/breaking_news.rss'),
83
('Image of the Day', 'http://www.nasa.gov/rss/image_of_the_day.rss'),
84
('Moon and Mars Exploration', 'http://www.nasa.gov/rss/moon_mars.rss'),
85
('Shuttle and Station News', 'http://www.nasa.gov/rss/shuttle_station.rss'),
86
('Solar System News', 'http://www.nasa.gov/rss/solar_system.rss'),
87
('Universe News', 'http://www.nasa.gov/rss/universe.rss'),
88
('Earth News', 'http://www.nasa.gov/rss/earth.rss'),
89
('Aeronautics News', 'http://www.nasa.gov/rss/aeronautics.rss'),