2
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
4
Contains the Base Profiles that can be used to easily create profiles to download
8
import tempfile, time, calendar, re, operator, atexit, shutil, os
9
from htmlentitydefs import name2codepoint
10
from email.utils import formatdate
12
from calibre import __appname__, iswindows, browser, strftime
13
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
16
class DefaultProfile(object):
18
#: The title to use for the LRF file
20
title = 'Default Profile'
22
#: Maximum number of articles to download from each feed
24
max_articles_per_feed = 10
26
#: If True process the <description> element of the feed as HTML
28
html_description = True
30
#: How many days old should the oldest article downloaded from the feeds be
34
#: Recommend frequency at which to download this profile. In days.
35
recommended_frequency = 7
37
#: Number of levels of links to follow
41
#: Maximum number of files to download
45
#: Delay between consecutive downloads in seconds
49
#: Timeout for fetching files from server in seconds
53
#: The format string for the date shown on the first page
55
timefmt = ' [%a %d %b %Y]'
57
#: The order of elements to search for a URL when parsing the RSS feed. You
58
#: can replace these elements by completely arbitrary elements to customize
60
#: @type: list of strings
61
url_search_order = ['guid', 'link']
63
#: The format string used to parse the publication date in the RSS feed.
64
#: If set to None some default heuristics are used, these may fail,
65
#: in which case set this to the correct string or re-implement
66
#: L{DefaultProfile.strptime} in your subclass.
67
#: @type: string or None
70
#: If True will look for a publication date for each article.
71
#: If False assumes the publication date is the current time.
75
#: Max number of characters in the short description.
76
#: Used by L{FullContentProfile}
80
#: If True stylesheets are not downloaded and processed
81
#: Convenient flag to disable loading of stylesheets for websites
82
#: that have overly complex stylesheets unsuitable for conversion
85
no_stylesheets = False
87
#: If False articles with the same title in the same feed
88
#: are not downloaded multiple times
90
allow_duplicates = False
92
#: If True the GUI will ask the user for a username and password
93
#: to use while downloading
95
needs_subscription = False
97
#: Specify an override encoding for sites that have an incorrect
98
#: charset specification. THe most common being specifying latin1 and
102
#: List of regular expressions that determines which links to follow
103
#: If empty, it is ignored.
104
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
105
#: @type: list of strings
108
#: List of regular expressions that determines which links to ignore
109
#: If empty it is ignored
110
#: Only one of L{match_regexps} or L{filter_regexps} should be defined
111
#: @type: list of strings
114
#: List of options to pass to html2lrf, to customize conversion
116
#: @type: list of strings
117
html2lrf_options = []
119
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
120
#: list should be a two element tuple. The first element of the tuple should
121
#: be a compiled regular expression and the second a callable that takes
122
#: a single match object and returns a string to replace the match.
123
#: @type: list of tuples
124
preprocess_regexps = []
126
# See the built-in profiles for examples of these settings.
128
#: The URL of the website
133
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
137
Return a list of RSS feeds to fetch for this profile. Each element of the list
138
must be a 2-element tuple of the form (title, url).
141
raise NotImplementedError
145
def print_version(cls, url):
147
Take a URL pointing to an article and returns the URL pointing to the
148
print version of the article.
153
def get_browser(cls):
155
Return a browser instance used to fetch documents from the web.
157
If your profile requires that you login first, override this method
158
in your subclass. See for example the nytimes profile.
165
def __init__(self, logger, verbose=False, username=None, password=None, lrf=True):
167
self.username = username
168
self.password = password
169
self.verbose = verbose
171
self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
172
self.browser = self.get_browser()
174
self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
175
except NotImplementedError:
177
atexit.register(cleanup, self.temp_dir)
179
def build_index(self):
180
'''Build an RSS based index.html'''
181
articles = self.parse_feeds()
182
encoding = 'utf-8' if self.encoding is None else self.encoding
183
def build_sub_index(title, items):
185
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
186
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
188
if not item.has_key('date'):
189
item['date'] = time.strftime('%a, %d %b', time.localtime())
200
'''%dict(title=title, items=ilist.rstrip())
204
categories = articles.keys()
206
for category in categories:
208
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
209
prefix = 'file:' if iswindows else ''
210
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
211
src = build_sub_index(category, articles[category])
212
open(cfile, 'wb').write(src.encode(encoding))
215
if not isinstance(title, unicode):
216
title = unicode(title, 'utf-8', 'replace')
221
<div style='text-align: right; font-weight: bold'>%(date)s</div>
227
'''%dict(date=strftime('%a, %d %B, %Y'),
228
categories=clist, title=title)
229
index = os.path.join(self.temp_dir, 'index.html')
230
open(index, 'wb').write(src.encode(encoding))
236
def tag_to_string(cls, tag, use_alt=True):
238
Convenience method to take a BeautifulSoup Tag and extract the text from it
239
recursively, including any CDATA sections and alt tag attributes.
240
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
241
@type use_alt: boolean
242
@return: A unicode (possibly empty) object
243
@rtype: unicode string
247
if isinstance(tag, basestring):
250
for item in tag.contents:
251
if isinstance(item, (NavigableString, CData)):
252
strings.append(item.string)
253
elif isinstance(item, Tag):
254
res = cls.tag_to_string(item)
257
elif use_alt and item.has_key('alt'):
258
strings.append(item['alt'])
259
return u''.join(strings)
261
def get_article_url(self, item):
263
Return the article URL given an item Tag from a feed, or None if no valid URL is found
264
@type item: BeatifulSoup.Tag
265
@param item: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
266
@rtype: string or None
269
for element in self.url_search_order:
270
url = item.find(element.lower())
276
def parse_feeds(self, require_url=True):
278
Create list of articles from a list of feeds.
279
@param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
280
@type require_url: boolean
282
@return: A dictionary whose keys are feed titles and whose values are each
283
a list of dictionaries. Each list contains dictionaries of the form::
285
'title' : article title,
286
'url' : URL of print version,
287
'date' : The publication date of the article as a string,
288
'description' : A summary of the article
289
'content' : The full article (can be an empty string). This is used by FullContentProfile
293
feeds = self.get_feeds()
295
for title, url in feeds:
297
src = self.browser.open(url).read()
298
except Exception, err:
299
self.logger.error('Could not fetch feed: %s\nError: %s'%(url, err))
301
self.logger.exception(' ')
305
added_articles[title] = []
306
soup = BeautifulStoneSoup(src)
307
for item in soup.findAll('item'):
309
atitle = item.find('title')
313
atitle = self.tag_to_string(atitle)
315
pubdate = item.find('pubdate')
317
pubdate = item.find('dc:date')
318
if not pubdate or not pubdate.string:
319
pubdate = formatdate()
320
pubdate = self.tag_to_string(pubdate)
321
pubdate = pubdate.replace('+0000', 'GMT')
324
url = self.get_article_url(item)
325
url = self.tag_to_string(url)
326
if require_url and not url:
327
self.logger.debug('Skipping article %s as it does not have a link url'%atitle)
331
purl = self.print_version(url)
332
except Exception, err:
333
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
336
content = item.find('content:encoded')
338
content = item.find('description')
340
content = self.process_html_description(content, strip_links=False)
347
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
348
'date' : pubdate if self.use_pubdate else formatdate(),
351
delta = time.time() - d['timestamp']
352
if not self.allow_duplicates:
353
if d['title'] in added_articles[title]:
355
added_articles[title].append(d['title'])
356
if delta > self.oldest_article*3600*24:
359
except Exception, err:
361
self.logger.exception('Error parsing article:\n%s'%(item,))
365
for c in item.findAll('description'):
366
desc = self.tag_to_string(c)
369
d['description'] = self.process_html_description(desc) if self.html_description else desc.string
371
d['description'] = ''
372
articles[title].append(d)
373
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
374
articles[title] = articles[title][:self.max_articles_per_feed+1]
375
#for item in articles[title]:
376
# item.pop('timestamp')
377
if not articles[title]:
384
Called after LRF file has been generated. Use it to do any cleanup like
385
logging out of subscription sites, etc.
390
def process_html_description(cls, tag, strip_links=True):
392
Process a <description> tag that contains HTML markup, either
393
entity encoded or escaped in a CDATA section.
397
src = '\n'.join(tag.contents) if hasattr(tag, 'contents') else tag
398
match = cls.CDATA_PAT.match(src.lstrip())
402
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
403
for e in replaced_entities:
405
src = src.replace(ent, unichr(name2codepoint[e]))
407
src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
412
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
413
FULL_DAY_MAP = dict(Sunday=0, Monday=1, Tueday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6)
414
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
415
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
416
July=7, August=8, September=9, October=10,
417
November=11, December=12)
420
def strptime(cls, src):
422
Take a string and return the date that string represents, in UTC as
423
an epoch (i.e. number of seconds since Jan 1, 1970). This function uses
424
a bunch of heuristics and is a prime candidate for being overridden in a
426
@param src: Timestamp as a string
428
@return: time ans a epoch
432
zone = re.search(r'\s*(\+\d\d\:{0,1}\d\d)', src)
434
delta = zone.group(1)
435
hrs, mins = int(delta[1:3]), int(delta[-2:].rstrip())
436
delta = 60*(hrs*60 + mins) * (-1 if delta.startswith('-') else 1)
437
src = src.replace(zone.group(), '')
438
if cls.pubdate_fmt is None:
439
src = src.strip().split()
441
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
443
src[0] = str(cls.FULL_DAY_MAP[src[0][:-1]])+','
445
src[2] = str(cls.MONTH_MAP[src[2]])
447
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
448
fmt = '%w, %d %m %Y %H:%M:%S'
449
src = src[:5] # Discard extra information
451
time_t = time.strptime(' '.join(src), fmt)
453
time_t = time.strptime(' '.join(src), fmt.replace('%Y', '%y'))
454
return calendar.timegm(time_t)-delta
456
return calendar.timegm(time.strptime(src, cls.pubdate_fmt))
458
def command_line_options(self):
460
args.append('--max-recursions='+str(self.max_recursions))
461
args.append('--delay='+str(self.delay))
462
args.append('--max-files='+str(self.max_files))
463
for i in self.match_regexps:
464
args.append('--match-regexp="'+i+'"')
465
for i in self.filter_regexps:
466
args.append('--filter-regexp="'+i+'"')
470
class FullContentProfile(DefaultProfile):
472
This profile is designed for feeds that embed the full article content in the RSS file.
479
def build_index(self):
480
'''Build an RSS based index.html. '''
481
articles = self.parse_feeds(require_url=False)
483
def build_sub_index(title, items):
485
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
486
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
488
content = item['content']
490
self.logger.debug('Skipping article as it has no content:%s'%item['title'])
492
item['description'] = cutoff(item['description'], self.summary_length)+'…'
493
self.article_counter = self.article_counter + 1
494
url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
496
open(url, 'wb').write((u'''\
504
</html>'''%(item['title'], content)).encode('utf-8')
516
'''%dict(title=title, items=ilist.rstrip())
520
categories = articles.keys()
522
for category in categories:
524
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
525
prefix = 'file:' if iswindows else ''
526
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
527
src = build_sub_index(category, articles[category])
528
open(cfile, 'wb').write(src.encode('utf-8'))
534
<div style='text-align: right; font-weight: bold'>%(date)s</div>
540
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
541
categories=clist, title=self.title)
542
index = os.path.join(self.temp_dir, 'index.html')
543
open(index, 'wb').write(src.encode('utf-8'))
546
def cutoff(src, pos, fuzz=50):
547
si = src.find(';', pos)
548
if si > 0 and si-pos > fuzz:
550
gi = src.find('>', pos)
551
if gi > 0 and gi-pos > fuzz:
558
def create_class(src):
559
environment = {'FullContentProfile':FullContentProfile, 'DefaultProfile':DefaultProfile}
560
exec src in environment
561
for item in environment.values():
562
if hasattr(item, 'build_index'):
563
if item.__name__ not in ['DefaultProfile', 'FullContentProfile']:
568
if os.path.isdir(tdir):
b'\\ No newline at end of file'