7
7
__docformat__ = "restructuredtext en"
10
import logging, os, cStringIO, time, traceback, re, urlparse, sys
10
import os, time, traceback, re, urlparse, sys
11
11
from collections import defaultdict
12
12
from functools import partial
13
13
from contextlib import nested, closing
15
15
from PyQt4.Qt import QApplication, QFile, QIODevice
18
from calibre import browser, __appname__, iswindows, LoggingInterface, \
18
from calibre import browser, __appname__, iswindows, \
19
19
strftime, __version__, preferred_encoding
20
20
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
21
21
from calibre.ebooks.metadata.opf2 import OPFCreator
22
from calibre.ebooks.lrf import entity_to_unicode
22
from calibre import entity_to_unicode
23
from calibre.web import Recipe
23
24
from calibre.ebooks import render_html
24
25
from calibre.ebooks.metadata.toc import TOC
25
26
from calibre.ebooks.metadata import MetaInformation
63
64
#: Delay between consecutive downloads in seconds
68
#: Set to newspaper, magazine or blog
69
publication_type = 'unknown'
66
71
#: Number of simultaneous downloads. Set to 1 if the server is picky.
67
72
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
68
73
simultaneous_downloads = 5
155
160
#: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
156
161
filter_regexps = []
158
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
159
html2lrf_options = []
161
#: Options to pass to html2epub to customize generation of EPUB ebooks.
162
html2epub_options = ''
163
#: Options to pass to oeb2mobi to customize generation of MOBI ebooks.
164
oeb2mobi_options = ''
163
#: Recipe specific options to control the conversion of the downloaded
164
#: content into an e-book. These will override any user or plugin specified
165
#: values, so only use if absolutely necessary. For example::
166
#: conversion_options = {
167
#: 'base_font_size' : 16,
168
#: 'tags' : 'mytag1,mytag2',
169
#: 'title' : 'My Title',
170
#: 'linearize_tables' : True,
173
conversion_options = {}
166
175
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
167
176
#: A tag is specified as a dictionary of the form::
230
239
#: use :member:`extra_css` in your recipe to customize look and feel.
231
240
template_css = u'''
233
font-size: x-small; color: gray; font-family: monospace;
242
color: gray; font-family: monospace;
236
245
.article_description {
237
font-size: small; font-family: sans; text-indent: 0pt;
246
font-family: sans; text-indent: 0pt;
241
font-weight: bold; font-size: large;
245
font-weight: bold; font-size: large;
249
font-family:monospace; font-size:8pt
258
font-family:monospace;
443
452
raise NotImplementedError
445
def __init__(self, options, parser, progress_reporter):
454
def postprocess_book(self, oeb, opts, log):
456
Run any needed post processing on the parsed downloaded e-book.
458
:param oeb: An OEBBook object
459
:param opts: Conversion options
463
def __init__(self, options, log, progress_reporter):
447
465
Initialize the recipe.
448
466
:param options: Parsed commandline options
449
467
:param parser: Command line option parser. Used to intelligently merge options.
450
468
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
452
LoggingInterface.__init__(self, logging.getLogger('feeds2disk'))
453
471
if not isinstance(self.title, unicode):
454
472
self.title = unicode(self.title, 'utf-8', 'replace')
456
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
457
setattr(self, attr, getattr(options, attr))
474
self.debug = options.verbose > 1
475
self.output_dir = os.getcwd()
476
self.verbose = options.verbose
477
self.test = options.test
478
self.username = options.username
479
self.password = options.password
480
self.lrf = options.lrf
481
self.include_navbars = not options.no_inline_navbars
458
483
self.output_dir = os.path.abspath(self.output_dir)
460
485
self.max_articles_per_feed = 2
461
486
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
465
logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
466
489
self.verbose = True
467
490
self.report_progress = progress_reporter
469
self.username = self.password = None
470
#: If True optimize downloading for eventual conversion to LRF
472
defaults = parser.get_default_values()
474
for opt in options.__dict__.keys():
475
if getattr(options, opt) != getattr(defaults, opt, None):
476
setattr(self, opt, getattr(options, opt))
478
492
if isinstance(self.feeds, basestring):
479
493
self.feeds = eval(self.feeds)
480
494
if isinstance(self.feeds, basestring):
491
505
'--timeout', str(self.timeout),
492
506
'--max-recursions', str(self.recursions),
493
507
'--delay', str(self.delay),
494
'--timeout', str(self.timeout),
496
509
if self.encoding is not None:
497
510
web2disk_cmdline.extend(['--encoding', self.encoding])
518
531
self.simultaneous_downloads = 1
520
533
self.navbar = templates.NavBarTemplate()
521
self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine', '--header', '--encoding', 'utf-8'])
522
if '--base-font-size' not in self.html2lrf_options:
523
self.html2lrf_options.extend(['--base-font-size', '12'])
524
534
self.failed_downloads = []
525
535
self.partial_failures = []
539
549
if first_fetch and job_info:
540
550
url, f, a, feed_len = job_info
541
551
body = soup.find('body')
552
if body is not None and self.include_navbars:
543
553
templ = self.navbar.generate(False, f, a, feed_len,
544
554
not self.has_single_feed,
545
555
url, __appname__,
567
577
res = self.build_index()
568
578
self.report_progress(1, _('Download finished'))
569
579
if self.failed_downloads:
570
self.log_warning(_('Failed to download the following articles:'))
580
self.log.warning(_('Failed to download the following articles:'))
571
581
for feed, article, debug in self.failed_downloads:
572
self.log_warning(article.title+_(' from ')+feed.title)
573
self.log_debug(article.url)
574
self.log_debug(debug)
582
self.log.warning(article.title, 'from', feed.title)
583
self.log.debug(article.url)
584
self.log.debug(debug)
575
585
if self.partial_failures:
576
self.log_warning(_('Failed to download parts of the following articles:'))
586
self.log.warning(_('Failed to download parts of the following articles:'))
577
587
for feed, atitle, aurl, debug in self.partial_failures:
578
self.log_warning(atitle + _(' from ') + feed)
580
self.log_warning(_('\tFailed links:'))
588
self.log.warning(atitle + _(' from ') + feed)
590
self.log.warning(_('\tFailed links:'))
581
591
for l, tb in debug:
588
598
def feeds2index(self, feeds):
589
599
templ = templates.IndexTemplate()
600
css = self.template_css + '\n\n' +(self.extra_css if self.extra_css else '')
590
601
return templ.generate(self.title, self.timefmt, feeds,
591
extra_css=self.extra_css).render(doctype='xhtml')
602
extra_css=css).render(doctype='xhtml')
594
605
def description_limiter(cls, src):
641
652
templ = templates.FeedTemplate()
653
css = self.template_css + '\n\n' +(self.extra_css if self.extra_css else '')
642
654
return templ.generate(feed, self.description_limiter,
643
extra_css=self.extra_css).render(doctype='xhtml')
646
def create_logger(self, feed_number, article_number):
647
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
648
out = cStringIO.StringIO()
649
handler = logging.StreamHandler(out)
650
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
651
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
653
handler.setLevel(logging.DEBUG)
654
logger.addHandler(handler)
657
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
655
extra_css=css).render(doctype='xhtml')
658
def _fetch_article(self, url, dir, f, a, num_of_feeds):
658
659
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
659
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
660
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
661
self.image_map, self.css_map,
662
(url, f, a, num_of_feeds))
660
663
fetcher.base_dir = dir
661
664
fetcher.current_dir = dir
662
665
fetcher.show_progress = False
668
671
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
669
672
return res, path, failures
671
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
672
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
674
def fetch_article(self, url, dir, f, a, num_of_feeds):
675
return self._fetch_article(url, dir, f, a, num_of_feeds)
674
def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
675
path = os.path.abspath(self.get_obfuscated_article(url, logger))
677
def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
678
path = os.path.abspath(self.get_obfuscated_article(url))
676
679
url = ('file:'+path) if iswindows else ('file://'+path)
677
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
680
return self._fetch_article(url, dir, f, a, num_of_feeds)
679
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
682
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
680
683
templ = templates.EmbeddedContent()
681
684
raw = templ.generate(article).render('html')
682
685
with PersistentTemporaryFile('_feeds2disk.html') as pt:
684
687
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
685
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
688
return self._fetch_article(url, dir, f, a, num_of_feeds)
688
691
def build_index(self):
733
735
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
734
736
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
735
737
else self.fetch_article), url)
736
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
738
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
737
739
{}, (f, a), self.article_downloaded,
738
740
self.error_in_article_download)
741
742
req.article = article
742
743
req.feed_dir = feed_dir
848
849
f.write(html.encode('utf-8'))
849
850
renderer = render_html(hf)
850
851
if renderer.tb is not None:
851
self.logger.warning('Failed to render default cover')
852
self.logger.debug(renderer.tb)
852
self.log.warning('Failed to render default cover')
853
self.log.debug(renderer.tb)
854
855
cover_file.write(renderer.data)
855
856
cover_file.flush()
861
862
mi = MetaInformation(self.title + strftime(self.timefmt), [__appname__])
862
863
mi.publisher = __appname__
863
864
mi.author_sort = __appname__
865
mi.publication_type = 'periodical:'+self.publication_type
864
866
opf_path = os.path.join(dir, 'index.opf')
865
867
ncx_path = os.path.join(dir, 'index.ncx')
866
868
opf = OPFCreator(dir, mi)
870
872
manifest.append(os.path.join(dir, 'index.ncx'))
871
873
cpath = getattr(self, 'cover_path', None)
872
874
if cpath is None:
873
pf = PersistentTemporaryFile('_recipe_cover.jpg')
875
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
874
876
self.default_cover(pf)
876
878
if cpath is not None and os.access(cpath, os.R_OK):
891
893
for j, a in enumerate(f):
892
894
if getattr(a, 'downloaded', False):
893
895
adir = 'feed_%d/article_%d/'%(num, j)
899
desc = a.text_summary
894
902
entries.append('%sindex.html'%adir)
895
903
po = self.play_order_map.get(entries[-1], None)
897
905
self.play_order_counter += 1
898
906
po = self.play_order_counter
899
907
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
908
play_order=po, author=auth, description=desc)
901
909
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
902
910
for sp in a.sub_pages:
903
911
prefix = os.path.commonprefix([opf_path, sp])
912
920
body = soup.find('body')
913
921
if body is not None:
914
922
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
915
templ = self.navbar.generate(True, num, j, len(f),
916
not self.has_single_feed,
917
a.orig_url, __appname__, prefix=prefix,
918
center=self.center_navbar)
919
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
920
body.insert(len(body.contents), elem)
923
if self.include_navbars:
924
templ = self.navbar.generate(True, num, j, len(f),
925
not self.has_single_feed,
926
a.orig_url, __appname__, prefix=prefix,
927
center=self.center_navbar)
928
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
929
body.insert(len(body.contents), elem)
921
930
with open(last, 'wb') as fi:
922
931
fi.write(unicode(soup).encode('utf-8'))
929
938
self.play_order_counter += 1
930
939
po = self.play_order_counter
931
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po))
940
auth = getattr(f, 'author', None)
943
desc = getattr(f, 'description', None)
946
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
947
f.title, play_order=po, description=desc, author=auth))
933
950
entries.append('feed_%d/index.html'%0)
934
951
feed_index(0, toc)
951
968
a = request.requestID[1]
953
970
article = request.article
954
self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
971
self.log.debug('Downloaded article:', article.title, 'from', article.url)
955
972
article.orig_url = article.url
956
973
article.url = 'article_%d/index.html'%a
957
974
article.downloaded = True
958
975
article.sub_pages = result[1][1:]
959
976
self.jobs_done += 1
960
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
977
self.report_progress(float(self.jobs_done)/len(self.jobs),
978
_(u'Article downloaded: %s')%repr(article.title))
962
980
self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
964
982
def error_in_article_download(self, request, traceback):
965
983
self.jobs_done += 1
966
self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
967
debug = request.stream.getvalue().decode('utf-8', 'ignore')
968
self.log_debug(debug)
969
self.log_debug(traceback)
971
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
972
self.failed_downloads.append((request.feed, request.article, debug))
984
self.log.error('Failed to download article:', request.article.title,
985
'from', request.article.url)
986
self.log.debug(traceback)
988
self.report_progress(float(self.jobs_done)/len(self.jobs),
989
_('Article download failed: %s')%repr(request.article.title))
990
self.failed_downloads.append((request.feed, request.article, traceback))
974
992
def parse_feeds(self):
1091
1109
index = os.path.abspath(self.custom_index())
1092
1110
url = 'file:'+index if iswindows else 'file://'+index
1093
1111
self.web2disk_options.browser = self.browser
1094
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
1112
fetcher = RecursiveFetcher(self.web2disk_options, self.log)
1095
1113
fetcher.base_dir = self.output_dir
1096
1114
fetcher.current_dir = self.output_dir
1097
1115
fetcher.show_progress = False
1104
1122
keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
1106
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
1124
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
1107
1125
if self.use_embedded_content:
1108
1126
self.web2disk_options.keep_only_tags = []
1109
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds)
1127
return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)