~ubuntu-branches/ubuntu/karmic/calibre/karmic

« back to all changes in this revision

Viewing changes to src/calibre/web/feeds/news.py

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2009-07-30 12:49:41 UTC
  • mfrom: (1.3.2 upstream)
  • Revision ID: james.westby@ubuntu.com-20090730124941-qjdsmri25zt8zocn
Tags: 0.6.3+dfsg-0ubuntu1
* New upstream release. Please see http://calibre.kovidgoyal.net/new_in_6/
  for the list of new features and changes.
* remove_postinstall.patch: Update for new version.
* build_debug.patch: Does not apply any more, disable for now. Might not be
  necessary any more.
* debian/copyright: Fix reference to versionless GPL.
* debian/rules: Drop obsolete dh_desktop call.
* debian/rules: Add workaround for weird Python 2.6 setuptools behaviour of
  putting compiled .so files into src/calibre/plugins/calibre/plugins
  instead of src/calibre/plugins.
* debian/rules: Drop hal fdi moving, new upstream version does not use hal
  any more. Drop hal dependency, too.
* debian/rules: Install udev rules into /lib/udev/rules.d.
* Add debian/calibre.preinst: Remove unmodified
  /etc/udev/rules.d/95-calibre.rules on upgrade.
* debian/control: Bump Python dependencies to 2.6, since upstream needs
  it now.

Show diffs side-by-side

added added

removed removed

Lines of Context:
7
7
__docformat__ = "restructuredtext en"
8
8
 
9
9
 
10
 
import logging, os, cStringIO, time, traceback, re, urlparse, sys
 
10
import os, time, traceback, re, urlparse, sys
11
11
from collections import defaultdict
12
12
from functools import partial
13
13
from contextlib import nested, closing
15
15
from PyQt4.Qt import QApplication, QFile, QIODevice
16
16
 
17
17
 
18
 
from calibre import browser, __appname__, iswindows, LoggingInterface, \
 
18
from calibre import browser, __appname__, iswindows, \
19
19
                    strftime, __version__, preferred_encoding
20
20
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
21
21
from calibre.ebooks.metadata.opf2 import OPFCreator
22
 
from calibre.ebooks.lrf import entity_to_unicode
 
22
from calibre import entity_to_unicode
 
23
from calibre.web import Recipe
23
24
from calibre.ebooks import render_html
24
25
from calibre.ebooks.metadata.toc import TOC
25
26
from calibre.ebooks.metadata import MetaInformation
31
32
                              PersistentTemporaryDirectory
32
33
 
33
34
 
34
 
class BasicNewsRecipe(object, LoggingInterface):
 
35
class BasicNewsRecipe(Recipe):
35
36
    '''
36
37
    Abstract base class that contains logic needed in all feed fetchers.
37
38
    '''
63
64
    #: Delay between consecutive downloads in seconds
64
65
    delay                  = 0
65
66
 
 
67
    #: Publication type
 
68
    #: Set to newspaper, magazine or blog
 
69
    publication_type = 'unknown'
 
70
 
66
71
    #: Number of simultaneous downloads. Set to 1 if the server is picky.
67
72
    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
68
73
    simultaneous_downloads = 5
155
160
    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
156
161
    filter_regexps        = []
157
162
 
158
 
    #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
159
 
    html2lrf_options      = []
160
 
 
161
 
    #: Options to pass to html2epub to customize generation of EPUB ebooks.
162
 
    html2epub_options     = ''
163
 
    #: Options to pass to oeb2mobi to customize generation of MOBI ebooks.
164
 
    oeb2mobi_options     = ''
 
163
    #: Recipe specific options to control the conversion of the downloaded
 
164
    #: content into an e-book. These will override any user or plugin specified
 
165
    #: values, so only use if absolutely necessary. For example::
 
166
    #:   conversion_options = {
 
167
    #:     'base_font_size'   : 16,
 
168
    #:     'tags'             : 'mytag1,mytag2',
 
169
    #:     'title'            : 'My Title',
 
170
    #:     'linearize_tables' : True,
 
171
    #:   }
 
172
    #:
 
173
    conversion_options = {}
165
174
 
166
175
    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
167
176
    #: A tag is specified as a dictionary of the form::
230
239
    #: use :member:`extra_css` in your recipe to customize look and feel.
231
240
    template_css = u'''
232
241
            .article_date {
233
 
                font-size: x-small; color: gray; font-family: monospace;
 
242
                color: gray; font-family: monospace;
234
243
            }
235
244
 
236
245
            .article_description {
237
 
                font-size: small; font-family: sans; text-indent: 0pt;
 
246
                font-family: sans; text-indent: 0pt;
238
247
            }
239
248
 
240
249
            a.article {
241
 
                font-weight: bold; font-size: large;
 
250
                font-weight: bold;
242
251
            }
243
252
 
244
253
            a.feed {
245
 
                font-weight: bold; font-size: large;
 
254
                font-weight: bold;
246
255
            }
247
256
 
248
257
            .navbar {
249
 
                font-family:monospace; font-size:8pt
 
258
                font-family:monospace;
250
259
            }
251
260
'''
252
261
 
429
438
        '''
430
439
        raise NotImplementedError
431
440
 
432
 
    def get_obfuscated_article(self, url, logger):
 
441
    def get_obfuscated_article(self, url):
433
442
        '''
434
443
        If you set :member:`articles_are_obfuscated` this method is called with
435
444
        every article URL. It should return the path to a file on the filesystem
442
451
        '''
443
452
        raise NotImplementedError
444
453
 
445
 
    def __init__(self, options, parser, progress_reporter):
 
454
    def postprocess_book(self, oeb, opts, log):
 
455
        '''
 
456
        Run any needed post processing on the parsed downloaded e-book.
 
457
 
 
458
        :param oeb: An OEBBook object
 
459
        :param opts: Conversion options
 
460
        '''
 
461
        pass
 
462
 
 
463
    def __init__(self, options, log, progress_reporter):
446
464
        '''
447
465
        Initialize the recipe.
448
466
        :param options: Parsed commandline options
449
467
        :param parser:  Command line option parser. Used to intelligently merge options.
450
468
        :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
451
469
        '''
452
 
        LoggingInterface.__init__(self, logging.getLogger('feeds2disk'))
 
470
        self.log = log
453
471
        if not isinstance(self.title, unicode):
454
472
            self.title = unicode(self.title, 'utf-8', 'replace')
455
473
 
456
 
        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
457
 
            setattr(self, attr, getattr(options, attr))
 
474
        self.debug = options.verbose > 1
 
475
        self.output_dir = os.getcwd()
 
476
        self.verbose = options.verbose
 
477
        self.test = options.test
 
478
        self.username = options.username
 
479
        self.password = options.password
 
480
        self.lrf = options.lrf
 
481
        self.include_navbars = not options.no_inline_navbars
 
482
 
458
483
        self.output_dir = os.path.abspath(self.output_dir)
459
484
        if options.test:
460
485
            self.max_articles_per_feed = 2
461
486
            self.simultaneous_downloads = min(4, self.simultaneous_downloads)
462
487
 
463
 
 
464
488
        if self.debug:
465
 
            logging.getLogger('feeds2disk').setLevel(logging.DEBUG)
466
489
            self.verbose = True
467
490
        self.report_progress = progress_reporter
468
491
 
469
 
        self.username = self.password = None
470
 
        #: If True optimize downloading for eventual conversion to LRF
471
 
        self.lrf = False
472
 
        defaults = parser.get_default_values()
473
 
 
474
 
        for opt in options.__dict__.keys():
475
 
            if getattr(options, opt) != getattr(defaults, opt, None):
476
 
                setattr(self, opt, getattr(options, opt))
477
 
 
478
492
        if isinstance(self.feeds, basestring):
479
493
            self.feeds = eval(self.feeds)
480
494
            if isinstance(self.feeds, basestring):
491
505
            '--timeout', str(self.timeout),
492
506
            '--max-recursions', str(self.recursions),
493
507
            '--delay', str(self.delay),
494
 
            '--timeout', str(self.timeout),
495
508
            ]
496
509
        if self.encoding is not None:
497
510
            web2disk_cmdline.extend(['--encoding', self.encoding])
518
531
            self.simultaneous_downloads = 1
519
532
 
520
533
        self.navbar = templates.NavBarTemplate()
521
 
        self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine', '--header', '--encoding', 'utf-8'])
522
 
        if '--base-font-size' not in self.html2lrf_options:
523
 
            self.html2lrf_options.extend(['--base-font-size', '12'])
524
534
        self.failed_downloads = []
525
535
        self.partial_failures = []
526
536
 
539
549
        if first_fetch and job_info:
540
550
            url, f, a, feed_len = job_info
541
551
            body = soup.find('body')
542
 
            if body is not None:
 
552
            if body is not None and self.include_navbars:
543
553
                templ = self.navbar.generate(False, f, a, feed_len,
544
554
                                             not self.has_single_feed,
545
555
                                             url, __appname__,
555
565
        return self.postprocess_html(soup, first_fetch)
556
566
 
557
567
 
558
 
    def download(self, for_lrf=False):
 
568
    def download(self):
559
569
        '''
560
570
        Download and pre-process all articles from the feeds in this recipe.
561
571
        This method should be called only one on a particular Recipe instance.
567
577
            res = self.build_index()
568
578
            self.report_progress(1, _('Download finished'))
569
579
            if self.failed_downloads:
570
 
                self.log_warning(_('Failed to download the following articles:'))
 
580
                self.log.warning(_('Failed to download the following articles:'))
571
581
                for feed, article, debug in self.failed_downloads:
572
 
                    self.log_warning(article.title+_(' from ')+feed.title)
573
 
                    self.log_debug(article.url)
574
 
                    self.log_debug(debug)
 
582
                    self.log.warning(article.title, 'from', feed.title)
 
583
                    self.log.debug(article.url)
 
584
                    self.log.debug(debug)
575
585
            if self.partial_failures:
576
 
                self.log_warning(_('Failed to download parts of the following articles:'))
 
586
                self.log.warning(_('Failed to download parts of the following articles:'))
577
587
                for feed, atitle, aurl, debug in self.partial_failures:
578
 
                    self.log_warning(atitle + _(' from ') + feed)
579
 
                    self.log_debug(aurl)
580
 
                    self.log_warning(_('\tFailed links:'))
 
588
                    self.log.warning(atitle + _(' from ') + feed)
 
589
                    self.log.debug(aurl)
 
590
                    self.log.warning(_('\tFailed links:'))
581
591
                    for l, tb in debug:
582
 
                        self.log_warning(l)
583
 
                        self.log_debug(tb)
 
592
                        self.log.warning(l)
 
593
                        self.log.debug(tb)
584
594
            return res
585
595
        finally:
586
596
            self.cleanup()
587
597
 
588
598
    def feeds2index(self, feeds):
589
599
        templ = templates.IndexTemplate()
 
600
        css = self.template_css + '\n\n' +(self.extra_css if self.extra_css else '')
590
601
        return templ.generate(self.title, self.timefmt, feeds,
591
 
                              extra_css=self.extra_css).render(doctype='xhtml')
 
602
                              extra_css=css).render(doctype='xhtml')
592
603
 
593
604
    @classmethod
594
605
    def description_limiter(cls, src):
639
650
 
640
651
 
641
652
        templ = templates.FeedTemplate()
 
653
        css = self.template_css + '\n\n' +(self.extra_css if self.extra_css else '')
642
654
        return templ.generate(feed, self.description_limiter,
643
 
                              extra_css=self.extra_css).render(doctype='xhtml')
644
 
 
645
 
 
646
 
    def create_logger(self, feed_number, article_number):
647
 
        logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
648
 
        out = cStringIO.StringIO()
649
 
        handler = logging.StreamHandler(out)
650
 
        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
651
 
        handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
652
 
        if self.debug:
653
 
            handler.setLevel(logging.DEBUG)
654
 
        logger.addHandler(handler)
655
 
        return logger, out
656
 
 
657
 
    def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
 
655
                              extra_css=css).render(doctype='xhtml')
 
656
 
 
657
 
 
658
    def _fetch_article(self, url, dir, f, a, num_of_feeds):
658
659
        self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
659
 
        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
 
660
        fetcher = RecursiveFetcher(self.web2disk_options, self.log,
 
661
                self.image_map, self.css_map,
 
662
                (url, f, a, num_of_feeds))
660
663
        fetcher.base_dir = dir
661
664
        fetcher.current_dir = dir
662
665
        fetcher.show_progress = False
668
671
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
669
672
        return res, path, failures
670
673
 
671
 
    def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
672
 
        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
 
674
    def fetch_article(self, url, dir, f, a, num_of_feeds):
 
675
        return self._fetch_article(url, dir, f, a, num_of_feeds)
673
676
 
674
 
    def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
675
 
        path = os.path.abspath(self.get_obfuscated_article(url, logger))
 
677
    def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
 
678
        path = os.path.abspath(self.get_obfuscated_article(url))
676
679
        url = ('file:'+path) if iswindows else ('file://'+path)
677
 
        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
 
680
        return self._fetch_article(url, dir, f, a, num_of_feeds)
678
681
 
679
 
    def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
 
682
    def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
680
683
        templ = templates.EmbeddedContent()
681
684
        raw = templ.generate(article).render('html')
682
685
        with PersistentTemporaryFile('_feeds2disk.html') as pt:
683
686
            pt.write(raw)
684
687
            url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
685
 
        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
 
688
        return self._fetch_article(url, dir,  f, a, num_of_feeds)
686
689
 
687
690
 
688
691
    def build_index(self):
723
726
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
724
727
                if not os.path.isdir(art_dir):
725
728
                    os.makedirs(art_dir)
726
 
                logger, stream = self.create_logger(f, a)
727
729
                try:
728
730
                    url = self.print_version(article.url)
729
731
                except NotImplementedError:
733
735
                func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
734
736
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
735
737
                              else self.fetch_article), url)
736
 
                req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
 
738
                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
737
739
                                      {}, (f, a), self.article_downloaded,
738
740
                                      self.error_in_article_download)
739
 
                req.stream = stream
740
741
                req.feed = feed
741
742
                req.article = article
742
743
                req.feed_dir = feed_dir
775
776
            cu = self.get_cover_url()
776
777
        except Exception, err:
777
778
            cu = None
778
 
            self.log_error(_('Could not download cover: %s')%str(err))
779
 
            self.log_debug(traceback.format_exc())
 
779
            self.log.error(_('Could not download cover: %s')%str(err))
 
780
            self.log.debug(traceback.format_exc())
780
781
        if cu is not None:
781
782
            ext = cu.rpartition('.')[-1]
782
783
            if '?' in ext:
848
849
            f.write(html.encode('utf-8'))
849
850
        renderer = render_html(hf)
850
851
        if renderer.tb is not None:
851
 
            self.logger.warning('Failed to render default cover')
852
 
            self.logger.debug(renderer.tb)
 
852
            self.log.warning('Failed to render default cover')
 
853
            self.log.debug(renderer.tb)
853
854
        else:
854
855
            cover_file.write(renderer.data)
855
856
            cover_file.flush()
861
862
        mi = MetaInformation(self.title + strftime(self.timefmt), [__appname__])
862
863
        mi.publisher = __appname__
863
864
        mi.author_sort = __appname__
 
865
        mi.publication_type = 'periodical:'+self.publication_type
864
866
        opf_path = os.path.join(dir, 'index.opf')
865
867
        ncx_path = os.path.join(dir, 'index.ncx')
866
868
        opf = OPFCreator(dir, mi)
870
872
        manifest.append(os.path.join(dir, 'index.ncx'))
871
873
        cpath = getattr(self, 'cover_path', None)
872
874
        if cpath is None:
873
 
            pf = PersistentTemporaryFile('_recipe_cover.jpg')
 
875
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
874
876
            self.default_cover(pf)
875
877
            cpath =  pf.name
876
878
        if cpath is not None and os.access(cpath, os.R_OK):
891
893
            for j, a in enumerate(f):
892
894
                if getattr(a, 'downloaded', False):
893
895
                    adir = 'feed_%d/article_%d/'%(num, j)
 
896
                    auth = a.author
 
897
                    if not auth:
 
898
                        auth = None
 
899
                    desc = a.text_summary
 
900
                    if not desc:
 
901
                        desc = None
894
902
                    entries.append('%sindex.html'%adir)
895
903
                    po = self.play_order_map.get(entries[-1], None)
896
904
                    if po is None:
897
905
                        self.play_order_counter += 1
898
906
                        po = self.play_order_counter
899
907
                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
900
 
                                    play_order=po)
 
908
                                    play_order=po, author=auth, description=desc)
901
909
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
902
910
                    for sp in a.sub_pages:
903
911
                        prefix = os.path.commonprefix([opf_path, sp])
912
920
                        body = soup.find('body')
913
921
                        if body is not None:
914
922
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
915
 
                            templ = self.navbar.generate(True, num, j, len(f),
916
 
                                             not self.has_single_feed,
917
 
                                             a.orig_url, __appname__, prefix=prefix,
918
 
                                             center=self.center_navbar)
919
 
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
920
 
                            body.insert(len(body.contents), elem)
 
923
                            if self.include_navbars:
 
924
                                templ = self.navbar.generate(True, num, j, len(f),
 
925
                                                not self.has_single_feed,
 
926
                                                a.orig_url, __appname__, prefix=prefix,
 
927
                                                center=self.center_navbar)
 
928
                                elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
 
929
                                body.insert(len(body.contents), elem)
921
930
                            with open(last, 'wb') as fi:
922
931
                                fi.write(unicode(soup).encode('utf-8'))
923
932
 
928
937
                if po is None:
929
938
                    self.play_order_counter += 1
930
939
                    po = self.play_order_counter
931
 
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po))
 
940
                auth = getattr(f, 'author', None)
 
941
                if not auth:
 
942
                    auth = None
 
943
                desc = getattr(f, 'description', None)
 
944
                if not desc:
 
945
                    desc = None
 
946
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
 
947
                    f.title, play_order=po, description=desc, author=auth))
 
948
 
932
949
        else:
933
950
            entries.append('feed_%d/index.html'%0)
934
951
            feed_index(0, toc)
951
968
        a = request.requestID[1]
952
969
 
953
970
        article = request.article
954
 
        self.log_debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
 
971
        self.log.debug('Downloaded article:', article.title, 'from', article.url)
955
972
        article.orig_url = article.url
956
973
        article.url = 'article_%d/index.html'%a
957
974
        article.downloaded = True
958
975
        article.sub_pages  = result[1][1:]
959
976
        self.jobs_done += 1
960
 
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
 
977
        self.report_progress(float(self.jobs_done)/len(self.jobs),
 
978
            _(u'Article downloaded: %s')%repr(article.title))
961
979
        if result[2]:
962
980
            self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
963
981
 
964
982
    def error_in_article_download(self, request, traceback):
965
983
        self.jobs_done += 1
966
 
        self.log_error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
967
 
        debug = request.stream.getvalue().decode('utf-8', 'ignore')
968
 
        self.log_debug(debug)
969
 
        self.log_debug(traceback)
970
 
        self.log_debug('\n')
971
 
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
972
 
        self.failed_downloads.append((request.feed, request.article, debug))
 
984
        self.log.error('Failed to download article:', request.article.title,
 
985
        'from', request.article.url)
 
986
        self.log.debug(traceback)
 
987
        self.log.debug('\n')
 
988
        self.report_progress(float(self.jobs_done)/len(self.jobs),
 
989
                _('Article download failed: %s')%repr(request.article.title))
 
990
        self.failed_downloads.append((request.feed, request.article, traceback))
973
991
 
974
992
    def parse_feeds(self):
975
993
        '''
997
1015
                feed.populate_from_preparsed_feed(msg, [])
998
1016
                feed.description = unicode(err)
999
1017
                parsed_feeds.append(feed)
1000
 
                self.log_exception(msg)
 
1018
                self.log.exception(msg)
1001
1019
 
1002
1020
 
1003
1021
        remove = [f for f in parsed_feeds if len(f) == 0 and
1091
1109
        index = os.path.abspath(self.custom_index())
1092
1110
        url = 'file:'+index if iswindows else 'file://'+index
1093
1111
        self.web2disk_options.browser = self.browser
1094
 
        fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
 
1112
        fetcher = RecursiveFetcher(self.web2disk_options, self.log)
1095
1113
        fetcher.base_dir = self.output_dir
1096
1114
        fetcher.current_dir = self.output_dir
1097
1115
        fetcher.show_progress = False
1103
1121
 
1104
1122
    keep_only_tags = [dict(name=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
1105
1123
 
1106
 
    def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
 
1124
    def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
1107
1125
        if self.use_embedded_content:
1108
1126
            self.web2disk_options.keep_only_tags = []
1109
 
        return BasicNewsRecipe.fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds)
 
1127
        return BasicNewsRecipe.fetch_embedded_article(self, article, dir, f, a, num_of_feeds)