~ubuntu-branches/debian/sid/calibre/sid

« back to all changes in this revision

Viewing changes to recipes/barrons.recipe

  • Committer: Package Import Robot
  • Author(s): Martin Pitt
  • Date: 2014-05-14 18:17:50 UTC
  • mfrom: (1.5.10)
  • Revision ID: package-import@ubuntu.com-20140514181750-xyrxqa47dbw0qfhu
Tags: 1.36.0+dfsg-1
* New upstream release:
  - Fixes editing of metadata (Closes: #741638)

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
##
2
 
##    web2lrf profile to download articles from Barrons.com
3
 
##    can download subscriber-only content if username and
4
 
##    password are supplied.
5
 
##
6
 
'''
7
 
'''
8
 
 
9
 
import re
10
 
 
11
1
from calibre.web.feeds.news import BasicNewsRecipe
12
2
 
13
3
class Barrons(BasicNewsRecipe):
14
4
 
15
 
        title = 'Barron\'s'
16
 
        max_articles_per_feed = 50
17
 
        needs_subscription    = True
18
 
        language = 'en'
19
 
 
20
 
        __author__ = 'Kovid Goyal and Sujata Raman'
21
 
        description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
22
 
        timefmt  = ' [%a, %b %d, %Y]'
23
 
        use_embedded_content   = False
24
 
        no_stylesheets = True
25
 
        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
26
 
        conversion_options = {'linearize_tables': True}
27
 
        ##delay = 1
28
 
 
29
 
        ## Don't grab articles more than 7 days old
30
 
        oldest_article = 7
31
 
        use_javascript_to_login = True
32
 
        requires_version = (0, 9, 16)
33
 
 
34
 
        extra_css = '''
35
 
                    .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
36
 
                    h3{font-family:Georgia,"Times New Roman",Times,serif; }
37
 
                    h2{font-family:Georgia,"Times New Roman",Times,serif; }
38
 
                    h1{ font-family:Georgia,"Times New Roman",Times,serif; }
39
 
                    .byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
40
 
                    .subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
41
 
                    .articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
42
 
                    .insettipUnit{font-size: x-small;}
43
 
                    '''
44
 
        remove_tags = [
45
 
                           dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
46
 
                           dict(name = 'a', attrs ={'class':'insetClose'})
47
 
                        ]
48
 
 
49
 
        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
50
 
                [
51
 
                ## Remove anything before the body of the article.
52
 
                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
53
 
 
54
 
                ## Remove any insets from the body of the article.
55
 
                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
56
 
 
57
 
                ## Remove any reprint info from the body of the article.
58
 
                (r'<hr size.*?<p', lambda match : '<p'),
59
 
 
60
 
                ## Remove anything after the end of the article.
61
 
                (r'<!-- article end.*?</body>', lambda match : '</body>'),
62
 
                ]
 
5
    title = 'Barron\'s'
 
6
    max_articles_per_feed = 50
 
7
    needs_subscription    = True
 
8
    language = 'en'
 
9
 
 
10
    __author__ = 'Kovid Goyal'
 
11
    description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
 
12
    timefmt  = ' [%a, %b %d, %Y]'
 
13
    use_embedded_content   = False
 
14
    no_stylesheets = True
 
15
    match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
 
16
    conversion_options = {'linearize_tables': True}
 
17
    ##delay = 1
 
18
 
 
19
    # Don't grab articles more than 7 days old
 
20
    oldest_article = 7
 
21
    use_javascript_to_login = True
 
22
    requires_version = (0, 9, 16)
 
23
 
 
24
    keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
 
25
    remove_tags = [
 
26
        dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
 
27
        dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
 
28
        dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
 
29
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
 
30
    ]
 
31
 
 
32
    def javascript_login(self, br, username, password):
 
33
        br.visit('http://commerce.barrons.com/auth/login')
 
34
        f = br.select_form(nr=0)
 
35
        f['username'] = username
 
36
        f['password'] = password
 
37
        br.submit(timeout=120)
 
38
 
 
39
    # Use the print version of a page when available.
 
40
    def print_version(self, url):
 
41
        main, sep, rest = url.rpartition('?')
 
42
        return main + '#text.print'
 
43
 
 
44
    def preprocess_html(self, soup):
 
45
        # Remove thumbnail for zoomable images
 
46
        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
 
47
            img = div.find('img')
 
48
            if img is not None:
 
49
                img.extract()
 
50
 
 
51
        return soup
 
52
 
 
53
# Comment out the feeds you don't want retrieved.
 
54
# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
 
55
 
 
56
    def get_feeds(self):
 
57
        return [
 
58
        ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
 
59
        ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
 
60
        ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
 
61
        ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
 
62
        ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
 
63
        ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
63
64
        ]
64
65
 
65
 
        def javascript_login(self, br, username, password):
66
 
            br.visit('http://commerce.barrons.com/auth/login')
67
 
            f = br.select_form(nr=0)
68
 
            f['username'] = username
69
 
            f['password'] = password
70
 
            br.submit(timeout=120)
71
 
 
72
 
        ## Use the print version of a page when available.
73
 
        def print_version(self, url):
74
 
               main, sep, rest = url.rpartition('?')
75
 
               return main + '#text.print'
76
 
 
77
 
        def postprocess_html(self, soup, first):
78
 
 
79
 
               for tag in soup.findAll(name=['ul', 'li']):
80
 
                    tag.name = 'div'
81
 
               for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
82
 
                  tag.extract()
83
 
 
84
 
               return soup
85
 
 
86
 
## Comment out the feeds you don't want retrieved.
87
 
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
88
 
 
89
 
        def get_feeds(self):
90
 
                return  [
91
 
                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
92
 
                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
93
 
                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
94
 
                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
95
 
                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
96
 
                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
97
 
                ]
98
 
 
99
 
        def get_article_url(self, article):
100
 
            return article.get('link', None)
101
 
 
102
 
 
103
 
        def get_cover_url(self):
104
 
            cover_url = None
105
 
            index = 'http://online.barrons.com/home-page'
106
 
            soup = self.index_to_soup(index)
107
 
            link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
108
 
            if link_item:
109
 
               cover_url = link_item.img['src']
110
 
            return cover_url
111
 
 
112
 
 
113
 
        ## Logout of website
114
 
        ## NOT CURRENTLY WORKING
115
 
        # def cleanup(self):
116
 
            # try:
117
 
                # self.browser.set_debug_responses(True)
118
 
                # import sys, logging
119
 
                # logger = logging.getLogger("mechanize")
120
 
                # logger.addHandler(logging.StreamHandler(sys.stdout))
121
 
                # logger.setLevel(logging.INFO)
122
 
 
123
 
                # res = self.browser.open('http://online.barrons.com/logout')
124
 
            # except:
125
 
                # import traceback
126
 
                # traceback.print_exc()
127
 
 
 
66
    def get_article_url(self, article):
 
67
        return article.get('link', None)
 
68
 
 
69
    def get_cover_url(self):
 
70
        cover_url = None
 
71
        index = 'http://online.barrons.com/home-page'
 
72
        soup = self.index_to_soup(index)
 
73
        link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
 
74
        if link_item:
 
75
            cover_url = link_item.img['src']
 
76
        return cover_url
128
77
 
129
78