2
## web2lrf profile to download articles from Barrons.com
3
## can download subscriber-only content if username and
4
## password are supplied.
11
1
from calibre.web.feeds.news import BasicNewsRecipe
13
3
class Barrons(BasicNewsRecipe):
16
max_articles_per_feed = 50
17
needs_subscription = True
20
__author__ = 'Kovid Goyal and Sujata Raman'
21
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
22
timefmt = ' [%a, %b %d, %Y]'
23
use_embedded_content = False
25
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
26
conversion_options = {'linearize_tables': True}
29
## Don't grab articles more than 7 days old
31
use_javascript_to_login = True
32
requires_version = (0, 9, 16)
35
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
36
h3{font-family:Georgia,"Times New Roman",Times,serif; }
37
h2{font-family:Georgia,"Times New Roman",Times,serif; }
38
h1{ font-family:Georgia,"Times New Roman",Times,serif; }
39
.byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
40
.subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
41
.articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
42
.insettipUnit{font-size: x-small;}
45
dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
46
dict(name = 'a', attrs ={'class':'insetClose'})
49
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
51
## Remove anything before the body of the article.
52
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
54
## Remove any insets from the body of the article.
55
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
57
## Remove any reprint info from the body of the article.
58
(r'<hr size.*?<p', lambda match : '<p'),
60
## Remove anything after the end of the article.
61
(r'<!-- article end.*?</body>', lambda match : '</body>'),
6
max_articles_per_feed = 50
7
needs_subscription = True
10
__author__ = 'Kovid Goyal'
11
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
12
timefmt = ' [%a, %b %d, %Y]'
13
use_embedded_content = False
15
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
16
conversion_options = {'linearize_tables': True}
19
# Don't grab articles more than 7 days old
21
use_javascript_to_login = True
22
requires_version = (0, 9, 16)
24
keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
26
dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
27
dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
28
dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
29
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
32
def javascript_login(self, br, username, password):
33
br.visit('http://commerce.barrons.com/auth/login')
34
f = br.select_form(nr=0)
35
f['username'] = username
36
f['password'] = password
37
br.submit(timeout=120)
39
# Use the print version of a page when available.
40
def print_version(self, url):
41
main, sep, rest = url.rpartition('?')
42
return main + '#text.print'
44
def preprocess_html(self, soup):
45
# Remove thumbnail for zoomable images
46
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
53
# Comment out the feeds you don't want retrieved.
54
# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
58
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
59
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
60
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
61
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
62
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
63
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
65
def javascript_login(self, br, username, password):
66
br.visit('http://commerce.barrons.com/auth/login')
67
f = br.select_form(nr=0)
68
f['username'] = username
69
f['password'] = password
70
br.submit(timeout=120)
72
## Use the print version of a page when available.
73
def print_version(self, url):
74
main, sep, rest = url.rpartition('?')
75
return main + '#text.print'
77
def postprocess_html(self, soup, first):
79
for tag in soup.findAll(name=['ul', 'li']):
81
for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
86
## Comment out the feeds you don't want retrieved.
87
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
91
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
92
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
93
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
94
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
95
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
96
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
99
def get_article_url(self, article):
100
return article.get('link', None)
103
def get_cover_url(self):
105
index = 'http://online.barrons.com/home-page'
106
soup = self.index_to_soup(index)
107
link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
109
cover_url = link_item.img['src']
114
## NOT CURRENTLY WORKING
117
# self.browser.set_debug_responses(True)
118
# import sys, logging
119
# logger = logging.getLogger("mechanize")
120
# logger.addHandler(logging.StreamHandler(sys.stdout))
121
# logger.setLevel(logging.INFO)
123
# res = self.browser.open('http://online.barrons.com/logout')
126
# traceback.print_exc()
66
def get_article_url(self, article):
67
return article.get('link', None)
69
def get_cover_url(self):
71
index = 'http://online.barrons.com/home-page'
72
soup = self.index_to_soup(index)
73
link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
75
cover_url = link_item.img['src']