11
11
from calibre import strftime
13
12
from calibre.web.feeds.news import BasicNewsRecipe
13
from calibre.ebooks.BeautifulSoup import Tag
15
15
class Harpers_full(BasicNewsRecipe):
16
16
title = u"Harper's Magazine - articles from printed edition"
17
17
__author__ = u'Darko Miletic'
18
18
description = u"Harper's Magazine: Founded June 1850."
19
19
publisher = "Harpers's"
20
category = 'news, politics, USA'
20
category = 'news, politics, USA'
21
21
oldest_article = 30
22
22
max_articles_per_feed = 100
23
23
no_stylesheets = True
24
24
use_embedded_content = False
25
simultaneous_downloads = 1
27
26
language = _('English')
28
27
needs_subscription = True
29
28
INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
30
29
LOGIN = 'http://www.harpers.org'
31
30
cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
32
remove_javascript = True
34
32
html2lrf_options = [
35
33
'--comment', description
36
34
, '--category', category
37
35
, '--publisher', publisher
40
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
38
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
42
40
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
44
dict(name='table', attrs={'class':'rcnt'})
45
,dict(name='table', attrs={'class':'rcnt topline'})
42
dict(name='table', attrs={'class':['rcnt','rcnt topline']})
48
46
def get_browser(self):
54
52
br['password'] = self.password
58
56
def parse_index(self):
60
58
print 'Processing ' + self.INDEX
61
59
soup = self.index_to_soup(self.INDEX)
62
60
for item in soup.findAll('div', attrs={'class':'title'}):
63
text_link = item.parent.find('img',attrs={'alt':'Text'})
61
text_link = item.parent.find('img',attrs={'alt':'Text'})
65
63
url = self.LOGIN + item.a['href']
66
64
title = item.a.contents[0]
74
72
return [(soup.head.title.string, articles)]
74
def preprocess_html(self, soup):
75
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
76
soup.head.insert(1,mcharset)
77
for item in soup.findAll(style=True):
79
for item in soup.findAll(xmlns=True):