16
17
use_embedded_content = False
18
19
INDEX = 'http://sportsillustrated.cnn.com/'
20
INDEX2 = 'http://sportsillustrated.cnn.com/vault/cover/home/index.htm'
20
22
def parse_index(self):
22
soup = self.index_to_soup(self.INDEX)
23
# Find the link to the current issue on the front page. SI Cover
24
cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'})
26
currentIssue = cover.parent['href']
28
# Open the index of current issue
30
index = self.index_to_soup(currentIssue)
31
self.log('\tLooking for current issue in: ' + currentIssue)
32
# Now let us see if they updated their frontpage
33
nav = index.find('div', attrs = {'class': 'siv_trav_top'})
35
img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_next_v2.jpg'})
38
if parent.name == 'a':
39
# They didn't update their frontpage; Load the next issue from here
40
href = self.INDEX + parent['href']
41
index = self.index_to_soup(href)
42
self.log('\tLooking for current issue in: ' + href)
24
soup = self.index_to_soup(self.INDEX2)
26
#Loop through all of the "latest" covers until we find one that actually has articles
27
for item in soup.findAll('div', attrs={'id': re.compile("ecomthumb_latest_*")}):
28
regex = re.compile('ecomthumb_latest_(\d*)')
29
result = regex.search(str(item))
30
current_issue_number = str(result.group(1))
31
current_issue_link = 'http://sportsillustrated.cnn.com/vault/cover/toc/' + current_issue_number + '/index.htm'
32
self.log('Checking this link for a TOC: ', current_issue_link)
34
index = self.index_to_soup(current_issue_link)
44
36
if index.find('div', 'siv_noArticleMessage'):
45
nav = index.find('div', attrs = {'class': 'siv_trav_top'})
47
# Their frontpage points to an issue without any articles; Use the previous issue
48
img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_previous_v2.jpg'})
51
if parent.name == 'a':
52
href = self.INDEX + parent['href']
53
index = self.index_to_soup(href)
54
self.log('\tLooking for current issue in: ' + href)
58
list = index.find('div', attrs = {'class' : 'siv_artList'})
61
# Get all the artcles ready for calibre.
62
for headline in list.findAll('div', attrs = {'class' : 'headline'}):
63
title = self.tag_to_string(headline.a) + '\n' + self.tag_to_string(headline.findNextSibling('div', attrs = {'class' : 'info'}))
64
url = self.INDEX + headline.a['href']
65
description = self.tag_to_string(headline.findNextSibling('a').div)
66
article = {'title' : title, 'date' : u'', 'url' : url, 'description' : description}
68
articles.append(article)
70
# See if we can find a meaningfull title
71
feedTitle = 'Current Issue'
72
hasTitle = index.find('div', attrs = {'class' : 'siv_imageText_head'})
74
feedTitle = self.tag_to_string(hasTitle.h1)
76
answer.append([feedTitle, articles])
37
self.log('No TOC for this one. Skipping...')
39
self.log('Found a TOC... Using this link')
43
list = index.find('div', attrs = {'class' : 'siv_artList'})
45
self.log ('found siv_artList')
47
# Get all the artcles ready for calibre.
49
for headline in list.findAll('div', attrs = {'class' : 'headline'}):
51
title = self.tag_to_string(headline.a) + '\n' + self.tag_to_string(headline.findNextSibling('div', attrs = {'class' : 'info'}))
52
url = self.INDEX + headline.a['href']
53
description = self.tag_to_string(headline.findNextSibling('a').div)
54
article = {'title' : title, 'date' : u'', 'url' : url, 'description' : description}
55
articles.append(article)
59
# See if we can find a meaningfull title
60
feedTitle = 'Current Issue'
61
hasTitle = index.find('div', attrs = {'class' : 'siv_imageText_head'})
63
feedTitle = self.tag_to_string(hasTitle.h1)
65
answer.append([feedTitle, articles])