4
from collections import OrderedDict
6
from calibre.web.feeds.news import BasicNewsRecipe
8
class StrangeHorizons(BasicNewsRecipe):
11
# Any issue archive page is an acceptable index as well.
12
# However, reviews will not be included in older issues.
13
# (Using the reviews archive instead of the recent reviews page would fix.)
14
INDEX = 'http://www.strangehorizons.com/'
15
title = 'Strange Horizons'
16
description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends'
17
masthead_url = 'http://strangehorizons.com/images/sh_head.gif'
18
publication_type = 'magazine'
20
__author__ = 'Jim DeVona'
24
keep_only_tags = [dict(name='div', id='content')]
25
remove_tags = [dict(name='p', attrs={'class': 'forum-links'}), dict(name='p', attrs={'class': 'top-link'})]
26
remove_tags_after = [dict(name='p', attrs={'class': 'author-bio'})]
30
extra_css = '''div.image-left { margin: 0.5em auto 1em auto; } div.image-right { margin: 0.5em auto 1em auto; } div.illustration { margin: 0.5em auto 1em auto; text-align: center; } p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; } h1 { font-size: 160%; } h2 { font-size: 110%; } h3 { font-size: 85%; } h4 { font-size: 80%; } p { font-size: 90%; margin: 1em 1em 1em 15px; } p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; } p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; } p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; } p.content-date { font-weight: bold; } p.dedication { font-style: italic; } div.stanza { margin-bottom: 1em; } div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; } p.verse-line { margin-bottom: 0px; margin-top: 0px; } p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; } p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; } p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; } .foreign { font-style: italic; } .thought { font-style: italic; } .thought cite { font-style: normal; } .thought em { font-style: normal; } blockquote { font-size: 90%; font-style: italic; } blockquote cite { font-style: normal; } blockquote em { font-style: normal; } blockquote .foreign { font-style: normal; } blockquote .thought { font-style: normal; } .speaker { font-weight: bold; } pre { margin-left: 15px; } div.screenplay { font-family: monospace; } blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; } .screenplay p.dialogue-first { margin-top: 0; } .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; } blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; } .no-italics { font-style: normal; }'''
32
def parse_index(self):
34
sections = OrderedDict()
35
strange_soup = self.index_to_soup(self.INDEX)
37
# Find the heading that marks the start of this issue.
38
issue_heading = strange_soup.find('h2')
39
issue_date = self.tag_to_string(issue_heading)
40
self.title = self.title + " - " + issue_date
42
# Examine subsequent headings for information about this issue.
43
heading_tag = issue_heading.findNextSibling(['h2','h3'])
44
while heading_tag != None:
46
# An h2 indicates the start of the next issue.
47
if heading_tag.name == 'h2':
50
# The heading begins with a word indicating the article category.
51
section = self.tag_to_string(heading_tag).split(':', 1)[0].title()
53
# Reviews aren't linked from the index, so we need to look them up
54
# separately. Currently using Recent Reviews page. The reviews
55
# archive page lists all reviews, but is >500k.
56
if section == 'Review':
58
# Get the list of recent reviews.
59
review_soup = self.index_to_soup('http://www.strangehorizons.com/reviews/')
60
review_titles = review_soup.findAll('p', attrs={'class': 'contents-title'})
62
# Get the list of reviews included in this issue. (Kludgey.)
63
reviews_summary = heading_tag.findNextSibling('p', attrs={'class': 'contents-pullquote'})
64
for br in reviews_summary.findAll('br'):
65
br.replaceWith('----')
66
review_summary_text = self.tag_to_string(reviews_summary)
67
review_lines = review_summary_text.split(' ----')
69
# Look for each of the needed reviews (there are 3, right?)...
70
for review_info in review_lines[0:3]:
72
# Get the review's release day (unused), title, and author.
73
day, tna = review_info.split(': ', 1)
74
article_title, article_author = tna.split(', reviewed by ')
76
# ... in the list of recent reviews.
77
for review_title_tag in review_titles:
78
review_title = self.tag_to_string(review_title_tag)
79
if review_title != article_title:
82
# Extract review information from heading and surrounding text.
83
article_summary = self.tag_to_string(review_title_tag.findNextSibling('p', attrs={'class': 'contents-pullquote'}))
84
review_date = self.tag_to_string(review_title_tag.findNextSibling('p', attrs={'class': 'contents-date'}))
85
article_url = review_title_tag.find('a')['href']
87
# Add this review to the Review section.
88
if section not in sections:
89
sections[section] = []
90
sections[section].append({
91
'title': article_title,
92
'author': article_author,
94
'description': article_summary,
100
# Try http://www.strangehorizons.com/reviews/archives.shtml
101
self.log("Review not found in Recent Reviews:", article_title)
105
# Extract article information from the heading and surrounding text.
106
link = heading_tag.find('a')
107
article_title = self.tag_to_string(link)
108
article_url = urlparse.urljoin(self.INDEX, link['href'])
109
article_author = link.nextSibling.replace(', by ', '')
110
article_summary = self.tag_to_string(heading_tag.findNextSibling('p', attrs={'class':'contents-pullquote'}))
112
# Add article to the appropriate collection of sections.
113
if section not in sections:
114
sections[section] = []
115
sections[section].append({
116
'title': article_title,
117
'author': article_author,
119
'description': article_summary,
122
heading_tag = heading_tag.findNextSibling(['h2','h3'])
124
# Manually insert standard info about the magazine.
125
sections['About'] = [{
126
'title': 'Strange Horizons',
127
'author': 'Niall Harrison, Editor-in-Chief',
128
'url': 'http://www.strangehorizons.com/AboutUs.shtml',
129
'description': 'Strange Horizons is a magazine of and about speculative fiction and related nonfiction. Speculative fiction includes science fiction, fantasy, horror, slipstream, and all other flavors of fantastika. Work published in Strange Horizons has been shortlisted for or won Hugo, Nebula, Rhysling, Theodore Sturgeon, James Tiptree Jr., and World Fantasy Awards.',
132
return sections.items()