~ubuntu-branches/debian/experimental/calibre/experimental

« back to all changes in this revision

Viewing changes to recipes/strange_horizons.recipe

  • Committer: Package Import Robot
  • Author(s): Martin Pitt
  • Date: 2012-02-10 07:35:00 UTC
  • mfrom: (1.3.30)
  • Revision ID: package-import@ubuntu.com-20120210073500-9hx5hpketc9hb59i
Tags: 0.8.38+dfsg-1
* New upstream release.
* debian/control: Bump Standards-Version to 3.9.2. No changes necessary.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/env python
 
2
 
 
3
import urlparse
 
4
from collections import OrderedDict
 
5
 
 
6
from calibre.web.feeds.news import BasicNewsRecipe
 
7
 
 
8
class StrangeHorizons(BasicNewsRecipe):
 
9
 
 
10
        # Recipe metadata
 
11
        # Any issue archive page is an acceptable index as well.
 
12
        # However, reviews will not be included in older issues.
 
13
        # (Using the reviews archive instead of the recent reviews page would fix.)
 
14
        INDEX = 'http://www.strangehorizons.com/'
 
15
        title = 'Strange Horizons'
 
16
        description = 'A magazine of speculative fiction and related nonfiction. Best downloaded on weekends'
 
17
        masthead_url = 'http://strangehorizons.com/images/sh_head.gif'
 
18
        publication_type = 'magazine'
 
19
        language = 'en'
 
20
        __author__ = 'Jim DeVona'
 
21
        __version__ = '1.0'
 
22
 
 
23
        # Cruft filters
 
24
        keep_only_tags = [dict(name='div', id='content')]
 
25
        remove_tags = [dict(name='p', attrs={'class': 'forum-links'}), dict(name='p', attrs={'class': 'top-link'})]
 
26
        remove_tags_after = [dict(name='p', attrs={'class': 'author-bio'})]
 
27
 
 
28
        # Styles
 
29
        no_stylesheets = True
 
30
        extra_css = '''div.image-left { margin: 0.5em auto 1em auto; } div.image-right { margin: 0.5em auto 1em auto; } div.illustration { margin: 0.5em auto 1em auto; text-align: center; } p.image-caption { margin-top: 0.25em; margin-bottom: 1em; font-size: 75%; text-align: center; } h1 { font-size: 160%; } h2 { font-size: 110%; } h3 { font-size: 85%; } h4 { font-size: 80%; } p { font-size: 90%; margin: 1em 1em 1em 15px; } p.author-bio { font-size: 75%; font-style: italic; margin: 1em 1em 1em 15px; } p.author-bio i, p.author-bio cite, p.author-bio .foreign { font-style: normal; } p.author-copyright { font-size: 75%; text-align: center; margin: 3em 1em 1em 15px; } p.content-date { font-weight: bold; } p.dedication { font-style: italic; } div.stanza { margin-bottom: 1em; } div.stanza p { margin: 0px 1em 0px 15px; font-size: 90%; } p.verse-line { margin-bottom: 0px; margin-top: 0px; } p.verse-line-indent-1 { margin-bottom: 0px; margin-top: 0px; text-indent: 2em; } p.verse-line-indent-2 { margin-bottom: 0px; margin-top: 0px; text-indent: 4em; } p.verse-stanza-break { margin-bottom: 0px; margin-top: 0px; } .foreign { font-style: italic; } .thought { font-style: italic; } .thought cite { font-style: normal; } .thought em { font-style: normal; } blockquote { font-size: 90%; font-style: italic; } blockquote cite { font-style: normal; } blockquote em { font-style: normal; } blockquote .foreign { font-style: normal; } blockquote .thought { font-style: normal; } .speaker { font-weight: bold; } pre { margin-left: 15px; } div.screenplay { font-family: monospace; } blockquote.screenplay-dialogue { font-style: normal; font-size: 100%; } .screenplay p.dialogue-first { margin-top: 0; } .screenplay p.speaker { margin-bottom: 0; text-align: center; font-weight: normal; } blockquote.typed-letter { font-style: normal; font-size: 100%; font-family: monospace; } .no-italics { font-style: normal; }'''
 
31
 
 
32
        def parse_index(self):
 
33
 
 
34
                sections = OrderedDict()
 
35
                strange_soup = self.index_to_soup(self.INDEX)
 
36
 
 
37
                # Find the heading that marks the start of this issue.
 
38
                issue_heading = strange_soup.find('h2')
 
39
                issue_date = self.tag_to_string(issue_heading)
 
40
                self.title = self.title + " - " + issue_date
 
41
 
 
42
                # Examine subsequent headings for information about this issue.
 
43
                heading_tag = issue_heading.findNextSibling(['h2','h3'])
 
44
                while heading_tag != None:
 
45
 
 
46
                        # An h2 indicates the start of the next issue.
 
47
                        if heading_tag.name == 'h2':
 
48
                                break
 
49
 
 
50
                        # The heading begins with a word indicating the article category.
 
51
                        section = self.tag_to_string(heading_tag).split(':', 1)[0].title()
 
52
 
 
53
                        # Reviews aren't linked from the index, so we need to look them up
 
54
                        # separately. Currently using Recent Reviews page. The reviews
 
55
                        # archive page lists all reviews, but is >500k.
 
56
                        if section == 'Review':
 
57
 
 
58
                                # Get the list of recent reviews.
 
59
                                review_soup = self.index_to_soup('http://www.strangehorizons.com/reviews/')
 
60
                                review_titles = review_soup.findAll('p', attrs={'class': 'contents-title'})
 
61
 
 
62
                                # Get the list of reviews included in this issue. (Kludgey.)
 
63
                                reviews_summary = heading_tag.findNextSibling('p', attrs={'class': 'contents-pullquote'})
 
64
                                for br in reviews_summary.findAll('br'):
 
65
                                        br.replaceWith('----')
 
66
                                review_summary_text = self.tag_to_string(reviews_summary)
 
67
                                review_lines = review_summary_text.split(' ----')
 
68
 
 
69
                                # Look for each of the needed reviews (there are 3, right?)...
 
70
                                for review_info in review_lines[0:3]:
 
71
 
 
72
                                        # Get the review's release day (unused), title, and author.
 
73
                                        day, tna = review_info.split(': ', 1)
 
74
                                        article_title, article_author = tna.split(', reviewed by ')
 
75
 
 
76
                                        # ... in the list of recent reviews.
 
77
                                        for review_title_tag in review_titles:
 
78
                                                review_title = self.tag_to_string(review_title_tag)
 
79
                                                if review_title != article_title:
 
80
                                                        continue
 
81
 
 
82
                                                # Extract review information from heading and surrounding text.
 
83
                                                article_summary = self.tag_to_string(review_title_tag.findNextSibling('p', attrs={'class': 'contents-pullquote'}))
 
84
                                                review_date = self.tag_to_string(review_title_tag.findNextSibling('p', attrs={'class': 'contents-date'}))
 
85
                                                article_url = review_title_tag.find('a')['href']
 
86
 
 
87
                                                # Add this review to the Review section.
 
88
                                                if section not in sections:
 
89
                                                        sections[section] = []
 
90
                                                sections[section].append({
 
91
                                                                'title': article_title,
 
92
                                                                'author': article_author,
 
93
                                                                'url': article_url,
 
94
                                                                'description': article_summary,
 
95
                                                                'date': review_date})
 
96
 
 
97
                                                break
 
98
 
 
99
                                        else:
 
100
                                                # Try http://www.strangehorizons.com/reviews/archives.shtml
 
101
                                                self.log("Review not found in Recent Reviews:", article_title)
 
102
 
 
103
                        else:
 
104
 
 
105
                                # Extract article information from the heading and surrounding text.
 
106
                                link = heading_tag.find('a')
 
107
                                article_title = self.tag_to_string(link)
 
108
                                article_url = urlparse.urljoin(self.INDEX, link['href'])
 
109
                                article_author = link.nextSibling.replace(', by ', '')
 
110
                                article_summary = self.tag_to_string(heading_tag.findNextSibling('p', attrs={'class':'contents-pullquote'}))
 
111
 
 
112
                                # Add article to the appropriate collection of sections.
 
113
                                if section not in sections:
 
114
                                        sections[section] = []
 
115
                                sections[section].append({
 
116
                                                'title': article_title,
 
117
                                                'author': article_author,
 
118
                                                'url': article_url,
 
119
                                                'description': article_summary,
 
120
                                                'date': issue_date})
 
121
 
 
122
                        heading_tag = heading_tag.findNextSibling(['h2','h3'])
 
123
 
 
124
                # Manually insert standard info about the magazine.
 
125
                sections['About'] = [{
 
126
                                'title': 'Strange Horizons',
 
127
                                'author': 'Niall Harrison, Editor-in-Chief',
 
128
                                'url': 'http://www.strangehorizons.com/AboutUs.shtml',
 
129
                                'description': 'Strange Horizons is a magazine of and about speculative fiction and related nonfiction. Speculative fiction includes science fiction, fantasy, horror, slipstream, and all other flavors of fantastika. Work published in Strange Horizons has been shortlisted for or won Hugo, Nebula, Rhysling, Theodore Sturgeon, James Tiptree Jr., and World Fantasy Awards.',
 
130
                                'date': ''}]
 
131
 
 
132
                return sections.items()
 
133