~stub/ubuntu/precise/calibre/devel

« back to all changes in this revision

Viewing changes to resources/recipes/roger_ebert_blog.recipe

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2011-04-12 11:29:25 UTC
  • mfrom: (42.1.2 sid)
  • Revision ID: james.westby@ubuntu.com-20110412112925-c7171kt2bb5rmft4
Tags: 0.7.50+dfsg-2
* debian/control: Build with libpodofo-dev to enable PDF metadata.
  (Closes: #619632)
* debian/control: Add libboost1.42-dev build dependency. Apparently it is
  needed in some setups. (Closes: #619807)
* debian/rules: Call dh_sip to generate a proper sip API dependency, to
  prevent crashes like #616372 for partial upgrades.
* debian/control: Bump python-qt4 dependency to >= 4.8.3-2, which reportedly
  fixes crashes on startup. (Closes: #619701, #620125)

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
import re
 
2
import urllib2
 
3
import time
 
4
from calibre.web.feeds.news import BasicNewsRecipe
 
5
from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
 
6
from calibre import strftime
 
7
 
 
8
'''
 
9
      Help Needed:
 
10
       Still can't figure out why I'm getting strange characters.  Esp. the Great Movies descriptions in the TOC.
 
11
       Anyone help me figure that out?
 
12
 
 
13
      Change Log:
 
14
       2011-02-19:  Version 2:  Added "Oscars" section and fixed date problem
 
15
'''
 
16
 
 
17
class Ebert(BasicNewsRecipe):
 
18
    title                 = 'Roger Ebert'
 
19
    __author__            = 'Shane Erstad'
 
20
    version               = 2
 
21
    description           = 'Roger Ebert Movie Reviews'
 
22
    publisher             = 'Chicago Sun Times'
 
23
    category              = 'movies'
 
24
    oldest_article        = 8
 
25
    max_articles_per_feed = 100
 
26
    no_stylesheets        = True
 
27
    use_embedded_content  = False
 
28
    encoding              = 'UTF-8'
 
29
    masthead_url          = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
 
30
    language              = 'en'
 
31
    remove_empty_feeds    = False
 
32
    PREFIX                  = 'http://rogerebert.suntimes.com'
 
33
    patternReviews                = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
 
34
    patternCommentary       = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
 
35
    patternPeople           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
 
36
    patternOscars           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?OSCARS.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
 
37
    patternGlossary           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
 
38
 
 
39
 
 
40
 
 
41
    conversion_options = {
 
42
                          'comment'          : description
 
43
                        , 'tags'             : category
 
44
                        , 'publisher'        : publisher
 
45
                        , 'language'         : language
 
46
                        , 'linearize_tables' : True
 
47
                        }
 
48
 
 
49
 
 
50
    feeds          = [
 
51
                        (u'Reviews'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' )
 
52
                        ,(u'Commentary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY')
 
53
                        ,(u'Great Movies'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08')
 
54
                        ,(u'People'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE')
 
55
                        ,(u'Oscars'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=OSCARS')
 
56
                        ,(u'Glossary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
 
57
 
 
58
                     ]
 
59
 
 
60
    preprocess_regexps = [
 
61
        (re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL|re.IGNORECASE),
 
62
            lambda m: '')
 
63
    ]
 
64
 
 
65
 
 
66
 
 
67
    def print_version(self, url):
 
68
        return url + '&template=printart'
 
69
 
 
70
    def parse_index(self):
 
71
        totalfeeds = []
 
72
        lfeeds = self.get_feeds()
 
73
        for feedobj in lfeeds:
 
74
            feedtitle, feedurl = feedobj
 
75
            self.log('\tFeedurl: ', feedurl)
 
76
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
 
77
            articles = []
 
78
            page = urllib2.urlopen(feedurl).read()
 
79
 
 
80
            if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
 
81
                    pattern = self.patternReviews
 
82
            elif feedtitle == 'Commentary':
 
83
                    pattern = self.patternCommentary
 
84
            elif feedtitle == 'People':
 
85
                    pattern = self.patternPeople
 
86
            elif feedtitle == 'Glossary':
 
87
                    pattern = self.patternGlossary
 
88
            elif feedtitle == 'Oscars':
 
89
                    pattern = self.patternOscars
 
90
 
 
91
 
 
92
            regex = re.compile(pattern, re.IGNORECASE|re.DOTALL)
 
93
 
 
94
            for match in regex.finditer(page):
 
95
                if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
 
96
                    movietitle = match.group(1)
 
97
                    thislink = match.group(2)
 
98
                    description = match.group(3)
 
99
                elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
 
100
                    thislink = match.group(1)
 
101
                    description = match.group(2)
 
102
 
 
103
                self.log(thislink)
 
104
 
 
105
                for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
 
106
                    thisurl = self.PREFIX + link['href']
 
107
                    thislinktext = self.tag_to_string(link)
 
108
 
 
109
                    if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
 
110
                        thistitle = movietitle
 
111
                    elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
 
112
                        thistitle = thislinktext
 
113
 
 
114
                    if thistitle == '':
 
115
                        continue
 
116
 
 
117
 
 
118
                    pattern2 = r'AID=\/(.*?)\/'
 
119
                    reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
 
120
                    match2 = reg2.search(thisurl)
 
121
                    if match2:
 
122
                        c = time.strptime(match2.group(1),"%Y%m%d")
 
123
                        mydate=strftime("%A, %B %d, %Y", c)
 
124
                    else:
 
125
                        mydate = strftime("%A, %B %d, %Y")
 
126
                    self.log(mydate)
 
127
 
 
128
                    articles.append({
 
129
                                      'title'      :thistitle
 
130
                                     ,'date'       :'  [' + mydate + ']'
 
131
                                     ,'url'        :thisurl
 
132
                                     ,'description':description
 
133
                                    })
 
134
            totalfeeds.append((feedtitle, articles))
 
135
 
 
136
        return totalfeeds