~ubuntu-branches/ubuntu/karmic/calibre/karmic

« back to all changes in this revision

Viewing changes to src/calibre/web/feeds/recipes/recipe_nytimes_sub.py

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2009-07-30 12:49:41 UTC
  • mto: This revision was merged to the branch mainline in revision 13.
  • Revision ID: james.westby@ubuntu.com-20090730124941-kviipg9ypwgppulc
Tags: upstream-0.6.3+dfsg
ImportĀ upstreamĀ versionĀ 0.6.3+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
11
11
from calibre.ebooks.BeautifulSoup import BeautifulSoup
12
12
 
13
13
class NYTimes(BasicNewsRecipe):
14
 
    
 
14
 
15
15
    title       = 'The New York Times (subscription)'
16
16
    __author__  = 'Kovid Goyal'
17
17
    language = _('English')
18
18
    description = 'Daily news from the New York Times (subscription version)'
19
 
    timefmt = ' [%a, %d %b, %Y]'
 
19
    timefmt = ''
20
20
    needs_subscription = True
21
21
    remove_tags_before = dict(id='article')
22
22
    remove_tags_after  = dict(id='article')
23
 
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), 
24
 
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
 
23
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
 
24
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
25
25
                   dict(name=['script', 'noscript', 'style'])]
26
26
    encoding = 'cp1252'
27
27
    no_stylesheets = True
28
28
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
29
 
    
 
29
 
30
30
    def get_browser(self):
31
31
        br = BasicNewsRecipe.get_browser()
32
32
        if self.username is not None and self.password is not None:
36
36
            br['PASSWORD'] = self.password
37
37
            br.submit()
38
38
        return br
39
 
    
 
39
 
40
40
    def parse_index(self):
41
41
        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
42
 
        
 
42
 
43
43
        def feed_title(div):
44
44
            return ''.join(div.findAll(text=True, recursive=False)).strip()
45
 
        
 
45
 
46
46
        articles = {}
47
47
        key = None
48
48
        ans = []
49
 
        for div in soup.findAll(True, 
 
49
        allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials',
 
50
                              'New York','Business Day','Sports','Dining','Arts','Home','Styles']
 
51
        excludeSectionKeywords = ['Dining','Styles']
 
52
 
 
53
 
 
54
        # Find each instance of class="section-headline", class="story", class="story headline"
 
55
        for div in soup.findAll(True,
50
56
            attrs={'class':['section-headline', 'story', 'story headline']}):
51
 
            
 
57
 
52
58
            if div['class'] == 'section-headline':
53
59
                key = string.capwords(feed_title(div))
 
60
                excluded = re.compile('|'.join(excludeSectionKeywords))
 
61
                if excluded.search(key):
 
62
                    self.log("Skipping section %s" % key)
 
63
                    continue
 
64
 
54
65
                articles[key] = []
55
66
                ans.append(key)
56
 
            
57
 
            elif div['class'] in ['story', 'story headline']:
 
67
 
 
68
            elif div['class'] in ['story', 'story headline'] :
58
69
                a = div.find('a', href=True)
59
70
                if not a:
60
71
                    continue
61
72
                url = re.sub(r'\?.*', '', a['href'])
62
73
                url += '?pagewanted=all'
63
74
                title = self.tag_to_string(a, use_alt=True).strip()
 
75
 
64
76
                description = ''
65
77
                pubdate = strftime('%a, %d %b')
66
78
                summary = div.find(True, attrs={'class':'summary'})
67
79
                if summary:
68
80
                    description = self.tag_to_string(summary, use_alt=False)
69
 
                
 
81
 
 
82
                author = ''
 
83
                authorAttribution = div.find(True, attrs={'class':'storyheadline-author'})
 
84
                if authorAttribution:
 
85
                    author = self.tag_to_string(authorAttribution, use_alt=False)
 
86
                else:
 
87
                    authorAttribution = div.find(True, attrs={'class':'byline'})
 
88
                    if authorAttribution:
 
89
                        author = self.tag_to_string(authorAttribution, use_alt=False)
 
90
 
70
91
                feed = key if key is not None else 'Uncategorized'
71
92
                if not articles.has_key(feed):
72
93
                    articles[feed] = []
73
94
                if not 'podcasts' in url:
74
95
                    articles[feed].append(
75
 
                                  dict(title=title, url=url, date=pubdate, 
76
 
                                       description=description,
 
96
                                  dict(title=title, url=url, date=pubdate,
 
97
                                       description=description, author=author,
77
98
                                       content=''))
78
 
        ans = self.sort_index_by(ans, {'The Front Page':-1, 
79
 
                                       'Dining In, Dining Out':1, 
 
99
        ans = self.sort_index_by(ans, {'The Front Page':-1,
 
100
                                       'Dining In, Dining Out':1,
80
101
                                       'Obituaries':2})
81
102
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
 
103
 
82
104
        return ans
83
 
    
 
105
 
84
106
    def preprocess_html(self, soup):
85
107
        refresh = soup.find('meta', {'http-equiv':'refresh'})
86
108
        if refresh is None: