~ubuntu-branches/debian/sid/calibre/sid

« back to all changes in this revision

Viewing changes to recipes/ap.recipe

  • Committer: Package Import Robot
  • Author(s): Martin Pitt
  • Date: 2014-02-27 07:48:06 UTC
  • mto: This revision was merged to the branch mainline in revision 74.
  • Revision ID: package-import@ubuntu.com-20140227074806-64wdebb3ptosxhhx
Tags: upstream-1.25.0+dfsg
ImportĀ upstreamĀ versionĀ 1.25.0+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
import re
2
1
from calibre.web.feeds.news import BasicNewsRecipe
3
2
 
4
3
 
6
5
 
7
6
    title = u'Associated Press'
8
7
    description = 'Global news'
9
 
    __author__ = 'Kovid Goyal and Sujata Raman'
 
8
    __author__ = 'Krittika Goyal'
10
9
    use_embedded_content   = False
11
10
    language = 'en'
12
11
    no_stylesheets = True
13
 
    auto_cleanup = True
14
 
#    auto_cleanup_keep = '//td[@class="ap-smallphoto-td-image"]'
15
 
    max_articles_per_feed = 15
16
 
 
17
 
 
18
 
    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
19
 
[
20
 
         (r'<span class="entry-content">', lambda match : '<div class="entry-content">'),
21
 
    ]
22
 
    ]
23
 
 
24
 
 
25
 
    #keep_only_tags = [ dict(name='table', attrs={'class':['ap-story-table hnews hentry item']}),
26
 
                       ##dict(name='div', attrs={'class':['entry-content']}),
27
 
                       #]
28
 
    #remove_tags = [dict(name='td', attrs={'class':['ap-mediabox-td']}),
29
 
                   #dict(name='table', attrs={'class':['ap-htmltable-table', 'ap-htmltable-table', 'ap-mediabox-table']}),
30
 
                   ##dict(name='td', attrs={'bgcolor':['#333333']}),
31
 
                  #]
32
 
    extra_css = '''
33
 
               .headline{font-family:Verdana,Arial,Helvetica,sans-serif;font-weight:bold;}
34
 
               .bline{color:#003366;}
35
 
                body{font-family:Arial,Helvetica,sans-serif;}
36
 
                '''
37
 
 
38
 
 
39
 
    feeds = [
40
 
                   ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'),
41
 
                   ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'),
42
 
                   ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'),
43
 
                   ('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'),
44
 
                   ('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'),
45
 
                   ('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'),
46
 
                   ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'),
47
 
                   ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'),
48
 
                   ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'),
49
 
    ]
50
 
 
 
12
    conversion_options = {
 
13
        'linearize_tables' : True
 
14
    }
 
15
    keep_only_tags = {'name':'table', 'attrs':{'class':lambda x: x and 'ap-story-table' in x.split()}}
 
16
    remove_tags = [
 
17
        {'class':['ap-mediabox-table']},
 
18
        {'name':'img', 'src':lambda x: x and '//analytics.' in x},
 
19
    ]
 
20
 
 
21
    def parse_index(self):
 
22
        feeds = []
 
23
        fronts = ('HOME', 'US', 'WORLD', 'BUSINESS', 'TECHNOLOGY', 'SPORTS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE', 'POLITICS')
 
24
        for front in fronts:
 
25
            feeds.append([front.capitalize(), self.parse_section(front)])
 
26
        feeds[0][0] = 'Top Stories'
 
27
        return feeds
 
28
 
 
29
    def parse_section(self, front):
 
30
        self.log('Processing section:', front)
 
31
        soup = self.index_to_soup('http://hosted.ap.org/dynamic/fronts/%s?SITE=AP' % front)
 
32
 
 
33
        articles = []
 
34
        for x in soup.findAll('p', attrs={'class':['ap-newsbriefitem-p', 'ap-topheadlineitem-p']}):
 
35
            a = x.find('a', href=True)
 
36
            title = self.tag_to_string(a)
 
37
            url = "http://hosted.ap.org" + a['href']
 
38
            p = x.find(attrs={'class':'topheadlinebody'})
 
39
            desc = ''
 
40
            if p is not None:
 
41
                desc = self.tag_to_string(p)
 
42
            self.log('\tFound article:', title, '\n\t\t', desc)
 
43
            articles.append({'title':title, 'url':url})
 
44
 
 
45
        self.log('\n\n')
 
46
 
 
47
        return articles