~ubuntu-branches/ubuntu/karmic/calibre/karmic-updates

« back to all changes in this revision

Viewing changes to src/calibre/web/feeds/recipes/recipe_zaobao.py

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2009-04-05 18:42:16 UTC
  • mfrom: (1.1.7 sid)
  • Revision ID: james.westby@ubuntu.com-20090405184216-cyb0x4edrwjcaw33
Tags: 0.5.9+dfsg-1
* New upstream release. (Closes: #525339)
* manpages-installation.patch: Encode generated manpages as UTF-8, to avoid
  UnicodeDecodeErrors when writing them out to files.
* debian/control: Demote calibre dependency of calibre-bin to Recommends:,
  which is sufficient and avoids a circular dependency. (Closes: #522059)
* debian/control: Drop build dependency help2man, current version does not
  need it any more.
* debian/control: Drop versioned build dependency on python-mechanize,
  current sid version is enough.
* debian/rules: Copy "setup.py install" command from cdbs'
  python-distutils.mk, since the current version broke this. This is a
  hackish workaround until #525436 gets fixed.
* debian/rules: Drop using $(wildcard ), use `ls`; the former does not work
  any more.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/env  python
 
2
 
 
3
__license__   = 'GPL v3'
 
4
__copyright__ = '2009, Pu Bo <pubo at pubolab.com>'
 
5
'''
 
6
zaobao.com
 
7
'''
 
8
import time, os, traceback, sys
 
9
from calibre.web.feeds.news import BasicNewsRecipe
 
10
from calibre.web.feeds import feeds_from_index, Feed, Article
 
11
from BeautifulSoup import Tag
 
12
 
 
13
class ZAOBAO(BasicNewsRecipe):
 
14
    title          = u'\u8054\u5408\u65e9\u62a5\u7f51 zaobao.com'
 
15
    __author__     = 'Pu Bo'
 
16
    description    = 'News from zaobao.com'
 
17
    no_stylesheets = True
 
18
    recursions     = 1
 
19
    language = _('Chinese')
 
20
    encoding     = 'gbk'
 
21
#    multithreaded_fetch = True
 
22
 
 
23
    keep_only_tags    = [
 
24
                                                dict(name='table', attrs={'cellpadding':'9'}),
 
25
                                                dict(name='table', attrs={'class':'cont'}),
 
26
                                                dict(name='div', attrs={'id':'content'}),
 
27
                                                dict(name='span', attrs={'class':'page'}),
 
28
                                        ]
 
29
                                        
 
30
    remove_tags    = [
 
31
                                                dict(name='table', attrs={'cellspacing':'9'}),
 
32
                                        ]
 
33
 
 
34
    extra_css      = '\
 
35
            @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}\n\
 
36
            body{font-family: serif1, serif}\n\
 
37
            .article_description{font-family: serif1, serif}\n\
 
38
            p{font-family: serif1, serif}\n\
 
39
            h1 {font-weight: bold; font-size: large;}\n\
 
40
            h2 {font-size: large;}\n\
 
41
            .title {font-size: large;}\n\
 
42
            .article {font-size:medium}\n\
 
43
            .navbar {font-size: small}\n\
 
44
            .feed{font-size: medium}\n\
 
45
                        .small{font-size: small; padding-right: 8%}\n' 
 
46
 
 
47
    INDEXES                = [
 
48
                       (u'\u65b0\u95fb\u56fe\u7247', u'http://www.zaobao.com/photoweb/photoweb_idx.shtml')
 
49
                    ]
 
50
    MAX_ITEMS_IN_INDEX = 10
 
51
    
 
52
    DESC_SENSE     = u'\u8054\u5408\u65e9\u62a5\u7f51'
 
53
 
 
54
    feeds          = [
 
55
                      (u'\u5373\u65f6\u62a5\u9053', u'http://realtime.zaobao.com/news.xml'),
 
56
                                          (u'\u4e2d\u56fd\u65b0\u95fb', u'http://www.zaobao.com/zg/zg.xml'),
 
57
                                          (u'\u56fd\u9645\u65b0\u95fb', u'http://www.zaobao.com/gj/gj.xml'),
 
58
                                          (u'\u4e16\u754c\u62a5\u520a\u6587\u8403', u'http://www.zaobao.com/wencui/wencui.xml'),
 
59
                      (u'\u4e1c\u5357\u4e9a\u65b0\u95fb', u'http://www.zaobao.com/yx/yx.xml'),
 
60
                      (u'\u65b0\u52a0\u5761\u65b0\u95fb', u'http://www.zaobao.com/sp/sp.xml'),
 
61
                      (u'\u4eca\u65e5\u89c2\u70b9', u'http://www.zaobao.com/yl/yl.xml'),
 
62
                      (u'\u4e2d\u56fd\u8d22\u7ecf', u'http://www.zaobao.com/cz/cz.xml'),
 
63
                      (u'\u72ee\u57ce\u8d22\u7ecf', u'http://www.zaobao.com/cs/cs.xml'),
 
64
                      (u'\u5168\u7403\u8d22\u7ecf', u'http://www.zaobao.com/cg/cg.xml'),
 
65
                      (u'\u65e9\u62a5\u4f53\u80b2', u'http://www.zaobao.com/ty/ty.xml'),
 
66
                      (u'\u65e9\u62a5\u526f\u520a', u'http://www.zaobao.com/fk/fk.xml'),
 
67
                    ]
 
68
        
 
69
    def postprocess_html(self, soup, first):
 
70
        for tag in soup.findAll(name=['table', 'tr', 'td']):
 
71
            tag.name = 'div'
 
72
        return soup
 
73
      
 
74
    def parse_feeds(self):
 
75
        self.log_debug(_('ZAOBAO overrided parse_feeds()'))
 
76
        parsed_feeds = BasicNewsRecipe.parse_feeds(self)
 
77
 
 
78
        for id, obj in enumerate(self.INDEXES):
 
79
            title, url = obj
 
80
            articles = []
 
81
            soup = self.index_to_soup(url)
 
82
            
 
83
            for i, item in enumerate(soup.findAll('li')):
 
84
                if i >= self.MAX_ITEMS_IN_INDEX:
 
85
                    break
 
86
                a = item.find('a')
 
87
                if a and a.has_key('href'):
 
88
                    a_url = a['href']
 
89
                    a_title = self.tag_to_string(a)
 
90
                    date = ''
 
91
                    description = ''
 
92
                    self.log_debug(_('adding %s at %s')%(a_title,a_url))
 
93
                    articles.append({
 
94
                                    'title':a_title,
 
95
                                    'date':date,
 
96
                                    'url':a_url,
 
97
                                    'description':description
 
98
                                    })
 
99
 
 
100
            pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article,
 
101
                                     max_articles_per_feed=self.max_articles_per_feed)
 
102
 
 
103
            self.log_debug(_('adding %s to feed')%(title))
 
104
            for feed in pfeeds:
 
105
                self.log_debug(_('adding feed: %s')%(feed.title))
 
106
                feed.description = self.DESC_SENSE
 
107
                parsed_feeds.append(feed)
 
108
                for a, article in enumerate(feed):
 
109
                    self.log_debug(_('added article %s from %s')%(article.title, article.url))
 
110
                self.log_debug(_('added feed %s')%(feed.title))
 
111
                
 
112
        for i, feed in enumerate(parsed_feeds):
 
113
            # workaorund a strange problem: Somethimes the xml encoding is not apllied correctly by parse()
 
114
            weired_encoding_detected = False
 
115
            if not isinstance(feed.description, unicode) and self.encoding and feed.description:
 
116
                self.log_debug(_('Feed %s is not encoded correctly, manually replace it')%(feed.title))
 
117
                feed.description = feed.description.decode(self.encoding, 'replace')
 
118
            elif feed.description.find(self.DESC_SENSE) == -1 and self.encoding and feed.description:
 
119
                self.log_debug(_('Feed %s is weired encoded, manually redo all')%(feed.title))
 
120
                feed.description = feed.description.encode('cp1252', 'replace').decode(self.encoding, 'replace')
 
121
                weired_encoding_detected = True
 
122
 
 
123
            for a, article in enumerate(feed):
 
124
                if not isinstance(article.title, unicode) and self.encoding:
 
125
                    article.title = article.title.decode(self.encoding, 'replace')
 
126
                if not isinstance(article.summary, unicode) and self.encoding and article.summary:
 
127
                    article.summary = article.summary.decode(self.encoding, 'replace')
 
128
                    article.text_summary = article.summary
 
129
                if not isinstance(article.text_summary, unicode) and self.encoding and article.text_summary:
 
130
                    article.text_summary = article.text_summary.decode(self.encoding, 'replace')
 
131
                    article.summary = article.text_summary
 
132
                if weired_encoding_detected:
 
133
                    if article.title:
 
134
                        article.title = article.title.encode('cp1252', 'replace').decode(self.encoding, 'replace')
 
135
                    if article.summary:
 
136
                        article.summary = article.summary.encode('cp1252', 'replace').decode(self.encoding, 'replace')
 
137
                    if article.text_summary:
 
138
                        article.text_summary = article.text_summary.encode('cp1252', 'replace').decode(self.encoding, 'replace')
 
139
                
 
140
                if article.title == "Untitled article":
 
141
                    self.log_debug(_('Removing empty article %s from %s')%(article.title, article.url))
 
142
                    # remove the article
 
143
                    feed.articles[a:a+1] = []
 
144
        return parsed_feeds
 
145
 
 
146
    def get_browser(self): 
 
147
        br = BasicNewsRecipe.get_browser()
 
148
        br.addheaders.append(('Pragma', 'no-cache'))
 
149
        return br
 
 
b'\\ No newline at end of file'