13
13
from lxml import html
15
15
class Article(object):
17
17
time_offset = datetime.now() - datetime.utcnow()
19
def __init__(self, id, title, url, summary, published, content):
19
def __init__(self, id, title, url, author, summary, published, content):
20
20
self.downloaded = False
22
self.title = title.strip() if title else title
22
self._title = title.strip() if title else title
24
self.title = re.sub(r'&(\S+);',
25
entity_to_unicode, self.title)
24
self._title = re.sub(r'&(\S+);',
25
entity_to_unicode, self._title)
28
if not isinstance(self._title, unicode):
29
self._title = self._title.decode('utf-8', 'replace')
32
if author and not isinstance(author, unicode):
33
author = author.decode('utf-8', 'replace')
29
34
self.summary = summary
30
35
if summary and not isinstance(summary, unicode):
31
36
summary = summary.decode('utf-8', 'replace')
39
44
traceback.print_exc()
41
46
self.text_summary = summary
42
48
self.content = content
43
49
self.date = published
44
50
self.utctime = datetime(*self.date[:6])
45
51
self.localtime = self.utctime + self.time_offset
57
if not isinstance(t, unicode) and hasattr(t, 'decode'):
58
t = t.decode('utf-8', 'replace')
62
return property(fget=fget, fset=fset)
48
65
def __repr__(self):
56
'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
74
'''%(self.title, self.url, self.author, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
57
75
bool(self.content))).encode('utf-8')
62
80
def is_same_as(self, other_article):
63
81
#if self.title != getattr(other_article, 'title', False):
66
84
return self.url == getattr(other_article, 'url', False)
67
85
return self.content == getattr(other_article, 'content', False)
70
88
class Feed(object):
76
94
self.logger = logging.getLogger('feeds2disk')
77
95
self.get_article_url = get_article_url
79
def populate_from_feed(self, feed, title=None, oldest_article=7,
97
def populate_from_feed(self, feed, title=None, oldest_article=7,
80
98
max_articles_per_feed=100):
81
99
entries = feed.entries
87
105
self.image_width = image.get('width', 88)
88
106
self.image_height = image.get('height', 31)
89
107
self.image_alt = image.get('title', '')
91
109
self.articles = []
92
110
self.id_counter = 0
93
111
self.added_articles = []
95
113
self.oldest_article = oldest_article
97
115
for item in entries:
98
116
if len(self.articles) >= max_articles_per_feed:
100
118
self.parse_article(item)
103
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
121
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
104
122
max_articles_per_feed=100):
105
123
self.title = title if title else _('Unknown feed')
106
124
self.descrition = ''
107
125
self.image_url = None
108
126
self.articles = []
109
127
self.added_articles = []
111
129
self.oldest_article = oldest_article
112
130
self.id_counter = 0
114
132
for item in articles:
115
133
if len(self.articles) >= max_articles_per_feed:
124
142
link = item.get('url', None)
125
143
description = item.get('description', '')
126
144
content = item.get('content', '')
127
article = Article(id, title, link, description, published, content)
145
author = item.get('author', '')
146
article = Article(id, title, link, author, description, published, content)
128
147
delta = datetime.utcnow() - article.utctime
129
148
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
130
149
self.articles.append(article)
132
151
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
135
154
def parse_article(self, item):
136
155
id = item.get('id', 'internal id#'+str(self.id_counter))
137
156
if id in self.added_articles:
141
160
published = time.gmtime()
142
161
self.id_counter += 1
143
162
self.added_articles.append(id)
145
164
title = item.get('title', _('Untitled article'))
147
166
link = self.get_article_url(item)
149
168
self.logger.warning('Failed to get link for %s'%title)
150
169
self.logger.debug(traceback.format_exc())
152
172
description = item.get('summary', None)
154
content = '\n'.join(i.value for i in item.get('content', []))
173
author = item.get('author', None)
175
content = [i.value for i in item.get('content', []) if i.value]
176
content = [i if isinstance(i, unicode) else i.decode('utf-8', 'replace')
178
content = u'\n'.join(content)
155
179
if not content.strip():
157
181
if not link and not content:
159
article = Article(id, title, link, description, published, content)
183
article = Article(id, title, link, author, description, published, content)
160
184
delta = datetime.utcnow() - article.utctime
161
185
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
162
186
self.articles.append(article)
167
191
if not isinstance(title, unicode):
168
192
title = title.decode('utf-8', 'replace')
169
193
self.logger.debug('Skipping article %s as it is too old'%title)
171
195
def __iter__(self):
172
196
return iter(self.articles)
174
198
def __len__(self):
175
199
return len(self.articles)
177
201
def __repr__(self):
178
202
res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
180
204
return '\n'+'\n'.join(res)+'\n'
182
206
def __str__(self):
183
207
return repr(self)
185
209
def __bool__(self):
186
210
for article in self:
187
211
if getattr(article, 'downloaded', False):
191
215
def has_embedded_content(self):
194
218
if a.content or a.summary:
195
length += max(len(a.content if a.content else ''),
219
length += max(len(a.content if a.content else ''),
196
220
len(a.summary if a.summary else ''))
198
222
return length > 2000 * len(self)
200
224
def has_article(self, article):
202
226
if a.is_same_as(article):
206
230
def find(self, article):
207
231
for i, a in enumerate(self):
208
232
if a.is_same_as(article):
212
236
def remove(self, article):
213
237
i = self.index(article)
215
239
self.articles[i:i+1] = []
217
241
class FeedCollection(list):
219
243
def __init__(self, feeds):
220
244
list.__init__(self, [f for f in feeds if len(f.articles) > 0])
221
245
found_articles = set([])
222
246
duplicates = set([])
224
248
def in_set(s, a):
226
250
if a.is_same_as(x):
230
254
print '#feeds', len(self)
231
255
print map(len, self)
240
264
found_articles.add(a)
242
266
f.articles.remove(x)
244
268
self.duplicates = duplicates
245
269
print len(duplicates)
246
270
print map(len, self)
249
273
def find_article(self, article):
250
274
for j, f in enumerate(self):
251
275
for i, a in enumerate(f):
255
279
def restore_duplicates(self):
257
281
for article, feed in self.duplicates:
261
285
temp.append((feed, art))
262
286
for feed, art in temp:
263
287
feed.articles.append(art)
266
def feed_from_xml(raw_xml, title=None, oldest_article=7,
290
def feed_from_xml(raw_xml, title=None, oldest_article=7,
267
291
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
268
292
feed = parse(raw_xml)
269
293
pfeed = Feed(get_article_url=get_article_url)
270
pfeed.populate_from_feed(feed, title=title,
294
pfeed.populate_from_feed(feed, title=title,
271
295
oldest_article=oldest_article,
272
296
max_articles_per_feed=max_articles_per_feed)
282
306
for title, articles in index:
284
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
308
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
285
309
max_articles_per_feed=max_articles_per_feed)
286
310
feeds.append(pfeed)
b'\\ No newline at end of file'