1
# Miro - an RSS based video player application
2
# Copyright (C) 2005-2010 Participatory Culture Foundation
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
# In addition, as a special exception, the copyright holders give
19
# permission to link the code of portions of this program with the OpenSSL
22
# You must obey the GNU General Public License in all respects for all of
23
# the code used other than OpenSSL. If you modify file(s) with this
24
# exception, you may extend this exception to your version of the file(s),
25
# but you are not obligated to do so. If you do not wish to do so, delete
26
# this exception statement from your version. If you delete this exception
27
# statement from all source files in the program, then also delete it here.
29
"""``miro.feed`` -- Holds ``Feed`` class and related things.
31
FIXME - talk about Feed architecture here
34
from HTMLParser import HTMLParser, HTMLParseError
35
from cStringIO import StringIO
36
from datetime import datetime, timedelta
37
from miro.gtcache import gettext as _
38
from miro.feedparser import FeedParserDict
39
from urlparse import urljoin
40
from miro.xhtmltools import (unescape, xhtmlify, fix_xml_header,
41
fix_html_header, urlencode)
46
from miro.database import DDBObject, ObjectNotFoundError
47
from miro.httpclient import grab_url
49
from miro import autodler
50
from miro import config
51
from miro import iconcache
52
from miro import databaselog
53
from miro import dialogs
54
from miro import download_utils
55
from miro import eventloop
56
from miro import feedupdate
57
from miro import flashscraper
58
from miro import models
59
from miro import prefs
60
from miro.plat import resources
61
from miro import downloader
62
from miro.util import (returns_unicode, returns_filename, unicodify, check_u,
63
check_f, quote_unicode_url, escape, to_uni,
65
from miro import fileutil
66
from miro.plat.utils import filename_to_unicode, make_url_safe, unmake_url_safe
67
from miro import filetypes
68
from miro.item import FeedParserValues
69
from miro import searchengines
71
from miro.clock import clock
73
WHITESPACE_PATTERN = re.compile(r"^[ \t\r\n]*$")
75
DEFAULT_FEED_ICON = "images/feedicon.png"
76
DEFAULT_FEED_ICON_TABLIST = "images/icon-rss.png"
79
def default_feed_icon_url():
80
return resources.url(DEFAULT_FEED_ICON)
82
def default_feed_icon_path():
83
return resources.path(DEFAULT_FEED_ICON)
85
def default_tablist_feed_icon_path():
86
return resources.path(DEFAULT_FEED_ICON_TABLIST)
88
# Notes on character set encoding of feeds:
90
# The parsing libraries built into Python mostly use byte strings
91
# instead of unicode strings. However, sometimes they get "smart" and
92
# try to convert the byte stream to a unicode stream automatically.
94
# What does what when isn't clearly documented
96
# We use the function to_uni() to fix those smart conversions
98
# If you run into Unicode crashes, adding that function in the
99
# appropriate place should fix it.
101
# Universal Feed Parser http://feedparser.org/
102
# Licensed under Python license
103
from miro import feedparser
105
def add_feed_from_file(fn):
106
"""Adds a new feed using USM
109
d = feedparser.parse(fn)
110
if d.feed.has_key('links'):
111
for link in d.feed['links']:
112
if link['rel'] == 'start' or link['rel'] == 'self':
115
if d.feed.has_key('link'):
116
add_feed_from_web_page(d.feed.link)
118
def add_feed_from_web_page(url):
119
"""Adds a new feed based on a link tag in a web page
123
url = HTMLFeedURLParser().get_link(info['updated-url'], info['body'])
127
logging.warning ("unhandled error in add_feed_from_web_page: %s", error)
128
grab_url(url, callback, errback)
130
FILE_MATCH_RE = re.compile(r"^file://.")
131
SEARCH_URL_MATCH_RE = re.compile('^dtv:savedsearch/(.*)\?q=(.*)')
133
def validate_feed_url(url):
134
"""URL validitation and normalization
139
if FILE_MATCH_RE.match(url) is not None:
143
def normalize_feed_url(url):
145
# Valid URL are returned as-is
146
if validate_feed_url(url):
152
# Check valid schemes with invalid separator
153
match = re.match(r"^(http|https):/*(.*)$", url)
154
if match is not None:
155
url = "%s://%s" % match.group(1, 2)
157
# Replace invalid schemes by http
158
match = re.match(r"^(([A-Za-z]*):/*)*(.*)$", url)
159
if match and match.group(2) in ['feed', 'podcast', 'fireant', None]:
160
url = "http://%s" % match.group(3)
161
elif match and match.group(1) == 'feeds':
162
url = "https://%s" % match.group(3)
164
# Make sure there is a leading / character in the path
165
match = re.match(r"^(http|https)://[^/]*$", url)
166
if match is not None:
169
url = quote_unicode_url(url)
171
if not validate_feed_url(url):
172
logging.info ("unable to normalize URL %s", originalURL)
177
def make_search_url(engine, term):
178
"""Create a URL for a search feed.
180
return u'dtv:savedsearch/%s?q=%s' % (engine, term)
182
def _config_change(key, value):
183
"""Handle configuration changes so we can update feed update frequencies
185
if key is prefs.CHECK_CHANNELS_EVERY_X_MN.key:
186
for feed in Feed.make_view():
189
update_freq = feed.parsed["feed"]["ttl"]
190
except (AttributeError, KeyError):
192
feed.set_update_frequency(update_freq)
194
config.add_change_callback(_config_change)
196
# Wait X seconds before updating the feeds at startup
197
INITIAL_FEED_UPDATE_DELAY = 5.0
199
class FeedImpl(DDBObject):
200
"""Actual implementation of a basic feed.
202
def setup_new(self, url, ufeed, title=None):
208
self.ufeed_id = ufeed.id
210
self.created = datetime.now()
211
self.updating = False
213
self.initialUpdate = True
214
self.updateFreq = config.get(prefs.CHECK_CHANNELS_EVERY_X_MN) * 60
217
def orphaned_view(cls):
218
table_name = app.db.table_name(cls)
219
return cls.make_view("feed.id is NULL",
220
joins={'feed': 'feed.feed_impl_id=%s.id' % table_name})
222
def _get_items(self):
223
return self.ufeed.items
224
items = property(_get_items)
226
def on_signal_change(self):
227
self.ufeed.signal_change()
230
def get_base_href(self):
231
"""Get a URL to use in the <base> tag for this channel. This is used
232
for relative links in this channel's items.
234
return escape(self.url)
236
def set_update_frequency(self, frequency):
237
"""Sets the update frequency (in minutes).
238
A frequency of -1 means that auto-update is disabled.
241
frequency = int(frequency)
246
self.cancel_update_events()
249
new_freq = max(config.get(prefs.CHECK_CHANNELS_EVERY_X_MN), frequency) * 60
250
if new_freq != self.updateFreq:
251
self.updateFreq = new_freq
252
self.schedule_update_events(-1)
253
self.ufeed.signal_change()
255
def schedule_update_events(self, firstTriggerDelay):
256
self.cancel_update_events()
257
if firstTriggerDelay >= 0:
258
self.scheduler = eventloop.add_timeout(firstTriggerDelay, self.update, "Feed update (%s)" % self.get_title())
260
if self.updateFreq > 0:
261
self.scheduler = eventloop.add_timeout(self.updateFreq, self.update, "Feed update (%s)" % self.get_title())
263
def cancel_update_events(self):
264
if hasattr(self, 'scheduler') and self.scheduler is not None:
265
self.scheduler.cancel()
266
self.scheduler = None
269
"""Subclasses should override this
271
self.schedule_update_events(-1)
273
def default_thumbnail_path(self):
274
"""Get the path to our thumbnail when there isn't a downloaded icon"""
275
return default_feed_icon_path()
279
"""Returns the title of the feed
283
if title is None or WHITESPACE_PATTERN.match(title):
284
if self.ufeed.baseTitle is not None:
285
title = self.ufeed.baseTitle
289
except AttributeError:
294
"""Returns the URL of the feed
299
def get_base_url(self):
300
"""Returns the URL of the feed
304
except AttributeError:
309
"""Returns a link to a webpage associated with the feed
311
return self.ufeed.get_base_href()
314
def get_thumbnail_url(self):
315
"""Returns the URL of a thumbnail associated with the feed
320
def get_license(self):
321
"""Returns URL of license assocaited with the feed
325
def setup_restored(self):
326
self.updating = False
330
DDBObject.remove(self)
333
"""Called when the feed uses this FeedImpl is removed from the DB.
334
subclasses can perform cleanup here."""
338
return "%s - %s" % (self.__class__.__name__, stringify(self.get_title()))
340
def clean_old_items(self):
342
Called to remove old items which are no longer in the feed.
344
Items that are currently in the feed should always be kept.
348
class Feed(DDBObject, iconcache.IconCacheOwnerMixin):
349
"""This class is a magic class that can become any type of feed it wants
351
It works by passing on attributes to the actual feed.
353
ICON_CACHE_VITAL = True
355
def setup_new(self, url, initiallyAutoDownloadable=None,
356
section=u'video', search_term=None, title=None):
358
if initiallyAutoDownloadable == None:
359
mode = config.get(prefs.CHANNEL_AUTO_DEFAULT)
360
# note that this is somewhat duplicated in
361
# set_auto_download_mode
363
self.getEverything = True
364
self.autoDownloadable = True
366
self.getEverything = False
367
self.autoDownloadable = True
369
self.getEverything = False
370
self.autoDownloadable = False
372
raise ValueError("Bad auto-download mode: %s" % mode)
375
self.autoDownloadable = initiallyAutoDownloadable
376
self.getEverything = False
378
self.section = section
380
self.maxOldItems = None
381
self.expire = u"system"
382
self.expireTime = None
384
self.last_viewed = datetime.min
386
self.baseTitle = None
388
self.errorState = False
390
self._actualFeed = None
391
self._set_feed_impl(FeedImpl(url, self, title))
392
self.setup_new_icon_cache()
393
self.informOnError = True
394
self.folder_id = None
395
self.searchTerm = search_term
396
self.userTitle = None
400
def setup_restored(self):
401
restored_feeds.append(self)
402
self._actualFeed = None
403
self.informOnError = False
406
def setup_common(self):
407
self.create_signal('update-finished')
409
self.wasUpdating = False
410
self.inlineSearchTerm = None
411
self.calc_item_list()
413
def _get_actual_feed(self):
414
# first try to load from actualFeed from the DB
415
if self._actualFeed is None:
416
for klass in (FeedImpl, RSSFeedImpl, SavedSearchFeedImpl,
417
ScraperFeedImpl, SearchFeedImpl, DirectoryFeedImpl,
418
DirectoryWatchFeedImpl, SearchDownloadsFeedImpl,
419
ManualFeedImpl, SingleFeedImpl):
421
self._actualFeed = klass.get_by_id(self.feed_impl_id)
422
self._actualFeed.ufeed = self
424
except ObjectNotFoundError:
426
# otherwise, make a new FeedImpl
427
if self._actualFeed is None:
428
self._set_feed_impl(FeedImpl(self.origURL, self))
430
return self._actualFeed
432
actualFeed = property(_get_actual_feed)
435
def get_by_url(cls, url):
436
return cls.make_view('origURL=?', (url,)).get_singleton()
439
def get_by_url_and_search(cls, url, searchTerm):
440
if searchTerm is not None:
441
view = cls.make_view('origURL=? AND searchTerm=?',
444
view = cls.make_view('origURL=? AND searchTerm IS NULL', (url,))
445
return view.get_singleton()
448
def get_manual_feed(cls):
449
return cls.get_by_url('dtv:manualFeed')
452
def get_directory_feed(cls):
453
return cls.get_by_url('dtv:directoryfeed')
456
def get_search_feed(cls):
457
return cls.get_by_url('dtv:search')
460
def get_search_downloads_feed(cls):
461
return cls.get_by_url('dtv:searchDownloads')
464
def folder_view(cls, id):
465
return cls.make_view('folder_id=?', (id,))
468
def visible_video_view(cls):
469
return cls.make_view("visible AND section='video'")
472
def watched_folder_view(cls):
473
return cls.make_view("origURL LIKE 'dtv:directoryfeed:%'")
476
def visible_audio_view(cls):
477
return cls.make_view("visible AND section='audio'")
479
def on_db_insert(self):
480
self.generate_feed(True)
483
return self.folder_id is not None
485
def _set_feed_impl(self, feed_impl):
486
if self._actualFeed is not None:
487
self._actualFeed.remove()
488
self._actualFeed = feed_impl
489
self.feed_impl_id = feed_impl.id
491
def signal_change(self, needs_save=True, needs_signal_folder=False):
492
if needs_signal_folder:
493
folder = self.get_folder()
495
folder.signal_change(needs_save=False)
496
DDBObject.signal_change (self, needs_save=needs_save)
498
def on_signal_change(self):
499
is_updating = bool(self.actualFeed.updating)
500
if self.wasUpdating and not is_updating:
501
self.emit('update-finished')
502
self.wasUpdating = is_updating
504
def calc_item_list(self):
505
self.items = models.Item.feed_view(self.id)
506
self.visible_items = models.Item.visible_feed_view(self.id)
507
self.downloaded_items = models.Item.feed_downloaded_view(self.id)
508
self.downloading_items = models.Item.feed_downloading_view(self.id)
509
self.available_items = models.Item.feed_available_view(self.id)
510
self.auto_pending_items = models.Item.feed_auto_pending_view(self.id)
511
self.unwatched_items = models.Item.feed_unwatched_view(self.id)
513
def update_after_restore(self):
514
if self.actualFeed.__class__ == FeedImpl:
515
# Our initial FeedImpl was never updated, call
516
# generate_feed again
518
eventloop.add_idle(lambda: self.generate_feed(True), "generate_feed")
520
self.schedule_update_events(INITIAL_FEED_UPDATE_DELAY)
522
def clean_old_items(self):
524
return self.actualFeed.clean_old_items()
526
def invalidate_counts(self):
527
for cached_count_attr in ('_num_available', '_num_unwatched',
528
'_num_downloaded', '_num_downloading'):
529
if cached_count_attr in self.__dict__:
530
del self.__dict__[cached_count_attr]
532
def recalc_counts(self):
533
self.invalidate_counts()
534
self.signal_change(needs_save=False)
536
self.get_folder().signal_change(needs_save=False)
538
def num_downloaded(self):
539
"""Returns the number of downloaded items in the feed.
542
return self._num_downloaded
543
except AttributeError:
544
self._num_downloaded = self.downloaded_items.count()
545
return self._num_downloaded
547
def num_downloading(self):
548
"""Returns the number of downloading items in the feed.
551
return self._num_downloading
552
except AttributeError:
553
self._num_downloading = self.downloading_items.count()
554
return self._num_downloading
556
def num_unwatched(self):
557
"""Returns string with number of unwatched videos in feed
560
return self._num_unwatched
561
except AttributeError:
562
self._num_unwatched = self.unwatched_items.count()
563
return self._num_unwatched
565
def num_available(self):
566
"""Returns string with number of available videos in feed
569
return self._num_available
570
except AttributeError:
571
self._num_available = (self.available_items.count() -
572
self.auto_pending_items.count())
573
return self._num_available
575
def get_viewed(self):
576
"""Returns true iff this feed has been looked at
578
return self.last_viewed != datetime.min
580
def mark_as_viewed(self):
581
"""Sets the last time the feed was viewed to now
583
self.last_viewed = datetime.now()
585
del self._num_available
586
except AttributeError:
589
self.get_folder().signal_change()
592
def start_manual_download(self):
594
for item in self.items:
595
if item.is_pending_manual_download():
598
elif item.get_pub_date_parsed() > next_.get_pub_date_parsed():
600
if next_ is not None:
601
next_.download(autodl=False)
603
def start_auto_download(self):
605
for item in self.items:
606
if item.is_eligible_for_auto_download():
609
elif item.get_pub_date_parsed() > next.get_pub_date_parsed():
612
next.download(autodl = True)
614
def expiring_items(self):
615
# items in watched folders never expire
616
if self.is_watched_folder():
618
if self.expire == u'never':
620
elif self.expire == u'system':
621
expire_after_x_days = config.get(prefs.EXPIRE_AFTER_X_DAYS)
622
if expire_after_x_days == -1:
624
delta = timedelta(days=expire_after_x_days)
626
delta = self.expireTime
627
return models.Item.feed_expiring_view(self.id, datetime.now() - delta)
629
def expire_items(self):
630
"""Expires items from the feed that are ready to expire.
632
for item in self.expiring_items():
635
def signal_items(self):
636
for item in self.items:
637
item.signal_change(needs_save=False)
639
def icon_changed(self):
640
"""See item.get_thumbnail to figure out which items to send
643
self.signal_change(needs_save=False)
644
for item in self.items:
645
if not (item.icon_cache.isValid() or
647
item.isContainerItem):
648
item.signal_change(needs_save=False)
651
return DDBObject.get_id(self)
654
def get_search_term(self):
655
self.confirm_db_thread()
656
return self.searchTerm
658
def is_updating(self):
659
return self.loading or (self.actualFeed and self.actualFeed.updating)
663
if self.userTitle is not None:
664
return self.userTitle
666
title = self.actualFeed.get_title()
667
if self.searchTerm is not None:
668
title = u"%s for '%s'" % (title, self.searchTerm)
671
def has_original_title(self):
672
return self.userTitle == None
674
def set_title(self, title):
675
self.confirm_db_thread()
676
self.userTitle = title
679
def revert_title(self):
682
def set_visible(self, visible):
683
if self.visible == visible:
685
self.visible = visible
689
def get_autodownload_mode(self):
690
self.confirm_db_thread()
691
if self.autoDownloadable:
692
if self.getEverything:
699
def set_auto_download_mode(self, mode):
700
# note that this is somewhat duplicated in setup_new
702
self.getEverything = True
703
self.autoDownloadable = True
705
self.getEverything = False
706
self.autoDownloadable = True
708
self.autoDownloadable = False
710
raise ValueError("Bad auto-download mode: %s" % mode)
714
def set_expiration(self, type_, time_):
715
"""Sets the expiration attributes. Valid types are u'system',
716
u'feed' and u'never'.
718
Expiration time is in hour(s).
720
self.confirm_db_thread()
722
self.expireTime = timedelta(hours=time_)
724
if self.expire == u"never":
725
for item in self.items:
726
if item.is_downloaded():
732
def set_max_new(self, max_new):
733
"""Sets the maxNew attributes. -1 means unlimited.
735
self.confirm_db_thread()
736
oldMaxNew = self.maxNew
737
self.maxNew = max_new
739
if self.maxNew >= oldMaxNew or self.maxNew < 0:
740
autodler.AUTO_DOWNLOADER.start_downloads()
742
def set_max_old_items(self, maxOldItems):
743
self.confirm_db_thread()
744
oldMaxOldItems = self.maxOldItems
745
if maxOldItems == -1:
747
self.maxOldItems = maxOldItems
749
if (maxOldItems is not None and
750
(oldMaxOldItems is None or oldMaxOldItems > maxOldItems)):
751
# the actual feed updating code takes care of expiring the old
753
self.actualFeed.clean_old_items()
756
self.confirm_db_thread()
757
if not self.id_exists():
761
elif self.errorState:
763
self.errorState = False
765
return self.generate_feed()
766
self.actualFeed.update()
768
def get_folder(self):
769
self.confirm_db_thread()
771
return models.ChannelFolder.get_by_id(self.folder_id)
775
def set_folder(self, new_folder, update_trackers=True):
776
self.confirm_db_thread()
777
old_folder = self.get_folder()
778
if new_folder is old_folder:
780
if new_folder is not None:
781
self.folder_id = new_folder.get_id()
783
self.folder_id = None
786
models.Item.update_folder_trackers()
788
new_folder.signal_change(needs_save=False)
790
old_folder.signal_change(needs_save=False)
793
def bulk_set_folders(folder_assignments):
794
"""Set the folders for multiple feeds at once.
796
This method is optimized to be a bit faster than calling
797
set_folder() for each individual folder.
799
for child, folder in folder_assignments:
800
child.set_folder(folder, update_trackers=False)
801
models.Item.update_folder_trackers()
803
def generate_feed(self, removeOnError=False):
805
if self.origURL == u"dtv:directoryfeed":
806
newFeed = DirectoryFeedImpl(self)
808
elif (self.origURL.startswith(u"dtv:directoryfeed:")):
809
url = self.origURL[len(u"dtv:directoryfeed:"):]
810
dir_ = unmake_url_safe(url)
811
newFeed = DirectoryWatchFeedImpl(self, dir_)
812
elif self.origURL == u"dtv:search":
813
newFeed = SearchFeedImpl(self)
815
elif self.origURL == u"dtv:searchDownloads":
816
newFeed = SearchDownloadsFeedImpl(self)
818
elif self.origURL == u"dtv:manualFeed":
819
newFeed = ManualFeedImpl(self)
821
elif self.origURL == u"dtv:singleFeed":
822
newFeed = SingleFeedImpl(self)
824
elif SEARCH_URL_MATCH_RE.match(self.origURL):
825
newFeed = SavedSearchFeedImpl(self.origURL, self)
827
self.download = grab_url(self.origURL,
828
lambda info: self._generate_feed_callback(info, removeOnError),
829
lambda error: self._generate_feed_errback(error, removeOnError),
830
default_mime_type=u'application/rss+xml')
831
logging.debug ("added async callback to create feed %s", self.origURL)
833
self.finish_generate_feed(newFeed)
835
def is_watched_folder(self):
836
return self.origURL.startswith("dtv:directoryfeed:")
838
def _handle_feed_loading_error(self, errorDescription):
840
self.errorState = True
843
if self.informOnError:
844
title = _('Error loading feed')
846
"Couldn't load the feed at %(url)s (%(errordescription)s)."
847
) % { "url": self.url, "errordescription": errorDescription }
848
description += "\n\n"
849
description += _("Would you like to keep the feed?")
850
d = dialogs.ChoiceDialog(title, description, dialogs.BUTTON_KEEP,
851
dialogs.BUTTON_DELETE)
852
def callback(dialog):
853
if dialog.choice == dialogs.BUTTON_DELETE and self.id_exists():
856
self.informOnError = False
857
delay = config.get(prefs.CHECK_CHANNELS_EVERY_X_MN)
858
eventloop.add_timeout(delay, self.update, "update failed feed")
860
def _generate_feed_errback(self, error, removeOnError):
861
if not self.id_exists():
863
logging.info("Warning couldn't load feed at %s (%s)",
865
self._handle_feed_loading_error(error.getFriendlyDescription())
867
def _generate_feed_callback(self, info, removeOnError):
868
"""This is called by grab_url to generate a feed based on
869
the type of data found at the given URL
871
# FIXME: This probably should be split up a bit. The logic is
874
# Note that all of the raw XML and HTML in this function is in
877
if not self.id_exists():
879
if info['updated-url'] != self.origURL and \
880
not self.origURL.startswith('dtv:'): # we got redirected
881
f = lookup_feed(info['updated-url'], self.searchTerm)
882
if f is not None: # already have this feed, so delete us
886
modified = unicodify(info.get('last-modified'))
887
etag = unicodify(info.get('etag'))
888
contentType = unicodify(info.get('content-type', u'text/html'))
890
# Some smarty pants serve RSS feeds with a text/html content-type...
891
# So let's do some really simple sniffing first.
892
apparentlyRSS = filetypes.is_maybe_rss(info['body'])
893
old_title = self.actualFeed.title
895
# Definitely an HTML feed
896
if (((contentType.startswith(u'text/html') or
897
contentType.startswith(u'application/xhtml+xml'))
898
and not apparentlyRSS)):
899
#print "Scraping HTML"
901
if info.has_key('charset'):
902
html = fix_html_header(html, info['charset'])
903
charset = unicodify(info['charset'])
906
self.ask_for_scrape(info, html, charset)
907
#It's some sort of feed we don't know how to scrape
908
elif (contentType.startswith(u'application/rdf+xml')
909
or contentType.startswith(u'application/atom+xml')):
912
if info.has_key('charset'):
913
xmldata = fix_xml_header(html, info['charset'])
916
self.finish_generate_feed(RSSFeedImpl(unicodify(info['updated-url']),
917
initialHTML=xmldata,etag=etag,modified=modified, ufeed=self,
919
# If it's not HTML, we can't be sure what it is.
921
# If we get generic XML, it's probably RSS, but it still could
924
# application/rss+xml links are definitely feeds. However, they
925
# might be pre-enclosure RSS, so we still have to download them
926
# and parse them before we can deal with them correctly.
927
elif (apparentlyRSS or
928
contentType.startswith(u'application/rss+xml') or
929
contentType.startswith(u'application/podcast+xml') or
930
contentType.startswith(u'text/xml') or
931
contentType.startswith(u'application/xml') or
932
(contentType.startswith(u'text/plain') and
933
(unicodify(info['updated-url']).endswith(u'.xml') or
934
unicodify(info['updated-url']).endswith(u'.rss')))):
935
#print " It's doesn't look like HTML..."
937
if info.has_key('charset'):
938
xmldata = fix_xml_header(html, info['charset'])
939
html = fix_html_header(html, info['charset'])
940
charset = unicodify(info['charset'])
944
# FIXME html and xmldata can be non-unicode at this point
945
parser = xml.sax.make_parser()
946
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
947
parser.setFeature(xml.sax.handler.feature_external_ges, 0)
948
handler = RSSLinkGrabber(unicodify(info['redirected-url']), charset)
949
parser.setContentHandler(handler)
950
parser.setErrorHandler(handler)
952
parser.parse(StringIO(xmldata))
953
except UnicodeDecodeError:
954
logging.exception ("Unicode issue parsing... %s",
956
self.finish_generate_feed(None)
959
except (SystemExit, KeyboardInterrupt):
962
#it doesn't parse as RSS, so it must be HTML
963
#print " Nevermind! it's HTML"
964
self.ask_for_scrape(info, html, charset)
966
#print " It's RSS with enclosures"
967
self.finish_generate_feed(RSSFeedImpl(
968
unicodify(info['updated-url']),
969
initialHTML=xmldata, etag=etag, modified=modified,
970
ufeed=self, title=old_title))
972
self._handle_feed_loading_error(_("Bad content-type"))
974
def finish_generate_feed(self, feedImpl):
975
self.confirm_db_thread()
977
if feedImpl is not None:
978
self._set_feed_impl(feedImpl)
979
self.errorState = False
981
self.errorState = True
984
def ask_for_scrape(self, info, initialHTML, charset):
985
title = _("Channel is not compatible with %(appname)s",
986
{"appname": config.get(prefs.SHORT_APP_NAME)})
988
"This channel is not compatible with %(appname)s "
989
"but we'll try our best to grab the files. It may take extra time "
990
"to list the videos, and descriptions may look funny.\n"
992
"Please contact the publishers of %(url)s and ask if they can supply a "
993
"feed in a format that will work with %(appname)s.\n"
995
"Do you want to try to load this channel anyway?",
996
{"url": info["updated-url"],
997
"appname": config.get(prefs.SHORT_APP_NAME)}
999
dialog = dialogs.ChoiceDialog(title, description, dialogs.BUTTON_YES,
1002
def callback(dialog):
1003
if not self.id_exists():
1005
if dialog.choice == dialogs.BUTTON_YES:
1006
uinfo = unicodify(info)
1007
impl = ScraperFeedImpl(uinfo['updated-url'],
1008
initialHTML=initialHTML, etag=uinfo.get('etag'),
1009
modified=uinfo.get('modified'), charset=charset,
1011
self.finish_generate_feed(impl)
1014
dialog.run(callback)
1016
def get_actual_feed(self):
1017
return self.actualFeed
1019
# Many attributes come from whatever FeedImpl subclass we're using.
1020
def attr_from_feed_impl(name):
1022
return getattr(self.actualFeed, name)
1023
return property(getter)
1025
for name in ( 'set_update_frequency', 'schedule_update_events',
1026
'cancel_update_events',
1027
'get_url', 'get_base_url',
1028
'get_base_href', 'get_link',
1029
'get_thumbnail_url', 'get_license', 'url', 'title', 'created',
1030
'thumbURL', 'dir', 'preserve_downloads', 'lookup', 'reset',
1033
locals()[name] = attr_from_feed_impl(name)
1036
def get_expiration_type(self):
1037
"""Returns "feed," "system," or "never"
1039
self.confirm_db_thread()
1043
def get_max_new(self):
1044
"""Returns "unlimited" or the maximum number of items this
1047
self.confirm_db_thread()
1053
def get_max_old_items(self):
1054
"""Returns the number of items to remember past the current
1055
contents of the feed. If self.maxOldItems is None, then this
1056
returns "system" indicating that the caller should look up the
1057
default in prefs.MAX_OLD_ITEMS_DEFAULT.
1059
self.confirm_db_thread()
1060
if self.maxOldItems is None:
1063
return self.maxOldItems
1065
def get_expiration_time(self):
1066
"""Returns the total absolute expiration time in hours.
1067
WARNING: 'system' and 'never' expiration types return 0
1069
self.confirm_db_thread()
1070
expireAfterSetting = config.get(prefs.EXPIRE_AFTER_X_DAYS)
1071
if ((self.expireTime is None or self.expire == 'never'
1072
or (self.expire == 'system' and expireAfterSetting <= 0))):
1075
return (self.expireTime.days * 24 +
1076
self.expireTime.seconds / 3600)
1078
def is_autodownloadable(self):
1079
"""Returns true iff item is autodownloadable
1081
self.confirm_db_thread()
1082
return self.autoDownloadable
1084
def remove(self, move_items_to=None):
1087
If move_items_to is None (the default), the items in this feed
1088
will be removed too. If move_items_to is given, the items in
1089
this feed will be moved to that feed.
1091
self.confirm_db_thread()
1093
if isinstance(self.actualFeed, DirectoryWatchFeedImpl):
1094
move_items_to = None
1095
self.cancel_update_events()
1096
if self.download is not None:
1097
self.download.cancel()
1098
self.download = None
1100
for item in self.items:
1101
if move_items_to is not None and item.is_downloaded():
1102
item.set_feed(move_items_to.get_id())
1104
to_remove.append(item)
1105
app.bulk_sql_manager.start()
1107
for item in to_remove:
1110
app.bulk_sql_manager.finish()
1111
self.remove_icon_cache()
1112
DDBObject.remove(self)
1113
self.actualFeed.remove()
1115
def thumbnail_valid(self):
1116
return self.icon_cache and self.icon_cache.isValid()
1118
def calc_tablist_thumbnail(self):
1119
if self.thumbnail_valid():
1120
return fileutil.expand_filename(self.icon_cache.get_filename())
1122
return default_tablist_feed_icon_path()
1125
def get_thumbnail_path(self):
1126
self.confirm_db_thread()
1127
if self.thumbnail_valid():
1128
return fileutil.expand_filename(self.icon_cache.get_filename())
1130
return self.actualFeed.default_thumbnail_path()
1132
def has_downloaded_items(self):
1133
return self.num_downloaded() > 0
1135
def has_downloading_items(self):
1136
return self.num_downloading() > 0
1139
return "Feed - %s" % stringify(self.get_title())
1141
class ThrottledUpdateFeedImpl(FeedImpl):
1142
"""Feed Impl that uses the feedupdate module to schedule it's
1143
updates. Only a limited number of ThrottledUpdateFeedImpl objects
1144
will be updating at any given time.
1147
def schedule_update_events(self, firstTriggerDelay):
1148
feedupdate.cancel_update(self.ufeed)
1149
if firstTriggerDelay >= 0:
1150
feedupdate.schedule_update(firstTriggerDelay, self.ufeed,
1153
if self.updateFreq > 0:
1154
feedupdate.schedule_update(self.updateFreq, self.ufeed,
1157
class RSSFeedImplBase(ThrottledUpdateFeedImpl):
1159
Base class from which RSSFeedImpl and SavedSearchFeedImpl derive.
1161
def setup_new(self, url, ufeed, title):
1162
FeedImpl.setup_new(self, url, ufeed, title)
1163
self.schedule_update_events(0)
1165
def _handle_new_entry(self, entry, fp_values, channel_title):
1166
"""Handle getting a new entry from a feed."""
1167
enclosure = fp_values.first_video_enclosure
1168
if ((self.url.startswith('file://') and enclosure
1169
and enclosure['url'].startswith('file://'))):
1170
path = download_utils.get_file_url_path(enclosure['url'])
1171
item = models.FileItem(path, fp_values=fp_values,
1172
feed_id=self.ufeed.id, channel_title=channel_title)
1174
item = models.Item(fp_values, feed_id=self.ufeed.id,
1175
eligibleForAutoDownload=not self.initialUpdate,
1176
channel_title=channel_title)
1177
if not item.matches_search(self.ufeed.searchTerm):
1180
def remember_old_items(self):
1181
self.old_items = set(self.items)
1183
def create_items_for_parsed(self, parsed):
1184
"""Update the feed using parsed XML passed in"""
1185
app.bulk_sql_manager.start()
1187
self._create_items_for_parsed(parsed)
1189
app.bulk_sql_manager.finish()
1191
def _create_items_for_parsed(self, parsed):
1192
# This is a HACK for Yahoo! search which doesn't provide
1194
for entry in parsed['entries']:
1195
if 'enclosures' not in entry:
1200
mimetype = filetypes.guess_mime_type(url)
1201
if mimetype is not None:
1202
entry['enclosures'] = [{'url': to_uni(url),
1203
'type': to_uni(mimetype)}]
1204
elif flashscraper.is_maybe_flashscrapable(url):
1205
entry['enclosures'] = [{'url': to_uni(url),
1206
'type': to_uni("video/flv")}]
1208
logging.info('unknown url type %s, not generating enclosure' % url)
1212
channelTitle = parsed["feed"]["title"]
1215
channelTitle = parsed["channel"]["title"]
1219
if channelTitle != None and self._allow_feed_to_override_title():
1220
self.title = channelTitle
1221
if (parsed.feed.has_key('image') and
1222
parsed.feed.image.has_key('url') and
1223
self._allow_feed_to_override_thumbnail()):
1224
self.thumbURL = parsed.feed.image.url
1225
self.ufeed.icon_cache.request_update(is_vital=True)
1228
items_byURLTitle = {}
1230
for item in self.items:
1232
items_byid[item.get_rss_id()] = item
1234
items_nokey.append(item)
1235
by_url_title_key = (item.url, item.entry_title)
1236
if by_url_title_key != (None, None):
1237
items_byURLTitle[by_url_title_key] = item
1238
for entry in parsed.entries:
1239
entry = self.add_scraped_thumbnail(entry)
1240
fp_values = FeedParserValues(entry)
1242
if fp_values.data['rss_id'] is not None:
1243
id_ = fp_values.data['rss_id']
1244
if items_byid.has_key(id_):
1245
item = items_byid[id_]
1246
if not fp_values.compare_to_item(item):
1247
item.update_from_feed_parser_values(fp_values)
1249
self.old_items.discard(item)
1251
by_url_title_key = (fp_values.data['url'],
1252
fp_values.data['entry_title'])
1253
if by_url_title_key != (None, None):
1254
if items_byURLTitle.has_key(by_url_title_key):
1255
item = items_byURLTitle[by_url_title_key]
1256
if not fp_values.compare_to_item(item):
1257
item.update_from_feed_parser_values(fp_values)
1259
self.old_items.discard(item)
1261
for item in items_nokey:
1262
if fp_values.compare_to_item(item):
1266
if fp_values.compare_to_item_enclosures(item):
1267
item.update_from_feed_parser_values(fp_values)
1269
self.old_items.discard(item)
1270
except (SystemExit, KeyboardInterrupt):
1274
if new and fp_values.first_video_enclosure is not None:
1275
self._handle_new_entry(entry, fp_values, channelTitle)
1277
def _allow_feed_to_override_title(self):
1278
"""Should the RSS feed override the default title?
1280
Subclasses can override this method to change our behavior when
1281
parsing feed entries.
1285
def _allow_feed_to_override_thumbnail(self):
1286
"""Should the RSS thumbnail override the default thumbnail?
1288
Subclasses can override this method to change our behavior when
1289
parsing feed entries.
1293
def update_finished(self):
1295
Called by subclasses to finish the update.
1297
if self.initialUpdate:
1298
self.initialUpdate = False
1301
for latest in models.Item.latest_in_feed_view(self.ufeed_id):
1302
latest.eligibleForAutoDownload = True
1303
latest.signal_change()
1304
if self.ufeed.is_autodownloadable():
1305
self.ufeed.mark_as_viewed()
1306
self.ufeed.signal_change()
1308
self.ufeed.recalc_counts()
1309
if hasattr(self, "old_items"):
1310
self.truncate_old_items()
1312
self.signal_change()
1314
def truncate_old_items(self):
1315
"""Truncate items so that the number of items in this feed doesn't
1316
exceed self.get_max_old_items()
1318
Items are only truncated if they don't exist in the feed anymore, and
1319
if the user hasn't downloaded them.
1321
limit = self.ufeed.get_max_old_items()
1322
if limit == u"system":
1323
limit = config.get(prefs.MAX_OLD_ITEMS_DEFAULT)
1325
item_count = self.items.count()
1326
if item_count > config.get(prefs.TRUNCATE_CHANNEL_AFTER_X_ITEMS):
1327
truncate = item_count - config.get(prefs.TRUNCATE_CHANNEL_AFTER_X_ITEMS)
1328
if truncate > len(self.old_items):
1330
limit = min(limit, truncate)
1331
extra = len(self.old_items) - limit
1336
for item in self.old_items:
1337
if item.downloader is None:
1338
candidates.append((item.creationTime, item))
1340
for time, item in candidates[:extra]:
1343
def add_scraped_thumbnail(self, entry):
1344
# skip this if the entry already has a thumbnail.
1345
if entry.has_key('thumbnail'):
1347
if entry.has_key('enclosures'):
1348
for enc in entry['enclosures']:
1349
if enc.has_key('thumbnail'):
1353
class RSSFeedImpl(RSSFeedImplBase):
1355
def setup_new(self, url, ufeed, title=None, initialHTML=None, etag=None,
1357
RSSFeedImplBase.setup_new(self, url, ufeed, title)
1358
self.initialHTML = initialHTML
1360
self.modified = modified
1361
self.download = None
1364
def get_base_href(self):
1366
return escape(self.parsed.link)
1367
except (SystemExit, KeyboardInterrupt):
1370
return FeedImpl.get_base_href(self)
1374
"""Returns a link to a webpage associated with the feed
1376
self.ufeed.confirm_db_thread()
1378
return self.parsed.link
1379
except (SystemExit, KeyboardInterrupt):
1384
def feedparser_finished(self):
1385
self.updating = False
1386
self.schedule_update_events(-1)
1387
self.update_finished()
1389
def feedparser_errback(self, e):
1390
if not self.ufeed.id_exists():
1392
logging.info("Error updating feed: %s: %s", self.url, e)
1393
self.feedparser_finished()
1395
def feedparser_callback(self, parsed):
1396
self.ufeed.confirm_db_thread()
1397
if not self.ufeed.id_exists():
1399
if len(parsed.entries) == len(parsed.feed) == 0:
1400
logging.warn("Empty feed, not updating: %s", self.url)
1401
self.feedparser_finished()
1404
parsed = self.parsed = unicodify(parsed)
1405
self.remember_old_items()
1406
self.create_items_for_parsed(parsed)
1409
updateFreq = self.parsed["feed"]["ttl"]
1412
self.set_update_frequency(updateFreq)
1414
self.feedparser_finished()
1416
if end - start > 1.0:
1417
logging.timing("feed update for: %s too slow (%.3f secs)",
1418
self.url, end - start)
1420
def call_feedparser(self, html):
1421
self.ufeed.confirm_db_thread()
1422
eventloop.call_in_thread(self.feedparser_callback,
1423
self.feedparser_errback,
1425
"Feedparser callback - %s" % self.url, html)
1430
self.ufeed.confirm_db_thread()
1431
if not self.ufeed.id_exists():
1436
self.updating = True
1437
self.ufeed.signal_change(needs_save=False)
1438
if hasattr(self, 'initialHTML') and self.initialHTML is not None:
1439
html = self.initialHTML
1440
self.initialHTML = None
1441
self.call_feedparser(html)
1445
except AttributeError:
1448
modified = self.modified
1449
except AttributeError:
1451
logging.info("updating %s", self.url)
1452
self.download = grab_url(self.url, self._update_callback,
1453
self._update_errback, etag=etag, modified=modified,
1454
default_mime_type=u'application/rss+xml')
1456
def _update_errback(self, error):
1457
if not self.ufeed.id_exists():
1459
logging.warn("WARNING: error in Feed.update for %s -- %s",
1460
self.ufeed, stringify(error))
1461
self.schedule_update_events(-1)
1462
self.updating = False
1463
self.ufeed.signal_change(needs_save=False)
1465
def _update_callback(self, info):
1466
if not self.ufeed.id_exists():
1468
if info.get('status') == 304:
1469
self.schedule_update_events(-1)
1470
self.updating = False
1471
self.ufeed.signal_change()
1474
if info.has_key('charset'):
1475
html = fix_xml_header(html, info['charset'])
1477
# FIXME HTML can be non-unicode here --NN
1478
self.url = unicodify(info['updated-url'])
1479
if info.has_key('etag'):
1480
self.etag = unicodify(info['etag'])
1483
if info.has_key('last-modified'):
1484
self.modified = unicodify(info['last-modified'])
1486
self.modified = None
1487
self.call_feedparser (html)
1490
def get_license(self):
1491
"""Returns the URL of the license associated with the feed
1494
return self.parsed["feed"]["license"]
1495
except (AttributeError, KeyError):
1499
def on_remove(self):
1500
if self.download is not None:
1501
self.download.cancel()
1502
self.download = None
1504
def setup_restored(self):
1505
"""Called by pickle during deserialization
1507
FeedImpl.setup_restored(self)
1508
self.download = None
1510
def clean_old_items(self):
1511
self.modified = None
1515
class RSSMultiFeedBase(RSSFeedImplBase):
1516
def setup_new(self, url, ufeed, title):
1517
RSSFeedImplBase.setup_new(self, url, ufeed, title)
1520
self.download_dc = {}
1522
self.urls = self.calc_urls()
1524
def setup_restored(self):
1525
"""Called by pickle during deserialization
1527
RSSFeedImplBase.setup_restored(self)
1528
self.download_dc = {}
1530
self.urls = self.calc_urls()
1532
def calc_urls(self):
1533
"""Calculate the list of URLs to parse.
1535
Subclasses must define this method.
1537
raise NotImplementedError()
1539
def check_update_finished(self):
1540
if self.updating == 0:
1541
self.update_finished()
1542
self.schedule_update_events(-1)
1544
def _allow_feed_to_override_title(self):
1547
def feedparser_finished(self, url, needs_save=False):
1548
if not self.ufeed.id_exists():
1551
self.check_update_finished()
1552
del self.download_dc[url]
1554
def feedparser_errback(self, e, url):
1555
if not self.ufeed.id_exists() or url not in self.download_dc:
1558
logging.info("Error updating feed: %s (%s): %s", self.url, url, e)
1560
logging.info("Error updating feed: %s (%s)", self.url, url)
1561
self.feedparser_finished(url, True)
1563
def feedparser_callback(self, parsed, url):
1564
self.ufeed.confirm_db_thread()
1565
if not self.ufeed.id_exists() or url not in self.download_dc:
1568
parsed = unicodify(parsed)
1569
self.create_items_for_parsed(parsed)
1570
self.feedparser_finished(url)
1572
if end - start > 1.0:
1573
logging.timing("feed update for: %s too slow (%.3f secs)",
1574
self.url, end - start)
1576
def call_feedparser(self, html, url):
1577
self.ufeed.confirm_db_thread()
1581
parsed = feedparser.parse(html)
1582
self.feedparser_callback(parsed, url)
1583
except (SystemExit, KeyboardInterrupt):
1586
self.feedparser_errback(self, None, url)
1589
eventloop.call_in_thread(
1590
lambda parsed, url=url: self.feedparser_callback(parsed, url),
1591
lambda e, url=url: self.feedparser_errback(e, url),
1592
feedparser.parse, "Feedparser callback - %s" % url, html)
1595
self.ufeed.confirm_db_thread()
1596
if not self.ufeed.id_exists():
1600
self.remember_old_items()
1601
for url in self.urls:
1602
etag = self.etag.get(url)
1603
modified = self.modified.get(url)
1604
self.download_dc[url] = grab_url(
1606
lambda x, url=url: self._update_callback(x, url),
1607
lambda x, url=url: self._update_errback(x, url),
1608
etag=etag, modified=modified,
1609
default_mime_type=u'application/rss+xml',)
1612
def _update_errback(self, error, url):
1613
if not self.ufeed.id_exists():
1615
logging.warn("WARNING: error in Feed.update for %s (%s) -- %s",
1616
self.ufeed, stringify(url), stringify(error))
1617
self.schedule_update_events(-1)
1619
self.check_update_finished()
1620
self.ufeed.signal_change(needs_save=False)
1622
def _update_callback(self, info, url):
1623
if not self.ufeed.id_exists():
1625
if info.get('status') == 304:
1626
self.schedule_update_events(-1)
1628
self.check_update_finished()
1629
self.ufeed.signal_change()
1632
if info.has_key('charset'):
1633
html = fix_xml_header(html, info['charset'])
1635
# FIXME HTML can be non-unicode here --NN
1636
if info.get('updated-url') and url in self.urls:
1637
index = self.urls.index(url)
1638
self.urls[index] = unicodify(info['updated-url'])
1640
if info.has_key('etag'):
1641
self.etag[url] = unicodify(info['etag'])
1643
self.etag[url] = None
1644
if info.has_key('last-modified'):
1645
self.modified[url] = unicodify(info['last-modified'])
1647
self.modified[url] = None
1648
self.call_feedparser (html, url)
1650
def on_remove(self):
1651
self._cancel_all_downloads()
1653
def _cancel_all_downloads(self):
1654
for dc in self.download_dc.values():
1656
self.download_dc = {}
1659
def clean_old_items(self):
1664
class SavedSearchFeedImpl(RSSMultiFeedBase):
1665
def setup_new(self, url, ufeed):
1667
info = searchengines.get_engine_for_name(self.engine)
1668
title = to_uni(_("%(engine)s for '%(query)s'",
1669
{'engine': info.title, 'query': self.query}))
1670
RSSMultiFeedBase.setup_new(self, url, ufeed, title)
1672
def default_thumbnail_path(self):
1673
info = searchengines.get_engine_for_name(self.engine)
1674
return searchengines.icon_path_for_engine(info)
1676
def setup_restored(self):
1677
self.parse_url(self.url)
1678
RSSMultiFeedBase.setup_restored(self)
1680
def _allow_feed_to_override_thumbnail(self):
1683
def parse_url(self, url):
1684
m = SEARCH_URL_MATCH_RE.match(url)
1685
self.engine = m.group(1)
1686
self.query = m.group(2)
1688
def calc_urls(self):
1689
return searchengines.get_request_urls(self.engine, self.query)
1691
class ScraperFeedImpl(ThrottledUpdateFeedImpl):
1692
"""A feed based on un unformatted HTML or pre-enclosure RSS
1694
def setup_new(self, url, ufeed, title=None, initialHTML=None, etag=None,
1695
modified=None, charset=None):
1696
FeedImpl.setup_new(self, url, ufeed, title)
1697
self.initialHTML = initialHTML
1698
self.initialCharset = charset
1699
self.linkHistory = {}
1700
self.linkHistory[url] = {}
1701
self.tempHistory = {}
1702
if not etag is None:
1703
self.linkHistory[url]['etag'] = unicodify(etag)
1704
if not modified is None:
1705
self.linkHistory[url]['modified'] = unicodify(modified)
1706
self.downloads = set()
1708
self.set_update_frequency(360)
1709
self.schedule_update_events(0)
1711
def save_cache_history(self):
1712
"""This puts all of the caching information in tempHistory into the
1713
linkHistory. This should be called at the end of an updated so that
1714
the next time we update we don't unnecessarily follow old links
1716
self.ufeed.confirm_db_thread()
1717
for url in self.tempHistory.keys():
1718
self.linkHistory[url] = self.tempHistory[url]
1719
self.tempHistory = {}
1721
def get_html(self, urlList, depth=0, linkNumber=0, top=False):
1722
"""Grabs HTML at the given URL, then processes it
1724
url = urlList.pop(0)
1725
#print "Grabbing %s" % url
1728
if self.linkHistory.has_key(url):
1729
etag = self.linkHistory[url].get('etag', None)
1730
modified = self.linkHistory[url].get('modified', None)
1732
if not self.ufeed.id_exists():
1734
self.downloads.discard(download)
1736
self.process_downloaded_html(info, urlList, depth, linkNumber,
1741
if not self.ufeed.id_exists():
1743
self.downloads.discard(download)
1744
logging.info("WARNING unhandled error for ScraperFeedImpl.get_html: %s", error)
1746
download = grab_url(url, callback, errback, etag=etag,
1747
modified=modified, default_mime_type='text/html')
1748
self.downloads.add(download)
1750
def process_downloaded_html(self, info, urlList, depth, linkNumber,
1752
self.ufeed.confirm_db_thread()
1753
#print "Done grabbing %s" % info['updated-url']
1755
if not self.tempHistory.has_key(info['updated-url']):
1756
self.tempHistory[info['updated-url']] = {}
1757
if info.has_key('etag'):
1758
self.tempHistory[info['updated-url']]['etag'] = unicodify(info['etag'])
1759
if info.has_key('last-modified'):
1760
self.tempHistory[info['updated-url']]['modified'] = unicodify(info['last-modified'])
1762
if info['status'] != 304 and info.has_key('body'):
1763
if info.has_key('charset'):
1764
subLinks = self.scrape_links(info['body'], info['redirected-url'], charset=info['charset'], setTitle=top)
1766
subLinks = self.scrape_links(info['body'], info['redirected-url'], setTitle=top)
1768
self.process_links(subLinks, 0, linkNumber)
1770
self.process_links(subLinks, depth+1, linkNumber)
1771
if len(urlList) > 0:
1772
self.get_html(urlList, depth, linkNumber)
1774
def check_done(self):
1775
if len(self.downloads) == 0:
1776
self.save_cache_history()
1777
self.updating = False
1778
self.ufeed.signal_change()
1779
self.schedule_update_events(-1)
1781
def add_video_item(self, link, dict_, linkNumber):
1782
link = unicodify(link.strip())
1783
if dict_.has_key('title'):
1784
title = dict_['title']
1787
for item in self.items:
1788
if item.get_url() == link:
1790
# Anywhere we call this, we need to convert the input back to unicode
1791
title = feedparser.sanitizeHTML(title, "utf-8").decode('utf-8')
1792
if dict_.has_key('thumbnail') > 0:
1793
fp_dict = FeedParserDict({'title': title,
1794
'enclosures': [FeedParserDict({'url': link,
1795
'thumbnail': FeedParserDict({'url': dict_['thumbnail']})
1799
fp_dict = FeedParserDict({'title': title,
1800
'enclosures': [FeedParserDict({'url': link})]
1802
i = models.Item(FeedParserValues(fp_dict),
1803
linkNumber=linkNumber, feed_id=self.ufeed.id,
1804
eligibleForAutoDownload=False)
1805
if ((self.ufeed.searchTerm is not None
1806
and not i.matches_search(self.ufeed.searchTerm))):
1810
def process_links(self, links, depth=0, linkNumber=0):
1811
# FIXME: compound names for titles at each depth??
1815
# List of URLs that should be downloaded
1818
if depth < maxDepth:
1822
#print "Processing %s (%d)" % (link,linkNumber)
1824
# FIXME: Using file extensions totally breaks the
1825
# standard and won't work with Broadcast Machine or
1826
# Blog Torrent. However, it's also a hell of a lot
1827
# faster than checking the mime type for every single
1828
# file, so for now, we're being bad boys. Uncomment
1829
# the elif to make this use mime types for HTTP GET URLs
1831
mimetype = filetypes.guess_mime_type(link)
1832
if mimetype is None:
1833
mimetype = 'text/html'
1835
#This is text of some sort: HTML, XML, etc.
1836
if ((mimetype.startswith('text/html') or
1837
mimetype.startswith('application/xhtml+xml') or
1838
mimetype.startswith('text/xml') or
1839
mimetype.startswith('application/xml') or
1840
mimetype.startswith('application/rss+xml') or
1841
mimetype.startswith('application/podcast+xml') or
1842
mimetype.startswith('application/atom+xml') or
1843
mimetype.startswith('application/rdf+xml') ) and
1844
depth < maxDepth -1):
1845
newURLs.append(link)
1848
elif (mimetype.startswith('video/') or
1849
mimetype.startswith('audio/') or
1850
mimetype == "application/ogg" or
1851
mimetype == "application/x-annodex" or
1852
mimetype == "application/x-bittorrent"):
1853
self.add_video_item(link, links[link], linkNumber)
1854
if len(newURLs) > 0:
1855
self.get_html(newURLs, depth, linkNumber)
1857
def on_remove(self):
1858
for download in self.downloads:
1860
self.downloads = set()
1863
# FIXME: go through and add error handling
1864
self.ufeed.confirm_db_thread()
1865
if not self.ufeed.id_exists():
1870
self.updating = True
1871
self.ufeed.signal_change(needs_save=False)
1873
if not self.initialHTML is None:
1874
html = self.initialHTML
1875
self.initialHTML = None
1878
charset = self.initialCharset
1879
self.initialCharset = None
1880
subLinks = self.scrape_links(html, redirURL, charset=charset,
1882
self.process_links(subLinks, 0, 0)
1885
self.get_html([self.url], top=True)
1887
def scrape_links(self, html, baseurl, setTitle=False, charset=None):
1889
if not charset is None:
1890
html = fix_html_header(html, charset)
1892
parser = xml.sax.make_parser()
1893
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
1895
parser.setFeature(xml.sax.handler.feature_external_ges, 0)
1896
except (SystemExit, KeyboardInterrupt):
1900
if charset is not None:
1901
handler = RSSLinkGrabber(baseurl, charset)
1903
handler = RSSLinkGrabber(baseurl)
1904
parser.setContentHandler(handler)
1906
parser.parse(StringIO(xmldata))
1909
except AttributeError:
1910
# bug in the python standard library causes this to be raised
1911
# sometimes. See #3201.
1913
links = handler.links
1916
if ((link[0].startswith('http://')
1917
or link[0].startswith('https://'))):
1918
if not linkDict.has_key(to_uni(link[0], charset)):
1919
linkDict[to_uni(link[0], charset)] = {}
1920
if not link[1] is None:
1921
linkDict[to_uni(link[0], charset)]['title'] = to_uni(link[1], charset).strip()
1922
if not link[2] is None:
1923
linkDict[to_uni(link[0], charset)]['thumbnail'] = to_uni(link[2], charset)
1924
if setTitle and not handler.title is None:
1925
self.ufeed.confirm_db_thread()
1927
self.title = to_uni(handler.title, charset)
1929
self.ufeed.signal_change()
1930
return ([x[0] for x in links if x[0].startswith('http://') or x[0].startswith('https://')], linkDict)
1931
except (xml.sax.SAXException, ValueError, IOError, xml.sax.SAXNotRecognizedException):
1932
(links, linkDict) = self.scrape_html_links(html, baseurl,
1935
return (links, linkDict)
1937
def scrape_html_links(self, html, baseurl, setTitle=False, charset=None):
1938
"""Given a string containing an HTML file, return a dictionary of
1939
links to titles and thumbnails
1941
lg = HTMLLinkGrabber()
1942
links = lg.get_links(html, baseurl)
1943
if setTitle and not lg.title is None:
1944
self.ufeed.confirm_db_thread()
1946
self.title = to_uni(lg.title, charset)
1948
self.ufeed.signal_change()
1952
if link[0].startswith('http://') or link[0].startswith('https://'):
1953
if not linkDict.has_key(to_uni(link[0], charset)):
1954
linkDict[to_uni(link[0], charset)] = {}
1955
if not link[1] is None:
1956
linkDict[to_uni(link[0], charset)]['title'] = to_uni(link[1], charset).strip()
1957
if not link[2] is None:
1958
linkDict[to_uni(link[0], charset)]['thumbnail'] = to_uni(link[2], charset)
1959
return ([x[0] for x in links
1960
if x[0].startswith('http://') or x[0].startswith('https://')],
1963
def setup_restored(self):
1964
"""Called by pickle during deserialization
1966
FeedImpl.setup_restored(self)
1967
self.downloads = set()
1968
self.tempHistory = {}
1970
class DirectoryScannerImplBase(FeedImpl):
1971
"""Base class for FeedImpls that scan directories for items."""
1973
def expire_items(self):
1974
"""Directory Items shouldn't automatically expire
1978
def set_update_frequency(self, frequency):
1979
newFreq = frequency*60
1980
if newFreq != self.updateFreq:
1981
self.updateFreq = newFreq
1982
self.schedule_update_events(-1)
1984
# the following methods much be implemented by subclasses
1985
def _scan_dir(self):
1986
raise NotImplementedError()
1988
# the following methods may be implemented by subclasses if they need to
1989
def _before_update(self):
1992
def _after_update(self):
1995
def _add_known_files(self, known_files):
1998
def _make_child(self, file_):
1999
models.FileItem(file_, feed_id=self.ufeed.id)
2002
self.ufeed.confirm_db_thread()
2004
self._before_update()
2006
# Calculate files known about by feeds other than the directory feed
2007
# Using a select statement is good here because we don't want to
2008
# construct all the Item objects if we don't need to.
2009
known_files = set(os.path.normcase(row[0]) for row in
2010
models.Item.select(['filename'],
2011
'filename IS NOT NULL AND '
2012
'(feed_id is NULL or feed_id != ?)', (self.ufeed_id,)))
2013
self._add_known_files(known_files)
2015
# Remove items with deleted files or that that are in feeds
2017
for item in self.items:
2018
filename = item.get_filename()
2019
if (filename is None or
2020
not fileutil.isfile(filename) or
2021
os.path.normcase(filename) in known_files):
2022
to_remove.append(item)
2023
app.bulk_sql_manager.start()
2025
for item in to_remove:
2028
app.bulk_sql_manager.finish()
2030
# now that we've checked for items that need to be removed, we
2031
# add our items to known_files so that they don't get added
2032
# multiple times to this feed.
2033
for x in self.items:
2034
known_files.add(os.path.normcase(x.get_filename()))
2036
# adds any files we don't know about
2037
# files on the filesystem
2039
scan_dir = self._scan_dir()
2040
if fileutil.isdir(scan_dir):
2041
all_files = fileutil.miro_allfiles(scan_dir)
2042
for file_ in all_files:
2043
file_ = os.path.normcase(file_)
2044
ufile = filename_to_unicode(file_)
2045
if (file_ not in known_files and
2046
filetypes.is_media_filename(ufile)):
2047
to_add.append(file_)
2049
app.bulk_sql_manager.start()
2051
for file_ in to_add:
2052
self._make_child(file_)
2054
app.bulk_sql_manager.finish()
2056
self._after_update()
2057
self.schedule_update_events(-1)
2059
class DirectoryWatchFeedImpl(DirectoryScannerImplBase):
2060
def setup_new(self, ufeed, directory):
2061
# calculate url and title arguments to FeedImpl's constructor
2062
if directory is not None:
2063
url = u"dtv:directoryfeed:%s" % make_url_safe(directory)
2065
url = u"dtv:directoryfeed"
2067
if title[-1] == '/':
2069
title = filename_to_unicode(os.path.basename(title)) + "/"
2071
FeedImpl.setup_new(self, url=url, ufeed=ufeed, title=title)
2072
self.dir = directory
2073
self.firstUpdate = True
2074
self.set_update_frequency(5)
2075
self.schedule_update_events(0)
2077
def _scan_dir(self):
2080
def _make_child(self, file_):
2081
models.FileItem(file_, feed_id=self.ufeed.id,
2082
mark_seen=self.firstUpdate)
2084
def _after_update(self):
2085
if self.firstUpdate:
2086
self.firstUpdate = False
2087
self.signal_change()
2089
class DirectoryFeedImpl(DirectoryScannerImplBase):
2090
"""A feed of all of the Movies we find in the movie folder that don't
2091
belong to a "real" feed. If the user changes her movies folder, this feed
2092
will continue to remember movies in the old folder.
2094
def setup_new(self, ufeed):
2095
FeedImpl.setup_new(self, url=u"dtv:directoryfeed", ufeed=ufeed, title=None)
2096
self.set_update_frequency(5)
2097
self.schedule_update_events(0)
2099
def _before_update(self):
2100
# Make sure container items have created FileItems for their contents
2101
for container in models.Item.containers_view():
2102
container.find_new_children()
2104
def _calc_known_files(self):
2107
def _add_known_files(self, known_files):
2108
# prevents files in Incomplete Downloads and Conversions from being
2109
# turned into FileItems.
2110
movies_dir = config.get(prefs.MOVIES_DIRECTORY)
2112
incomplete_dir = os.path.join(movies_dir, "Incomplete Downloads")
2113
known_files.add(os.path.normcase(incomplete_dir))
2115
conversions_dir = os.path.join(movies_dir, "Conversions")
2116
known_files.add(os.path.normcase(conversions_dir))
2118
def _scan_dir(self):
2119
return config.get(prefs.MOVIES_DIRECTORY)
2122
def get_title(self):
2123
return _(u'Local Files')
2125
class SearchFeedImpl(RSSMultiFeedBase):
2126
"""Search and Search Results feeds
2128
def setup_new(self, ufeed):
2129
self.engine = searchengines.get_search_engines()[0].name
2131
RSSMultiFeedBase.setup_new(self, url=u'dtv:search', ufeed=ufeed,
2133
self.initialUpdate = True
2134
self.searching = False
2135
self.set_update_frequency(-1)
2136
self.ufeed.autoDownloadable = False
2137
# keeps the items from being seen as 'newly available'
2138
self.ufeed.last_viewed = datetime.max
2139
self.ufeed.signal_change()
2141
def setup_restored(self):
2142
self.searching = False
2143
RSSMultiFeedBase.setup_restored(self)
2145
def calc_urls(self):
2146
if self.engine and self.query:
2147
return searchengines.get_request_urls(self.engine, self.query)
2151
def reset(self, set_engine=None):
2152
self.ufeed.confirm_db_thread()
2153
was_searching = self.searching
2154
self._cancel_all_downloads()
2155
self.initialUpdate = True
2156
app.bulk_sql_manager.start()
2158
for item in self.items:
2161
app.bulk_sql_manager.finish()
2163
self.searching = False
2164
if set_engine is not None:
2165
self.engine = set_engine
2168
self.ufeed.icon_cache.reset()
2169
self.thumbURL = None
2170
self.ufeed.icon_cache.request_update(is_vital=True)
2172
self.ufeed.emit('update-finished')
2174
def preserve_downloads(self, downloads_feed):
2175
self.ufeed.confirm_db_thread()
2176
for item in self.items:
2177
if item.get_state() not in ('new', 'not-downloaded'):
2178
item.set_feed(downloads_feed.id)
2180
def set_engine(self, engine):
2181
self.engine = engine
2183
def lookup(self, engine, query):
2187
self.searching = True
2188
self.engine = engine
2190
self.urls = self.calc_urls()
2192
self.ufeed.signal_change()
2194
def _handle_new_entry(self, entry, fp_values, channelTitle):
2195
"""Handle getting a new entry from a feed."""
2196
url = fp_values.data['url']
2198
dl = downloader.get_existing_downloader_by_url(url)
2200
for item in dl.item_list:
2201
if ((item.get_feed_url() == 'dtv:searchDownloads'
2202
and item.get_url() == url)):
2204
if entry["id"] == item.get_rss_id():
2205
item.set_feed(self.ufeed.id)
2206
if not fp_values.compare_to_item(item):
2207
item.update_from_feed_parser_values(fp_values)
2211
title = entry.get("title")
2212
oldtitle = item.entry_title
2213
if title == oldtitle:
2214
item.set_feed(self.ufeed.id)
2215
if not fp_values.compare_to_item(item):
2216
item.update_from_feed_parser_values(fp_values)
2218
RSSMultiFeedBase._handle_new_entry(self, entry, fp_values, channelTitle)
2220
def update_finished(self):
2221
self.searching = False
2222
RSSMultiFeedBase.update_finished(self)
2226
RSSMultiFeedBase.update(self)
2228
self.ufeed.emit('update-finished')
2231
def get_title(self):
2234
class SearchDownloadsFeedImpl(FeedImpl):
2235
def setup_new(self, ufeed):
2236
FeedImpl.setup_new(self, url=u'dtv:searchDownloads', ufeed=ufeed,
2238
self.set_update_frequency(-1)
2241
def get_title(self):
2244
class ManualFeedImpl(FeedImpl):
2245
"""Downloaded Videos/Torrents that have been added using by the
2246
user opening them with democracy.
2248
def setup_new(self, ufeed):
2249
FeedImpl.setup_new(self, url=u'dtv:manualFeed', ufeed=ufeed,
2251
self.ufeed.expire = u'never'
2252
self.set_update_frequency(-1)
2253
self.ufeed.last_viewed = datetime.max
2256
def get_title(self):
2257
return _(u'Local Files')
2259
class SingleFeedImpl(FeedImpl):
2260
"""Single Video that is playing that has been added by the user
2261
opening them with democracy.
2263
def setup_new(self, ufeed):
2264
FeedImpl.setup_new(self, url=u'dtv:singleFeed', ufeed=ufeed,
2266
self.ufeed.expire = u'never'
2267
self.set_update_frequency(-1)
2270
def get_title(self):
2271
return _(u'Playing File')
2273
LINK_PATTERN = re.compile("<(a|embed)\s[^>]*(href|src)\s*=\s*\"([^\"]*)\"[^>]*>(.*?)</a(.*)", re.S)
2274
IMG_PATTERN = re.compile(".*<img\s.*?src\s*=\s*\"(.*?)\".*?>", re.S)
2275
TAG_PATTERN = re.compile("<.*?>")
2277
class HTMLLinkGrabber(HTMLParser):
2278
"""Parse HTML document and grab all of the links and titles.
2280
# FIXME: Grab link title from ALT tags in images
2281
# FIXME: Grab document title from TITLE tags
2282
def get_links(self, data, baseurl):
2284
self.lastLink = None
2286
self.inObject = False
2287
self.baseurl = baseurl
2288
self.inTitle = False
2290
self.thumbnailUrl = None
2292
match = LINK_PATTERN.search(data)
2295
link_url = match.group(3).encode('ascii')
2296
except UnicodeError:
2297
link_url = match.group(3)
2298
i = len(link_url) - 1
2300
if 127 < ord(link_url[i]) <= 255:
2301
link_url = (link_url[:i] +
2302
"%%%02x" % (ord(link_url[i])) +
2306
link = urljoin(baseurl, link_url)
2307
desc = match.group(4)
2308
img_match = IMG_PATTERN.match(desc)
2311
thumb = urljoin(baseurl, img_match.group(1).encode('ascii'))
2312
except UnicodeError:
2316
desc = TAG_PATTERN.sub(' ', desc)
2317
self.links.append((link, desc, thumb))
2318
match = LINK_PATTERN.search(match.group(5))
2321
class RSSLinkGrabber(xml.sax.handler.ContentHandler,
2322
xml.sax.handler.ErrorHandler):
2323
def __init__(self, baseurl, charset=None):
2324
self.baseurl = baseurl
2325
self.charset = charset
2327
def startDocument(self):
2328
#print "Got start document"
2329
self.enclosureCount = 0
2333
self.inDescription = False
2334
self.inTitle = False
2339
self.firstTag = True
2341
self.fatal_errors = 0
2343
def startElementNS(self, name, qname, attrs):
2347
self.firstTag = False
2348
if tag not in ['rss', 'feed']:
2349
raise xml.sax.SAXNotRecognizedException, "Not an RSS file"
2350
if tag.lower() == 'enclosure' or tag.lower() == 'content':
2351
self.enclosureCount += 1
2352
elif tag.lower() == 'link':
2355
elif tag.lower() == 'description':
2356
self.inDescription = True
2358
elif tag.lower() == 'item':
2361
elif tag.lower() == 'title' and not self.inItem:
2364
def endElementNS(self, name, qname):
2367
if tag.lower() == 'description':
2368
lg = HTMLLinkGrabber()
2370
html = xhtmlify(unescape(self.descHTML), add_top_tags=True)
2371
if not self.charset is None:
2372
html = fix_html_header(html, self.charset)
2373
self.links[:0] = lg.get_links(html, self.baseurl)
2374
except HTMLParseError: # Don't bother with bad HTML
2375
logging.info ("bad HTML in description for %s", self.baseurl)
2376
self.inDescription = False
2377
elif tag.lower() == 'link':
2378
self.links.append((self.theLink, None, None))
2380
elif tag.lower() == 'item':
2382
elif tag.lower() == 'title' and not self.inItem:
2383
self.inTitle = False
2385
def characters(self, data):
2386
if self.inDescription:
2387
self.descHTML += data
2389
self.theLink += data
2391
if self.title is None:
2396
def error(self, exception):
2399
def fatalError(self, exception):
2400
self.fatal_errors += 1
2402
class HTMLFeedURLParser(HTMLParser):
2403
"""Grabs the feed link from the given webpage
2405
def get_link(self, baseurl, data):
2406
self.baseurl = baseurl
2410
except HTMLParseError:
2411
logging.info ("error parsing %s", baseurl)
2414
except HTMLParseError:
2415
logging.info ("error closing %s", baseurl)
2418
def handle_starttag(self, tag, attrs):
2420
for (key, value) in attrs:
2421
attrdict[key.lower()] = value
2422
if (tag.lower() == 'link' and attrdict.has_key('rel') and
2423
attrdict.has_key('type') and attrdict.has_key('href') and
2424
attrdict['rel'].lower() == 'alternate' and
2425
attrdict['type'].lower() in ['application/rss+xml',
2426
'application/podcast+xml',
2427
'application/rdf+xml',
2428
'application/atom+xml',
2430
'application/xml']):
2431
self.link = urljoin(self.baseurl, attrdict['href'])
2435
for feed in Feed.make_view():
2438
eventloop.add_timeout(300, expire_items, "Expire Items")
2440
def lookup_feed(url, search_term=None):
2442
return Feed.get_by_url_and_search(url, search_term)
2443
except ObjectNotFoundError:
2446
def remove_orphaned_feed_impls():
2448
for klass in (FeedImpl, RSSFeedImpl, SavedSearchFeedImpl,
2449
ScraperFeedImpl, SearchFeedImpl, DirectoryFeedImpl,
2450
DirectoryWatchFeedImpl, SearchDownloadsFeedImpl,):
2451
for feed_impl in klass.orphaned_view():
2452
logging.warn("No feed for FeedImpl: %s. Discarding", feed_impl)
2454
removed_impls.append(feed_impl.url)
2456
databaselog.info("Removed FeedImpl objects without a feed: %s",
2457
','.join(removed_impls))
2460
def start_updates():
2461
global restored_feeds
2462
if config.get(prefs.CHECK_CHANNELS_EVERY_X_MN) == -1:
2464
for feed in restored_feeds:
2465
if feed.id_exists():
2466
feed.update_after_restore()