22
22
# Boston, MA 02111-1307, USA.
24
24
from spectlib.watch import Watch
25
import spectlib.gtkconfig
26
28
import StringIO, gzip
27
import os, md5, urllib2
29
import os, md5, urllib2, difflib, pprint
28
30
from httplib import HTTPMessage, BadStatusLine
29
31
from math import fabs
30
32
from re import compile #this is the regex compile module to parse some stuff such as <link> tags in feeds
31
33
from spectlib.i18n import _
35
cacheSubDir__ = os.environ['HOME'] + "/.specto/cache/"
36
if not os.path.exists(cacheSubDir__):
37
os.mkdir(cacheSubDir__)
39
class Web_watch(Watch):
37
type = "Watch_web_static"
38
type_desc = "Webpage/feed"
40
icon = 'applications-internet'
42
class Watch_web_static(Watch):
41
44
Watch class that will check if http or rss pages are changed.
53
actually_updated = False
56
def __init__(self, specto, name, refresh, url, id, error_margin):
57
Watch.__init__(self, specto) #init superclass
58
self.refresh = refresh
62
self.specto.logger.log(_("Watch: \"%s\" has error: empty url") % self.error, "error", self.__class__)
64
self.error_margin = error_margin#the amount in percent (as a float) of what the filesize must change to consider the page changed
67
def dict_values(self):
68
return { 'name': self.name, 'refresh': self.refresh, 'uri': self.url_, 'error_margin':self.error_margin, 'type':0 }
71
def start_watch(self):
72
""" Start the watch. """
75
def _real_update(self):
76
self.specto.notifier.connected_message(True)#hide the network error message
77
lock = thread.allocate_lock()
79
t=thread.start_new_thread(self.update,(lock,))
81
while gtk.events_pending():
84
while gtk.events_pending():
87
def thread_update(self):
88
if not self.specto.connection_manager.connected():
89
self.specto.logger.log(_("No network connection detected"),
90
"info", self.__class__)
91
self.specto.notifier.connected_message(False) #show the network error message
92
self.specto.connection_manager.add_callback(self._real_update)
93
self.specto.mark_watch_busy(False, self.id)
97
def update(self, lock):
57
def __init__(self, specto, id, values):
59
( "uri", spectlib.config.String(True) ),
60
( "error_margin", spectlib.config.Dec(True) ),
61
( "redirect", spectlib.config.Boolean(False) )
64
self.standard_open_command = spectlib.util.return_webpage(values['uri'])
66
Watch.__init__(self, specto, id, values, watch_values)
68
self.cacheSubDir__ = specto.CACHE_DIR
69
self.use_network = True
70
self.filesize_difference = 0.0
73
# self.error_margin = self.error_margin
74
self.open_command = self.open_command.replace("&","\&")
98
79
""" See if a http or rss page changed. """
100
self.specto.mark_watch_busy(True, self.id)
101
self.specto.logger.log(_("Updating watch: \"%s\"") % self.name, "info", self.__class__)
103
80
# Create a unique name for each url.
81
if self.uri[:7] != "http://" and self.uri[:8] != "https://" and self.uri[:6] != "ftp://":
82
self.uri = "http://" + self.uri
104
84
digest = md5.new(self.url_).digest()
105
85
cacheFileName = "".join(["%02x" % (ord(c),) for c in digest])
106
self.cacheFullPath_ = os.path.join(cacheSubDir__, cacheFileName)
107
request = urllib2.Request(self.url_, None, {"Accept-encoding" : "gzip"})
86
self.cacheFullPath_ = os.path.join(self.cacheSubDir__, cacheFileName)
87
self.cacheFullPath2_ = os.path.join(self.cacheSubDir__, cacheFileName + "size")
88
request = urllib2.Request(self.uri, None, {"Accept-encoding" : "gzip"})
109
90
if (self.cached == 1) or (os.path.exists(self.cacheFullPath_)):
111
f = file(self.cacheFullPath_, "r")# Load up the cached version
112
self.infoB_ = HTTPMessage(f)
113
if self.infoB_.has_key('last-modified'):
114
request.add_header("If-Modified-Since", self.infoB_['last-modified'])
115
if self.infoB_.has_key('ETag'):
116
request.add_header("If-None-Match", self.infoB_['ETag'])
93
f = file(self.cacheFullPath_, "r")# Load up the cached version
118
99
response = urllib2.urlopen(request)
119
100
except (urllib2.URLError, BadStatusLine), e:
164
152
# just in case there is annoying advertising on the page,
165
153
# rendering the md5sum a false indicator.
166
154
self.new_filesize = len(str(self.content_))#size in bytes?... will be used for the error_margin in case of annoying advertising in the page
167
#if self.specto.DEBUG: print "\tPerceived filesize is", self.new_filesize, "bytes ("+str(self.new_filesize/1024)+"KB)"#useful for adjusting your error_margin
155
#if self.specto.DEBUG: "\tPerceived filesize is", self.new_filesize, "bytes ("+str(self.new_filesize/1024)+"KB)"#useful for adjusting your error_margin
169
157
if int(self.new_filesize)==4:
170
158
#FIXME: temporary hack, not sure the etag is ALWAYS 4bytes
171
159
#4 bytes means it's actually an etag reply, so there is no change. We don't care about filesize checks then.
172
160
self.filesize_difference = 0
174
self.old_filesize = self.specto.watch_io.read_option(self.name, "filesize")
162
self.old_filesize = self.read_filesize()
175
163
if self.old_filesize!=0:#if 0, that would mean that read_option could not find the filesize in watches.list
176
164
#if there is a previous filesize
177
165
#calculate the % changed filesize
178
166
self.filesize_difference = (fabs(int(self.new_filesize) - int(self.old_filesize)) / int(self.old_filesize))*100
179
167
#if self.specto.DEBUG: print "\tCached filesize: ", self.old_filesize, "\tFilesize difference percentage:", str(self.filesize_difference)[:5], "%"
180
self.specto.logger.log(_("Difference percentage:%s (Watch: \"%s\")") % (str(self.filesize_difference)[:5], self.name), "info", self.__class__)
181
if (self.filesize_difference >= float(self.error_margin)*100) and (self.filesize_difference != 0.0):
182
#if the filesize differences exceed the error_margin
183
#if self.specto.DEBUG: print "\tMD5SUM and filesize exceeded the margin: the watch has been updated."
168
#self.specto.logger.log(_("Difference percentage:%s (Watch: \"%s\")") % (str(self.filesize_difference)[:5], self.name), "info", self.__class__)
169
if self.cached and self.diff and (self.filesize_difference >= float(self.error_margin)*100) and (self.filesize_difference != 0.0): #and (self.infoB_['md5sum'] == self.info_['md5sum']):
184
170
self.to_be_stored_filesize = self.new_filesize
185
#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize
187
171
self.actually_updated = True
188
#this means that no matter what, the webpage is updated
190
#if there is no important changes in filesize. Call the MD5Sum.
192
if self.cached and (self.infoB_['md5sum'] == self.info_['md5sum']):
193
self.to_be_stored_filesize = self.new_filesize
194
#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize
196
self.actually_updated = True
199
#we don't want to juggle with all the possible filesizes,
200
#we want to stay close to the original, because replacing the filesize each time
201
#if the watch is not updated would create a lot of fluctuations
202
self.to_be_stored_filesize = self.old_filesize
203
#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize
204
self.actually_updated = False
173
#we don't want to juggle with all the possible filesizes,
174
#we want to stay close to the original, because replacing the filesize each time
175
#if the watch is not updated would create a lot of fluctuations
176
self.to_be_stored_filesize = self.old_filesize
177
#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize
178
self.actually_updated = False
206
180
#if there is NO previously stored filesize
207
181
self.to_be_stored_filesize = self.new_filesize
208
182
#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize
210
if (self.url2_ != self.url_) and self.specto.specto_gconf.get_entry("follow_website_redirects") == True:
184
if (self.url2_ != self.url_) and self.redirect == True:
211
185
self.write_uri()#it's uri, not url.
212
186
self.write_filesize()
214
self.specto.mark_watch_busy(False, self.id)
215
Watch.update(self, lock)
188
Watch.timer_update(self)
217
190
def content(self):
218
191
"""Get the content as a single string."""
226
199
adding/changing header values permanently in the cache."""
227
200
return self.info_
229
def add_headers(self, headers):
230
"""Add/change header values in the cache.
232
Note that if the key/value pair you change is used
233
by HTTP then you risk the possibility that the value
234
will be over-written the next time content is retrieved
237
for key in headers.keys():
238
self.info_[key] = headers[key]
239
f = file(self.cacheFullPath_, "w")
240
f.write(str(self.info_))
243
def _writeHeaders(self):
244
""" Write the full header in the cache. """
245
f = file(self.cacheFullPath_, "w")
246
f.write(str(self.info_))
249
202
def write_filesize(self):
250
203
""" Write the filesize in the watch list. """
252
self.new_values['name'] = self.name
253
self.new_values['filesize'] = self.to_be_stored_filesize
254
self.specto.watch_io.write_options(self.new_values)
205
f = open(self.cacheFullPath2_, "w")
207
self.specto.logger.log(_("There was an error opening the file %s") % self.cacheFullPath2_, "critical", self.__class__)
209
f.write(str(self.to_be_stored_filesize))
214
def read_filesize(self):
215
if os.path.exists(self.cacheFullPath2_):
217
f = open(self.cacheFullPath2_, "r")
219
self.specto.logger.log(_("There was an error reader the file %s") % self.cacheFullPath2_, "critical", self.__class__)
256
232
def write_uri(self):
257
233
""" Write the uri in the watch list. """
259
self.new_values['name'] = self.name
260
self.new_values['uri'] = self.url2_
261
self.specto.watch_io.write_options(self.new_values)
234
self.specto.watch_io.write_option(self.name, 'uri', self.url2_)
262
235
self.url_ = self.url2_
264
def clearCache(self):
265
""" Clear the cache file. """
266
[os.unlink(os.path.join(cacheSubDir__, name)) for name in os.listdir(cacheSubDir__)]
237
def remove_cache_files(self):
238
os.unlink(self.cacheFullPath_)
239
os.unlink(self.cacheFullPath2_)
268
241
def _writeContent(self, response):
270
243
content = response.read()
273
def set_url(self, url):
274
""" Set the url for the watch. """
277
def set_error_margin(self, error_margin):
278
""" Set the error margin for the watch. """
279
self.error_margin = error_margin
246
def escape(self, text, quotes=True):
247
"""Create a Markup instance from a string and escape special characters
248
it may contain (<, >, & and ").
250
If the `quotes` parameter is set to `False`, the " character is left as
251
is. Escaping quotes is generally only required for strings that are to
252
be used in attribute values.
254
text = str(text).replace('&', '&') \
255
.replace('<', '<') \
256
.replace('>', '>')
258
text = text.replace('"', '"')
281
261
def get_balloon_text(self):
282
262
""" create the text for the balloon """
283
text = ("The website, <b>%s</b>, has been updated.\n%d\n%s") % (self.name, self.to_be_stored_filesize, str(self.filesize_difference)[:5])
263
text = ("The website, <b>%s</b>, has been updated.\nDifference percentage: %s percent") % (self.name, str(self.filesize_difference)[:5])
286
266
def get_extra_information(self):
288
## i = self.newMsg - self.oldMsg
291
## while i < len(self.mail_info) and y < 5:
292
## author_info += "<i>" + self.mail_info[i].split("|")[1] + "</i> From <b>" + self.mail_info[i].split("|")[0] + "</b>\n"
296
## author_info += "and others..."
297
## text = "<b>New messages:</b>\n" + author_info
272
def get_gui_info(self):
275
('Last updated', self.last_updated),
277
("Error margin", str(self.error_margin) + "%")
280
def get_add_gui_info():
282
("uri", spectlib.gtkconfig.Entry("Url")),
283
("error_margin", spectlib.gtkconfig.Scale("Error margin (%)",value=2.0,upper=50,step_incr=0.1,page_incr=1.0))
286
"""HTML Diff: http://www.aaronsw.com/2002/diff
287
Rough code, badly documented. Send me comments and patches."""
289
__author__ = 'Aaron Swartz <me@aaronsw.com>'
290
__copyright__ = '(C) 2003 Aaron Swartz. GNU GPL 2.'
293
import difflib, string
295
def isTag(x): return x[0] == "<" and x[-1] == ">"
298
"""Takes in strings a and b and returns a human-readable HTML diff."""
301
a, b = html2list(a), html2list(b)
302
s = difflib.SequenceMatcher(None, a, b)
303
for e in s.get_opcodes():
304
if e[0] == "replace":
305
# @@ need to do something more complicated here
306
# call textDiff but not for html, but for some html... ugh
307
# gonna cop-out for now
308
out.append('<span foreground=\"red\">'+''.join(a[e[1]:e[2]]) + '</span><span foreground=\"green\">'+''.join(b[e[3]:e[4]])+"</span>\n")
309
elif e[0] == "delete":
310
out.append('<span foreground=\"red\">'+ ''.join(a[e[1]:e[2]]) + "</span>\n")
311
elif e[0] == "insert":
312
out.append('<span foreground=\"green\">'+''.join(b[e[3]:e[4]]) + "</span>\n")
315
def html2list(x, b=1):
324
out.append("");cur = ''; mode = 'char'
332
elif c in string.whitespace: out.append(cur+c); cur = ''
336
return filter(lambda x: x is not '', out)
b'\\ No newline at end of file'