~woutc/specto/specto-dbus-client

Viewing changes to spectlib/plugins/watch_web_static.py

Committer: Jean-François Fortin Tam
Author(s): Wout Clymans
Date: 2008-07-22 23:22:49 UTC
Revision ID: jeff@kiki-20080722232249-l64srclhp6u6qyrw

WARNING: this commit contains all the significant changes that happened in a specto-woutc branch over the past year. Large change log follows. Some commit log lines were intentionally left out.

- A dialog with debug information is shown when specto has a system/programming error.
- Disable renaming watches in the listview, make it a Jump To action instead
- All mandatory fields have to be filled in now (add and edit watch)
- The error log now shows the lines in color according to the severity
- Better file size cache name
- Added more error-handling
- The filesize is now saved in a different cache file (not in watches.list), may fix issue 37?
- Icons are now shown in the combobox when you add a new watch (buggy, patches welcome)
- Improved the pop3, imap and gmail watches
- The gmail watch now saves what unread mails there already were last time
- Convert HTML entities for the web diff
- Moved some code so the file dialog will show faster
- A watch will be marked updated when you didn't clear it on quit.
- Removed double call to refresh the watch info
- Made a general gtkutil file where you can define widgets used in the edit and add watch windows
- Removed the class name from the logger
- Clear the watch when you open it using the balloon
- Make some watch names clearer
- Error log tab in notifier window
- Added "clear" button in the edit menu
- Show simple diff from webpage difference
- Console mode (specto --console or specto --console --help)
- Watch menu when you right-click a watch entry in the notifier window
- Ability to run a command when a watch is updated
- Ability to run a command when a watch is cleared
- Fields in the add and edit windows are now dynamic; when creating a new watch plugin, you don't have to write all the gui code anymore
- More space for the extra information in the info panel
- code cleanup
- use plugin-system

- Fix issue 150: Gmail.com - that address is disabled in Germany - hence you can't go to messages directly
- Fix issue 93: Gmail library can support no more than 19 new mails
- Fix issue 131: bombs on special characters
- Fix issue 134: harmonized colors
- Fix issue 119: don't let the log file get huge
- Fix issue 143: Site adress in "About" box is not clickable
- Fix issue 146: Per-watch option to prevent URL redirects; To use this option add "redirect = True" to the watch that is allowed to redirect
- Fix issue 145: abnormal behavior with ampersands in a web watch
- Fix issue 51: Specto stores passwords in plaintext (started keyring support)
- Fix issue 135: Proxy support (already proxy support for web watch)
- Fix issue 128: allow specifying a port for mail watches (add 'port = 323' to your watch config)
- Fix issue 132: removing a watch should remove its cache files
- Fix issue 136: Support specific folder monitor over IMAP (add 'folder = work' to your imap watch config)
- Fix issue 63: Google Reader Watch does not support more than 20 items
- Fix issue 39: POP3 & IMAP watches not on par with gmail watch's message counting logic
- Fix issue 100: gmail with google apps should point to the right domain when clicking Jump to
- Fix issue 95: statusbar should show something else when updates are done
- Fix issue 112: hide error log tabs when debug mode is deactivated
- Fix issue 101: show the import dialog after the file chooser
- Fix issue 114: removing a watch should show a confirmation dialog
- Fix issue 73: brackets in watch name lead to startup crash (brackets can now be used in the name!)
- Fix issue 69: startup fails due to wrong glade file path
- Fix issue 12: provide more information
- Fix issue 13: watch list importing and exporting
- Fix issue 20: Organise specto source into modules
- Fix issue 33: ability to run a command instead of notifying
- Fix issue 54: freedesktop-compliant user directories
- Fix issue 72: "show in window list" preference is not saved
- Fix issue 77: don't mess up if ekiga's sound files are not present
- Fix issue 118: add http:// automatically for web watches (also @gmail.com added for gmail accounts)

files added:
spectlib/config.py

spectlib/console.py

spectlib/gtkconfig.py

spectlib/import_watch.py

spectlib/plugins/__init__.py

spectlib/plugins/watch_system_file.py

spectlib/tools

spectlib/tools/__init__.py

spectlib/tools/keyringmanager.py

files removed:
spectlib/gmailatom.py

spectlib/process.py

spectlib/watch_collection.py

files renamed:
spectlib/import_export.py => spectlib/export_watch.py

spectlib/watch_mail_imap.py => spectlib/plugins/watch_mail_imap.py

spectlib/watch_mail_pop3.py => spectlib/plugins/watch_mail_pop3.py

spectlib/watch_file.py => spectlib/plugins/watch_system_folder.py

spectlib/watch_port.py => spectlib/plugins/watch_system_port.py

spectlib/watch_process.py => spectlib/plugins/watch_system_process.py

spectlib/watch_web_greader.py => spectlib/plugins/watch_web_greader.py

spectlib/watch_web_static.py => spectlib/plugins/watch_web_static.py

spectlib/iniparser.py => spectlib/tools/iniparser.py

spectlib/networkmanager.py => spectlib/tools/networkmanager.py

spectlib/specto_gconf.py => spectlib/tools/specto_gconf.py

files modified:
data/doc/VERSION

data/glade/add_watch.glade

data/glade/edit_watch.glade

data/glade/log_dialog.glade

data/glade/notifier.glade

setup.py *

spectlib/about.py

spectlib/add_watch.py

spectlib/balloons.py

spectlib/edit_watch.py

spectlib/logger.py

spectlib/main.py

spectlib/notifier.py

spectlib/plugins/watch_mail_gmail.py

spectlib/preferences.py

spectlib/trayicon.py

spectlib/util.py

spectlib/watch.py

specto

Show diffs side-by-side

added added

removed removed

spectlib/plugins/watch_web_static.py

# Boston, MA 02111-1307, USA.

from spectlib.watch import Watch

import spectlib.gtkconfig

import spectlib.util

import StringIO, gzip

import os, md5, urllib2

import os, md5, urllib2, difflib, pprint

from httplib import HTTPMessage, BadStatusLine

from math import fabs

from re import compile #this is the regex compile module to parse some stuff such as <link> tags in feeds

from spectlib.i18n import _

import thread

import gtk, time

cacheSubDir__ = os.environ['HOME'] + "/.specto/cache/"

if not os.path.exists(cacheSubDir__):

os.mkdir(cacheSubDir__)

class Web_watch(Watch):

import time

type = "Watch_web_static"

type_desc = "Webpage/feed"

open_command = ""

icon = 'applications-internet'

class Watch_web_static(Watch):

"""

Watch class that will check if http or rss pages are changed.

"""

type_desc = type_desc

url_ = ""

info_ = None

content_ = None

infoB_ = None

cached = 0

url2_ = ""

updated = False

actually_updated = False

type = 0

def __init__(self, specto, name, refresh, url, id, error_margin):

Watch.__init__(self, specto) #init superclass

self.refresh = refresh

self.id = id

self.url_ = url

if self.url_ == "":

self.specto.logger.log(_("Watch: \"%s\" has error: empty url") % self.error, "error", self.__class__)

self.name = name

self.error_margin = error_margin#the amount in percent (as a float) of what the filesize must change to consider the page changed

self.error = False

def dict_values(self):

return { 'name': self.name, 'refresh': self.refresh, 'uri': self.url_, 'error_margin':self.error_margin, 'type':0 }

def start_watch(self):

""" Start the watch. """

self.thread_update()

def _real_update(self):

self.specto.notifier.connected_message(True)#hide the network error message

lock = thread.allocate_lock()

lock.acquire()

t=thread.start_new_thread(self.update,(lock,))

while lock.locked():

while gtk.events_pending():

gtk.main_iteration()

time.sleep(0.05)

while gtk.events_pending():

gtk.main_iteration()

def thread_update(self):

if not self.specto.connection_manager.connected():

self.specto.logger.log(_("No network connection detected"),

"info", self.__class__)

self.specto.notifier.connected_message(False) #show the network error message

self.specto.connection_manager.add_callback(self._real_update)

self.specto.mark_watch_busy(False, self.id)

else :

self._real_update()

def update(self, lock):

def __init__(self, specto, id, values):

watch_values = [

( "uri", spectlib.config.String(True) ),

( "error_margin", spectlib.config.Dec(True) ),

( "redirect", spectlib.config.Boolean(False) )

]

self.standard_open_command = spectlib.util.return_webpage(values['uri'])

Watch.__init__(self, specto, id, values, watch_values)

self.cacheSubDir__ = specto.CACHE_DIR

self.use_network = True

self.filesize_difference = 0.0

self.icon = icon

# self.error_margin = self.error_margin

self.open_command = self.open_command.replace("&","\&")

self.url_ = self.uri

self.diff = ""

def update(self):

""" See if a http or rss page changed. """

self.error = False

100

self.specto.mark_watch_busy(True, self.id)

101

self.specto.logger.log(_("Updating watch: \"%s\"") % self.name, "info", self.__class__)

102

103

# Create a unique name for each url.

if self.uri[:7] != "http://" and self.uri[:8] != "https://" and self.uri[:6] != "ftp://":

self.uri = "http://" + self.uri

self.url_ = self.uri

104

digest = md5.new(self.url_).digest()

105

cacheFileName = "".join(["%02x" % (ord(c),) for c in digest])

106

self.cacheFullPath_ = os.path.join(cacheSubDir__, cacheFileName)

107

request = urllib2.Request(self.url_, None, {"Accept-encoding" : "gzip"})

108

self.cacheFullPath_ = os.path.join(self.cacheSubDir__, cacheFileName)

self.cacheFullPath2_ = os.path.join(self.cacheSubDir__, cacheFileName + "size")

request = urllib2.Request(self.uri, None, {"Accept-encoding" : "gzip"})

cache_res = ""

109

if (self.cached == 1) or (os.path.exists(self.cacheFullPath_)):

110

self.cached = 1

111

f = file(self.cacheFullPath_, "r")# Load up the cached version

112

self.infoB_ = HTTPMessage(f)

113

if self.infoB_.has_key('last-modified'):

114

request.add_header("If-Modified-Since", self.infoB_['last-modified'])

115

if self.infoB_.has_key('ETag'):

116

request.add_header("If-None-Match", self.infoB_['ETag'])

try:

f = file(self.cacheFullPath_, "r")# Load up the cached version

cache_res = f.read()

f.close()

except:

cache_res = ""

117

try:

118

response = urllib2.urlopen(request)

119

100

except (urllib2.URLError, BadStatusLine), e:

123

104

self.info_ = response.info()

124

105

self.url2_ = response.geturl()

125

106

self.content_ = self._writeContent(response)

126

self.info_['Url'] = self.url_

107

self.info_['Url'] = self.uri

127

108

self.digest_ = md5.new(self.content_).digest()

128

109

self.digest_ = "".join(["%02x" % (ord(c),) for c in self.digest_])

129

110

self.info_['md5sum'] = self.digest_

134

115

self.page_source = gzip.GzipFile(fileobj=self.compressedstream).read() #try uncompressing

135

116

except:

136

117

self.page_source = self.content_ #the page was not compressed

137

118

119

self.page_source = self.escape(self.page_source)

120

self.diff = textDiff(cache_res, self.page_source)

121

try:

122

out_file = file(self.cacheFullPath_, "w")

123

out_file.write(str(self.page_source))

124

out_file.close()

125

except:

126

pass

127

138

128

# This will check for the "real" website home URL when the website target is an xml feed.

139

129

# First, check if the web page is actually a known feed type.

140

130

# Here we look for three kinds of headers, where * is a wildcard:

149

139

if self.rss_links=="":

150

140

m=m.strip("<link>").strip("</link>")

151

141

self.rss_links = m

152

#Save the uri_real attribute to the watch list

153

self.new_values = {}

154

self.new_values['name'] = self.name

155

self.new_values['uri_real'] = self.rss_links

156

self.specto.watch_io.write_options(self.new_values)

157

#TODO: the uri_real is now correctly saved into watches.list. Now, what is missing is someone who would implement quite easily that notifier.py reads watches.list, gets that uri_real and uses it when someone clicks "go to" to open the website. That's all.

142

#change the uri_real attribute

143

if self.open_command == self.standard_open_command:

144

self.standard_open_command = spectlib.util.return_webpage(self.rss_links)

145

self.open_command = self.standard_open_command

158

146

else:

159

147

#the file is not a recognized feed. We will not parse it for the <link> tag.

160

148

pass

164

152

# just in case there is annoying advertising on the page,

165

153

# rendering the md5sum a false indicator.

166

154

self.new_filesize = len(str(self.content_))#size in bytes?... will be used for the error_margin in case of annoying advertising in the page

167

#if self.specto.DEBUG: print "\tPerceived filesize is", self.new_filesize, "bytes ("+str(self.new_filesize/1024)+"KB)"#useful for adjusting your error_margin

155

#if self.specto.DEBUG: "\tPerceived filesize is", self.new_filesize, "bytes ("+str(self.new_filesize/1024)+"KB)"#useful for adjusting your error_margin

168

156

169

157

if int(self.new_filesize)==4:

170

158

#FIXME: temporary hack, not sure the etag is ALWAYS 4bytes

171

159

#4 bytes means it's actually an etag reply, so there is no change. We don't care about filesize checks then.

172

160

self.filesize_difference = 0

173

161

else:

174

self.old_filesize = self.specto.watch_io.read_option(self.name, "filesize")

162

self.old_filesize = self.read_filesize()

175

163

if self.old_filesize!=0:#if 0, that would mean that read_option could not find the filesize in watches.list

176

164

#if there is a previous filesize

177

165

#calculate the % changed filesize

178

166

self.filesize_difference = (fabs(int(self.new_filesize) - int(self.old_filesize)) / int(self.old_filesize))*100

179

167

#if self.specto.DEBUG: print "\tCached filesize: ", self.old_filesize, "\tFilesize difference percentage:", str(self.filesize_difference)[:5], "%"

180

self.specto.logger.log(_("Difference percentage:%s (Watch: \"%s\")") % (str(self.filesize_difference)[:5], self.name), "info", self.__class__)

181

if (self.filesize_difference >= float(self.error_margin)*100) and (self.filesize_difference != 0.0):

182

#if the filesize differences exceed the error_margin

183

#if self.specto.DEBUG: print "\tMD5SUM and filesize exceeded the margin: the watch has been updated."

168

#self.specto.logger.log(_("Difference percentage:%s (Watch: \"%s\")") % (str(self.filesize_difference)[:5], self.name), "info", self.__class__)

169

if self.cached and self.diff and (self.filesize_difference >= float(self.error_margin)*100) and (self.filesize_difference != 0.0): #and (self.infoB_['md5sum'] == self.info_['md5sum']):

184

170

self.to_be_stored_filesize = self.new_filesize

185

#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize

186

self.updated = True

187

171

self.actually_updated = True

188

#this means that no matter what, the webpage is updated

189

172

else:

190

#if there is no important changes in filesize. Call the MD5Sum.

191

#MD5summing analysis

192

if self.cached and (self.infoB_['md5sum'] == self.info_['md5sum']):

193

self.to_be_stored_filesize = self.new_filesize

194

#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize

195

self.updated = True

196

self.actually_updated = True

197

self._writeHeaders()

198

else:

199

#we don't want to juggle with all the possible filesizes,

200

#we want to stay close to the original, because replacing the filesize each time

201

#if the watch is not updated would create a lot of fluctuations

202

self.to_be_stored_filesize = self.old_filesize

203

#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize

204

self.actually_updated = False

173

#we don't want to juggle with all the possible filesizes,

174

#we want to stay close to the original, because replacing the filesize each time

175

#if the watch is not updated would create a lot of fluctuations

176

self.to_be_stored_filesize = self.old_filesize

177

#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize

178

self.actually_updated = False

205

179

else:

206

180

#if there is NO previously stored filesize

207

181

self.to_be_stored_filesize = self.new_filesize

208

182

#if self.specto.DEBUG: print "\tSaved filesize: ", self.to_be_stored_filesize

209

183

210

if (self.url2_ != self.url_) and self.specto.specto_gconf.get_entry("follow_website_redirects") == True:

184

if (self.url2_ != self.url_) and self.redirect == True:

211

185

self.write_uri()#it's uri, not url.

212

186

self.write_filesize()

213

187

214

self.specto.mark_watch_busy(False, self.id)

215

Watch.update(self, lock)

188

Watch.timer_update(self)

216

189

217

190

def content(self):

218

191

"""Get the content as a single string."""

226

199

adding/changing header values permanently in the cache."""

227

200

return self.info_

228

201

229

def add_headers(self, headers):

230

"""Add/change header values in the cache.

231

232

Note that if the key/value pair you change is used

233

by HTTP then you risk the possibility that the value

234

will be over-written the next time content is retrieved

235

from that URL.

236

"""

237

for key in headers.keys():

238

self.info_[key] = headers[key]

239

f = file(self.cacheFullPath_, "w")

240

f.write(str(self.info_))

241

f.close()

242

243

def _writeHeaders(self):

244

""" Write the full header in the cache. """

245

f = file(self.cacheFullPath_, "w")

246

f.write(str(self.info_))

247

f.close()

248

249

202

def write_filesize(self):

250

203

""" Write the filesize in the watch list. """

251

self.new_values = {}

252

self.new_values['name'] = self.name

253

self.new_values['filesize'] = self.to_be_stored_filesize

254

self.specto.watch_io.write_options(self.new_values)

255

204

try:

205

f = open(self.cacheFullPath2_, "w")

206

except:

207

self.specto.logger.log(_("There was an error opening the file %s") % self.cacheFullPath2_, "critical", self.__class__)

208

else:

209

f.write(str(self.to_be_stored_filesize))

210

211

finally:

212

f.close()

213

214

def read_filesize(self):

215

if os.path.exists(self.cacheFullPath2_):

216

try:

217

f = open(self.cacheFullPath2_, "r")

218

except:

219

self.specto.logger.log(_("There was an error reader the file %s") % self.cacheFullPath2_, "critical", self.__class__)

220

else:

221

size = f.read()

222

if size != "":

223

return size

224

else:

225

return 0

226

finally:

227

f.close()

228

else:

229

return 0

230

231

256

232

def write_uri(self):

257

233

""" Write the uri in the watch list. """

258

self.new_values = {}

259

self.new_values['name'] = self.name

260

self.new_values['uri'] = self.url2_

261

self.specto.watch_io.write_options(self.new_values)

234

self.specto.watch_io.write_option(self.name, 'uri', self.url2_)

262

235

self.url_ = self.url2_

263

236

264

def clearCache(self):

265

""" Clear the cache file. """

266

[os.unlink(os.path.join(cacheSubDir__, name)) for name in os.listdir(cacheSubDir__)]

237

def remove_cache_files(self):

238

os.unlink(self.cacheFullPath_)

239

os.unlink(self.cacheFullPath2_)

267

240

268

241

def _writeContent(self, response):

269

242

content = ""

270

243

content = response.read()

271

244

return content

272

273

def set_url(self, url):

274

""" Set the url for the watch. """

275

self.url_ = url

276

277

def set_error_margin(self, error_margin):

278

""" Set the error margin for the watch. """

279

self.error_margin = error_margin

280

245

246

def escape(self, text, quotes=True):

247

"""Create a Markup instance from a string and escape special characters

248

it may contain (<, >, & and ").

249

250

If the `quotes` parameter is set to `False`, the " character is left as

251

is. Escaping quotes is generally only required for strings that are to

252

be used in attribute values.

253

"""

254

text = str(text).replace('&', '&') \

255

.replace('<', '<') \

256

.replace('>', '>')

257

if quotes:

258

text = text.replace('"', '"')

259

return text

260

281

261

def get_balloon_text(self):

282

262

""" create the text for the balloon """

283

text = ("The website, %s, has been updated.\n%d\n%s") % (self.name, self.to_be_stored_filesize, str(self.filesize_difference)[:5])

263

text = ("The website, %s, has been updated.\nDifference percentage: %s percent") % (self.name, str(self.filesize_difference)[:5])

284

264

return text

285

265

286

266

def get_extra_information(self):

287

pass

288

## i = self.newMsg - self.oldMsg

289

## y = 0

290

## author_info = ""

291

## while i < len(self.mail_info) and y < 5:

292

## author_info += "" + self.mail_info[i].split("|")[1] + " From " + self.mail_info[i].split("|")[0] + "\n"

293

## i += 1

294

## y += 1

295

## if y == 5:

296

## author_info += "and others..."

297

## text = "New messages:\n" + author_info

298

## return text

267

text = ""

268

if self.diff:

269

text = self.diff

270

return text

271

272

def get_gui_info(self):

273

return [

274

('Name', self.name),

275

('Last updated', self.last_updated),

276

('Url', self.url_),

277

("Error margin", str(self.error_margin) + "%")

278

]

279

280

def get_add_gui_info():

281

return [

282

("uri", spectlib.gtkconfig.Entry("Url")),

283

("error_margin", spectlib.gtkconfig.Scale("Error margin (%)",value=2.0,upper=50,step_incr=0.1,page_incr=1.0))

284

]

285

286

"""HTML Diff: http://www.aaronsw.com/2002/diff

287

Rough code, badly documented. Send me comments and patches."""

288

289

__author__ = 'Aaron Swartz <me@aaronsw.com>'

290

291

__version__ = '0.22'

292

293

import difflib, string

294

295

def isTag(x): return x[0] == "<" and x[-1] == ">"

296

297

def textDiff(a, b):

298

"""Takes in strings a and b and returns a human-readable HTML diff."""

299

300

out = []

301

a, b = html2list(a), html2list(b)

302

s = difflib.SequenceMatcher(None, a, b)

303

for e in s.get_opcodes():

304

if e[0] == "replace":

305

# @@ need to do something more complicated here

306

# call textDiff but not for html, but for some html... ugh

307

# gonna cop-out for now

308

out.append(''+''.join(a[e[1]:e[2]]) + ''+''.join(b[e[3]:e[4]])+"\n")

309

elif e[0] == "delete":

310

out.append(''+ ''.join(a[e[1]:e[2]]) + "\n")

311

elif e[0] == "insert":

312

out.append(''+''.join(b[e[3]:e[4]]) + "\n")

313

return ''.join(out)

314

315

def html2list(x, b=1):

316

mode = 'char'

317

cur = ''

318

out = []

319

for c in x:

320

if mode == 'tag':

321

if c == '>':

322

if b: cur += ']'

323

else: cur += c

324

out.append("");cur = ''; mode = 'char'

325

else: cur += c

326

elif mode == 'char':

327

if c == '<':

328

out.append(cur)

329

if b: cur = '['

330

else: cur = c

331

mode = 'tag'

332

elif c in string.whitespace: out.append(cur+c); cur = ''

333

else: cur += c

334

out.append(cur)

335

336

return filter(lambda x: x is not '', out)

337

b'\\ No newline at end of file'

Older »