1
# gozerplugs/plugs/snarf.py
5
__author__ = "Wijnand 'tehmaze' Modderman - http://tehmaze.com"
7
__gendoclast__ = ['snarf-disable', ]
10
from gozerbot.callbacks import callbacks, jcallbacks
11
from gozerbot.plughelp import plughelp
12
from gozerbot.aliases import aliases
13
from gozerbot.commands import cmnds
14
from gozerbot.examples import examples
15
from gozerbot.generic import decode_html_entities, get_encoding, geturl, \
16
geturl2, rlog, handle_exception
17
from gozerbot.persist.persist import Persist
18
from gozerbot.persist.persistconfig import PersistConfig
19
from gozerbot.plugins import plugins
27
plughelp.add('snarf', 'the snarf plugin gets the title of the web page of \
28
the provided url or the last url in the log')
30
cfg = Persist('snarf', {})
31
pcfg = PersistConfig()
32
pcfg.define('allow', ['text/plain', 'text/html', 'application/xml'])
33
re_html_title = re.compile(u'<title>(.*?)</title>', re.I | re.M | re.S)
34
re_url_match = re.compile(u'((?:http|https)://\S+)')
36
'result': re.compile('(Failed validation, \
37
\d+ errors?|Passed validation)', re.I | re.M),
38
'modified': re.compile('<th>Modified:</th>\
39
<td colspan="2">([^<]+)</td>', re.I | re.M),
40
'server': re.compile('<th>Server:</th>\
41
<td colspan="2">([^<]+)</td>', re.I | re.M),
42
'size': re.compile('<th>Size:</th><td colspan="2">\
43
([^<]+)</td>', re.I | re.M),
44
'content': re.compile('<th>Content-Type:</th><td colspan="2">\
45
([^<]+)</td>', re.I | re.M),
46
'encoding': re.compile('<td>([^<]+)</td><td><select name="charset" \
47
id="charset">', re.I | re.M),
48
'doctype': re.compile('<td>([^<]+)</td><td><select id="doctype" \
49
name="doctype">', re.I | re.M)
51
urlvalidate = 'http://validator.w3.org/check?charset=%%28\
52
detect+automatically%%29&doctype=Inline&verbose=1&%s'
54
class SnarfException(Exception):
57
def geturl_title(url):
58
""" fetch title of url """
61
except urllib2.HTTPError, ex:
62
rlog(10, 'snarf', str(ex))
64
except urllib2.URLError, ex:
65
rlog(10, 'snarf', str(ex))
76
test_title = re_html_title.search(result)
78
# try to find an encoding and standardize it to utf-8
79
encoding = get_encoding(result)
80
title = test_title.group(1).decode(encoding, 'replace').replace('\n', ' ')
82
return decode_html_entities(title)
85
def geturl_validate(url):
87
url = urlvalidate % urllib.urlencode({'uri': url})
100
for key in re_html_valid.keys():
101
results[key] = re_html_valid[key].search(result)
103
results[key] = results[key].group(1)
105
results[key] = '(unknown)'
109
""" check if url is valid """
110
if not re_url_match.match(url):
112
parts = urlparse.urlparse(url)
113
# do a HEAD request to get the content-type
114
request = urllib2.Request(url)
115
request.get_method = lambda: "HEAD"
116
content = urllib2.urlopen(request)
117
if content.headers['content-type']:
118
type = content.headers['content-type'].split(';', 1)[0].strip()
119
if type not in pcfg.get('allow'):
120
raise SnarfException, "Content-Type %s is not allowed" % type
121
cleanurl = '%s://%s' % (parts[0], parts[1])
123
cleanurl = '%s%s' % (cleanurl, parts[2])
125
cleanurl = '%s;%s' % (cleanurl, parts[3])
127
cleanurl = '%s?%s' % (cleanurl, parts[4])
130
def handle_snarf(bot, ievent, direct=True):
131
""" snarf provided url or last url in log """
138
url = plugins['url'].cache.fetch(bot, ievent)
141
ievent.missing('<url>')
147
ievent.reply("can't detect content type")
149
except SnarfException, e:
151
ievent.reply('unable to snarf: %s' % str(e))
153
except urllib2.HTTPError, e:
154
ievent.reply('unable to snarf: %s' % str(e))
156
except urllib2.URLError, ex:
157
ievent.reply('unable to snarf: %s' % str(ex))
160
ievent.reply('invalid url')
163
title = geturl_title(url)
164
except socket.timeout:
165
ievent.reply('%s socket timeout' % url)
167
except urllib2.HTTPError, e:
168
ievent.reply('error: %s' % e)
171
host = urlparse.urlparse(url)[1]
173
host = host[0:20] + '...'
174
ievent.reply('%s: %s' % (host, title))
176
ievent.reply('no title found at %s' % urlparse.urlparse(url)[1])
178
cmnds.add('snarf', handle_snarf, 'USER')
179
examples.add('snarf', 'fetch the title from an URL', \
180
'snarf http://gozerbot.org')
181
aliases.data['@'] = 'snarf'
182
aliases.data['title'] = 'snarf'
184
def handle_snarf_enable(bot, ievent):
185
""" enable snarfing in channel """
186
if not cfg.data.has_key(bot.name):
187
cfg.data[bot.name] = {}
188
cfg.data[bot.name][ievent.printto] = True
192
cmnds.add('snarf-enable', handle_snarf_enable, 'OPER')
193
examples.add('snarf-enable', 'enable snarfing in the channel', 'snarf-enable')
194
aliases.data['snarf-on'] = 'snarf-enable'
196
def handle_snarf_disable(bot, ievent):
197
""" disable snarfing in channel """
198
if not cfg.data.has_key(bot.name):
201
cfg.data[bot.name][ievent.printto] = False
205
cmnds.add('snarf-disable', handle_snarf_disable, 'OPER')
206
examples.add('snarf-disable', 'disable snarfing in the channel', \
208
aliases.data['snarf-off'] = 'snarf-disable'
210
def handle_snarf_list(bot, ievent):
211
""" show channels in which snarfing is enabled """
213
names = cfg.data.keys()
216
targets = cfg.data[name].keys()
218
snarfs.append('%s: %s' % (name, ' '.join(targets)))
222
ievent.reply('snarfers enable on: %s' % ', '.join(snarfs))
224
cmnds.add('snarf-list', handle_snarf_list, 'OPER')
225
examples.add('snarf-list', 'show in which channels snarfing is enabled', \
228
def handle_validate(bot, ievent):
229
""" validate provided url or last url in log """
236
url = plugins['url'].cache.fetch(bot, ievent)
239
ievent.missing('<url>')
244
except urllib2.HTTPError, e:
245
ievent.reply('error: %s' % e)
248
ievent.reply('invalid or bad URL')
250
result = geturl_validate(url)
252
host = urlparse.urlparse(url)[1]
254
host = host[0:20] + '...'
255
ievent.reply('%s: %s | modified: %s | server: %s | size: %s | \
256
content-type: %s | encoding: %s | doctype: %s' % \
257
tuple([host] + [result[x] for x in ['result', 'modified', \
258
'server', 'size', 'content', 'encoding', 'doctype']]))
260
cmnds.add('validate', handle_validate, 'USER')
261
examples.add('validate', 'validate an URL', 'validate http://gozerbot.org')
262
aliases.data['valid'] = 'validate'