|
49
by mg
Rename. Use environment for log location |
1 |
#!/usr/bin/env python
|
2 |
"""
|
|
3 |
Search IRC logs (a CGI script).
|
|
4 |
||
|
112
by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB. |
5 |
Expects to find logs matching the IRCLOG_GLOB pattern (default: *.log)
|
6 |
in the directory specified by the IRCLOG_LOCATION environment variable.
|
|
7 |
Expects the filenames to contain a ISO 8601 date (YYYY-MM-DD).
|
|
|
53
by mg
Document installation. |
8 |
|
9 |
Apache configuration example:
|
|
10 |
||
11 |
ScriptAlias /irclogs/search /path/to/irclogsearch.py
|
|
12 |
<Location /irclogs/search>
|
|
13 |
SetEnv IRCLOG_LOCATION /path/to/irclog/files/
|
|
|
112
by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB. |
14 |
# Uncomment the following if your log files use a different format
|
15 |
#SetEnv IRCLOG_GLOB "*.log.????-??-??"
|
|
|
53
by mg
Document installation. |
16 |
</Location>
|
17 |
||
|
49
by mg
Rename. Use environment for log location |
18 |
"""
|
19 |
||
20 |
# Copyright (c) 2006, Marius Gedminas
|
|
21 |
#
|
|
22 |
# Released under the terms of the GNU GPL
|
|
23 |
# http://www.gnu.org/copyleft/gpl.html
|
|
24 |
||
25 |
import cgi |
|
26 |
import sys |
|
27 |
import os |
|
28 |
import re |
|
29 |
import glob |
|
|
52
by mg
Add urlquoting (stupid filenames that begin with #). |
30 |
import urllib |
|
49
by mg
Rename. Use environment for log location |
31 |
import datetime |
|
51
by mg
Newest matches first. Show time taken to search. |
32 |
import time |
|
49
by mg
Rename. Use environment for log location |
33 |
|
34 |
import cgitb; cgitb.enable() |
|
35 |
||
36 |
from irclog2html import LogParser, XHTMLTableStyle, NickColourizer |
|
|
99
by Marius Gedminas
Introduce one canonical source for the version number. |
37 |
from irclog2html import VERSION, RELEASE |
|
49
by mg
Rename. Use environment for log location |
38 |
|
39 |
logfile_path = os.getenv('IRCLOG_LOCATION') |
|
|
52
by mg
Add urlquoting (stupid filenames that begin with #). |
40 |
if not logfile_path: |
41 |
logfile_path = os.path.dirname(__file__) |
|
42 |
||
|
112
by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB. |
43 |
logfile_pattern = os.getenv('IRCLOG_GLOB') |
44 |
if not logfile_pattern: |
|
45 |
logfile_pattern = '*.log' |
|
|
49
by mg
Rename. Use environment for log location |
46 |
|
47 |
DATE_REGEXP = re.compile('^.*(\d\d\d\d)-(\d\d)-(\d\d)') |
|
48 |
||
49 |
HEADER = """\ |
|
50 |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
51 |
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
52 |
<html>
|
|
53 |
<head>
|
|
54 |
<meta http-equiv="Content-Type" content="text/html; charset=%(charset)s" /> |
|
55 |
<title>%(title)s</title> |
|
56 |
<link rel="stylesheet" href="irclog.css" />
|
|
|
54
by mg
Fix footer |
57 |
<meta name="generator" content="irclogsearch.py %(VERSION)s by Marius Gedminas" /> |
|
49
by mg
Rename. Use environment for log location |
58 |
<meta name="version" content="%(VERSION)s - %(RELEASE)s" /> |
59 |
</head>
|
|
60 |
<body>""" % {'VERSION': VERSION, 'RELEASE': RELEASE, |
|
61 |
'title': cgi.escape("Search IRC logs"), 'charset': 'UTF-8'} |
|
62 |
||
63 |
FOOTER = """ |
|
64 |
<div class="generatedby">
|
|
|
54
by mg
Fix footer |
65 |
<p>Generated by irclogsearch.py %(VERSION)s by <a href="mailto:marius@pov.lt">Marius Gedminas</a> |
|
49
by mg
Rename. Use environment for log location |
66 |
- find it at <a href="http://mg.pov.lt/irclog2html/">mg.pov.lt</a>!</p>
|
67 |
</div>
|
|
68 |
</body>
|
|
69 |
</html>""" % {'VERSION': VERSION, 'RELEASE': RELEASE} |
|
70 |
||
71 |
||
72 |
class Error(Exception): |
|
73 |
"""Application error."""
|
|
74 |
||
75 |
||
76 |
class SearchStats(object): |
|
77 |
"""Search statistics."""
|
|
78 |
||
79 |
files = 0 |
|
80 |
lines = 0 |
|
81 |
matches = 0 |
|
82 |
||
83 |
||
84 |
class SearchResult(object): |
|
85 |
"""Search result -- a single utterance."""
|
|
86 |
||
87 |
def __init__(self, filename, link, date, time, event, info): |
|
88 |
self.filename = filename |
|
89 |
self.link = link |
|
90 |
self.date = date |
|
91 |
self.time = time |
|
92 |
self.event = event |
|
93 |
self.info = info |
|
94 |
||
95 |
||
96 |
class SearchResultFormatter(object): |
|
97 |
"""Formatter of search results."""
|
|
98 |
||
99 |
def __init__(self): |
|
100 |
self.style = XHTMLTableStyle(sys.stdout) |
|
101 |
self.nick_colour = NickColourizer() |
|
102 |
||
103 |
def print_prefix(self): |
|
104 |
print self.style.prefix |
|
105 |
||
106 |
def print_html(self, result): |
|
|
52
by mg
Add urlquoting (stupid filenames that begin with #). |
107 |
link = urlescape(result.link) |
|
49
by mg
Rename. Use environment for log location |
108 |
if result.event == LogParser.COMMENT: |
109 |
nick, text = result.info |
|
110 |
htmlcolour = self.nick_colour[nick] |
|
111 |
self.style.nicktext(result.time, nick, text, htmlcolour, link) |
|
112 |
else: |
|
113 |
if result.event == LogParser.NICKCHANGE: |
|
114 |
text, oldnick, newnick = result.info |
|
115 |
self.nick_colour.change(oldnick, newnick) |
|
116 |
else: |
|
117 |
text = result.info |
|
118 |
self.style.servermsg(result.time, result.event, text, link) |
|
119 |
||
120 |
def print_suffix(self): |
|
121 |
print self.style.suffix |
|
122 |
||
123 |
||
|
52
by mg
Add urlquoting (stupid filenames that begin with #). |
124 |
def urlescape(link): |
125 |
return cgi.escape(urllib.quote(link), True) |
|
126 |
||
|
49
by mg
Rename. Use environment for log location |
127 |
def date_from_filename(filename): |
128 |
basename = os.path.basename(filename) |
|
129 |
m = DATE_REGEXP.match(basename) |
|
130 |
if not m: |
|
131 |
raise Error("File name does not contain a YYYY-MM-DD date: %s" |
|
132 |
% filename) |
|
133 |
return datetime.date(*map(int, m.groups())) |
|
134 |
||
135 |
||
136 |
def link_from_filename(filename): |
|
137 |
basename = os.path.basename(filename) |
|
138 |
return basename + '.html' |
|
139 |
||
140 |
||
141 |
def search_irc_logs(query, stats=None): |
|
142 |
if not stats: |
|
143 |
stats = SearchStats() # will be discarded, but, oh, well |
|
|
88
by Marius Gedminas
Support XChat Latin/Unicode hybrid encoding (http://xchat.org/encoding/). |
144 |
query = query.decode('UTF-8').lower() |
|
112
by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB. |
145 |
files = glob.glob(os.path.join(logfile_path, logfile_pattern)) |
|
51
by mg
Newest matches first. Show time taken to search. |
146 |
files.sort() # ISO-8601 dates sort the right way |
147 |
files.reverse() # newest first |
|
|
50
by mg
Python 2.3 fix |
148 |
for filename in files: |
|
49
by mg
Rename. Use environment for log location |
149 |
date = date_from_filename(filename) |
150 |
link = link_from_filename(filename) |
|
151 |
stats.files += 1 |
|
152 |
for time, event, info in LogParser(file(filename)): |
|
153 |
if event == LogParser.COMMENT: |
|
154 |
nick, text = info |
|
155 |
text = nick + ' ' + text |
|
156 |
elif event == LogParser.NICKCHANGE: |
|
157 |
text, oldnick, newnick = info |
|
158 |
else: |
|
|
88
by Marius Gedminas
Support XChat Latin/Unicode hybrid encoding (http://xchat.org/encoding/). |
159 |
text = unicode(info) |
|
49
by mg
Rename. Use environment for log location |
160 |
stats.lines += 1 |
161 |
if query in text.lower(): |
|
162 |
stats.matches += 1 |
|
163 |
yield SearchResult(filename, link, date, time, event, info) |
|
164 |
||
165 |
||
166 |
def print_search_form(): |
|
167 |
print "Content-Type: text/html; charset=UTF-8" |
|
168 |
print
|
|
169 |
print HEADER |
|
170 |
print "<h1>Search IRC logs</h1>" |
|
|
59
by mg
What I said. |
171 |
print '<form action="" method="get">' |
|
49
by mg
Rename. Use environment for log location |
172 |
print '<input type="text" name="q" />' |
173 |
print '<input type="submit" />' |
|
174 |
print '</form>' |
|
175 |
print FOOTER |
|
176 |
||
177 |
||
178 |
def print_search_results(query): |
|
179 |
print "Content-Type: text/html; charset=UTF-8" |
|
180 |
print
|
|
181 |
print HEADER |
|
182 |
print "<h1>IRC log search results for %s</h1>" % cgi.escape(query) |
|
|
58
by mg
Use HTTP get. |
183 |
print '<form action="" method="get">' |
|
64
by mg
Fix quoting when you search for text that includes a double quote. |
184 |
print '<input type="text" name="q" value="%s" />' % cgi.escape(query, True) |
|
49
by mg
Rename. Use environment for log location |
185 |
print '<input type="submit" />' |
186 |
print '</form>' |
|
|
51
by mg
Newest matches first. Show time taken to search. |
187 |
started = time.time() |
|
49
by mg
Rename. Use environment for log location |
188 |
date = None |
189 |
prev_result = None |
|
190 |
formatter = SearchResultFormatter() |
|
191 |
stats = SearchStats() |
|
192 |
for result in search_irc_logs(query, stats): |
|
193 |
if date != result.date: |
|
194 |
if prev_result: |
|
195 |
formatter.print_suffix() |
|
196 |
prev_result = None |
|
197 |
if date: |
|
198 |
print " </li>" |
|
199 |
else: |
|
|
55
by mg
CSS tweaks for the search page. |
200 |
print '<ul class="searchresults">' |
|
52
by mg
Add urlquoting (stupid filenames that begin with #). |
201 |
print ' <li><a href="%s">%s</a>:' % (urlescape(result.link), |
|
49
by mg
Rename. Use environment for log location |
202 |
result.date.strftime('%Y-%m-%d (%A)')) |
203 |
date = result.date |
|
204 |
if not prev_result: |
|
205 |
formatter.print_prefix() |
|
206 |
formatter.print_html(result) |
|
207 |
prev_result = result |
|
208 |
if prev_result: |
|
209 |
formatter.print_suffix() |
|
210 |
if date: |
|
211 |
print " </li>" |
|
212 |
print "</ul>" |
|
|
51
by mg
Newest matches first. Show time taken to search. |
213 |
total_time = time.time() - started |
214 |
print "<p>%d matches in %d log files with %d lines (%.1f seconds).</p>" % ( |
|
215 |
stats.matches, stats.files, stats.lines, total_time) |
|
|
49
by mg
Rename. Use environment for log location |
216 |
print FOOTER |
217 |
||
218 |
||
219 |
def main(): |
|
220 |
form = cgi.FieldStorage() |
|
221 |
if not form.has_key("q"): |
|
222 |
print_search_form() |
|
223 |
return
|
|
224 |
search_text = form["q"].value |
|
225 |
print_search_results(search_text) |
|
226 |
||
227 |
||
228 |
if __name__ == '__main__': |
|
229 |
main() |