~mgedmin/irclog2html/trunk

49 by mg
Rename. Use environment for log location
1
#!/usr/bin/env python
2
"""
3
Search IRC logs (a CGI script).
4
112 by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB.
5
Expects to find logs matching the IRCLOG_GLOB pattern (default: *.log)
6
in the directory specified by the IRCLOG_LOCATION environment variable.
7
Expects the filenames to contain a ISO 8601 date (YYYY-MM-DD).
53 by mg
Document installation.
8
9
Apache configuration example:
10
11
  ScriptAlias /irclogs/search /path/to/irclogsearch.py
12
  <Location /irclogs/search>
13
    SetEnv IRCLOG_LOCATION /path/to/irclog/files/
112 by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB.
14
    # Uncomment the following if your log files use a different format
15
    #SetEnv IRCLOG_GLOB "*.log.????-??-??"
53 by mg
Document installation.
16
  </Location>
17
49 by mg
Rename. Use environment for log location
18
"""
19
20
# Copyright (c) 2006, Marius Gedminas 
21
#
22
# Released under the terms of the GNU GPL
23
# http://www.gnu.org/copyleft/gpl.html
24
25
import cgi
26
import sys
27
import os
28
import re
29
import glob
52 by mg
Add urlquoting (stupid filenames that begin with #).
30
import urllib
49 by mg
Rename. Use environment for log location
31
import datetime
51 by mg
Newest matches first. Show time taken to search.
32
import time
49 by mg
Rename. Use environment for log location
33
34
import cgitb; cgitb.enable()
35
36
from irclog2html import LogParser, XHTMLTableStyle, NickColourizer
99 by Marius Gedminas
Introduce one canonical source for the version number.
37
from irclog2html import VERSION, RELEASE
49 by mg
Rename. Use environment for log location
38
39
logfile_path = os.getenv('IRCLOG_LOCATION')
52 by mg
Add urlquoting (stupid filenames that begin with #).
40
if not logfile_path:
41
    logfile_path = os.path.dirname(__file__)
42
112 by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB.
43
logfile_pattern = os.getenv('IRCLOG_GLOB')
44
if not logfile_pattern:
45
    logfile_pattern = '*.log'
49 by mg
Rename. Use environment for log location
46
47
DATE_REGEXP = re.compile('^.*(\d\d\d\d)-(\d\d)-(\d\d)')
48
49
HEADER = """\
50
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
51
          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
52
<html>
53
<head>
54
  <meta http-equiv="Content-Type" content="text/html; charset=%(charset)s" />
55
  <title>%(title)s</title>
56
  <link rel="stylesheet" href="irclog.css" />
54 by mg
Fix footer
57
  <meta name="generator" content="irclogsearch.py %(VERSION)s by Marius Gedminas" />
49 by mg
Rename. Use environment for log location
58
  <meta name="version" content="%(VERSION)s - %(RELEASE)s" />
59
</head>
60
<body>""" % {'VERSION': VERSION, 'RELEASE': RELEASE,
61
             'title': cgi.escape("Search IRC logs"), 'charset': 'UTF-8'}
62
63
FOOTER = """
64
<div class="generatedby">
54 by mg
Fix footer
65
<p>Generated by irclogsearch.py %(VERSION)s by <a href="mailto:marius@pov.lt">Marius Gedminas</a>
49 by mg
Rename. Use environment for log location
66
 - find it at <a href="http://mg.pov.lt/irclog2html/">mg.pov.lt</a>!</p>
67
</div>
68
</body>
69
</html>""" % {'VERSION': VERSION, 'RELEASE': RELEASE}
70
71
72
class Error(Exception):
73
    """Application error."""
74
75
76
class SearchStats(object):
77
    """Search statistics."""
78
79
    files = 0
80
    lines = 0
81
    matches = 0
82
83
84
class SearchResult(object):
85
    """Search result -- a single utterance."""
86
87
    def __init__(self, filename, link, date, time, event, info):
88
        self.filename = filename
89
        self.link = link
90
        self.date = date
91
        self.time = time
92
        self.event = event
93
        self.info = info
94
95
96
class SearchResultFormatter(object):
97
    """Formatter of search results."""
98
99
    def __init__(self):
100
        self.style = XHTMLTableStyle(sys.stdout)
101
        self.nick_colour = NickColourizer()
102
103
    def print_prefix(self):
104
        print self.style.prefix
105
106
    def print_html(self, result):
52 by mg
Add urlquoting (stupid filenames that begin with #).
107
        link = urlescape(result.link)
49 by mg
Rename. Use environment for log location
108
        if result.event == LogParser.COMMENT:
109
            nick, text = result.info
110
            htmlcolour = self.nick_colour[nick]
111
            self.style.nicktext(result.time, nick, text, htmlcolour, link)
112
        else:
113
            if result.event == LogParser.NICKCHANGE:
114
                text, oldnick, newnick = result.info
115
                self.nick_colour.change(oldnick, newnick)
116
            else:
117
                text = result.info
118
            self.style.servermsg(result.time, result.event, text, link)
119
120
    def print_suffix(self):
121
        print self.style.suffix
122
123
52 by mg
Add urlquoting (stupid filenames that begin with #).
124
def urlescape(link):
125
    return cgi.escape(urllib.quote(link), True)
126
49 by mg
Rename. Use environment for log location
127
def date_from_filename(filename):
128
    basename = os.path.basename(filename)
129
    m = DATE_REGEXP.match(basename)
130
    if not m:
131
        raise Error("File name does not contain a YYYY-MM-DD date: %s"
132
                    % filename)
133
    return datetime.date(*map(int, m.groups()))
134
135
136
def link_from_filename(filename):
137
    basename = os.path.basename(filename)
138
    return basename + '.html'
139
140
141
def search_irc_logs(query, stats=None):
142
    if not stats:
143
        stats = SearchStats() # will be discarded, but, oh, well
88 by Marius Gedminas
Support XChat Latin/Unicode hybrid encoding (http://xchat.org/encoding/).
144
    query = query.decode('UTF-8').lower()
112 by marius at gedmin
irclogsearch: pay attention to $IRCLOG_GLOB.
145
    files = glob.glob(os.path.join(logfile_path, logfile_pattern))
51 by mg
Newest matches first. Show time taken to search.
146
    files.sort()    # ISO-8601 dates sort the right way
147
    files.reverse() # newest first
50 by mg
Python 2.3 fix
148
    for filename in files:
49 by mg
Rename. Use environment for log location
149
        date = date_from_filename(filename)
150
        link = link_from_filename(filename)
151
        stats.files += 1
152
        for time, event, info in LogParser(file(filename)):
153
            if event == LogParser.COMMENT:
154
                nick, text = info
155
                text = nick + ' ' + text
156
            elif event == LogParser.NICKCHANGE:
157
                text, oldnick, newnick = info
158
            else:
88 by Marius Gedminas
Support XChat Latin/Unicode hybrid encoding (http://xchat.org/encoding/).
159
                text = unicode(info)
49 by mg
Rename. Use environment for log location
160
            stats.lines += 1
161
            if query in text.lower():
162
                stats.matches += 1
163
                yield SearchResult(filename, link, date, time, event, info)
164
165
166
def print_search_form():
167
    print "Content-Type: text/html; charset=UTF-8"
168
    print
169
    print HEADER
170
    print "<h1>Search IRC logs</h1>"
59 by mg
What I said.
171
    print '<form action="" method="get">'
49 by mg
Rename. Use environment for log location
172
    print '<input type="text" name="q" />'
173
    print '<input type="submit" />'
174
    print '</form>'
175
    print FOOTER
176
177
178
def print_search_results(query):
179
    print "Content-Type: text/html; charset=UTF-8"
180
    print
181
    print HEADER
182
    print "<h1>IRC log search results for %s</h1>" % cgi.escape(query)
58 by mg
Use HTTP get.
183
    print '<form action="" method="get">'
64 by mg
Fix quoting when you search for text that includes a double quote.
184
    print '<input type="text" name="q" value="%s" />' % cgi.escape(query, True)
49 by mg
Rename. Use environment for log location
185
    print '<input type="submit" />'
186
    print '</form>'
51 by mg
Newest matches first. Show time taken to search.
187
    started = time.time()
49 by mg
Rename. Use environment for log location
188
    date = None
189
    prev_result = None
190
    formatter = SearchResultFormatter()
191
    stats = SearchStats()
192
    for result in search_irc_logs(query, stats):
193
        if date != result.date:
194
            if prev_result:
195
                formatter.print_suffix()
196
                prev_result = None
197
            if date:
198
                print "  </li>"
199
            else:
55 by mg
CSS tweaks for the search page.
200
                print '<ul class="searchresults">'
52 by mg
Add urlquoting (stupid filenames that begin with #).
201
            print '  <li><a href="%s">%s</a>:' % (urlescape(result.link),
49 by mg
Rename. Use environment for log location
202
                                        result.date.strftime('%Y-%m-%d (%A)'))
203
            date = result.date
204
        if not prev_result:
205
            formatter.print_prefix()
206
        formatter.print_html(result)
207
        prev_result = result
208
    if prev_result:
209
        formatter.print_suffix()
210
    if date:
211
        print "  </li>"
212
        print "</ul>"
51 by mg
Newest matches first. Show time taken to search.
213
    total_time = time.time() - started
214
    print "<p>%d matches in %d log files with %d lines (%.1f seconds).</p>" % (
215
                stats.matches, stats.files, stats.lines, total_time)
49 by mg
Rename. Use environment for log location
216
    print FOOTER
217
218
219
def main():
220
    form = cgi.FieldStorage()
221
    if not form.has_key("q"):
222
        print_search_form()
223
        return
224
    search_text = form["q"].value
225
    print_search_results(search_text)
226
227
228
if __name__ == '__main__':
229
    main()