~jelmer/lp-dev-utils/loc-contributions-author

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env python

import datetime, re, sys
from optparse import OptionParser

# Defaults to Apache 'combined' log format.
default_log_format = ("%h %l %u %t \"%r\" %>s %O "
                      "\"%{Referer}i\" \"%{User-Agent}i\"")

# Log format potentially including %v:%p (virtual host:port) as used
# in other_vhosts log files (i.e. vhosts_combined log format).
other_log_format = "(%v:%p )?" + default_log_format

lp_log_format = ("%h %l %u %t (%D )?\"%r\" %>s %O "
                 "\"%{Referer}i\" \"%{User-Agent}i\"")
lp_vhosts_log_format = "(%v:%p )?" + lp_log_format

lpapp_log_format = ("%l - \"(?P<ip>[^\"]*)\" \"%v\" %t \"%r\" %>s %O [0-9]+ "
                    "(?P<float_duration>[0-9]+\.[0-9]+) "
                    "[0-9]+ [0-9]+ \"(?P<account>[a-zA-Z0-9]*)\" "
                    "\"(?P<lp_object>[^\"]*)\" \"%{Referer}i\" "
                    "\"%{User-Agent}i\"")

use_apache_format = True

log_formats = {
    'apache': {
        'description': 'Apache vhosts_combined log format with duration',
        'format': lp_vhosts_log_format,
        },
    'zope': {
        'description': 'Launchpad appserver log format',
        'format': lpapp_log_format,
        }
    }


# String to replace an escaped quote with in lines read from log files
# which allows us to have a more optimal regular expression for parsing
# (speed-up of 3x in parsing large log files).
escaped_quote = '<ESCAPED_QUOTE>'

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']


def re_for_log_format(log_format=default_log_format):
    """Provide a regular expression for parsing a certain apache log format.

    Supports only %h, %l, %u, %t, %r, %>s, %O, %D, %T, %v, %p, %{Referer}i
    and %{User-Agent}i format specifiers.

    Anything that can contain a space must be quoted with "".
    """
    log_re = log_format
    log_re = log_re.replace('%h', '(?P<ip>[^ ]+)')
    log_re = log_re.replace('%l', '(?P<host>[^ ]+)')
    log_re = log_re.replace('%v', '(?P<vhost>[^ ]*)')
    log_re = log_re.replace('%p', '(?P<port>[0-9]+)')
    log_re = log_re.replace('%u', '(?P<user>[^ ]+)')
    log_re = log_re.replace('%t', '\[(?P<date>[^\]]+)\]')
    log_re = log_re.replace('%D', '(?P<duration>[0-9]+)')
    log_re = log_re.replace('%T', '(?P<duration_seconds>[0-9]+)')
    log_re = log_re.replace('%r', '(?P<request>.*HTTP[^\"]+)')
    log_re = log_re.replace('%>s', '(?P<retcode>[0-9]+)')
    log_re = log_re.replace('%O', '(?P<size>[0-9]+)')
    log_re = log_re.replace('%{Referer}i', '(?P<referrer>[^\"]*)')
    log_re = log_re.replace('%{User-Agent}i', '(?P<useragent>.*)')
    return re.compile(log_re)

log_line = None


def re_for_apache_date():
    months_re = '|'.join(months)
    return re.compile(
        "(?P<day>[0-3][0-9])/"
        "(?P<month>%s)/"
        "(?P<year>[0-9]{4}):"
        "(?P<hour>[0-2][0-9]):"
        "(?P<min>[0-5][0-9]):"
        "(?P<sec>[0-5][0-9]) "
        "(?P<tz>[\+\-][0-9]{4})" % months_re)

date_re = re_for_apache_date()


def parse_date(date):
    """Parses a string like '15/Sep/2011:07:00:00 +0000'.

    Returns a datetime.datetime object.
    """
    match = date_re.match(date)
    if match:
        data = match.groupdict()
        month = months.index(data['month']) + 1
        return datetime.datetime(
            int(data['year']), month, int(data['day']),
            int(data['hour']), int(data['min']), int(data['sec']))
    else:
        return None


def parse_line(line):
    """Parse a single log file line and return a dict with parsed information.

    It returns fields `ip', `host', `user', `date', `request', `retcode',
    `size', `referrer' and `useragent' as strings, except `date' which is
    a datetime.datetime object.
    """
    # Remove escaped quotes (\") from strings so we can have a much better
    # performing regex which doesn't use ([^\"]|\\\") as the matcher for
    # characters inside a string.
    if use_apache_format:
        line = line.replace('\\"', escaped_quote)
    match = log_line.match(line)
    if match:
        data = match.groupdict()
        data['date'] = parse_date(data['date'])
        if data.has_key('float_duration'):
            data['duration'] = int(float(data['float_duration']) * 1000000)
        return data
    else:
        print "Not matched:\n%s" % line


def parse_args():
    global use_apache_format
    parser = OptionParser(
        usage=("Usage: %prog -f FROM_DATE -t TO_DATE [options]\n\n"
               "Parse and filter apache/zope log files passed in "
               "through stdin."))
    parser.add_option("-f", "--from", dest="from_date",
                      help=(
                          "Starting date/time of requests to include in "
                          "the report (formatted as YYYY-MM-DD HH:MM:SS)."),
                      metavar="FROM")
    parser.add_option("-t", "--to", dest="to_date",
                      help=(
                          "Ending date/time of requests to include in "
                          "the report (formatted as YYYY-MM-DD HH:MM:SS)."),
                      metavar="TO")
    log_format_help = ''
    log_format_default = 'apache'
    for log_format in log_formats:
        if log_format == log_format_default:
            log_format_help += "'%s' (default, %s), " % (
                log_format, log_formats[log_format]['description'])
        else:
            log_format_help += "'%s' (%s), " % (
                log_format, log_formats[log_format]['description'])
    log_format_help = log_format_help[:-2]
    parser.add_option("-l", "--log-format", dest="log_format",
                      default=log_format_default,
                      help=(
                          "Log format, one of %s" % log_format_help),
                      metavar="LOG_FORMAT")
    parser.add_option("-w", "--window", dest="window",
                      help=(
                          "Window size (in seconds) to provide splits by. "
                          "Default %default."),
                      metavar="WINDOW", default=300)
    parser.add_option("-u", "--user-agent", dest="user_agent", default=None,
                      help=(
                          "Only match passed-in user-agent."),
                      metavar="USER_AGENT")
    parser.add_option("-i", "--ip", dest="ip", default=None,
                      help=(
                          "Only match passed-in client IP address."),
                      metavar="IP")
    (options, args) = parser.parse_args()

    if options.from_date is None or options.to_date is None:
        parser.print_help()
        sys.exit(1)
    from_date = datetime.datetime.strptime(options.from_date,
                                           "%Y-%m-%d %H:%M:%S")
    to_date = datetime.datetime.strptime(options.to_date,
                                         "%Y-%m-%d %H:%M:%S")
    window = datetime.timedelta(0, int(options.window))

    filters = {}
    if options.user_agent:
        filters['useragent'] = options.user_agent
    if options.ip:
        filters['ip'] = options.ip

    log_format = log_formats[options.log_format]['format']
    if options.log_format == 'apache':
        use_apache_format = True
    else:
        use_apache_format = False

    return (log_format, from_date, to_date, window, filters)


def matches_filters(data, filters):
    if filters:
        matches = True
        for field in filters:
            if filters[field] not in data[field]:
                matches = False
                break
        return matches
    else:
        return True


def parse_stdin(min_date, max_date, filters):
    """Read line-by-line from stdin and pass it through parse_line().

    Returns an iterator over lines which have a date of request between
    min_date and max_date.
    """
    results = []
    count = 0
    line = sys.stdin.readline()
    while line:
        data = parse_line(line)
        if (matches_filters(data, filters) and
            data['date'] >= min_date and data['date'] < max_date):
            yield data
        line = sys.stdin.readline()


def group_by_field(results, field, value_field=None):
    """Groups and counts dicts in `results` by `field` member.

    If `value_field` is not None, instead of simply counting elements,
    it aggregates the `value_field` member of an element instead.

    Returns a dict with `field` values as the keys pointing to a dict
    with 'aggregate' as the aggregate/count as the value, and 'items'
    as the list of items that have matched.
    """
    grouped = {}
    for result in results:
        if value_field is None:
            # Count number of appearances.
            increment = 1
        else:
            # Aggregate result[value_field] values.
            value = result.get(value_field, None)
            if value is None:
                value = 0
            increment = int(value)
        if not grouped.has_key(result[field]):
            grouped[result[field]] = {
                'aggregate': increment,
                'items': [result],
                }
        else:
            grouped[result[field]]['aggregate'] += increment
            grouped[result[field]]['items'].append(result)
    return grouped


def get_top(results, field, value_field=None, count=20):
    """Gets top `count` elements from group_by_field(results, field, value_field).

    Sorted from those having the larges aggregate/count value to the smallest.
    """
    fields = group_by_field(results, field, value_field=value_field)
    sorted_fields = sorted(
        fields.iteritems(), reverse=True,
        key=lambda (a,b):(
            b['aggregate'], len(b['items']), a))
    printed_count = 0
    return sorted_fields[:count]


def print_top(results, field, value_field=None, count=20, sanitizer=None):
    """Prints elements as returned by get_top().
    """
    if value_field is None:
        title = "by request count"
    else:
        title = "by aggregated " + value_field
    print ("\n"
           "Top %d %ss (%s)\n"
           "=====================================" % (
               count, field, title))
    for field_name, field in get_top(results, field, value_field, count):
        if sanitizer is not None:
            field_title = sanitizer(field_name)
        else:
            field_title = field_name
        print "%s: %d" % (field_title, field['aggregate'])


def get_window_index_for_date(current_date, min_date, max_date, window):
    """Finds out what 'window' a particular date falls into.

    A window index is a zero-based value indicating in what
     (`min_date` + `window` * window_index,
      `min_date` + `window` * (window_index + 1)
    interval does date fall into.
    """
    assert current_date >= min_date and current_date < max_date
    date = min_date
    window_index = 0
    while date < max_date:
        if current_date >= date and current_date < (date + window):
            break
        date += window
        window_index += 1
    if current_date < date or current_date > (date + window):
        raise AssertionError("Ended up outside expected range.")
    return window_index


ua_re = re.compile('lazr.restfulclient [0-9]+\.[0-9]+\.[0-9]+; '
         'oauth_consumer=\"([^\"]*)\"')

def normalize_useragent(value):
    match = ua_re.match(value)
    if match:
        return "%s (API client)" % match.groups(1)
    else:
        return value


def aggregate(items, values):
    if type(values) != list:
        values = [values]
    result = []
    for idx in range(len(values)):
        result.append(0)
    for item in items:
        for idx in range(len(values)):
            if item[values[idx]] is None:
                increment = 0
            else:
                increment = int(item[values[idx]])
            result[idx] += increment
    return result

if __name__ == '__main__':
    log_format, min_date, max_date, window, filters = parse_args()
    log_line = re_for_log_format(log_format)

    count_by_window = {}
    size_by_window = {}
    by_window = {}

    results = list(parse_stdin(min_date, max_date, filters))
    count = len(results)
    for data in results:
        # Restore escaped quotes.
        for key in data:
            if type(data[key]) == str:
                data[key] = data[key].replace(escaped_quote, '"')
        current_date = data['date']
        window_index = get_window_index_for_date(
            current_date, min_date, max_date, window)
        if not by_window.has_key(window_index):
            by_window[window_index] = []
        by_window[window_index].append(data)

    print "Requests between %s and %s: %d" % (min_date, max_date, count)
    for key in sorted(by_window.keys()):
        window_start = min_date + key * window
        window_count = len(by_window[key])
        window_size = aggregate(by_window[key], ['size', 'duration'])
        print "%s: %d\t%s" % (window_start, window_count, window_size)

    print_top(results, 'useragent', sanitizer=normalize_useragent)
    print_top(results, 'useragent', 'size', sanitizer=normalize_useragent)
    print_top(results, 'useragent', 'duration', sanitizer=normalize_useragent)

    print_top(results, 'ip')
    print_top(results, 'ip', 'size')
    print_top(results, 'ip', 'duration')

    if not use_apache_format:
        print_top(results, 'lp_object')
        print_top(results, 'lp_object', 'size')
        print_top(results, 'lp_object', 'duration')

    print_top(results, 'request')
    print_top(results, 'request', 'size')
    print_top(results, 'request', 'duration')