~jelmer/lp-dev-utils/loc-contributions-author : contents of apache-log-investigator.py at revision 110

~jelmer/lp-dev-utils/loc-contributions-author : (revision 110)
#!/usr/bin/env python

import datetime, re, sys
from optparse import OptionParser

# Defaults to Apache 'combined' log format.
default_log_format = ("%h %l %u %t \"%r\" %>s %O "
                      "\"%{Referer}i\" \"%{User-Agent}i\"")

# Log format potentially including %v:%p (virtual host:port) as used
# in other_vhosts log files (i.e. vhosts_combined log format).
other_log_format = "(%v:%p )?" + default_log_format

lp_log_format = ("%h %l %u %t (%D )?\"%r\" %>s %O "
                 "\"%{Referer}i\" \"%{User-Agent}i\"")
lp_vhosts_log_format = "(%v:%p )?" + lp_log_format

lpapp_log_format = ("%l - \"(?P<ip>[^\"]*)\" \"%v\" %t \"%r\" %>s %O [0-9]+ "
                    "(?P<float_duration>[0-9]+\.[0-9]+) "
                    "[0-9]+ [0-9]+ \"(?P<account>[a-zA-Z0-9]*)\" "
                    "\"(?P<lp_object>[^\"]*)\" \"%{Referer}i\" "
                    "\"%{User-Agent}i\"")

use_apache_format = True

log_formats = {
    'apache': {
        'description': 'Apache vhosts_combined log format with duration',
        'format': lp_vhosts_log_format,
        },
    'zope': {
        'description': 'Launchpad appserver log format',
        'format': lpapp_log_format,
        }
    }


# String to replace an escaped quote with in lines read from log files
# which allows us to have a more optimal regular expression for parsing
# (speed-up of 3x in parsing large log files).
escaped_quote = '<ESCAPED_QUOTE>'

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']


def re_for_log_format(log_format=default_log_format):
    """Provide a regular expression for parsing a certain apache log format.

    Supports only %h, %l, %u, %t, %r, %>s, %O, %D, %T, %v, %p, %{Referer}i
    and %{User-Agent}i format specifiers.

    Anything that can contain a space must be quoted with "".
    """
    log_re = log_format
    log_re = log_re.replace('%h', '(?P<ip>[^ ]+)')
    log_re = log_re.replace('%l', '(?P<host>[^ ]+)')
    log_re = log_re.replace('%v', '(?P<vhost>[^ ]*)')
    log_re = log_re.replace('%p', '(?P<port>[0-9]+)')
    log_re = log_re.replace('%u', '(?P<user>[^ ]+)')
    log_re = log_re.replace('%t', '\[(?P<date>[^\]]+)\]')
    log_re = log_re.replace('%D', '(?P<duration>[0-9]+)')
    log_re = log_re.replace('%T', '(?P<duration_seconds>[0-9]+)')
    log_re = log_re.replace('%r', '(?P<request>.*HTTP[^\"]+)')
    log_re = log_re.replace('%>s', '(?P<retcode>[0-9]+)')
    log_re = log_re.replace('%O', '(?P<size>[0-9]+)')
    log_re = log_re.replace('%{Referer}i', '(?P<referrer>[^\"]*)')
    log_re = log_re.replace('%{User-Agent}i', '(?P<useragent>.*)')
    return re.compile(log_re)

log_line = None


def re_for_apache_date():
    months_re = '|'.join(months)
    return re.compile(
        "(?P<day>[0-3][0-9])/"
        "(?P<month>%s)/"
        "(?P<year>[0-9]{4}):"
        "(?P<hour>[0-2][0-9]):"
        "(?P<min>[0-5][0-9]):"
        "(?P<sec>[0-5][0-9]) "
        "(?P<tz>[\+\-][0-9]{4})" % months_re)

date_re = re_for_apache_date()


def parse_date(date):
    """Parses a string like '15/Sep/2011:07:00:00 +0000'.

    Returns a datetime.datetime object.
    """
    match = date_re.match(date)
    if match:
        data = match.groupdict()
        month = months.index(data['month']) + 1
        return datetime.datetime(
            int(data['year']), month, int(data['day']),
            int(data['hour']), int(data['min']), int(data['sec']))
    else:
        return None


def parse_line(line):
    """Parse a single log file line and return a dict with parsed information.

    It returns fields `ip', `host', `user', `date', `request', `retcode',
    `size', `referrer' and `useragent' as strings, except `date' which is
    a datetime.datetime object.
    """
    # Remove escaped quotes (\") from strings so we can have a much better
    # performing regex which doesn't use ([^\"]|\\\") as the matcher for
    # characters inside a string.
    if use_apache_format:
        line = line.replace('\\"', escaped_quote)
    match = log_line.match(line)
    if match:
        data = match.groupdict()
        data['date'] = parse_date(data['date'])
        if data.has_key('float_duration'):
            data['duration'] = int(float(data['float_duration']) * 1000000)
        return data
    else:
        print "Not matched:\n%s" % line


def parse_args():
    global use_apache_format
    parser = OptionParser(
        usage=("Usage: %prog -f FROM_DATE -t TO_DATE [options]\n\n"
               "Parse and filter apache/zope log files passed in "
               "through stdin."))
    parser.add_option("-f", "--from", dest="from_date",
                      help=(
                          "Starting date/time of requests to include in "
                          "the report (formatted as YYYY-MM-DD HH:MM:SS)."),
                      metavar="FROM")
    parser.add_option("-t", "--to", dest="to_date",
                      help=(
                          "Ending date/time of requests to include in "
                          "the report (formatted as YYYY-MM-DD HH:MM:SS)."),
                      metavar="TO")
    log_format_help = ''
    log_format_default = 'apache'
    for log_format in log_formats:
        if log_format == log_format_default:
            log_format_help += "'%s' (default, %s), " % (
                log_format, log_formats[log_format]['description'])
        else:
            log_format_help += "'%s' (%s), " % (
                log_format, log_formats[log_format]['description'])
    log_format_help = log_format_help[:-2]
    parser.add_option("-l", "--log-format", dest="log_format",
                      default=log_format_default,
                      help=(
                          "Log format, one of %s" % log_format_help),
                      metavar="LOG_FORMAT")
    parser.add_option("-w", "--window", dest="window",
                      help=(
                          "Window size (in seconds) to provide splits by. "
                          "Default %default."),
                      metavar="WINDOW", default=300)
    parser.add_option("-u", "--user-agent", dest="user_agent", default=None,
                      help=(
                          "Only match passed-in user-agent."),
                      metavar="USER_AGENT")
    parser.add_option("-i", "--ip", dest="ip", default=None,
                      help=(
                          "Only match passed-in client IP address."),
                      metavar="IP")
    (options, args) = parser.parse_args()

    if options.from_date is None or options.to_date is None:
        parser.print_help()
        sys.exit(1)
    from_date = datetime.datetime.strptime(options.from_date,
                                           "%Y-%m-%d %H:%M:%S")
    to_date = datetime.datetime.strptime(options.to_date,
                                         "%Y-%m-%d %H:%M:%S")
    window = datetime.timedelta(0, int(options.window))

    filters = {}
    if options.user_agent:
        filters['useragent'] = options.user_agent
    if options.ip:
        filters['ip'] = options.ip

    log_format = log_formats[options.log_format]['format']
    if options.log_format == 'apache':
        use_apache_format = True
    else:
        use_apache_format = False

    return (log_format, from_date, to_date, window, filters)


def matches_filters(data, filters):
    if filters:
        matches = True
        for field in filters:
            if filters[field] not in data[field]:
                matches = False
                break
        return matches
    else:
        return True


def parse_stdin(min_date, max_date, filters):
    """Read line-by-line from stdin and pass it through parse_line().

    Returns an iterator over lines which have a date of request between
    min_date and max_date.
    """
    results = []
    count = 0
    line = sys.stdin.readline()
    while line:
        data = parse_line(line)
        if (matches_filters(data, filters) and
            data['date'] >= min_date and data['date'] < max_date):
            yield data
        line = sys.stdin.readline()


def group_by_field(results, field, value_field=None):
    """Groups and counts dicts in `results` by `field` member.

    If `value_field` is not None, instead of simply counting elements,
    it aggregates the `value_field` member of an element instead.

    Returns a dict with `field` values as the keys pointing to a dict
    with 'aggregate' as the aggregate/count as the value, and 'items'
    as the list of items that have matched.
    """
    grouped = {}
    for result in results:
        if value_field is None:
            # Count number of appearances.
            increment = 1
        else:
            # Aggregate result[value_field] values.
            value = result.get(value_field, None)
            if value is None:
                value = 0
            increment = int(value)
        if not grouped.has_key(result[field]):
            grouped[result[field]] = {
                'aggregate': increment,
                'items': [result],
                }
        else:
            grouped[result[field]]['aggregate'] += increment
            grouped[result[field]]['items'].append(result)
    return grouped


def get_top(results, field, value_field=None, count=20):
    """Gets top `count` elements from group_by_field(results, field, value_field).

    Sorted from those having the larges aggregate/count value to the smallest.
    """
    fields = group_by_field(results, field, value_field=value_field)
    sorted_fields = sorted(
        fields.iteritems(), reverse=True,
        key=lambda (a,b):(
            b['aggregate'], len(b['items']), a))
    printed_count = 0
    return sorted_fields[:count]


def print_top(results, field, value_field=None, count=20, sanitizer=None):
    """Prints elements as returned by get_top().
    """
    if value_field is None:
        title = "by request count"
    else:
        title = "by aggregated " + value_field
    print ("\n"
           "Top %d %ss (%s)\n"
           "=====================================" % (
               count, field, title))
    for field_name, field in get_top(results, field, value_field, count):
        if sanitizer is not None:
            field_title = sanitizer(field_name)
        else:
            field_title = field_name
        print "%s: %d" % (field_title, field['aggregate'])


def get_window_index_for_date(current_date, min_date, max_date, window):
    """Finds out what 'window' a particular date falls into.

    A window index is a zero-based value indicating in what
     (`min_date` + `window` * window_index,
      `min_date` + `window` * (window_index + 1)
    interval does date fall into.
    """
    assert current_date >= min_date and current_date < max_date
    date = min_date
    window_index = 0
    while date < max_date:
        if current_date >= date and current_date < (date + window):
            break
        date += window
        window_index += 1
    if current_date < date or current_date > (date + window):
        raise AssertionError("Ended up outside expected range.")
    return window_index


ua_re = re.compile('lazr.restfulclient [0-9]+\.[0-9]+\.[0-9]+; '
         'oauth_consumer=\"([^\"]*)\"')

def normalize_useragent(value):
    match = ua_re.match(value)
    if match:
        return "%s (API client)" % match.groups(1)
    else:
        return value


def aggregate(items, values):
    if type(values) != list:
        values = [values]
    result = []
    for idx in range(len(values)):
        result.append(0)
    for item in items:
        for idx in range(len(values)):
            if item[values[idx]] is None:
                increment = 0
            else:
                increment = int(item[values[idx]])
            result[idx] += increment
    return result

if __name__ == '__main__':
    log_format, min_date, max_date, window, filters = parse_args()
    log_line = re_for_log_format(log_format)

    count_by_window = {}
    size_by_window = {}
    by_window = {}

    results = list(parse_stdin(min_date, max_date, filters))
    count = len(results)
    for data in results:
        # Restore escaped quotes.
        for key in data:
            if type(data[key]) == str:
                data[key] = data[key].replace(escaped_quote, '"')
        current_date = data['date']
        window_index = get_window_index_for_date(
            current_date, min_date, max_date, window)
        if not by_window.has_key(window_index):
            by_window[window_index] = []
        by_window[window_index].append(data)

    print "Requests between %s and %s: %d" % (min_date, max_date, count)
    for key in sorted(by_window.keys()):
        window_start = min_date + key * window
        window_count = len(by_window[key])
        window_size = aggregate(by_window[key], ['size', 'duration'])
        print "%s: %d\t%s" % (window_start, window_count, window_size)

    print_top(results, 'useragent', sanitizer=normalize_useragent)
    print_top(results, 'useragent', 'size', sanitizer=normalize_useragent)
    print_top(results, 'useragent', 'duration', sanitizer=normalize_useragent)

    print_top(results, 'ip')
    print_top(results, 'ip', 'size')
    print_top(results, 'ip', 'duration')

    if not use_apache_format:
        print_top(results, 'lp_object')
        print_top(results, 'lp_object', 'size')
        print_top(results, 'lp_object', 'duration')

    print_top(results, 'request')
    print_top(results, 'request', 'size')
    print_top(results, 'request', 'duration')