~ubuntu-reports-dev/ubuntu-reports/trunk : contents of udd-similar-bug-titles.py at revision 169

~ubuntu-reports-dev/ubuntu-reports/trunk : (revision 169)
#!/usr/bin/python
#
# Author: Brian Murray <brian@canonical.com>
# Copyright (C) 2011 Canonical, Ltd.
# License: GPLv3
#
# Using a local copy of the UDD (Ultimate Debian Database) check to find bug
# reports with similar titles about Ubuntu and or Debian

from launchpadlib.errors import HTTPError
from operator import itemgetter

import datetime
import lpl_common
import Levenshtein
import optparse
import psycopg2


def connect_to_db():
    host = 'localhost'
    dbname = 'udd'
    user = 'udd'
    connect = psycopg2.connect("host=%s dbname=%s user=%s password=password" %
                               (host, dbname, user))
    return connect.cursor()


def prepare_link(row):
    if row[1][1] == 'launchpad':
        return "LP: #%s" % row[0]
    elif row[1][1] == 'debian':
        return "http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=%s" % row[0]


def check_lp_skippable(number):
    ''' Skip bugs that duplicates, private or have debian bug watches.'''
    bug = lp.bugs[number]
    try:
        bug.duplicate_of
    except HTTPError:
        # skip private bugs
        return True
    except ValueError:
        return True
    if bug.duplicate_of:
        return True
    if len([bug_watch for bug_watch in bug.bug_watches]) > 0:
        if 'bugs.debian.org' in str([bug_watch.url for bug_watch in bug.bug_watches]):
        #if 'debbugs' in [bug_watch.bug_tracker.name for bug_watch in bug.bug_watches]:
        # if the bug has a debian bug watch skip it
            return True
    else:
        return False


lp = lpl_common.connect(use_edge=False, version='devel')

cur = connect_to_db()

parser = optparse.OptionParser(usage="usage %prog --package PACKAGE \
 --fixed")
parser.add_option('--package', help='source package name')
parser.add_option('--fixed', help='Search archived debian bugs',
    default=False, action="store_true")
parser.add_option('--team',
    help='Display bugs about packages TEAM is subscribed to')
parser.add_option('--within',
    help='Compare bugs created since X days ago')
parser.add_option('--different',
    help='Only show bugs from different bug trackers',
    default=False, action="store_true")
(opt, args) = parser.parse_args()

package = opt.package
ubuntu = lp.projects['ubuntu']

if opt.team:
    team = lp.people[opt.team]
    pkgs = team.getBugSubscriberPackages()
else:
    pkgs = [ubuntu.getSourcePackage(name=package)]

for package in pkgs:
    pkg_name = package.name
    # bug_tracker, bug_number, title
    # 2011-09-30: need to filter on some date thing to make it ones changed
    # since a certain date
    cur.execute('''SELECT 'launchpad', ubuntu_bugs.bug, ubuntu_bugs.title, ubuntu_bugs.date_reported
                   FROM ubuntu_bugs INNER JOIN ubuntu_bugs_tasks
                   ON ubuntu_bugs.bug = ubuntu_bugs_tasks.bug
                   WHERE ubuntu_bugs.duplicate_of is NULL
                   AND ubuntu_bugs_tasks.package = '%s' AND
                   ubuntu_bugs.bug NOT IN
                   (SELECT ubuntu_bugs_tags.bug FROM ubuntu_bugs_tags
                    WHERE ubuntu_bugs_tags.tag = 'apport-package')'''
                   % pkg_name)
    results = cur.fetchall()
    if not opt.fixed:
        # the merged bit was taken from an example query at the UDD page and
        # is suspect as I don't understand it exactly
        cur.execute('''SELECT 'debian', bugs.id, bugs.title, bugs.arrival
                    FROM bugs WHERE
                    source = '%s'
                    AND (NOT (bugs.id IN
                    (SELECT DISTINCT bugs_merged_with.id FROM bugs_merged_with
                     WHERE (bugs_merged_with.id > bugs_merged_with.merged_with))
                    ))''' % pkg_name)
    elif opt.fixed:
        cur.execute('''SELECT 'debian', archived_bugs.id, archived_bugs.title
                       FROM archived_bugs WHERE source = '%s' ''' % pkg_name)
    deb_results = cur.fetchall()
    # do some pre-cleaning of debian titles
    for deb_result in deb_results:
        if 'intl:' in deb_result[2].lower() or \
            'translation' in deb_result[2].lower():
            continue
        else:
            results.append(deb_result)

    bugs = {}

    for result in results:
        tracker = result[0]
        # if you don't remove the space there would be a double space
        releases = ['breezy ', 'dapper ', 'edgy ', 'feisty ', 'gutsy ',
            'hardy ', 'intrepid ', 'jaunty ', 'karmic ', 'lucid ',
            'maverick ', 'natty ', 'oneiric ']
        bug_number = result[1]
        # lower case all of the titles
        try:
            title = result[2].lower()
        except AttributeError:
            continue
        # perhaps using regex would be better
        title = title.strip('''"''')
        title = title.strip("'")
        # frequently seen with release
        title = title.strip('[')
        # frequently seen with release
        title = title.replace(']', '')
        # make contracted words uniform
        title = title.replace('''can't''', 'cannot')
        title = title.replace('''doesn't''', 'does not')
        title = title.replace('''isn't''', 'is not')
        # release in title is not useful
        for release in releases:
            title = title.replace(release, '')
        # package in start of title is redundant
        try:
            if title.startswith('%s ' % package):
                title = title.replace('%s ' % package, '')
        except UnicodeDecodeError:
            continue
        try:
            if title.startswith('%s: ' % package):
                title = title.replace('%s: ' % package, '')
        except UnicodeDecodeError:
            continue
        title = title.strip()
        # lp date created is not a datetime so make it one
        if tracker == 'launchpad':
            created = datetime.datetime.strptime(result[3].replace(' +0000', ''), '%a, %d %b %Y %H:%M:%S')
        else:
            created = result[3]
        bugs[bug_number] = (title, tracker, created)

    sorted_titles = sorted(bugs.items(), key=itemgetter(1))

    if opt.within:
        today = datetime.datetime.now()
        target_date = today - datetime.timedelta(int(opt.within))
    else:
        target_date = None

    start = 0
    while start < len(sorted_titles)-1:
        after = start + 1

        if target_date:
            if sorted_titles[start][1][2] < target_date or sorted_titles[start][1][2] < target_date:
                start += 1
                continue

        if opt.different:
            if sorted_titles[start][1][1] == sorted_titles[after][1][1]:
                start += 1
                continue

        this_title = sorted_titles[start][1][0]
        next_title = sorted_titles[after][1][0]

        if this_title == '' or this_title == '(no subject)':
            start += 1
            continue
        if sorted_titles[start][1][1] == 'launchpad' and \
            check_lp_skippable(sorted_titles[start][0]) is True:
            start += 1
            continue
        elif sorted_titles[after][1][1] == 'launchpad' and \
            check_lp_skippable(sorted_titles[after][0]) is True:
        # actually shouldn't this be plus 2? because we are on the next title already
            start += 1
            continue
        try:
            ratio = Levenshtein.ratio(this_title, next_title)
        except TypeError:
            start += 1
            continue
        if ratio != 1.00 and 'sigsegv' in this_title:
            start += 1
            continue
        if ratio != 1.00 and 'assert failure' in this_title:
            start += 1
            continue
        if ratio > 0.75 and not 'sigsegv' in this_title:
            slink = prepare_link(sorted_titles[start])
            alink = prepare_link(sorted_titles[after])
            print "%s" % ratio
            print "\t%s: %s" % (slink, sorted_titles[start][1][0])
            print "\t%s: %s" % (alink, sorted_titles[after][1][0])
            match = True
            increment = 1
            if after + increment >= len(sorted_titles):
                start += 1
                continue
            while match is True:
                next_next_title = sorted_titles[after+increment][1][0]
                if next_next_title == '' or \
                    next_next_title == '(no subject)':
                    match = False
                    start += 1
                    continue
                nratio = Levenshtein.ratio(next_title, next_next_title)
                if nratio == ratio:
                    nlink = prepare_link(sorted_titles[after+increment])
                    print "\t%s: %s" % \
                        (nlink, sorted_titles[after+increment][1][0])
                    increment += 1
                else:
                # need to increase start here to account for the fact that we've gone further in the results set
                    start += increment - 1
                    match = False
            start += 1
        elif ratio == 1.00 and 'sigsegv' in this_title:
            slink = prepare_link(sorted_titles[start])
            alink = prepare_link(sorted_titles[after])
            print "%s" % ratio
            print "\t%s: %s" % (slink, sorted_titles[start][1][0])
            print "\t%s: %s" % (alink, sorted_titles[after][1][0])
            start += 1
        elif ratio == 1.00 and 'assert failure' in this_title:
            slink = prepare_link(sorted_titles[start])
            alink = prepare_link(sorted_titles[after])
            print "%s" % ratio
            print "\t%s: %s" % (slink, sorted_titles[start][1][0])
            print "\t%s: %s" % (alink, sorted_titles[after][1][0])
            start += 1
        start += 1