~m4v/+junk/Urldb

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python

import os
import re
from fnmatch import fnmatch
from datetime import datetime

files = os.listdir('.')
files.sort()

octet = r'(?:2(?:[0-4]\d|5[0-5])|1\d\d|\d{1,2})'                      
ipAddr = r'%s(?:\.%s){3}' % (octet, octet)                            
label = r'[0-9a-z][-0-9a-z]*[0-9a-z]?'                                
domain = r'%s(?:\.%s)*\.[a-z][-0-9a-z]*[a-z]?' % (label, label)       
url = r'(\w+://(?:%s|%s)(?::\d+)?(?:/[^\])>\s]*)?)' % (domain, ipAddr)

urlre = re.compile(url, re.I)
channel = '#arg.urt'

from storm.locals import *
from storm.sqlobject import AutoUnicode

urls_schema = \
"""CREATE TABLE urls (\
    id INTEGER PRIMARY KEY AUTOINCREMENT,\
    channel TEXT COLLATE NOCASE,\
    nick TEXT COLLATE NOCASE,\
    url TEXT COLLATE NOCASE,\
    date DATETIME,
    UNIQUE (channel, url))
"""

others_schema = \
"""CREATE TABLE other_urls (\
    id INTEGER PRIMARY KEY AUTOINCREMENT,\
    channel TEXT COLLATE NOCASE,\
    nick TEXT COLLATE NOCASE,\
    url_id INTEGER references urls(id),\
    date DATETIME)
"""


class Url(object):
    __storm_table__ = 'urls'

    id = Int(primary=True)
    channel = AutoUnicode()
    nick = AutoUnicode()
    url = RawStr()
    date = DateTime()

    def __init__(self, url, nick, channel, date):
        self.url = url
        self.nick = nick
        self.channel = channel
        self.date = date


class OtherUrl(object):
    __storm_table__ = 'other_urls'

    id = Int(primary=True)
    channel = AutoUnicode()
    nick = AutoUnicode()
    url_id = Int()
    date = DateTime()

    url = Reference(url_id, Url.id)

    def __init__(self, nick, channel, date):
        self.nick = nick
        self.channel = channel
        self.date = date


db = create_database('sqlite:Urldb.sqlite.db')
store = Store(db)
try:
    store.execute(urls_schema)
    store.execute(others_schema)
    store.commit()
except:
    pass


for file in files:
    if not fnmatch(file, '*.log'):
        continue
    print file
    for line in open(file):
        line = line.strip()
        match = urlre.findall(line)
        if match:
            try:
                date, nick, _ = line.split(None, 2)
                if nick == '*':
                    nick = line.split()[2]
                nick = nick.strip('<>')
                if nick.lower() in ('***', 'ircbot', 'ircbot_', 'ircbot`', 'noirc', 'botfly', 'manuk', 'botgay', 'coyobot'):
                    continue
                date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S')
                for url in match:
                    _url = store.find(Url, Url.url.like(url)).one()
                    if _url:
                        other = OtherUrl(nick, channel, date)
                        other.url = _url
                        store.add(other)
                    else:
                        url = Url(url, nick, channel, date)
                        store.add(url)

                store.commit()
            except:
                print line
                raise