~usn-tool/usn-tool/trunk

« back to all changes in this revision

Viewing changes to scrape-mbox

  • Committer: Steve Beattie
  • Date: 2019-02-19 07:48:48 UTC
  • Revision ID: sbeattie@ubuntu.com-20190219074848-2hmbpko59tlrzeav
The usn-tool repository has been converted to git.

To get the converted repository, please use:
  git clone https://git.launchpad.net/usn-tool

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
#!/usr/bin/env python
2
 
# This expects to read an mbox file containing the email posts from
3
 
# ubuntu-security-announce and generates a YAML database per USN, which
4
 
# can be merged into one big database with "cat"  :)
5
 
import mailbox, rfc822, quopri
6
 
from datetime import datetime
7
 
from time import strptime
8
 
import sys, os
9
 
import subprocess, tempfile
10
 
 
11
 
def usn_flush(cmd,option=None,text=None):
12
 
    handle = None
13
 
    if option:
14
 
        handle = tempfile.TemporaryFile()
15
 
        handle.write(text)
16
 
        handle.flush()
17
 
        handle.seek(0)
18
 
 
19
 
        cmd += ['--%s'%option,'-']
20
 
 
21
 
    #print " ".join(cmd)
22
 
    #if text:
23
 
    #    print text
24
 
 
25
 
    sp = subprocess.Popen(cmd, stdin=handle, stderr=subprocess.STDOUT)
26
 
    out = sp.communicate(None)[0]
27
 
    if sp.returncode:
28
 
        print >>sys.stderr, out
29
 
        sys.exit(0)
30
 
 
31
 
 
32
 
mb = mailbox.UnixMailbox(file(sys.argv[1],'r'))
33
 
 
34
 
msg = mb.next()
35
 
while msg:
36
 
    cmd = ['./usn.py']
37
 
    args = []
38
 
    urls = []
39
 
 
40
 
    subject = msg['Subject']
41
 
    subject = subject.replace('\n','')
42
 
    if '[USN-' in subject:
43
 
        # Extract USN
44
 
        usn = subject[subject.find('[USN-')+5:]
45
 
        usn = usn[:usn.find(']')]
46
 
 
47
 
        ## Skip old USNs
48
 
        #major, minor = usn.split('-')
49
 
        #if int(major)<290:
50
 
        #    print 'Ignoring USN-%s ...' % usn
51
 
        #    msg = mb.next()
52
 
        #    continue
53
 
 
54
 
        # Extract title
55
 
        title = subject[subject.rfind(']')+2:]
56
 
        # Extract date
57
 
        date = datetime(*msg.getdate('Date')[0:6])
58
 
 
59
 
        #print '[USN-%s] %s' % (usn, title)
60
 
        cmd += ['--db','/scratch/ubuntu/usn/database-%s.pickle'%usn]
61
 
        cmd += [usn]
62
 
        args += ['--title',title,'--timestamp',date.strftime("%s")]
63
 
 
64
 
        body = msg.fp.read()
65
 
        # Decode transfer encoding
66
 
        if '\nContent-Transfer-Encoding: quoted-printable' in body:
67
 
            body = quopri.decodestring(body)
68
 
        body = body.splitlines()
69
 
 
70
 
        INIT = 0
71
 
        AT_DATE = 1
72
 
        AT_SUMMARY = 2
73
 
        AT_CVES = 3
74
 
        AT_RELEASES_LIST = 4
75
 
        AT_RELEASES_COLLECT = 5
76
 
        AT_ACTION = 6
77
 
        AT_BIN_LIST = 7
78
 
        AT_OLD_BIN_LIST = 8
79
 
        AT_NEW_BIN_LIST = 9
80
 
        AT_SKIP_TO_DETAILS = 10
81
 
        AT_DETAILS = 11
82
 
        AT_PATH_LIST = 12
83
 
 
84
 
        cves = []
85
 
        details = []
86
 
        action = []
87
 
        releases = []
88
 
        bins = []
89
 
 
90
 
        release = ""
91
 
        prevline = ""
92
 
        arch = ""
93
 
        url = ""
94
 
        size = ""
95
 
        md5 = ""
96
 
 
97
 
        old_bin_list = 0
98
 
        okay = 0
99
 
        quoted = 0
100
 
        position = INIT
101
 
        for line in body:
102
 
            if line.find('====') == 0:
103
 
                if position == INIT:
104
 
                    position = AT_DATE
105
 
                elif position == AT_CVES:
106
 
                    position = AT_RELEASES_LIST
107
 
                    #print " ".join(cves)
108
 
                    for cve in cves:
109
 
                        args += ['--cve',cve]
110
 
                continue
111
 
 
112
 
            if position == AT_DATE:
113
 
                usnloc = line.find(usn)
114
 
                if usnloc<0 or line[usnloc:usnloc+len(usn)] != usn:
115
 
                    print >>sys.stderr, "USN does not match between Subject(%s) and Summary(%s)\n" % (usn, line[usnloc:usnloc+len(usn)])
116
 
                    break
117
 
                position = AT_SUMMARY
118
 
                continue
119
 
 
120
 
            if position == AT_SUMMARY:
121
 
                summary = line
122
 
                position = AT_CVES
123
 
                args += ['--summary',summary]
124
 
                continue
125
 
 
126
 
            if position == AT_CVES:
127
 
                for cve in line.split(','):
128
 
                    cve = cve.strip()
129
 
                    if cve == "":
130
 
                        continue
131
 
                    if 'CAN-' in cve:
132
 
                        cve = cve.replace('CAN-','CVE-')
133
 
                    cves += [cve]
134
 
                continue
135
 
 
136
 
            if position == AT_RELEASES_LIST:
137
 
                if line.find('affects the following Ubuntu releases')>=0:
138
 
                    position = AT_RELEASES_COLLECT
139
 
                continue
140
 
 
141
 
            if position == AT_RELEASES_COLLECT:
142
 
                if len(releases)==0 and line=="":
143
 
                    continue
144
 
                if line.startswith('Ubuntu 4.10'):
145
 
                    releases += ['warty']
146
 
                elif line.startswith('Ubuntu 5.04'):
147
 
                    releases += ['hoary']
148
 
                elif line.startswith('Ubuntu 5.10'):
149
 
                    releases += ['breezy']
150
 
                elif line.startswith('Ubuntu 6.06'):
151
 
                    releases += ['dapper']
152
 
                elif line.startswith('Ubuntu 6.10'):
153
 
                    releases += ['edgy']
154
 
                elif line.startswith('Ubuntu 7.04'):
155
 
                    releases += ['feisty']
156
 
                else:
157
 
                    position = AT_BIN_LIST
158
 
                continue
159
 
 
160
 
            if position == AT_BIN_LIST:
161
 
                if line.startswith('following package version'):
162
 
                    position = AT_NEW_BIN_LIST
163
 
                elif line.startswith('The following packages are affected:'):
164
 
                    position = AT_OLD_BIN_LIST
165
 
                continue
166
 
 
167
 
            if position == AT_OLD_BIN_LIST:
168
 
                old_bin_list = 1
169
 
                if line == "":
170
 
                    if len(bins)>0:
171
 
                        position = AT_SKIP_TO_DETAILS
172
 
                        for release in releases:
173
 
                            for bin in bins:
174
 
                                #print '%s %s ()' % (release, bin)
175
 
                                args += ['--release',release,'--package',bin,'--binary-version','(needed)']
176
 
                else:
177
 
                    bins += [line]
178
 
                continue
179
 
 
180
 
            if position == AT_SKIP_TO_DETAILS:
181
 
                if line.startswith('following package version'):
182
 
                    position = AT_NEW_BIN_LIST
183
 
                if line.find('Details follow:')==0:
184
 
                    position = AT_DETAILS
185
 
                continue
186
 
 
187
 
            if position == AT_NEW_BIN_LIST:
188
 
                old_bin_list = 0
189
 
                if line.find('Ubuntu 4.10:')==0:
190
 
                    release = 'warty'
191
 
                    continue
192
 
                elif line.find('Ubuntu 5.04:')==0:
193
 
                    release = 'hoary'
194
 
                    continue
195
 
                elif line.find('Ubuntu 5.10:')==0:
196
 
                    release = 'breezy'
197
 
                    continue
198
 
                elif line.find('Ubuntu 6.06 LTS:')==0:
199
 
                    release = 'dapper'
200
 
                    continue
201
 
                elif line.find('Ubuntu 6.10:')==0:
202
 
                    release = 'edgy'
203
 
                    continue
204
 
                elif line.find('Ubuntu 7.04:')==0:
205
 
                    release = 'feisty'
206
 
                    continue
207
 
                elif release != "" and line.startswith(" "):
208
 
                    line = line.replace("\t"," ")
209
 
                    items = line.strip().split(" ")
210
 
                    if len(items)>=2:
211
 
                        binary = items.pop(0)
212
 
                        version = items.pop()
213
 
                        #print '%s %s (%s)' % (release, binary, version)
214
 
                        args += ['--release',release,'--package',binary,'--binary-version',version]
215
 
                    else:
216
 
                        print >>sys.stderr, "Bad binary line: %s" % line
217
 
                    continue
218
 
                elif release != "" and line!="" and not line.startswith('Ubuntu'):
219
 
                    position = AT_ACTION
220
 
                    # fall through
221
 
                else:
222
 
                    continue
223
 
 
224
 
            if position == AT_ACTION:
225
 
                if line.startswith('Details follow:'):
226
 
                    if action:
227
 
                        usn_flush([] + cmd,'action',"\n".join(action).strip())
228
 
                        position = AT_DETAILS
229
 
                elif not line.startswith(' '):
230
 
                    action += [line]
231
 
                continue
232
 
 
233
 
            if position == AT_DETAILS:
234
 
                if len(details)>0 and line == "" and prevline == "":
235
 
                    position = AT_PATH_LIST
236
 
                    continue
237
 
                elif line.startswith("  Source archives:") or line.startswith("Updated packages for"):
238
 
                    position = AT_PATH_LIST
239
 
                    # Fall through to get line re-processed in AT_PATH_LIST!
240
 
                elif len(details)>0 or line != "":
241
 
                    details += [line]
242
 
                    prevline = line
243
 
                    continue
244
 
                else:
245
 
                    continue
246
 
 
247
 
            if position == AT_PATH_LIST:
248
 
                # We can't reliably process paths for old bin lists with
249
 
                # multiple releases.
250
 
                if old_bin_list:
251
 
                    if len(releases)>1:
252
 
                        print "*Skipping URLs for old-style multi-release USN"
253
 
                        break
254
 
                    else:
255
 
                        release = releases[0]
256
 
 
257
 
                if line.startswith('Updated packages for Ubuntu 4.10'):
258
 
                    release = 'warty'
259
 
                    continue
260
 
                if line.startswith('Updated packages for Ubuntu 5.04'):
261
 
                    release = 'hoary'
262
 
                    continue
263
 
                if line.startswith('Updated packages for Ubuntu 5.10'):
264
 
                    release = 'breezy'
265
 
                    continue
266
 
                if line.startswith('Updated packages for Ubuntu 6.06'):
267
 
                    release = 'dapper'
268
 
                    continue
269
 
                if line.startswith('Updated packages for Ubuntu 6.10'):
270
 
                    release = 'edgy'
271
 
                    continue
272
 
                if line.startswith('Updated packages for Ubuntu 7.04'):
273
 
                    release = 'feisty'
274
 
                    continue
275
 
 
276
 
                if line.startswith("  Source"):
277
 
                    arch = 'source'
278
 
                    continue
279
 
                if line.startswith("  Architecture"):
280
 
                    arch = 'all'
281
 
                    continue
282
 
                if line.startswith("  amd64"):
283
 
                    arch = 'amd64'
284
 
                    continue
285
 
                if line.startswith("  i386"):
286
 
                    arch = 'i386'
287
 
                    continue
288
 
                if line.startswith("  powerpc"):
289
 
                    arch = 'powerpc'
290
 
                    continue
291
 
                if line.startswith("  sparc"):
292
 
                    arch = 'sparc'
293
 
                    continue
294
 
 
295
 
                urlpos = line.find('http')
296
 
                if urlpos>=0:
297
 
                    url = line[urlpos:].strip()
298
 
                    if release == "" or arch == "":
299
 
                        print >>sys.stderr, "USN-%s: release(%s) or arch(%s) missing!?" % (usn,release,arch)
300
 
                    continue
301
 
 
302
 
                if line.find('Size/MD5')>=0:
303
 
                    items = line.split(" ")
304
 
                    md5 = items.pop()
305
 
                    size = items.pop()
306
 
                    #print '%s %s %s %s %s %s' % (usn,release,arch,url,md5,size)
307
 
                    urls += ['--release',release,'--arch',arch,'--url',url,'--url-size',size,'--url-md5',md5]
308
 
                    if len(urls)>1000:
309
 
                        urlcmd = cmd + urls
310
 
                        print 'Saving %s long URLs ...' % usn
311
 
                        usn_flush(urlcmd)
312
 
                        urls = []
313
 
                    continue
314
 
                
315
 
 
316
 
        # Oops, no matches!
317
 
        if position == INIT:
318
 
            print >>std.stderr, "Misparse of body:\n%s" % body
319
 
        else:
320
 
            print 'Saving %s ...' % usn
321
 
            usn_flush(cmd + args + urls,'description',"\n".join(details).strip())
322
 
 
323
 
    msg = mb.next()
324