1
# Copyright (C) 2001-2007 by the Free Software Foundation, Inc.
1
# Copyright (C) 2001-2009 by the Free Software Foundation, Inc.
3
3
# This program is free software; you can redistribute it and/or
4
4
# modify it under the terms of the GNU General Public License
41
40
from Mailman.Errors import DiscardMessage
42
41
from Mailman.i18n import _
43
42
from Mailman.Logging.Syslog import syslog
43
from Mailman.Utils import sha_new
45
45
# Path characters for common platforms
46
46
pre = re.compile(r'[/\\:]')
47
47
# All other characters to strip out of Content-Disposition: filenames
48
# (essentially anything that isn't an alphanum, dot, slash, or underscore.
48
# (essentially anything that isn't an alphanum, dot, dash, or underscore).
49
49
sre = re.compile(r'[^-\w.]')
50
50
# Regexp to strip out leading dots
51
51
dre = re.compile(r'^\.*')
158
158
if msgid is None:
159
159
msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
160
160
# We assume that the message id actually /is/ unique!
161
digest = sha.new(msgid).hexdigest()
161
digest = sha_new(msgid).hexdigest()
162
162
return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
167
167
# message by a text (scrubbing).
168
168
del msg['content-type']
169
169
del msg['content-transfer-encoding']
170
if isinstance(charset, unicode):
171
# email 3.0.1 (python 2.4) doesn't like unicode
172
charset = charset.encode('us-ascii')
170
173
msg.set_payload(text, charset)
189
192
# Now walk over all subparts of this message and scrub out various types
190
193
format = delsp = None
191
194
for part in msg.walk():
192
ctype = part.get_type(part.get_default_type())
195
ctype = part.get_content_type()
193
196
# If the part is text/plain, we leave it alone
194
197
if ctype == 'text/plain':
195
198
# We need to choose a charset for the scrubbed message, so we'll
293
296
Subject: %(subject)s
298
301
# If the message isn't a multipart, then we'll strip it out as an
299
302
# attachment that would have to be separately downloaded. Pipermail
300
303
# will transform the url into a hyperlink.
301
elif part and not part.is_multipart():
304
elif part.get_payload() and not part.is_multipart():
302
305
payload = part.get_payload(decode=True)
303
ctype = part.get_type()
306
ctype = part.get_content_type()
304
307
# XXX Under email 2.5, it is possible that payload will be None.
305
308
# This can happen when you have a Content-Type: multipart/* with
306
309
# only one part and that part has two blank lines between the
318
321
desc = part.get('content-description', _('not available'))
322
desc = Utils.oneline(desc, lcset)
319
323
filename = part.get_filename(_('not available'))
320
324
filename = Utils.oneline(filename, lcset)
321
325
replace_payload_by_text(part, _("""\
349
353
for part in msg.walk():
350
354
# TK: bug-id 1099138 and multipart
351
if not part or part.is_multipart():
355
# MAS test payload - if part may fail if there are no headers.
356
if not part.get_payload() or part.is_multipart():
353
358
# All parts should be scrubbed to text/plain by now.
354
359
partctype = part.get_content_type()
356
361
text.append(_('Skipped content of type %(partctype)s\n'))
359
t = part.get_payload(decode=True)
364
t = part.get_payload(decode=True) or ''
360
365
# MAS: TypeError exception can occur if payload is None. This
361
366
# was observed with a message that contained an attached
362
367
# message/delivery-status part. Because of the special parsing
363
368
# of this type, this resulted in a text/plain sub-part with a
364
369
# null body. See bug 1430236.
365
370
except (binascii.Error, TypeError):
366
t = part.get_payload()
371
t = part.get_payload() or ''
367
372
# TK: get_content_charset() returns 'iso-2022-jp' for internally
368
373
# crafted (scrubbed) 'euc-jp' text part. So, first try
369
374
# get_charset(), then get_content_charset() for the parts
376
381
if partcharset and partcharset <> charset:
378
383
t = unicode(t, partcharset, 'replace')
379
except (UnicodeError, LookupError, ValueError, AssertionError):
380
# Replace funny characters. We use errors='replace' for
381
# both calls since the first replace will leave U+FFFD,
382
# which isn't ASCII encodeable.
383
u = unicode(t, 'ascii', 'replace')
384
t = u.encode('ascii', 'replace')
384
except (UnicodeError, LookupError, ValueError,
386
# We can get here if partcharset is bogus in come way.
387
# Replace funny characters. We use errors='replace'
388
t = unicode(t, 'ascii', 'replace')
386
390
# Should use HTML-Escape, or try generalizing to UTF-8
387
391
t = t.encode(charset, 'replace')
388
except (UnicodeError, LookupError, ValueError):
392
except (UnicodeError, LookupError, ValueError,
394
# if the message charset is bogus, use the list's.
389
395
t = t.encode(lcset, 'replace')
390
396
# Separation is useful
391
397
if isinstance(t, StringType):
400
406
s = unicode(sep, lcset, 'replace')
401
407
sep = s.encode(charset, 'replace')
402
except (UnicodeError, LookupError, ValueError):
408
except (UnicodeError, LookupError, ValueError,
404
411
replace_payload_by_text(msg, sep.join(text), charset)
436
443
# i18n file name is encoded
437
444
lcset = Utils.GetCharSet(mlist.preferred_language)
438
445
filename = Utils.oneline(msg.get_filename(''), lcset)
439
fnext = os.path.splitext(filename)[1]
446
filename, fnext = os.path.splitext(filename)
440
447
# For safety, we should confirm this is valid ext for content-type
441
448
# but we can use fnext if we introduce fnext filtering
442
449
if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
463
# Allow only alphanumerics, dash, underscore, and dot
464
ext = sre.sub('', ext)
457
466
# We need a lock to calculate the next attachment number
458
467
lockfile = os.path.join(fsdir, 'attachments.lock')
476
484
# which one should we go with? For now, let's go with the one we
477
485
# guessed so attachments can't lie about their type. Also, if the
478
486
# filename /has/ no extension, then tack on the one we guessed.
479
filebase, ignore = os.path.splitext(filename)
487
# The extension was removed from the name above.
480
489
# Now we're looking for a unique name for this file on the file
481
490
# system. If msgdir/filebase.ext isn't unique, we'll add a counter
482
491
# after filebase, e.g. msgdir/filebase-cnt.ext