13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
17
"""Cook a message's Subject header.
18
"""Cook a message's Subject header."""
20
20
from __future__ import nested_scopes
22
22
from types import UnicodeType
24
24
from email.Charset import Charset
25
from email.Header import Header, decode_header
25
from email.Header import Header, decode_header, make_header
26
26
from email.Utils import parseaddr, formataddr, getaddresses
27
from email.Errors import HeaderParseError
28
29
from Mailman import mm_cfg
29
30
from Mailman import Utils
40
48
return isinstance(s, UnicodeType)
50
nonascii = re.compile('[^\s!-~]')
42
52
def uheader(mlist, s, header_name=None, continuation_ws='\t', maxlinelen=None):
43
# Get the charset to encode the string in. If this is us-ascii, we'll use
44
# iso-8859-1 instead, just to get a little extra coverage, and because the
45
# Header class tries us-ascii first anyway.
53
# Get the charset to encode the string in. Then search if there is any
54
# non-ascii character is in the string. If there is and the charset is
55
# us-ascii then we use iso-8859-1 instead. If the string is ascii only
56
# we use 'us-ascii' if another charset is specified.
46
57
charset = Utils.GetCharSet(mlist.preferred_language)
47
if charset == 'us-ascii':
48
charset = 'iso-8859-1'
49
charset = Charset(charset)
50
# Convert the string to unicode so Header will do the 3-charset encoding.
51
# If s is a byte string and there are funky characters in it that don't
52
# match the charset, we might as well replace them now.
54
codec = charset.input_codec or 'ascii'
55
s = unicode(s, codec, 'replace')
56
# We purposefully leave no space b/w prefix and subject!
58
if nonascii.search(s):
59
# use list charset but ...
60
if charset == 'us-ascii':
61
charset = 'iso-8859-1'
63
# there is no nonascii so ...
57
65
return Header(s, charset, maxlinelen, header_name, continuation_ws)
143
156
# was munged into the Reply-To header, but if not, we'll add it to a
144
157
# Cc header. BAW: should we force it into a Reply-To header in the
146
if mlist.personalize == 2 and mlist.reply_goes_to_list <> 1:
159
# Also skip Cc if this is an anonymous list as list posting address
160
# is already in From and Reply-To in this case.
161
if mlist.personalize == 2 and mlist.reply_goes_to_list <> 1 \
162
and not mlist.anonymous_list:
147
163
# Watch out for existing Cc headers, merge, and remove dups. Note
148
164
# that RFC 2822 says only zero or one Cc header is allowed.
151
167
for pair in getaddresses(msg.get_all('cc', [])):
153
i18ndesc = uheader(mlist, mlist.description)
169
i18ndesc = uheader(mlist, mlist.description, 'Cc')
154
170
add((str(i18ndesc), mlist.GetListEmail()))
156
172
msg['Cc'] = COMMASPACE.join([formataddr(pair) for pair in new])
166
182
# This will act like an email address for purposes of formataddr()
167
183
listid = '%s.%s' % (mlist.internal_name(), mlist.host_name)
184
cset = Utils.GetCharSet(mlist.preferred_language)
168
185
if mlist.description:
169
186
# Don't wrap the header since here we just want to get it properly RFC
171
h = uheader(mlist, mlist.description, 'List-Id', maxlinelen=10000)
188
i18ndesc = uheader(mlist, mlist.description, 'List-Id', maxlinelen=998)
189
listid_h = formataddr((str(i18ndesc), listid))
175
listid_h = formataddr((desc, listid))
176
# BAW: I think the message object should handle any necessary wrapping.
191
# without desc we need to ensure the MUST brackets
192
listid_h = '<%s>' % listid
193
# We always add a List-ID: header.
177
194
del msg['list-id']
178
195
msg['List-Id'] = listid_h
179
# For internally crafted messages, we
180
# also add a (nonstandard), "X-List-Administrivia: yes" header. For all
181
# others (i.e. those coming from list posts), we adda a bunch of other RFC
196
# For internally crafted messages, we also add a (nonstandard),
197
# "X-List-Administrivia: yes" header. For all others (i.e. those coming
198
# from list posts), we add a bunch of other RFC 2369 headers.
183
199
requestaddr = mlist.GetRequestEmail()
184
200
subfieldfmt = '<%s>, <mailto:%s?subject=%ssubscribe>'
185
201
listinfo = mlist.GetScriptURL('listinfo', absolute=1)
229
247
if len(lines) > 1 and lines[1] and lines[1][0] in ' \t':
231
249
msgdata['origsubj'] = subject
250
# The subject may be multilingual but we take the first charset as major
251
# one and try to decode. If it is decodable, returned subject is in one
252
# line and cset is properly set. If fail, subject is mime-encoded and
253
# cset is set as us-ascii. See detail for ch_oneline() (CookHeaders one
255
subject, cset = ch_oneline(subject)
256
# TK: Python interpreter has evolved to be strict on ascii charset code
257
# range. It is safe to use unicode string when manupilating header
258
# contents with re module. It would be best to return unicode in
259
# ch_oneline() but here is temporary solution.
260
subject = unicode(subject, cset)
261
# If the subject_prefix contains '%d', it is replaced with the
262
# mailing list sequential number. Sequential number format allows
263
# '%d' or '%05d' like pattern.
264
prefix_pattern = re.escape(prefix)
266
prefix_pattern = '%'.join(prefix_pattern.split(r'\%'))
267
p = re.compile('%\d*d')
268
if p.search(prefix, 1):
269
# prefix have number, so we should search prefix w/number in subject.
270
# Also, force new style.
271
prefix_pattern = p.sub(r'\s*\d+\s*', prefix_pattern)
274
old_style = mm_cfg.OLD_STYLE_PREFIXING
275
subject = re.sub(prefix_pattern, '', subject)
276
rematch = re.match('((RE|AW|SV|VS)(\[\d+\])?:\s*)+', subject, re.I)
278
subject = subject[rematch.end():]
282
# At this point, subject may become null if someone post mail with
283
# subject: [subject prefix]
284
if subject.strip() == '':
233
285
subject = _('(no subject)')
234
# The header may be multilingual; decode it from base64/quopri and search
235
# each chunk for the prefix. BAW: Note that if the prefix contains spaces
236
# and each word of the prefix is encoded in a different chunk in the
237
# header, we won't find it. I think in practice that's unlikely though.
238
headerbits = decode_header(subject)
239
if prefix and subject:
240
pattern = re.escape(prefix.strip())
241
for decodedsubj, charset in headerbits:
242
if re.search(pattern, decodedsubj, re.IGNORECASE):
243
# The subject's already got the prefix, so don't change it
286
cset = Utils.GetCharSet(mlist.preferred_language)
287
subject = unicode(subject, cset)
288
# and substitute %d in prefix with post_id
290
prefix = prefix % mlist.post_id
293
# If charset is 'us-ascii', try to concatnate as string because there
294
# is some weirdness in Header module (TK)
295
if cset == 'us-ascii':
298
h = u' '.join([recolon, prefix, subject])
300
h = u' '.join([prefix, recolon, subject])
301
h = h.encode('us-ascii')
302
h = uheader(mlist, h, 'Subject', continuation_ws=ws)
305
ss = u' '.join([recolon, subject])
306
ss = ss.encode('us-ascii')
307
ss = uheader(mlist, ss, 'Subject', continuation_ws=ws)
308
msgdata['stripped_subject'] = ss
312
# Get the header as a Header instance, with proper unicode conversion
314
h = uheader(mlist, recolon, 'Subject', continuation_ws=ws)
317
h = uheader(mlist, prefix, 'Subject', continuation_ws=ws)
319
# TK: Subject is concatenated and unicode string.
320
subject = subject.encode(cset, 'replace')
321
h.append(subject, cset)
245
322
del msg['subject']
246
# Get the header as a Header instance, with proper unicode conversion
247
h = uheader(mlist, prefix, 'Subject', continuation_ws=ws)
248
for s, c in headerbits:
249
# Once again, convert the string to unicode.
251
c = Charset('iso-8859-1')
252
if not isinstance(c, Charset):
254
if not _isunicode(s):
255
codec = c.input_codec or 'ascii'
257
s = unicode(s, codec, 'replace')
259
# Unknown codec, is this default reasonable?
260
s = unicode(s, Utils.GetCharSet(mlist.preferred_language),
263
323
msg['Subject'] = h
324
ss = uheader(mlist, recolon, 'Subject', continuation_ws=ws)
325
ss.append(subject, cset)
326
msgdata['stripped_subject'] = ss
330
def ch_oneline(headerstr):
331
# Decode header string in one line and convert into single charset
332
# copied and modified from ToDigest.py and Utils.py
333
# return (string, cset) tuple as check for failure
335
d = decode_header(headerstr)
336
# at this point, we should rstrip() every string because some
337
# MUA deliberately add trailing spaces when composing return
339
d = [(s.rstrip(), c) for (s,c) in d]
342
# search for no-None charset
347
ustr = h.__unicode__()
348
oneline = u''.join(ustr.splitlines())
349
return oneline.encode(cset, 'replace'), cset
350
except (LookupError, UnicodeError, ValueError, HeaderParseError):
351
# possibly charset problem. return with undecoded string in one line.
352
return ''.join(headerstr.splitlines()), 'us-ascii'