1
# Copyright (C) 1998-2011 by the Free Software Foundation, Inc.
3
# This file is part of GNU Mailman.
5
# GNU Mailman is free software: you can redistribute it and/or modify it under
6
# the terms of the GNU General Public License as published by the Free
7
# Software Foundation, either version 3 of the License, or (at your option)
10
# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
11
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15
# You should have received a copy of the GNU General Public License along with
16
# GNU Mailman. If not, see <http://www.gnu.org/licenses/>.
18
"""HyperArch: Pipermail archiving for Mailman
20
- The Dragon De Monsyne <dragondm@integral.org>
35
from tempfile import mkstemp
36
from shutil import move
37
from email.Charset import Charset
38
from email.Errors import HeaderParseError
39
from email.Header import decode_header, make_header
40
from flufl.lock import Lock, TimeOutError
41
from lazr.config import as_boolean
42
from string import Template
43
from zope.component import getUtility
44
from cStringIO import StringIO
45
from email.utils import parseaddr, parsedate_tz, mktime_tz, formatdate
46
from string import lowercase
47
from storm.locals import *
49
from archiver.model.article import StormArticle
50
from archiver.model.conversation import Conversation
51
from archiver.model.mlist import Mlist
52
from archiver.Indexer import Indexer
54
from archiver.core.i18n import _
55
#from mailman.config import config
56
#from mailman.core.i18n import _, ctime
57
#from mailman.interfaces.listmanager import IListManager
58
#from mailman.utilities.i18n import find
59
#from mailman.utilities.string import uncanonstr, websafe
62
log = logging.getLogger('mailman.error')
63
log1= logging.getLogger('mailman.http')
68
# MacOSX has a default stack size that is too small for deeply recursive
69
# regular expressions. We see this as crashes in the Python test suite when
70
# running test_re.py and test_sre.py. The fix is to set the stack limit to
71
# 2048; the general recommendation is to do in the shell before running the
72
# test suite. But that's inconvenient for a daemon like the qrunner.
74
# AFAIK, this problem only affects the archiver, so we're adding this work
75
# around to this file (it'll get imported by the bundled pipermail or by the
76
# bin/arch script. We also only do this on darwin, a.k.a. MacOSX.
77
if sys.platform == 'darwin':
83
soft, hard = resource.getrlimit(resource.RLIMIT_STACK)
84
newsoft = min(hard, max(soft, 1024*2048))
85
resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard))
87
def prepare_url(subject):
88
subject = re.sub("[^-A-Za-z0-9 ]","",subject)
89
subject = re.sub("[ -]+","-",subject.strip().lower())
90
return subject if len(subject) < 80 else subject[:80]
92
def safe_unicode(obj, *args):
93
""" return the unicode representation of obj """
95
return unicode(obj, *args)
96
except UnicodeDecodeError:
98
ascii_text = str(obj).encode('string_escape')
99
return unicode(ascii_text)
101
def html_quote(s, langcode=None):
102
repls = ( ('&', '&'),
106
for thing, repl in repls:
107
s = s.replace(thing, repl)
108
return uncanonstr(s, langcode)
112
return urllib.quote(s)
115
def null_to_space(s):
116
return s.replace('\000', ' ')
119
def sizeof(filename, lang):
121
size = os.path.getsize(filename)
123
# ENOENT can happen if the .mbox file was moved away or deleted, and
124
# an explicit mbox file name was given to bin/arch.
125
if e.errno <> errno.ENOENT: raise
126
return _('size not available')
128
with _.using(lang.code):
129
out = _(' %(size)i bytes ')
132
return ' %d KB ' % (size / 1000)
134
return ' %d MB ' % (size / 1000000)
137
html_charset = '<META http-equiv="Content-Type" ' \
138
'content="text/html; charset=%s">'
140
def CGIescape(arg, lang=None):
141
if isinstance(arg, unicode):
144
s = websafe(str(arg))
145
return uncanonstr(s.replace('"', '"'), lang.code)
147
# Parenthesized human name
148
paren_name_pat = re.compile(r'([(].*[)])')
150
# Subject lines preceded with 'Re:'
151
REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE)
153
# E-mail addresses and URLs in text
154
emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)')
156
# Argh! This pattern is buggy, and will choke on URLs with GET parameters.
157
urlpat = re.compile(r'(\w+://[^>)\s]+)') # URLs in text
160
blankpat = re.compile(r'^\s*$')
162
# Starting <html> directive
163
htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE)
164
# Ending </html> directive
165
nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE)
167
quotedpat = re.compile(r'^([>|:]|>)+')
171
_templatefilepathcache = {}
174
msgid_pat = re.compile(r'(<.*>)')
175
def strip_separators(s):
176
"Remove quotes or parenthesization from a Message-ID string"
179
if s[0] in '"<([' and s[-1] in '">)]':
183
smallNameParts = ['van', 'von', 'der', 'de']
185
def fixAuthor(author):
186
"Canonicalize a name into Last, First format"
187
# If there's a comma, guess that it's already in "Last, First" format
193
return author # The string's one word--forget it
194
if author.upper() == author or author.lower() == author:
195
# Damn, the name is all upper- or lower-case.
196
while i > 0 and L[i-1].lower() in smallNameParts:
199
# Mixed case; assume that small parts of the last name will be
200
# in lowercase, and check them against the list.
201
while i>0 and (L[i-1][0] in lowercase or
202
L[i-1].lower() in smallNameParts):
204
author = SPACE.join(L[-1:] + L[i:-1]) + ', ' + SPACE.join(L[:i])
207
# The Article class encapsulates a single posting. The attributes are:
209
# sequence : Sequence number, unique for each article in a set of archives
211
# datestr : The posting date, in human-readable format
212
# date : The posting date, in purely numeric format
213
# fromdate : The posting date, in `unixfrom' format
214
# headers : Any other headers of interest
215
# author : The author's name (and possibly organization)
216
# email : The author's e-mail address
217
# msgid : A unique message ID
218
# in_reply_to : If !="", this is the msgid of the article being replied to
219
# references: A (possibly empty) list of msgid's of earlier articles in
221
# body : A string making up the message body
224
_last_article_time = time.time()
226
def __init__(self, message=None, sequence=0, keepHeaders=[],
227
lang=None, mlist=None):
230
self.sequence = sequence
233
self.threadKey = None
234
# otherwise the current sequence number is used.
235
id = strip_separators(message['Message-Id'])
237
self.msgid = str(self.sequence)
238
else: self.msgid = id
240
if message.has_key('Subject'):
241
self.subject = str(message['Subject'])
243
self.subject = _('No subject')
244
if self.subject == "": self.subject = _('No subject')
246
self._set_date(message)
248
# Figure out the e-mail address and poster's name. Use the From:
249
# field first, followed by Reply-To:
250
self.author, self.email = parseaddr(message.get('From', ''))
251
e = message['Reply-To']
252
if not self.email and e is not None:
253
ignoreauthor, self.email = parseaddr(e)
254
self.email = strip_separators(self.email)
255
self.author = strip_separators(self.author)
257
if self.author == "":
258
self.author = self.email
260
# Save the In-Reply-To:, References:, and Message-ID: lines
262
# TBD: The original code does some munging on these fields, which
263
# shouldn't be necessary, but changing this may break code. For
264
# safety, I save the original headers on different attributes for use
265
# in writing the plain text periodic flat files.
266
self._in_reply_to = message['in-reply-to']
267
self._references = message['references']
268
self._message_id = message['message-id']
270
i_r_t = message['In-Reply-To']
272
self.in_reply_to = ''
275
match = msgid_pat.search(i_r_t)
277
self.in_reply_to = ''
279
self.in_reply_to = strip_separators(match.group(1))
280
if not self.in_reply_to=='':
281
self._in_reply_to = self.in_reply_to
283
references = message['References']
284
if references is None:
287
self.references = map(strip_separators, references.split())
289
# Save any other interesting headers
291
for i in keepHeaders:
292
if message.has_key(i):
293
self.headers[i] = message[i]
295
# Read the message body
296
s = StringIO(message.get_payload(decode=True)\
297
or message.as_string().split('\n\n',1)[1])
298
# self.body = s.readlines()
303
# Trim Re: from the subject line
306
result = REpat.match(self.subject)
309
self.subject = self.subject[i:]
312
# Useful to keep around
316
# thread_addr is for dlists
317
self.thread_addr = ''
318
addrs_to_try = ['To', 'X-Original-To', 'Delivered-To', 'Cc', 'Bcc']
320
while (self.thread_addr == None or self.thread_addr.find('+') == -1 or self.thread_addr.find(',') != -1 or self.thread_addr.find(' ') != -1) and i < len(addrs_to_try):
321
self.thread_addr = message[addrs_to_try[i]]
323
if self.thread_addr != None:
324
self.thread_addr = self.thread_addr.replace('>', '').replace('<', '')
326
self.thread_addr = self.strip_subject(self.subject)
328
# if as_boolean(config.archiver.pipermail.obscure_email_addresses):
329
# # Avoid i18n side-effects. Note that the language for this
330
# # article (for this list) could be different from the site-wide
331
# # preferred language, so we need to ensure no side-effects will
332
# # occur. Think what happens when executing bin/arch.
333
# with _.using(lang.code):
334
# if self.author == self.email:
335
# self.author = self.email = re.sub('@', _(' at '),
338
# self.email = re.sub('@', _(' at '), self.email)
339
# Snag the content-* headers. RFC 1521 states that their values are
341
ctype = message.get('Content-Type', 'text/plain')
342
cenc = message.get('Content-Transfer-Encoding', '')
343
self.ctype = ctype.lower()
344
self.cenc = cenc.lower()
346
cset = 'us-ascii' #mlist.preferred_language.charset
347
cset_out = Charset(cset).output_charset or cset
348
charset = message.get_content_charset(cset_out)
350
charset = charset.lower().strip()
351
if charset[0]=='"' and charset[-1]=='"':
352
charset = charset[1:-1]
353
if charset[0]=="'" and charset[-1]=="'":
354
charset = charset[1:-1]
356
body = message.get_payload(decode=True)
357
except binascii.Error:
359
if body and charset != 'us-ascii':
362
body = unicode(body, charset)
363
except (UnicodeError, LookupError):
366
# self.body = [l + "\n" for l in body.splitlines()]
369
self.decode_headers()
371
def quote(self, buf):
372
return html_quote(buf, self._lang.code)
374
def decode_headers(self):
375
"""MIME-decode headers.
377
If the email, subject, or author attributes contain non-ASCII
378
characters using the encoded-word syntax of RFC 2047, decoded versions
379
of those attributes are placed in the self.decoded (a dictionary).
381
If the list's charset differs from the header charset, an attempt is
382
made to decode the headers as Unicode. If that fails, they are left
385
author = self.decode_charset(self.author)
386
subject = self.decode_charset(self.subject)
388
self.decoded['author'] = author
389
email = self.decode_charset(self.email)
391
self.decoded['email'] = email
393
# if as_boolean(config.archiver.pipermail.obscure_email_addresses):
394
# with _.using(self._lang.code):
396
# subject = re.sub(r'([-+,.\w]+)@([-+.\w]+)',
397
# '\g<1>' + atmark + '\g<2>', subject)
398
self.decoded['subject'] = subject
399
self.decoded['stripped'] = self.strip_subject(subject or self.subject)
401
def strip_subject(self, subject):
402
# Strip subject_prefix and Re: for subject sorting
403
# This part was taken from CookHeaders.py (TK)
404
prefix = None #self._mlist.subject_prefix.strip()
406
prefix_pat = re.escape(prefix)
407
prefix_pat = '%'.join(prefix_pat.split(r'\%'))
408
prefix_pat = re.sub(r'%\d*d', r'\s*\d+\s*', prefix_pat)
409
subject = re.sub(prefix_pat, '', subject)
410
subject = subject.lstrip()
411
strip_pat = re.compile('^((RE|AW|SV|VS)(\[\d+\])?:\s*)+', re.I)
412
stripped = strip_pat.sub('', subject)
415
def decode_charset(self, field):
416
# TK: This function was rewritten for unifying to Unicode.
417
# Convert 'field' into Unicode one line string.
419
pairs = decode_header(field)
420
ustr = make_header(pairs).__unicode__()
421
except (LookupError, UnicodeError, ValueError, HeaderParseError):
422
# assume list's language
423
cset = 'us-ascii' #self._mlist.preferred_language.charset
424
if cset == 'us-ascii':
425
cset = 'iso-8859-1' # assume this for English list
426
ustr = unicode(field, cset, 'replace')
427
return u''.join(ustr.splitlines())
429
def _get_subject_enc(self, art):
430
"""Return the subject of art, decoded if possible.
432
If the charset of the current message and art match and the
433
article's subject is encoded, decode it.
435
return art.decoded.get('subject', art.subject)
437
_rx_quote = re.compile('=([A-F0-9][A-F0-9])')
438
_rx_softline = re.compile('=[ \t]*$')
441
"""Return the message body ready for HTML, decoded if necessary"""
443
return null_to_space(EMPTYSTRING.join(body))
445
def _set_date(self, message):
446
def floatdate(header):
448
datestr = message.get(header, missing)
449
if datestr is missing:
451
date = parsedate_tz(datestr)
453
return mktime_tz(date)
454
except (TypeError, ValueError, OverflowError):
456
date = floatdate('date')
458
date = floatdate('x-list-received-date')
460
# What's left to try?
461
date = self._last_article_time + 1
462
self._last_article_time = date
464
# self.date = '%011i' % date
465
print type(self.date)
466
self.datestr = message.get('date') \
467
or message.get('x-list-received-date') \
470
self.fromdate = time.ctime(int(self.date))
473
DIRMODE = 0755 # Mode to give to created directories
475
def __init__(self, basedir = None, mlist = None):
476
# If basedir isn't provided, assume the current directory
479
basedir = os.getcwd()
481
basedir = os.path.expanduser(basedir)
482
self.listdir = os.path.join(basedir,mlist)
483
# If the directory doesn't exist, create it.
484
for dir in (basedir, self.listdir):
487
except os.error, errdata:
488
errno, errmsg = errdata
490
raise os.error, errdata
494
os.mkdir(dir, self.DIRMODE)
498
db_path = os.path.join(self.listdir,'archives.db')
499
self.store = Store(create_database('sqlite:///'+db_path))
501
#insert mlist into mlist db.
502
mlist_db= os.path.join(basedir,'mlists.db')
503
store = Store(create_database('sqlite:///'+mlist_db))
505
store.execute("CREATE TABLE mlist " "(list_name VARCHAR(255), db_path VARCHAR(255), id INTEGER PRIMARY KEY)")
508
if store.find(Mlist, Mlist.list_name==unicode(self.mlist)).is_empty():
509
store.add(Mlist(self.mlist,self.listdir))
513
index_path = os.path.join(self.listdir,'archives.index')
514
self.indexer = Indexer(index_path)
518
self.store.execute("CREATE TABLE article " "(subject VARCHAR(255), date FLOAT, datestr VARCHAR(255), author VARCHAR(255), email VARCHAR(255), msgid VARCHAR(255), id INTEGER PRIMARY KEY, in_reply_to VARCHAR(255), body TEXT, thread_addr VARCHAR(255), threadkey VARCHAR(255), pdate_msg INTEGER, ndate_msg INTEGER, pthread_msg INTEGER, nthread_msg INTEGER )")
522
self.store.execute("CREATE TABLE conversation" "(subject VARCHAR(255), date FLOAT, datestr VARCHAR(255), thread_addr VARCHAR(255), author VARCHAR(255), count INTEGER, pdate_con VARCHAR(255), ndate_con VARCHAR(255))")
526
def _makeArticle(self, msg):
529
def mboxToDB(self, path, start=None, end=None):
530
mbox = iter(mailbox.mbox(path))
534
while counter < start:
537
except errors.DiscardMessage:
545
except StopIteration:
547
except errors.DiscardMessage:
550
log.error('uncaught archiver exception')
553
# It was an unparseable message
555
# self.message(_('#%(counter)05d %(msgid)s'))
556
stormarticle = self.archiveMsg(m, commit = False)
557
self.indexer.index_article(stormarticle,created= True, commit = False)
558
if end is not None and counter >= end:
561
#update navigation fields
562
self.update_cons_date()
563
self.update_msgs_thread()
565
#commit database and index
567
self.indexer.commit()
569
def archiveMsg(self, msg, commit=True):
570
article = self._makeArticle(msg)
571
if article.decoded.has_key('author'):
572
author = fixAuthor(article.decoded['author'])
574
author = fixAuthor(article.author)
575
if article.decoded.has_key('stripped'):
576
subject = article.decoded['stripped'].lower()
578
subject = article.subject.lower()
580
parent, in_reply = self.get_parent(article)
582
#if article's parent exists
583
if parent is not None:
584
article.thread_addr = parent.thread_addr
585
article.threadkey = parent.threadkey + str(article.date) + '.'
586
#update corresponding conversation
587
conversation = self.store.get(Conversation, parent.thread_addr)
588
if conversation is not None:
589
conversation.count = conversation.count +1
590
conversation.date = article.date
592
article.thread_addr = unicode(prepare_url(article.thread_addr))
593
article.threadkey = str(article.date) + '.'
594
#create new conversation object
595
conversation = self.store.add(Conversation(article.thread_addr, float(article.date), unicode(article.subject), unicode(author), unicode(article.datestr), 1))
597
id = None if in_reply is None else in_reply.id
599
#update previous and next article sorted by date
600
pdate_article = self.store.find(StormArticle).order_by(Desc(StormArticle.date)).first()
602
stormarticle = StormArticle(unicode(subject), float(article.date), unicode (article.datestr), unicode(author), unicode(article.email), unicode(article.msgid), id, safe_unicode(article.body), article.thread_addr, unicode(article.threadkey))
603
self.store.add(stormarticle)
606
if pdate_article is not None:
607
pdate_article.ndate_msg = stormarticle.id
608
stormarticle.pdate_msg = pdate_article.id
614
def get_parent(self, article):
617
if article.in_reply_to:
618
parent = self.store.find(StormArticle, StormArticle.msgid==unicode(article.in_reply_to))
619
if parent is not None:
620
parentID = parent.any()
622
elif article.references:
623
for ref in article.references:
624
parent = self.store.find(StormArticle, StormArticle.msgid==unicode(ref))
625
if parent is not None:
626
parentID = parent.any()
629
return parentID, in_reply
631
def update_msgs(self):
632
# for now, assume id contains the thread_addr
633
result = self.store.find(StormArticle)
634
if result.is_empty():
636
messages = result.order_by(Asc(StormArticle.date))
637
for i in range(messages.count()):
638
messages[i].pdate_msg = messages[i-1].id
640
messages[i].ndate_msg = messages[i+1].id
644
return 'Finished updating msgs'
646
def update_cons_date(self):
647
result = self.store.find(Conversation)
648
if result.is_empty():
649
raise EmptyTable("Conversation table is empty")
650
conversations = result.order_by(Asc(Conversation.date))
651
for i in range(conversations.count()):
652
conversations[i].pdate_con = conversations[i-1].thread_addr
654
conversations[i].ndate_con = conversations[i+1].thread_addr
659
def update_msgs_thread(self):
660
result = self.store.find(StormArticle)
661
if result.is_empty():
662
raise EmptyTable("StormArticle table is empty")
663
messages = result.order_by(Asc(StormArticle.threadkey))
664
for i in range(messages.count()):
665
messages[i].pthread_msg = messages[i-1].id
667
messages[i].nthread_msg = messages[i+1].id
672
def update_cons(self, con):
673
p_con = self.store.get(Conversation, con.pdate_con)
674
n_con = self.store.get(Conversation, con.ndate_con)
676
p_con.ndate_con = n_con.thread_addr
677
n_con.pdate_con = p_con.thread_addr
679
last_con = self.store.find(Conversation).order_by(Desc(Conversation.date)).first()
680
last_con.ndate_con = con.thread_addr
681
con.pdate_con = last_con.thread_addr
b'\\ No newline at end of file'