~dushyant37/+junk/Archiver : revision 2

1

2

#

3

# This file is part of GNU Mailman.

4

#

5

# GNU Mailman is free software: you can redistribute it and/or modify it under

6

# the terms of the GNU General Public License as published by the Free

7

# Software Foundation, either version 3 of the License, or (at your option)

8

# any later version.

9

#

10

# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT

11

# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

12

# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for

13

# more details.

14

#

15

# You should have received a copy of the GNU General Public License along with

16

# GNU Mailman. If not, see <http://www.gnu.org/licenses/>.

17

18

"""HyperArch: Pipermail archiving for Mailman

19

20

- The Dragon De Monsyne <dragondm@integral.org>

21

22

"""

23

24

import os

25

import re

26

import sys

27

import gzip

28

import time

29

import errno

30

import urllib

31

import logging

32

import binascii

33

import mailbox

34

35

from tempfile import mkstemp

36

from shutil import move

37

from email.Charset import Charset

38

from email.Errors import HeaderParseError

39

from email.Header import decode_header, make_header

40

from flufl.lock import Lock, TimeOutError

41

from lazr.config import as_boolean

42

from string import Template

43

from zope.component import getUtility

44

from cStringIO import StringIO

45

from email.utils import parseaddr, parsedate_tz, mktime_tz, formatdate

46

from string import lowercase

47

from storm.locals import *

48

49

from archiver.model.article import StormArticle

50

from archiver.model.conversation import Conversation

51

from archiver.model.mlist import Mlist

52

from archiver.Indexer import Indexer

53

54

from archiver.core.i18n import _

55

#from mailman.config import config

56

#from mailman.core.i18n import _, ctime

57

#from mailman.interfaces.listmanager import IListManager

58

#from mailman.utilities.i18n import find

59

#from mailman.utilities.string import uncanonstr, websafe

60

SPACE = ' '

61

mbox_archive = 1

62

log = logging.getLogger('mailman.error')

63

log1= logging.getLogger('mailman.http')

64

EMPTYSTRING = ''

65

NL = '\n'

66

67

68

# MacOSX has a default stack size that is too small for deeply recursive

69

# regular expressions. We see this as crashes in the Python test suite when

70

# running test_re.py and test_sre.py. The fix is to set the stack limit to

71

# 2048; the general recommendation is to do in the shell before running the

72

# test suite. But that's inconvenient for a daemon like the qrunner.

73

#

74

# AFAIK, this problem only affects the archiver, so we're adding this work

75

# around to this file (it'll get imported by the bundled pipermail or by the

76

# bin/arch script. We also only do this on darwin, a.k.a. MacOSX.

77

if sys.platform == 'darwin':

78

try:

79

import resource

80

except ImportError:

81

pass

82

else:

83

soft, hard = resource.getrlimit(resource.RLIMIT_STACK)

84

newsoft = min(hard, max(soft, 1024*2048))

85

resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard))

86

87

def prepare_url(subject):

88

subject = re.sub("[^-A-Za-z0-9 ]","",subject)

89

subject = re.sub("[ -]+","-",subject.strip().lower())

90

return subject if len(subject) < 80 else subject[:80]

91

92

def safe_unicode(obj, *args):

93

""" return the unicode representation of obj """

94

try:

95

return unicode(obj, *args)

96

except UnicodeDecodeError:

97

# obj is byte string

98

ascii_text = str(obj).encode('string_escape')

99

return unicode(ascii_text)

100

101

def html_quote(s, langcode=None):

102

repls = ( ('&', '&'),

103

("<", '<'),

104

(">", '>'),

105

('"', '"'))

106

for thing, repl in repls:

107

s = s.replace(thing, repl)

108

return uncanonstr(s, langcode)

109

110

111

def url_quote(s):

112

return urllib.quote(s)

113

114

115

def null_to_space(s):

116

return s.replace('\000', ' ')

117

118

119

def sizeof(filename, lang):

120

try:

121

size = os.path.getsize(filename)

122

except OSError, e:

123

# ENOENT can happen if the .mbox file was moved away or deleted, and

124

# an explicit mbox file name was given to bin/arch.

125

if e.errno <> errno.ENOENT: raise

126

return _('size not available')

127

if size < 1000:

128

with _.using(lang.code):

129

out = _(' %(size)i bytes ')

130

return out

131

elif size < 1000000:

132

return ' %d KB ' % (size / 1000)

133

# GB?? :-)

134

return ' %d MB ' % (size / 1000000)

135

136

137

html_charset = '<META http-equiv="Content-Type" ' \

138

'content="text/html; charset=%s">'

139

140

def CGIescape(arg, lang=None):

141

if isinstance(arg, unicode):

142

s = websafe(arg)

143

else:

144

s = websafe(str(arg))

145

return uncanonstr(s.replace('"', '"'), lang.code)

146

147

# Parenthesized human name

148

paren_name_pat = re.compile(r'([(].*[)])')

149

150

# Subject lines preceded with 'Re:'

151

REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE)

152

153

# E-mail addresses and URLs in text

154

emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)')

155

156

# Argh! This pattern is buggy, and will choke on URLs with GET parameters.

157

urlpat = re.compile(r'(\w+://[^>)\s]+)') # URLs in text

158

159

# Blank lines

160

blankpat = re.compile(r'^\s*$')

161

162

# Starting <html> directive

163

htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE)

164

# Ending </html> directive

165

nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE)

166

# Match quoted text

167

quotedpat = re.compile(r'^([>|:]|>)+')

168

169

170

171

_templatefilepathcache = {}

172

173

174

msgid_pat = re.compile(r'(<.*>)')

175

def strip_separators(s):

176

"Remove quotes or parenthesization from a Message-ID string"

177

if not s:

178

return ""

179

if s[0] in '"<([' and s[-1] in '">)]':

180

s = s[1:-1]

181

return s

182

183

smallNameParts = ['van', 'von', 'der', 'de']

184

185

def fixAuthor(author):

186

"Canonicalize a name into Last, First format"

187

# If there's a comma, guess that it's already in "Last, First" format

188

if ',' in author:

189

return author

190

L = author.split()

191

i = len(L) - 1

192

if i == 0:

193

return author # The string's one word--forget it

194

if author.upper() == author or author.lower() == author:

195

# Damn, the name is all upper- or lower-case.

196

while i > 0 and L[i-1].lower() in smallNameParts:

197

i = i - 1

198

else:

199

# Mixed case; assume that small parts of the last name will be

200

# in lowercase, and check them against the list.

201

while i>0 and (L[i-1][0] in lowercase or

202

L[i-1].lower() in smallNameParts):

203

i = i - 1

204

author = SPACE.join(L[-1:] + L[i:-1]) + ', ' + SPACE.join(L[:i])

205

return author

206

207

# The Article class encapsulates a single posting. The attributes are:

208

#

209

# sequence : Sequence number, unique for each article in a set of archives

210

# subject : Subject

211

# datestr : The posting date, in human-readable format

212

# date : The posting date, in purely numeric format

213

# fromdate : The posting date, in `unixfrom' format

214

# headers : Any other headers of interest

215

# author : The author's name (and possibly organization)

216

# email : The author's e-mail address

217

# msgid : A unique message ID

218

# in_reply_to : If !="", this is the msgid of the article being replied to

219

# references: A (possibly empty) list of msgid's of earlier articles in

220

# the thread

221

# body : A string making up the message body

222

223

class Article:

224

_last_article_time = time.time()

225

226

def __init__(self, message=None, sequence=0, keepHeaders=[],

227

lang=None, mlist=None):

228

if message is None:

229

return

230

self.sequence = sequence

231

232

self.parentID = None

233

self.threadKey = None

234

# otherwise the current sequence number is used.

235

id = strip_separators(message['Message-Id'])

236

if id == "":

237

self.msgid = str(self.sequence)

238

else: self.msgid = id

239

240

if message.has_key('Subject'):

241

self.subject = str(message['Subject'])

242

else:

243

self.subject = _('No subject')

244

if self.subject == "": self.subject = _('No subject')

245

246

self._set_date(message)

247

248

# Figure out the e-mail address and poster's name. Use the From:

249

# field first, followed by Reply-To:

250

self.author, self.email = parseaddr(message.get('From', ''))

251

e = message['Reply-To']

252

if not self.email and e is not None:

253

ignoreauthor, self.email = parseaddr(e)

254

self.email = strip_separators(self.email)

255

self.author = strip_separators(self.author)

256

257

if self.author == "":

258

self.author = self.email

259

260

# Save the In-Reply-To:, References:, and Message-ID: lines

261

#

262

# TBD: The original code does some munging on these fields, which

263

# shouldn't be necessary, but changing this may break code. For

264

# safety, I save the original headers on different attributes for use

265

# in writing the plain text periodic flat files.

266

self._in_reply_to = message['in-reply-to']

267

self._references = message['references']

268

self._message_id = message['message-id']

269

270

i_r_t = message['In-Reply-To']

271

if i_r_t is None:

272

self.in_reply_to = ''

273

274

else:

275

match = msgid_pat.search(i_r_t)

276

if match is None:

277

self.in_reply_to = ''

278

else:

279

self.in_reply_to = strip_separators(match.group(1))

280

if not self.in_reply_to=='':

281

self._in_reply_to = self.in_reply_to

282

283

references = message['References']

284

if references is None:

285

self.references = []

286

else:

287

self.references = map(strip_separators, references.split())

288

289

# Save any other interesting headers

290

self.headers = {}

291

for i in keepHeaders:

292

if message.has_key(i):

293

self.headers[i] = message[i]

294

295

# Read the message body

296

s = StringIO(message.get_payload(decode=True)\

297

or message.as_string().split('\n\n',1)[1])

298

# self.body = s.readlines()

299

self.body = s.read()

300

301

self.prev = None

302

self.next = None

303

# Trim Re: from the subject line

304

i = 0

305

while i != -1:

306

result = REpat.match(self.subject)

307

if result:

308

i = result.end(0)

309

self.subject = self.subject[i:]

310

else:

311

i = -1

312

# Useful to keep around

313

self._lang = lang

314

self._mlist = mlist

315

316

# thread_addr is for dlists

317

self.thread_addr = ''

318

addrs_to_try = ['To', 'X-Original-To', 'Delivered-To', 'Cc', 'Bcc']

319

i = 0

320

while (self.thread_addr == None or self.thread_addr.find('+') == -1 or self.thread_addr.find(',') != -1 or self.thread_addr.find(' ') != -1) and i < len(addrs_to_try):

321

self.thread_addr = message[addrs_to_try[i]]

322

i += 1

323

if self.thread_addr != None:

324

self.thread_addr = self.thread_addr.replace('>', '').replace('<', '')

325

else:

326

self.thread_addr = self.strip_subject(self.subject)

327

328

# if as_boolean(config.archiver.pipermail.obscure_email_addresses):

329

# # Avoid i18n side-effects. Note that the language for this

330

# # article (for this list) could be different from the site-wide

331

# # preferred language, so we need to ensure no side-effects will

332

# # occur. Think what happens when executing bin/arch.

333

# with _.using(lang.code):

334

# if self.author == self.email:

335

# self.author = self.email = re.sub('@', _(' at '),

336

# self.email)

337

# else:

338

# self.email = re.sub('@', _(' at '), self.email)

339

# Snag the content-* headers. RFC 1521 states that their values are

340

# case insensitive.

341

ctype = message.get('Content-Type', 'text/plain')

342

cenc = message.get('Content-Transfer-Encoding', '')

343

self.ctype = ctype.lower()

344

self.cenc = cenc.lower()

345

self.decoded = {}

346

cset = 'us-ascii' #mlist.preferred_language.charset

347

cset_out = Charset(cset).output_charset or cset

348

charset = message.get_content_charset(cset_out)

349

if charset:

350

charset = charset.lower().strip()

351

if charset[0]=='"' and charset[-1]=='"':

352

charset = charset[1:-1]

353

if charset[0]=="'" and charset[-1]=="'":

354

charset = charset[1:-1]

355

try:

356

body = message.get_payload(decode=True)

357

except binascii.Error:

358

body = None

359

if body and charset != 'us-ascii':

360

# decode body

361

try:

362

body = unicode(body, charset)

363

except (UnicodeError, LookupError):

364

body = None

365

if body:

366

# self.body = [l + "\n" for l in body.splitlines()]

367

self.body = body

368

369

self.decode_headers()

370

371

def quote(self, buf):

372

return html_quote(buf, self._lang.code)

373

374

def decode_headers(self):

375

"""MIME-decode headers.

376

377

If the email, subject, or author attributes contain non-ASCII

378

characters using the encoded-word syntax of RFC 2047, decoded versions

379

of those attributes are placed in the self.decoded (a dictionary).

380

381

If the list's charset differs from the header charset, an attempt is

382

made to decode the headers as Unicode. If that fails, they are left

383

undecoded.

384

"""

385

author = self.decode_charset(self.author)

386

subject = self.decode_charset(self.subject)

387

if author:

388

self.decoded['author'] = author

389

email = self.decode_charset(self.email)

390

if email:

391

self.decoded['email'] = email

392

if subject:

393

# if as_boolean(config.archiver.pipermail.obscure_email_addresses):

394

# with _.using(self._lang.code):

395

# atmark = _(' at ')

396

# subject = re.sub(r'([-+,.\w]+)@([-+.\w]+)',

397

# '\g<1>' + atmark + '\g<2>', subject)

398

self.decoded['subject'] = subject

399

self.decoded['stripped'] = self.strip_subject(subject or self.subject)

400

401

def strip_subject(self, subject):

402

# Strip subject_prefix and Re: for subject sorting

403

# This part was taken from CookHeaders.py (TK)

404

prefix = None #self._mlist.subject_prefix.strip()

405

if prefix:

406

prefix_pat = re.escape(prefix)

407

prefix_pat = '%'.join(prefix_pat.split(r'\%'))

408

prefix_pat = re.sub(r'%\d*d', r'\s*\d+\s*', prefix_pat)

409

subject = re.sub(prefix_pat, '', subject)

410

subject = subject.lstrip()

411

strip_pat = re.compile('^((RE|AW|SV|VS)(\[\d+\])?:\s*)+', re.I)

412

stripped = strip_pat.sub('', subject)

413

return stripped

414

415

def decode_charset(self, field):

416

# TK: This function was rewritten for unifying to Unicode.

417

# Convert 'field' into Unicode one line string.

418

try:

419

pairs = decode_header(field)

420

ustr = make_header(pairs).__unicode__()

421

except (LookupError, UnicodeError, ValueError, HeaderParseError):

422

# assume list's language

423

cset = 'us-ascii' #self._mlist.preferred_language.charset

424

if cset == 'us-ascii':

425

cset = 'iso-8859-1' # assume this for English list

426

ustr = unicode(field, cset, 'replace')

427

return u''.join(ustr.splitlines())

428

429

def _get_subject_enc(self, art):

430

"""Return the subject of art, decoded if possible.

431

432

If the charset of the current message and art match and the

433

article's subject is encoded, decode it.

434

"""

435

return art.decoded.get('subject', art.subject)

436

437

_rx_quote = re.compile('=([A-F0-9][A-F0-9])')

438

_rx_softline = re.compile('=[ \t]*$')

439

440

def _get_body(self):

441

"""Return the message body ready for HTML, decoded if necessary"""

442

body = self.body

443

return null_to_space(EMPTYSTRING.join(body))

444

445

def _set_date(self, message):

446

def floatdate(header):

447

missing = []

448

datestr = message.get(header, missing)

449

if datestr is missing:

450

return None

451

date = parsedate_tz(datestr)

452

try:

453

return mktime_tz(date)

454

except (TypeError, ValueError, OverflowError):

455

return None

456

date = floatdate('date')

457

if date is None:

458

date = floatdate('x-list-received-date')

459

if date is None:

460

# What's left to try?

461

date = self._last_article_time + 1

462

self._last_article_time = date

463

self.date = date

464

# self.date = '%011i' % date

465

print type(self.date)

466

self.datestr = message.get('date') \

467

or message.get('x-list-received-date') \

468

or formatdate(date)

469

470

self.fromdate = time.ctime(int(self.date))

471

472

class T:

473

DIRMODE = 0755 # Mode to give to created directories

474

475

def __init__(self, basedir = None, mlist = None):

476

# If basedir isn't provided, assume the current directory

477

self.mlist = mlist

478

if basedir is None:

479

basedir = os.getcwd()

480

else:

481

basedir = os.path.expanduser(basedir)

482

self.listdir = os.path.join(basedir,mlist)

483

# If the directory doesn't exist, create it.

484

for dir in (basedir, self.listdir):

485

try:

486

os.stat(dir)

487

except os.error, errdata:

488

errno, errmsg = errdata

489

if errno != 2:

490

raise os.error, errdata

491

else:

492

omask = os.umask(0)

493

try:

494

os.mkdir(dir, self.DIRMODE)

495

finally:

496

os.umask(omask)

497

#find db_path

498

db_path = os.path.join(self.listdir,'archives.db')

499

self.store = Store(create_database('sqlite:///'+db_path))

500

501

#insert mlist into mlist db.

502

mlist_db= os.path.join(basedir,'mlists.db')

503

store = Store(create_database('sqlite:///'+mlist_db))

504

try:

505

store.execute("CREATE TABLE mlist " "(list_name VARCHAR(255), db_path VARCHAR(255), id INTEGER PRIMARY KEY)")

506

except:

507

pass

508

if store.find(Mlist, Mlist.list_name==unicode(self.mlist)).is_empty():

509

store.add(Mlist(self.mlist,self.listdir))

510

store.commit()

511

512

#find index_path

513

index_path = os.path.join(self.listdir,'archives.index')

514

self.indexer = Indexer(index_path)

515

516

def db_init(self):

517

try:

518

self.store.execute("CREATE TABLE article " "(subject VARCHAR(255), date FLOAT, datestr VARCHAR(255), author VARCHAR(255), email VARCHAR(255), msgid VARCHAR(255), id INTEGER PRIMARY KEY, in_reply_to VARCHAR(255), body TEXT, thread_addr VARCHAR(255), threadkey VARCHAR(255), pdate_msg INTEGER, ndate_msg INTEGER, pthread_msg INTEGER, nthread_msg INTEGER )")

519

except:

520

pass

521

try:

522

self.store.execute("CREATE TABLE conversation" "(subject VARCHAR(255), date FLOAT, datestr VARCHAR(255), thread_addr VARCHAR(255), author VARCHAR(255), count INTEGER, pdate_con VARCHAR(255), ndate_con VARCHAR(255))")

523

except:

524

pass

525

526

def _makeArticle(self, msg):

527

return Article(msg)

528

529

def mboxToDB(self, path, start=None, end=None):

530

mbox = iter(mailbox.mbox(path))

531

if start is None:

532

start = 0

533

counter = 0

534

while counter < start:

535

try:

536

m = next(mbox)

537

except errors.DiscardMessage:

538

continue

539

if m is None:

540

return

541

counter += 1

542

while True:

543

try:

544

m = next(mbox)

545

except StopIteration:

546

break

547

except errors.DiscardMessage:

548

continue

549

except Exception:

550

log.error('uncaught archiver exception')

551

raise

552

if m == '':

553

# It was an unparseable message

554

continue

555

# self.message(_('#%(counter)05d %(msgid)s'))

556

stormarticle = self.archiveMsg(m, commit = False)

557

self.indexer.index_article(stormarticle,created= True, commit = False)

558

if end is not None and counter >= end:

559

break

560

counter += 1

561

#update navigation fields

562

self.update_cons_date()

563

self.update_msgs_thread()

564

565

#commit database and index

566

self.store.commit()

567

self.indexer.commit()

568

569

def archiveMsg(self, msg, commit=True):

570

article = self._makeArticle(msg)

571

if article.decoded.has_key('author'):

572

author = fixAuthor(article.decoded['author'])

573

else:

574

author = fixAuthor(article.author)

575

if article.decoded.has_key('stripped'):

576

subject = article.decoded['stripped'].lower()

577

else:

578

subject = article.subject.lower()

579

580

parent, in_reply = self.get_parent(article)

581

582

#if article's parent exists

583

if parent is not None:

584

article.thread_addr = parent.thread_addr

585

article.threadkey = parent.threadkey + str(article.date) + '.'

586

#update corresponding conversation

587

conversation = self.store.get(Conversation, parent.thread_addr)

588

if conversation is not None:

589

conversation.count = conversation.count +1

590

conversation.date = article.date

591

else:

592

article.thread_addr = unicode(prepare_url(article.thread_addr))

593

article.threadkey = str(article.date) + '.'

594

#create new conversation object

595

conversation = self.store.add(Conversation(article.thread_addr, float(article.date), unicode(article.subject), unicode(author), unicode(article.datestr), 1))

596

597

id = None if in_reply is None else in_reply.id

598

599

#update previous and next article sorted by date

600

pdate_article = self.store.find(StormArticle).order_by(Desc(StormArticle.date)).first()

601

602

stormarticle = StormArticle(unicode(subject), float(article.date), unicode (article.datestr), unicode(author), unicode(article.email), unicode(article.msgid), id, safe_unicode(article.body), article.thread_addr, unicode(article.threadkey))

603

self.store.add(stormarticle)

604

self.store.flush()

605

606

if pdate_article is not None:

607

pdate_article.ndate_msg = stormarticle.id

608

stormarticle.pdate_msg = pdate_article.id

609

610

if commit:

611

self.store.commit()

612

return stormarticle

613

614

def get_parent(self, article):

615

parentID = None

616

in_reply = None

617

if article.in_reply_to:

618

parent = self.store.find(StormArticle, StormArticle.msgid==unicode(article.in_reply_to))

619

if parent is not None:

620

parentID = parent.any()

621

in_reply = parentID

622

elif article.references:

623

for ref in article.references:

624

parent = self.store.find(StormArticle, StormArticle.msgid==unicode(ref))

625

if parent is not None:

626

parentID = parent.any()

627

if parentID:

628

break

629

return parentID, in_reply

630

631

def update_msgs(self):

632

# for now, assume id contains the thread_addr

633

result = self.store.find(StormArticle)

634

if result.is_empty():

635

return

636

messages = result.order_by(Asc(StormArticle.date))

637

for i in range(messages.count()):

638

messages[i].pdate_msg = messages[i-1].id

639

try:

640

messages[i].ndate_msg = messages[i+1].id

641

except IndexError:

642

pass

643

self.store.commit()

644

return 'Finished updating msgs'

645

646

def update_cons_date(self):

647

result = self.store.find(Conversation)

648

if result.is_empty():

649

raise EmptyTable("Conversation table is empty")

650

conversations = result.order_by(Asc(Conversation.date))

651

for i in range(conversations.count()):

652

conversations[i].pdate_con = conversations[i-1].thread_addr

653

try:

654

conversations[i].ndate_con = conversations[i+1].thread_addr

655

except IndexError:

656

pass

657

self.store.commit()

658

659

def update_msgs_thread(self):

660

result = self.store.find(StormArticle)

661

if result.is_empty():

662

raise EmptyTable("StormArticle table is empty")

663

messages = result.order_by(Asc(StormArticle.threadkey))

664

for i in range(messages.count()):

665

messages[i].pthread_msg = messages[i-1].id

666

try:

667

messages[i].nthread_msg = messages[i+1].id

668

except IndexError:

669

pass

670

self.store.commit()

671

672

def update_cons(self, con):

673

p_con = self.store.get(Conversation, con.pdate_con)

674

n_con = self.store.get(Conversation, con.ndate_con)

675

676

p_con.ndate_con = n_con.thread_addr

677

n_con.pdate_con = p_con.thread_addr

678

679

last_con = self.store.find(Conversation).order_by(Desc(Conversation.date)).first()

680

last_con.ndate_con = con.thread_addr

681

con.pdate_con = last_con.thread_addr

b'\\ No newline at end of file'