1
/* $Id: Index.cpp,v 1.31 2004/08/27 17:53:44 terpstra Exp $
3
* index.cpp - Insert all the keywords from the given email
5
* Copyright (C) 2002 - Wesley W. Terpstra
9
* Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of the GNU General Public License as published by
13
* the Free Software Foundation; version 2.
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
#define _XOPEN_SOURCE 500
26
#define _FILE_OFFSET_BITS 64
28
#include <mimelib/headers.h>
29
#include <mimelib/datetime.h>
30
#include <mimelib/addrlist.h>
31
#include <mimelib/address.h>
32
#include <mimelib/group.h>
33
#include <mimelib/mboxlist.h>
34
#include <mimelib/mailbox.h>
35
#include <mimelib/text.h>
36
#include <mimelib/param.h>
37
#include <mimelib/enum.h>
38
#include <mimelib/body.h>
39
#include <mimelib/bodypart.h>
40
#include <mimelib/utility.h>
42
#include <CharsetEscape.h>
58
#define MAX_MESSAGE_ID 80
60
void utf8Truncate(string& str, string::size_type len)
62
if (str.length() < len) return;
64
// look for nasty utf-8 stuff that's dangling and crop it
65
while (len && ((unsigned char)str[len-1]) >= 0x80 &&
66
((unsigned char)str[len-1]) <= 0xBF)
68
// now rewind off potential utf-8 start bytes
69
while (len && ((unsigned char)str[len-1]) >= 0xC0)
72
// len is now at the end of a complete multi-byte element or ascii
77
// first = address, second = name
78
pair<string, string> pickAddress(DwAddress* a, const char* charset)
80
for (; a != 0; a = a->Next())
84
DwGroup* g = dynamic_cast<DwGroup*>(a);
87
pair<string, string> out =
89
g->MailboxList().FirstMailbox(),
91
if (out.first != "") return out;
96
DwMailbox* m = dynamic_cast<DwMailbox*>(a);
99
string name = m->FullName().c_str();
100
name = decode_header(name, charset);
101
DwString addr = m->LocalPart() + "@" + m->Domain();
103
// fucked address? (one cannot safely cut this)
104
if (addr.length() > 128 ||
105
m->LocalPart() == "" || m->Domain() == "")
110
for (size_t i = 0; i < addr.length(); ++i)
112
if (addr[i] <= 0x20 || addr[i] >= 0x7f)
113
{ // fucked up address
119
// prune any optional quotes
120
if (name.length() >= 2 && name[0] == '"')
121
name = name.substr(1, name.length()-2);
124
return pair<string, string>(addr.c_str(), name);
129
return pair<string, string>("", "");
132
int Index::index_author()
134
// one always has headers, but not always this function:
135
// if (message.hasHeaders())
137
charset = "ISO-8859-1"; // a good default as any
139
if (message.Headers().HasContentType())
141
DwParameter* p = message.Headers().ContentType().FirstParameter();
144
if (p->Attribute() == "charset")
145
charset = p->Value().c_str();
150
// pickAddress only gives an author_name if it gave an author_email
152
if (message.Headers().HasReplyTo())
154
pair<string, string> addr = pickAddress(
155
message.Headers().ReplyTo().FirstAddress(),
158
author_email = addr.first;
159
author_name = addr.second;
161
// Some evil mailing lists set reply-to the list.
162
if (author_email == list.address)
169
// Given a reply-to that is not the list, we allow the from to
170
// provide a fullname under the assumption it is the same person.
172
if (message.Headers().HasFrom())
174
pair<string, string> addr = pickAddress(
175
message.Headers().From().FirstMailbox(),
178
if (!author_email.length()) author_email = addr.first;
179
if (!author_name .length()) author_name = addr.second;
184
if (message.Headers().HasSender())
186
pair<string, string> addr = pickAddress(
187
&message.Headers().Sender(),
190
if (!author_email.length()) author_email = addr.first;
191
if (!author_name .length()) author_name = addr.second;
194
utf8Truncate(author_name, 100);
195
// - nothing longer than 128 could get here (from above)
196
// - one can never safely truncate an email address
197
// utf8Truncate(author_email, 100);
202
// Doesn't vary with charset
203
inline bool lu_isspace(char x)
205
return x == ' ' || x == '\n' || x == '\r' || x == '\t';
208
void build_message_hash(const char* str, unsigned char* hash)
213
MD5Update(&ctx, (const unsigned char*)str, strlen(str));
215
unsigned char buf[16];
218
hash[0] = buf[0] ^ buf[4] ^ buf[ 8] ^ buf[12];
219
hash[1] = buf[1] ^ buf[5] ^ buf[ 9] ^ buf[13];
220
hash[2] = buf[2] ^ buf[6] ^ buf[10] ^ buf[14];
221
hash[3] = buf[3] ^ buf[7] ^ buf[11] ^ buf[15];
224
int feed_writer(const char* keyword, void* arg)
226
Index* i = (Index*)arg;
228
string x(LU_KEYWORD);
233
return i->writer->insert(x);
236
int Index::index_id(bool userdate, time_t server)
238
time_t stamp = server;
240
unsigned char hash[4];
242
// if (message.hasHeaders())
244
if (message.Headers().HasDate())
246
time_t user = message.Headers().Date().AsUnixTime();
248
/* User time must be earlier; there is delivery delay!
249
* However, more than 7 day delivery time is unlikely.
251
if ((user <= server && server < user+7*60*60*24) ||
252
userdate || // trusting the userdate?
253
server <= 0) // server is on crack?
258
{ // this is crazy; I don't care if they agree: it's wrong
259
stamp = 1; // liers all have timestamp 1970-01-01 00:00:01
262
if (message.Headers().HasMessageId())
264
vector<string> ids = extract_message_ids(
265
message.Headers().MessageId().AsString().c_str());
268
messageId = ids.front();
271
if (messageId.length())
273
// Constant message-id across import, and threadable
274
build_message_hash(messageId.c_str(), hash);
276
else if (author_email.length())
278
// This means no proper threading.
279
// At least the message-id is constant across import.
280
build_message_hash(author_email.c_str(), hash);
284
// Can't make any guarantees; just import it.
285
hash[0] = random() % 256;
286
hash[1] = random() % 256;
287
hash[2] = random() % 256;
288
hash[3] = random() % 256;
291
id = MessageId(stamp, hash);
293
if (messageId.length() && writer->insert(
295
string(LU_KEYWORD_MESSAGE_ID) +
300
cerr << "Failed to insert message id keyword!" << endl;
306
string(LU_KEYWORD_EVERYTHING) +
310
cerr << "Failed to the any keyword!" << endl;
317
int Index::index_summary(bool check, bool& exist)
319
string prefix = LU_SUMMARY + id.raw();
321
if (message.Headers().HasSubject())
323
subject = message.Headers().Subject().AsString().c_str();
324
subject = decode_header(subject, charset.c_str());
330
string mbox = prefix + LU_MESSAGE_MBOX + list.mbox + '\0';
334
// Check for existance
335
auto_ptr<ESort::Walker> w(writer->seek(mbox, "", ESort::Forward));
337
if (w->advance() == -1)
338
{ // was it just eof?
339
if (errno != 0) return -1;
342
{ // if it suceeded. then ... it is already in there
348
unsigned char buf[12];
353
for (i = 7; i >= 0; --i)
358
for (i = 11; i >= 8; --i)
364
// Don't let crazy stuff in there.
365
utf8Truncate(subject, 200);
367
if (writer->insert(prefix + LU_MESSAGE_AUTHOR_EMAIL + author_email) != 0 ||
368
writer->insert(prefix + LU_MESSAGE_AUTHOR_NAME + author_name) != 0 ||
369
writer->insert(prefix + LU_MESSAGE_SUBJECT + subject) != 0 ||
370
writer->insert(mbox + string((char*)buf, 12)) != 0)
372
cerr << "Failed to insert summary keys" << endl;
379
int Index::index_threading()
381
string shash = subject_hash(subject.c_str());
384
unsigned char hash[4];
393
cerr << "Failed to insert threading keyword" << endl;
397
// if (message.hasHeaders())
399
if (message.Headers().HasInReplyTo())
401
vector<string> ids = extract_message_ids(
402
message.Headers().InReplyTo().AsString().c_str());
404
// first in-reply-to is most relevant
405
for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
407
build_message_hash(i->c_str(), hash);
409
// keep it reasonable; too many reply-tos is bad
410
if (suffix.length() < 200)
411
suffix.append((const char*)hash, 4);
415
if (message.Headers().HasReferences())
417
vector<string> ids = extract_message_ids(
418
message.Headers().References().AsString().c_str());
420
// last references is most recently added (most likely irt)
421
for (vector<string>::reverse_iterator i = ids.rbegin();
422
i != ids.rend(); ++i)
424
build_message_hash(i->c_str(), hash);
425
// keep it reasonable; too many reply-tos is bad
426
if (suffix.length() < 200)
427
suffix.append((const char*)hash, 4);
437
cerr << "Failed to insert threading keys" << endl;
444
+ id.raw().substr(0, 4)
447
cerr << "Failed to insert new topics keys" << endl;
454
int Index::index_control(time_t import)
462
id.raw()) != 0) ok = false;
464
/* emulated group and language searches are impossibly slow.
465
* these keywords are a must for large archives.
466
* see the regroupable option in the stock lurker.conf
473
id.raw()) != 0) ok = false;
477
LU_KEYWORD_LANGUAGE +
480
id.raw()) != 0) ok = false;
482
MessageId importStamp(import);
485
importStamp.raw().substr(0, 4) +
486
id.raw()) != 0) ok = false;
488
if (author_email.length())
490
if (my_keyword_digest_string(
491
author_email.c_str(), author_email.length(),
492
LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
496
if (author_name.length())
498
if (my_keyword_digest_string(
499
author_name.c_str(), author_name.length(),
500
LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
504
if (subject.length())
506
if (my_keyword_digest_string(
507
subject.c_str(), subject.length(),
508
LU_KEYWORD_SUBJECT, &feed_writer, this, 1) != 0)
512
if (message.Headers().HasInReplyTo())
514
vector<string> ids = extract_message_ids(
515
message.Headers().InReplyTo().AsString().c_str());
516
for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
519
LU_KEYWORD_REPLY_TO +
520
*i + '\0' + id.raw()) != 0)
524
#if 0 // this is questionable...
525
if (message.Headers().HasReferences())
527
vector<string> ids = extract_message_ids(
528
message.Headers().References().AsString().c_str());
529
for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
532
LU_KEYWORD_REPLY_TO +
533
*i + '\0' + id.raw()) != 0)
540
cerr << "Failed to insert control keys" << endl;
547
int Index::index_entity(DwEntity& e, const string& charset)
550
if (e.Headers().HasContentTransferEncoding())
552
switch (e.Headers().ContentTransferEncoding().AsEnum())
554
case DwMime::kCteQuotedPrintable:
555
DwDecodeQuotedPrintable(e.Body().AsString(), text);
558
case DwMime::kCteBase64:
559
DwDecodeBase64(e.Body().AsString(), text);
562
case DwMime::kCteNull:
563
case DwMime::kCteUnknown:
564
case DwMime::kCte7bit:
565
case DwMime::kCte8bit:
566
case DwMime::kCteBinary:
567
text = e.Body().AsString();
573
text = e.Body().AsString();
576
CharsetEscape decode(charset.c_str());
577
string utf8 = decode.write(text.c_str(), text.length());
579
if (my_keyword_digest_string(
580
utf8.c_str(), utf8.length(),
581
LU_KEYWORD_WORD, &feed_writer, this, 1) != 0)
583
cerr << "Failed to index un-typed segment" << endl;
590
int Index::index_keywords(DwEntity& e, const string& parentCharset)
592
string charset = parentCharset;
594
if (e.Headers().HasContentType())
596
DwMediaType& mt = e.Headers().ContentType();
598
for (DwParameter* p = mt.FirstParameter(); p; p = p->Next())
600
DwString attr = p->Attribute();
601
attr.ConvertToLowerCase(); // case insens
602
if (attr == "charset") charset = p->Value().c_str();
606
// if (e.hasHeaders() &&
607
if (e.Headers().HasContentType())
609
DwMediaType& t = e.Headers().ContentType();
612
case DwMime::kTypeMessage:
613
if (e.Body().Message())
614
index_keywords(*e.Body().Message(), charset);
617
case DwMime::kTypeMultipart:
618
// index all alternatives in multipart
619
for (DwBodyPart* p = e.Body().FirstBodyPart(); p != 0; p = p->Next())
620
index_keywords(*p, charset);
623
case DwMime::kTypeText:
624
if (t.Subtype() == DwMime::kSubtypePlain)
626
if (index_entity(e, charset) != 0) return -1;
633
if (index_entity(e, charset) != 0) return -1;
639
int Index::index(bool userdate, time_t envelope, time_t import, bool check, bool& exist)
643
// cout << message.Headers().Subject().AsString().c_str() << endl;
645
if (index_author() < 0) return -1;
646
if (index_id(userdate, envelope) < 0) return -1;
647
if (index_summary(check, exist) < 0) return -1;
651
if (index_threading( ) < 0) return -1;
652
if (index_control (import) < 0) return -1;
653
if (index_keywords (message, "ISO-8859-1") < 0) return -1;