1
/*************************************************************************************************
2
* Implementation of wavermod
3
* Copyright (C) 2004-2006 Mikio Hirabayashi
4
* This file is part of Hyper Estraier.
5
* Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6
* the GNU Lesser General Public License as published by the Free Software Foundation; either
7
* version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8
* that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10
* License for more details.
11
* You should have received a copy of the GNU Lesser General Public License along with Hyper
12
* Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13
* Boston, MA 02111-1307 USA.
14
*************************************************************************************************/
20
/* private function prototypes */
21
static void log_close(void);
22
static void db_informer(const char *message, void *opaque);
23
static const char *skiplabel(const char *str);
24
static char *makeabspath(const char *rootdir, const char *path);
25
static int queue_compare(const char *aptr, int asiz, const char *bptr, int bsiz);
26
static int keysc_compare(const void *ap, const void *bp);
27
static void make_doc_from_draft(const char *buf, int size, ESTDOC *doc, CBLIST *links);
28
static void make_doc_from_text(const char *buf, int size, const char *penc, int plang,
29
ESTDOC *doc, CBLIST *links);
30
static int check_binary(const char *buf, int size);
31
static void make_doc_from_html(const char *buf, int size, const char *penc, int plang,
32
ESTDOC *doc, CBLIST *links);
33
static char *html_enc(const char *str);
34
static char *html_raw_text(const char *html);
35
static void make_doc_from_mime(const char *buf, int size, const char *penc, int plang,
36
ESTDOC *doc, CBLIST *links);
37
static void doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value);
38
static void make_doc_with_xcmd(const char *xcmd, const char *url, const char *buf, int size,
39
const char *penc, int plang, ESTDOC *doc, CBLIST *links);
43
/*************************************************************************************************
45
*************************************************************************************************/
48
/* The handles of the log file. */
52
/* Level of logging. */
53
int log_level = LL_INFO;
56
/* Open the log file. */
57
int log_open(const char *rootdir, const char *path, int level, int trunc){
58
char mypath[URIBUFSIZ];
59
assert(rootdir && path);
61
if(log_fp) return TRUE;
62
if((ESTPATHCHR == '/' && path[0] == ESTPATHCHR) ||
63
(ESTPATHCHR == '\\' && ((path[0] >= 'A' && path[0] <= 'Z') ||
64
(path[0] >= 'a' && path[0] <= 'z')) && path[1] == ':' &&
66
sprintf(mypath, "%s", path);
68
sprintf(mypath, "%s%c%s", rootdir, ESTPATHCHR, path);
70
if(!(log_fp = fopen(mypath, trunc ? "wb" : "ab"))) return FALSE;
71
if(level == LL_CHECK){
81
/* Print formatted string into the log file. */
82
void log_print(int level, const char *format, ...){
83
static pthread_mutex_t mymutex = PTHREAD_MUTEX_INITIALIZER;
87
if(level < log_level) return;
88
if(pthread_mutex_lock(&mymutex) != 0) return;
92
case LL_DEBUG: lvstr = "DEBUG"; break;
93
case LL_INFO: lvstr = "INFO"; break;
94
case LL_WARN: lvstr = "WARN"; break;
95
default: lvstr = "ERROR"; break;
97
date = cbdatestrwww(time(NULL), 0);
98
printf("%s\t%s\t", date, lvstr);
103
fprintf(log_fp, "%s\t%s\t", date, lvstr);
104
vfprintf(log_fp, format, aq);
111
pthread_mutex_unlock(&mymutex);
115
/* Rotete the log file. */
116
int log_rotate(const char *rootdir, const char *path){
118
char mypath[URIBUFSIZ], *wp, iobuf[IOBUFSIZ];
119
int err, year, month, day, hour, minute, second, len;
120
assert(rootdir && path);
121
if(!log_fp || fflush(log_fp) == -1) return FALSE;
124
if((ESTPATHCHR == '/' && path[0] == ESTPATHCHR) ||
125
(ESTPATHCHR == '\\' && ((path[0] >= 'A' && path[0] <= 'Z') ||
126
(path[0] >= 'a' && path[0] <= 'z')) && path[1] == ':' &&
128
wp += sprintf(wp, "%s", path);
130
wp += sprintf(wp, "%s%c%s", rootdir, ESTPATHCHR, path);
132
if(!(ifp = fopen(mypath, "rb"))) return FALSE;
133
cbcalendar(-1, 0, &year, &month, &day, &hour, &minute, &second);
134
sprintf(wp, "-%04d%02d%02d%02d%02d%02d", year, month, day, hour, minute, second);
135
if(!(ofp = fopen(mypath, "wb"))){
139
while((len = fread(iobuf, 1, IOBUFSIZ, ifp)) > 0){
140
fwrite(iobuf, 1, len, ofp);
142
if(fclose(ofp) == -1) err = TRUE;
143
if(fclose(ifp) == -1) err = TRUE;
144
if(fseek(log_fp, 0, SEEK_SET) == -1 || fflush(log_fp) == -1) err = TRUE;
145
if(ftruncate(fileno(log_fp), 0) == -1) err = TRUE;
146
return err ? FALSE : TRUE;
150
/* Initialize the root directory. */
151
int waver_init(const char *rootdir, int options){
157
char path[URIBUFSIZ];
158
int err, tracebnum, estopts, ecode;
160
if(est_mkdir(rootdir) == -1 && errno != EEXIST) return FALSE;
162
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, METAFILE);
163
if((metadb = dpopen(path, DP_OWRITER | DP_OCREAT | DP_OTRUNC, MINIBNUM))){
164
if(!dpput(metadb, MMKMAGIC, -1, MMKMAGVAL, -1, DP_DKEEP)) err = TRUE;
165
if(!dpclose(metadb)) err = TRUE;
169
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, CONFFILE);
170
if((ofp = fopen(path, "wb")) != NULL){
171
fprintf(ofp, "# seed documents (weight and URL)\n");
172
fprintf(ofp, "seed: 1.5|http://hyperestraier.sourceforge.net/uguide-en.html\n");
173
fprintf(ofp, "seed: 1.0|http://hyperestraier.sourceforge.net/pguide-en.html\n");
174
fprintf(ofp, "seed: 1.0|http://hyperestraier.sourceforge.net/nguide-en.html\n");
175
fprintf(ofp, "seed: 0.0|http://qdbm.sourceforge.net/\n");
177
fprintf(ofp, "# host name of the proxy\n");
178
fprintf(ofp, "proxyhost:\n");
180
fprintf(ofp, "# port number of the proxy\n");
181
fprintf(ofp, "proxyport:\n");
183
fprintf(ofp, "# waiting interval of each request (in milliseconds)\n");
184
fprintf(ofp, "interval: 500\n");
186
fprintf(ofp, "# timeout of each request (in seconds)\n");
187
fprintf(ofp, "timeout: 30\n");
189
fprintf(ofp, "# strategy of crawling path"
190
" (0:balanced, 1:similarity, 2:depth, 3:width, 4:random)\n");
191
fprintf(ofp, "strategy: %d\n", CS_BALANCED);
193
fprintf(ofp, "# inheritance ratio of similarity from the parent\n");
194
fprintf(ofp, "inherit: 0.4\n");
196
fprintf(ofp, "# maximum depth of seed documents\n");
197
fprintf(ofp, "seeddepth: 0\n");
199
fprintf(ofp, "# maximum depth of recursion\n");
200
fprintf(ofp, "maxdepth: 20\n");
202
fprintf(ofp, "# standard value for checking mass sites\n");
203
fprintf(ofp, "masscheck: 500\n");
205
fprintf(ofp, "# maximum number of records of the priority queue\n");
206
fprintf(ofp, "queuesize: 50000\n");
208
fprintf(ofp, "# regular expressions and replacement strings to normalize URLs\n");
209
fprintf(ofp, "replace: ^http://127.0.0.1/{{!}}http://localhost/\n");
211
fprintf(ofp, "# allowing regular expressions of URLs to be visited\n");
212
fprintf(ofp, "allowrx: ^http://\n");
214
fprintf(ofp, "# denying regular expressions of URLs to be visited\n");
215
fprintf(ofp, "denyrx: \\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$\n");
216
fprintf(ofp, "denyrx: \\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$\n");
217
fprintf(ofp, "denyrx: \\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$\n");
218
fprintf(ofp, "denyrx: \\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$\n");
219
fprintf(ofp, "denyrx: \\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)(\\?.*)?$\n");
220
fprintf(ofp, "denyrx: \\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)(\\?.*)?$\n");
221
fprintf(ofp, "denyrx: \\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)(\\?.*)?$\n");
222
fprintf(ofp, "denyrx: \\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)(\\?.*)?$\n");
223
fprintf(ofp, "denyrx: \\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$\n");
224
fprintf(ofp, "denyrx: \\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$\n");
225
fprintf(ofp, "denyrx: \\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$\n");
226
fprintf(ofp, "denyrx: (/core$|/core\\.[0-9]*$|/casket/)\n");
227
fprintf(ofp, "denyrx: ://(localhost|[a-z]*\\.localdomain|127\\.0\\.0\\.1)/\n");
229
fprintf(ofp, "# denying regular expressions of URLs to be indexed\n");
230
fprintf(ofp, "noidxrx: /\\?[a-z]=[a-z](;|$)\n");
232
fprintf(ofp, "# URL rules (regular expressions and media types)\n");
233
fprintf(ofp, "urlrule: \\.est${{!}}text/x-estraier-draft\n");
234
fprintf(ofp, "urlrule: \\.(eml|mime|mht|mhtml)${{!}}message/rfc822\n");
236
fprintf(ofp, "# media type rules (regular expressions and filter commands)\n");
237
fprintf(ofp, "typerule: ^text/x-estraier-draft${{!}}%s\n", DRAFTCMD);
238
fprintf(ofp, "typerule: ^text/plain${{!}}%s\n", TEXTCMD);
239
fprintf(ofp, "typerule: ^(text/html|application/xhtml+xml)${{!}}%s\n", HTMLCMD);
240
fprintf(ofp, "typerule: ^message/rfc822${{!}}%s\n", MIMECMD);
242
fprintf(ofp, "# preferred language (0:English, 1:Japanese, 2:Chinese, 3:Korean, 4:misc)\n");
243
fprintf(ofp, "language: 0\n");
245
fprintf(ofp, "# text size limitation (in kilobytes)\n");
246
fprintf(ofp, "textlimit: 128\n");
248
fprintf(ofp, "# total number of keywords for seed documents\n");
249
fprintf(ofp, "seedkeynum: 256\n");
251
fprintf(ofp, "# number of keywords saved for each document\n");
252
fprintf(ofp, "savekeynum: 32\n");
254
fprintf(ofp, "# number of threads running in parallel\n");
255
fprintf(ofp, "threadnum: 10\n");
257
fprintf(ofp, "# number of documents to collect\n");
258
fprintf(ofp, "docnum: 10000\n");
260
fprintf(ofp, "# running time period (in s:seconds, m:minutes, h:hours, d:days)\n");
261
fprintf(ofp, "period: 10000s\n");
263
fprintf(ofp, "# revisit span (in s:seconds, m:minutes, h:hours, d:days)\n");
264
fprintf(ofp, "revisit: 7d\n");
266
fprintf(ofp, "# maximum size of the index cache (in megabytes)\n");
267
fprintf(ofp, "cachesize: 256\n");
269
fprintf(ofp, "# remote nodes for alternative indexes (ID number and URL)\n");
270
fprintf(ofp, "#nodeserv: 1|http://admin:admin@localhost:1978/node/node1\n");
271
fprintf(ofp, "#nodeserv: 2|http://admin:admin@localhost:1978/node/node2\n");
272
fprintf(ofp, "#nodeserv: 3|http://admin:admin@localhost:1978/node/node3\n");
274
fprintf(ofp, "# path of the log file (relative path or absolute path)\n");
275
fprintf(ofp, "logfile: %s\n", LOGFILE);
277
fprintf(ofp, "# logging level (1:debug, 2:information, 3:warning, 4:error, 5:none)\n");
278
fprintf(ofp, "loglevel: 2\n");
280
fprintf(ofp, "# path of the draft directory (relative path or absolute path)\n");
281
fprintf(ofp, "draftdir:\n");
283
fprintf(ofp, "# path of the entity directory (relative path or absolute path)\n");
284
fprintf(ofp, "entitydir:\n");
286
fprintf(ofp, "# postprocessor for retrieved files\n");
287
fprintf(ofp, "postproc:\n");
289
if(fclose(ofp) == EOF) err = TRUE;
293
tracebnum = TRACEBNUM;
295
if(options & WI_SMALL){
296
tracebnum = TRACEBNUM / 2;
297
estopts = ESTDBSMALL;
298
} else if(options & WI_LARGE){
299
tracebnum = TRACEBNUM * 2;
300
estopts = ESTDBLARGE;
301
} else if(options & WI_HUGE){
302
tracebnum = TRACEBNUM * 4;
305
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, QUEUEFILE);
307
if((queue = queue_open(path)) != NULL){
308
if(!queue_close(queue)) err = TRUE;
312
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, TRACEFILE);
313
if((trace = cropen(path, CR_OWRITER | CR_OCREAT | CR_OTRUNC, tracebnum, TRACEDNUM)) != NULL){
314
if(!crclose(trace)) err = TRUE;
318
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, INDEXDIR);
319
if((index = est_mtdb_open(path, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC | estopts,
321
if(!est_mtdb_add_attr_index(index, ESTDATTRURI, ESTIDXATTRSTR)) err = TRUE;
322
if(!est_mtdb_add_attr_index(index, ESTDATTRTITLE, ESTIDXATTRSTR)) err = TRUE;
323
if(!est_mtdb_add_attr_index(index, ESTDATTRMDATE, ESTIDXATTRSEQ)) err = TRUE;
324
if(!est_mtdb_close(index, &ecode)) err = TRUE;
328
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, MYTMPDIR);
330
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, LOGFILE);
331
if((ofp = fopen(path, "wb")) != NULL){
332
if(fclose(ofp) == EOF) err = TRUE;
336
return err ? FALSE : TRUE;
340
/* Open a waver handle. */
341
WAVER *waver_open(const char *rootdir){
353
const char *rp, *pv, *logfile;
354
char path[URIBUFSIZ], *tmp;
355
int i, ecode, loglevel, num;
358
if(stat(rootdir, &sbuf) == -1) return NULL;
359
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, CONFFILE);
360
lines = cbreadlines(path);
361
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, METAFILE);
362
metadb = dpopen(path, DP_OWRITER, -1);
363
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, QUEUEFILE);
364
queue = queue_open(path);
365
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, TRACEFILE);
366
trace = cropen(path, CR_OWRITER, -1, -1);
367
sprintf(path, "%s%c%s", rootdir, ESTPATHCHR, INDEXDIR);
368
index = est_mtdb_open(path, ESTDBWRITER, &ecode);
369
if(!lines || !metadb || !queue || !trace || !index){
370
if(index) est_mtdb_close(index, &ecode);
371
if(trace) crclose(trace);
372
if(queue) queue_close(queue);
373
if(metadb) dpclose(metadb);
374
if(lines) cblistclose(lines);
377
waver = cbmalloc(sizeof(WAVER));
378
waver->rootdir = cbmemdup(rootdir, -1);
379
waver->metadb = metadb;
380
waver->queue = queue;
381
waver->trace = trace;
382
waver->index = index;
383
waver->seeds = cbmapopen();
384
waver->kwords = cbmapopen();
385
waver->sites = cbmapopen();
386
waver->pxhost = NULL;
390
waver->strategy = CS_BALANCED;
391
waver->inherit = 0.0;
392
waver->seeddepth = 0;
393
waver->maxdepth = INT_MAX;
394
waver->masscheck = INT_MAX;
395
waver->queuesize = INT_MAX;
396
waver->unrules = cblistopen();
397
waver->pmrules = cblistopen();
398
waver->urlrules = cblistopen();
399
waver->mtrules = cblistopen();
400
waver->language = ESTLANGEN;
401
waver->textlimit = 0;
402
waver->seedkeynum = 0;
403
waver->savekeynum = 0;
408
waver->cachesize = 0;
409
waver->nodes = cbmapopenex(MINIBNUM);
410
waver->draftdir = NULL;
411
waver->entitydir = NULL;
412
waver->postproc = NULL;
413
waver->stime = time(NULL);
416
waver->minload = 1.0;
419
for(i = 0; i < cblistnum(lines); i++){
420
rp = cblistval(lines, i, NULL);
421
if(cbstrfwimatch(rp, "seed:")){
423
if((pv = strchr(rp, '|')) != NULL){
424
tmp = cburlresolve(pv + 1, "");
425
cbmapput(waver->seeds, tmp, -1, rp, pv - rp, FALSE);
428
} else if(cbstrfwimatch(rp, "proxyhost:")){
430
if(rp[0] != '\0') waver->pxhost = cbmemdup(rp, -1);
431
} else if(cbstrfwimatch(rp, "proxyport:")){
433
if(rp[0] != '\0') waver->pxport = atoi(rp);
434
} else if(cbstrfwimatch(rp, "interval:")){
435
waver->interval = atoi(skiplabel(rp));
436
} else if(cbstrfwimatch(rp, "timeout:")){
437
waver->timeout = atoi(skiplabel(rp));
438
} else if(cbstrfwimatch(rp, "strategy:")){
439
waver->strategy = atoi(skiplabel(rp));
440
} else if(cbstrfwimatch(rp, "inherit:")){
441
waver->inherit = strtod(skiplabel(rp), NULL);
442
} else if(cbstrfwimatch(rp, "seeddepth:")){
443
waver->seeddepth = atoi(skiplabel(rp));
444
} else if(cbstrfwimatch(rp, "maxdepth:")){
445
waver->maxdepth = atoi(skiplabel(rp));
446
} else if(cbstrfwimatch(rp, "masscheck:")){
447
waver->masscheck = atoi(skiplabel(rp));
448
} else if(cbstrfwimatch(rp, "queuesize:")){
449
waver->queuesize = atoi(skiplabel(rp));
450
} else if(cbstrfwimatch(rp, "replace:")){
452
if((pv = strstr(rp, "{{!}}")) != NULL){
453
tmp = cbmemdup(rp, pv - rp);
454
if((unrule.regex = est_regex_new(tmp)) != NULL){
456
unrule.after = cbmemdup(pv + 5, -1);
457
cblistpush(waver->unrules, (char *)&unrule, sizeof(UNRULE));
462
} else if(cbstrfwimatch(rp, "allowrx:")){
463
tmp = cbsprintf("*I:%s", skiplabel(rp));
464
if((pmrule.regex = est_regex_new(tmp)) != NULL){
467
cblistpush(waver->pmrules, (char *)&pmrule, sizeof(PMRULE));
470
} else if(cbstrfwimatch(rp, "denyrx:")){
471
tmp = cbsprintf("*I:%s", skiplabel(rp));
472
if((pmrule.regex = est_regex_new(tmp)) != NULL){
475
cblistpush(waver->pmrules, (char *)&pmrule, sizeof(PMRULE));
478
} else if(cbstrfwimatch(rp, "noidxrx:")){
479
tmp = cbsprintf("*I:%s", skiplabel(rp));
480
if((pmrule.regex = est_regex_new(tmp)) != NULL){
483
cblistpush(waver->pmrules, (char *)&pmrule, sizeof(PMRULE));
486
} else if(cbstrfwimatch(rp, "urlrule:")){
488
if((pv = strstr(rp, "{{!}}")) != NULL){
489
tmp = cbmemdup(rp, pv - rp);
490
if((urlrule.regex = est_regex_new(tmp)) != NULL){
491
urlrule.type = cbmemdup(pv + 5, -1);
492
cblistpush(waver->urlrules, (char *)&urlrule, sizeof(URLRULE));
496
} else if(cbstrfwimatch(rp, "typerule:")){
498
if((pv = strstr(rp, "{{!}}")) != NULL){
499
tmp = cbmemdup(rp, pv - rp);
500
if((mtrule.regex = est_regex_new(tmp)) != NULL){
501
mtrule.filter = cbmemdup(pv + 5, -1);
502
cblistpush(waver->mtrules, (char *)&mtrule, sizeof(MTRULE));
506
} else if(cbstrfwimatch(rp, "language:")){
507
waver->language = atoi(skiplabel(rp));
508
} else if(cbstrfwimatch(rp, "textlimit:")){
509
waver->textlimit = atoi(skiplabel(rp)) * 1024;
510
} else if(cbstrfwimatch(rp, "seedkeynum:")){
511
waver->seedkeynum = atoi(skiplabel(rp));
512
} else if(cbstrfwimatch(rp, "savekeynum:")){
513
waver->savekeynum = atoi(skiplabel(rp));
514
} else if(cbstrfwimatch(rp, "threadnum:")){
515
waver->thnum = atoi(skiplabel(rp));
516
} else if(cbstrfwimatch(rp, "docnum:")){
517
waver->docnum = atoi(skiplabel(rp));
518
} else if(cbstrfwimatch(rp, "period:")){
520
waver->period = atoi(rp);
521
if(cbstrbwimatch(rp, "m")){
523
} else if(cbstrbwimatch(rp, "h")){
524
waver->period *= 3600;
525
} else if(cbstrbwimatch(rp, "d")){
526
waver->period *= 86400;
528
} else if(cbstrfwimatch(rp, "revisit:")){
530
waver->revisit = atoi(rp);
531
if(cbstrbwimatch(rp, "m")){
532
waver->revisit *= 60;
533
} else if(cbstrbwimatch(rp, "h")){
534
waver->revisit *= 3600;
535
} else if(cbstrbwimatch(rp, "d")){
536
waver->revisit *= 86400;
538
} else if(cbstrfwimatch(rp, "cachesize:")){
539
waver->cachesize = atoi(skiplabel(rp)) * 1024 * 1024;
540
} else if(cbstrfwimatch(rp, "nodeserv:")){
542
if((pv = strchr(rp, '|')) != NULL){
545
if(num > 0 && !cbmapget(waver->nodes, (char *)&num, sizeof(int), NULL)){
546
node = est_node_new(pv);
547
cbmapput(waver->nodes, (char *)&num, sizeof(int),
548
(char *)&node, sizeof(ESTNODE *), FALSE);
551
} else if(cbstrfwimatch(rp, "logfile:")){
552
logfile = skiplabel(rp);
553
} else if(cbstrfwimatch(rp, "loglevel:")){
554
loglevel = atoi(skiplabel(rp));
555
} else if(cbstrfwimatch(rp, "draftdir:")){
557
if(rp[0] != '\0' && !waver->draftdir) waver->draftdir = makeabspath(rootdir, rp);
558
} else if(cbstrfwimatch(rp, "entitydir:")){
560
if(rp[0] != '\0' && !waver->entitydir) waver->entitydir = makeabspath(rootdir, rp);
561
} else if(cbstrfwimatch(rp, "postproc:")){
563
if(rp[0] != '\0' && !waver->postproc) waver->postproc = cbmemdup(rp, -1);
566
if(!log_open(rootdir, logfile, loglevel, FALSE)){
571
if(waver->pxport < 1) waver->pxport = 80;
572
if(waver->seeddepth < 0) waver->seeddepth = 0;
573
if(waver->maxdepth < 0) waver->maxdepth = 0;
574
if(waver->masscheck < 1) waver->masscheck = 1;
575
if(waver->queuesize < 1) waver->queuesize = 1;
576
if(waver->textlimit < 0) waver->textlimit = 0;
577
if(waver->seedkeynum < 0) waver->seedkeynum = 0;
578
if(waver->savekeynum < 0) waver->savekeynum = 0;
579
if(waver->thnum < 1) waver->thnum = 1;
580
if(waver->period < 1) waver->period = 1;
581
if(waver->revisit < 1) waver->revisit = 1;
582
if(waver->cachesize < 1) waver->cachesize = 1;
584
est_mtdb_set_informer(index, db_informer, NULL);
585
est_mtdb_set_cache_size(index, waver->cachesize, -1, -1, -1);
590
/* Close a waver handle. */
591
int waver_close(WAVER *waver){
601
free(waver->postproc);
602
free(waver->entitydir);
603
free(waver->draftdir);
604
cbmapiterinit(waver->nodes);
605
while((kbuf = cbmapiternext(waver->nodes, NULL)) != NULL){
606
node = *(ESTNODE **)cbmapget(waver->nodes, kbuf, sizeof(int), NULL);
607
est_node_delete(node);
609
cbmapclose(waver->nodes);
610
for(i = 0; i < cblistnum(waver->mtrules); i++){
611
mtrule = (MTRULE *)cblistval(waver->mtrules, i, NULL);
612
est_regex_delete(mtrule->regex);
613
free(mtrule->filter);
615
cblistclose(waver->mtrules);
616
for(i = 0; i < cblistnum(waver->urlrules); i++){
617
urlrule = (URLRULE *)cblistval(waver->urlrules, i, NULL);
618
est_regex_delete(urlrule->regex);
621
cblistclose(waver->urlrules);
622
for(i = 0; i < cblistnum(waver->pmrules); i++){
623
pmrule = (PMRULE *)cblistval(waver->pmrules, i, NULL);
624
est_regex_delete(pmrule->regex);
626
cblistclose(waver->pmrules);
627
for(i = 0; i < cblistnum(waver->unrules); i++){
628
unrule = (UNRULE *)cblistval(waver->unrules, i, NULL);
629
est_regex_delete(unrule->regex);
630
free(unrule->before);
633
cblistclose(waver->unrules);
635
cbmapclose(waver->sites);
636
cbmapclose(waver->kwords);
637
cbmapclose(waver->seeds);
638
if(!est_mtdb_close(waver->index, &ecode)) err = TRUE;
639
if(!crclose(waver->trace)) err = TRUE;
640
if(!queue_close(waver->queue)) err = TRUE;
641
if(!dpclose(waver->metadb)) err = TRUE;
642
free(waver->rootdir);
644
return err ? FALSE : TRUE;
648
/* Set the current node. */
649
void waver_set_current_node(WAVER *waver){
656
cbmapiterinit(waver->nodes);
657
waver->minload = 1.0;
658
for(i = 0; i < 600; i++){
659
while((kbuf = cbmapiternext(waver->nodes, NULL)) != NULL){
660
node = *(ESTNODE **)cbmapget(waver->nodes, kbuf, sizeof(int), NULL);
661
if((ratio = est_node_cache_usage(node)) >= 0.0 && ratio <= waver->minload){
662
waver->curnode = *(int *)kbuf;
663
waver->minload = ratio;
666
if(waver->minload < 1.0){
667
cbmapmove(waver->nodes, (char *)&(waver->curnode), sizeof(int), TRUE);
670
est_usleep(1000 * 1000);
675
/* Get the load of the current node. */
676
double waver_current_node_load(WAVER *waver){
679
if(!(node = *(ESTNODE **)cbmapget(waver->nodes, (char *)&(waver->curnode), sizeof(int), NULL)))
681
ratio = est_node_cache_usage(node);
682
return ratio > 0.0 ? ratio : 1.0;
686
/* Add a document to a node. */
687
int waver_node_put_doc(WAVER *waver, ESTDOC *doc, int *codep){
690
assert(waver && doc);
691
if(!(node = *(ESTNODE **)cbmapget(waver->nodes, (char *)&(waver->curnode), sizeof(int), NULL))){
692
if(codep) *codep = -1;
695
rv = est_node_put_doc(node, doc);
696
if(codep) *codep = est_node_status(node);
701
/* Remove a document from a node. */
702
int waver_node_out_doc(WAVER *waver, const char *url, int *codep){
707
assert(waver && url);
709
if((vbuf = crget(waver->trace, url, -1, 0, -1, NULL)) != NULL){
710
if((rp = strchr(vbuf, '#')) != NULL) nid = atoi(rp + 1);
713
if(nid < 1 || !(node = *(ESTNODE **)cbmapget(waver->nodes, (char *)&nid, sizeof(int), NULL))){
714
if(codep) *codep = -1;
717
rv = est_node_out_doc_by_uri(node, url);
718
if(codep) *codep = est_node_status(node);
723
/* Open a priority queue. */
724
QUEUE *queue_open(const char *name){
731
vomode = VL_OWRITER | VL_OCREAT;
734
} else if(ESTUSELZO){
736
} else if(ESTUSEZLIB){
739
if(!(db = vlopen(name, vomode, queue_compare))) return NULL;
740
vlsettuning(db, QUEUELRM, QUEUENIM, QUEUELCN, QUEUENCN);
743
if((vbuf = vlcurkeycache(db, &vsiz)) != NULL && vsiz == sizeof(double))
744
max = *(double *)vbuf;
745
if(max < 1.0) max = 1.0;
746
queue = cbmalloc(sizeof(QUEUE));
753
/* Close a priority queue. */
754
int queue_close(QUEUE *queue){
758
if(!vlclose(queue->db)) err = TRUE;
760
return err ? FALSE : TRUE;
764
/* Set the range of the priority space of a priority queue. */
765
void queue_set_range(QUEUE *queue, double range){
768
if(queue->max < 1.0) queue->max = 1.0;
772
/* Enqueue a record into a priority queue. */
773
int queue_enqueue(QUEUE *queue, const char *str, double priority){
775
assert(queue && str);
776
if(priority < 0.0) priority = 0.0;
777
if(priority > 1.0) priority = 1.0;
778
priority *= queue->max;
780
if(!vlput(queue->db, (char *)&priority, sizeof(double), str, -1, VL_DDUP)) err = TRUE;
781
return err ? FALSE : TRUE;
785
/* Dequeue a record from a priority queue. */
786
char *queue_dequeue(QUEUE *queue){
789
vlcurfirst(queue->db);
790
if(!(vbuf = vlcurval(queue->db, NULL))) return NULL;
796
/* Get the number of records in a priority queue. */
797
int queue_rnum(QUEUE *queue){
799
return vlrnum(queue->db);
803
/* Discard inferior records in a priority queue. */
804
int queue_slim(QUEUE *queue, int num){
806
assert(queue && num >= 0);
807
if((diff = vlrnum(queue->db) - num) < 1) return TRUE;
808
vlcurlast(queue->db);
809
for(i = 1; i < diff; i++){
810
vlcurprev(queue->db);
813
if(!vlcurout(queue->db)) break;
815
return vlrnum(queue->db) == num && !vlfatalerror(queue->db);
819
/* Add a word to a keyword map. */
820
void kwords_add(CBMAP *kwords, const char *word, int frequency){
822
char numbuf[NUMBUFSIZ];
824
assert(kwords && word && frequency >= 0);
826
if((vbuf = cbmapget(kwords, word, wlen, NULL)) != NULL) frequency += atoi(vbuf);
827
nlen = sprintf(numbuf, "%d", frequency);
828
cbmapput(kwords, word, wlen, numbuf, nlen, TRUE);
832
/* Reduce elements of a keyword map. */
833
void kwords_reduce(CBMAP *kwords, int num, int fadeout){
836
char numbuf[NUMBUFSIZ];
839
assert(kwords && num >= 0);
841
snum = cbmaprnum(kwords);
842
scores = cbmalloc(snum * sizeof(KEYSC) + 1);
843
cbmapiterinit(kwords);
844
for(i = 0; i < snum; i++){
845
vbuf = cbmapiternext(kwords, &vsiz);
846
scores[i].word = vbuf;
847
scores[i].wsiz = vsiz;
848
scores[i].pt = atoi(cbmapget(kwords, vbuf, vsiz, NULL));
850
qsort(scores, snum, sizeof(KEYSC), keysc_compare);
851
basis = num * 1.1 + 1.0;
852
for(i = 0; i < snum; i++){
854
vsiz = sprintf(numbuf, "%d",
855
fadeout ? (int)(scores[i].pt * (basis - i) / basis) : scores[i].pt);
856
cbmapput(kwords, scores[i].word, scores[i].wsiz, numbuf, vsiz, TRUE);
857
cbmapmove(kwords, scores[i].word, scores[i].wsiz, FALSE);
859
cbmapout(kwords, scores[i].word, scores[i].wsiz);
866
/* Fetch a document of a URL. */
867
int fetch_document(const char *url, const char *pxhost, int pxport, int outsec, time_t mdate,
868
const CBLIST *urlrules, const CBLIST *mtrules,
869
int *codep, CBDATUM *raw, CBMAP *heads,
870
CBLIST *links, const CBLIST *unrules, ESTDOC *doc, int lang){
878
const char *vbuf, *cmd;
879
char *dstr, *tbuf, *type, *enc, *pv;
886
rbuf = cbdatumopen(NULL, -1);
892
rheads = cbmapopenex(MINIBNUM);
898
rdoc = est_doc_new();
901
reqheads = cblistopen();
903
dstr = cbdatestrhttp(mdate, 0);
904
tbuf = cbsprintf("If-Modified-Since: %s", dstr);
905
cblistpush(reqheads, tbuf, -1);
911
cblistpush(reqheads, "Accept-Language: en,ja", -1);
914
cblistpush(reqheads, "Accept-Language: ja,en", -1);
917
cblistpush(reqheads, "Accept-Language: zh,en", -1);
920
cblistpush(reqheads, "Accept-Language: ko,en", -1);
923
if(!est_url_shuttle(url, pxhost, pxport, outsec, RESLIMSIZE, NULL, reqheads, NULL, 0,
924
&rescode, heads, raw) || rescode != 200){
925
if(links && (rescode == 301 || rescode == 302) &&
926
(vbuf = cbmapget(heads, "location", -1, NULL)) != NULL){
927
tbuf = cburlresolve(url, vbuf);
928
cblistpush(links, tbuf, -1);
931
cblistclose(reqheads);
932
if(rdoc) est_doc_delete(rdoc);
933
if(rheads) cbmapclose(rheads);
934
if(rbuf) cbdatumclose(rbuf);
935
if(codep) *codep = rescode;
939
for(i = 0; i < cblistnum(urlrules); i++){
940
urlrule = (URLRULE *)cblistval(urlrules, i, NULL);
941
if(est_regex_match(urlrule->regex, url)){
942
cbmapput(heads, "content-type", -1, urlrule->type, -1, TRUE);
947
if(!(vbuf = cbmapget(heads, "content-type", -1, NULL))) vbuf = "text/plain";
948
type = cbmemdup(vbuf, -1);
949
if((pv = strchr(type, ';')) != NULL) *pv = '\0';
952
if((pv = strstr(vbuf, "charset=")) != NULL || (pv = strstr(vbuf, "CHARSET=")) != NULL){
953
pv = strchr(pv, '=') + 1;
955
enc = cbmemdup(pv, -1);
956
if((pv = strchr(enc, '"')) != NULL) *pv = '\0';
960
for(i = 0; i < cblistnum(mtrules); i++){
961
mtrule = (MTRULE *)cblistval(mtrules, i, NULL);
962
if(est_regex_match(mtrule->regex, type)){
963
cmd = mtrule->filter;
967
} else if(!strcmp(type, "text/plain")){
969
} else if(!strcmp(type, "text/html") || !strcmp(vbuf, "application/xhtml+xml")){
971
} else if(!strcmp(type, "message/rfc822")){
974
if(!strcmp(cmd, DRAFTCMD)){
975
make_doc_from_draft(cbdatumptr(raw), cbdatumsize(raw), doc, links);
976
if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, url);
977
} else if(!strcmp(cmd, TEXTCMD)){
978
make_doc_from_text(cbdatumptr(raw), cbdatumsize(raw), enc, lang, doc, links);
979
est_doc_add_attr(doc, ESTDATTRURI, url);
980
} else if(!strcmp(cmd, HTMLCMD)){
981
make_doc_from_html(cbdatumptr(raw), cbdatumsize(raw), enc, lang, doc, links);
982
est_doc_add_attr(doc, ESTDATTRURI, url);
983
} else if(!strcmp(cmd, MIMECMD)){
984
make_doc_from_mime(cbdatumptr(raw), cbdatumsize(raw), enc, lang, doc, links);
985
est_doc_add_attr(doc, ESTDATTRURI, url);
986
} else if(cmd[0] != '\0'){
987
make_doc_with_xcmd(cmd, url, cbdatumptr(raw), cbdatumsize(raw), enc, lang, doc, links);
988
if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, url);
992
if((vbuf = cbmapget(heads, "last-modified", -1, NULL)) != NULL)
993
est_doc_add_attr(doc, ESTDATTRMDATE, vbuf);
995
for(i = 0; i < cblistnum(links); i++){
996
tbuf = cburlresolve(url, cblistval(links, i, NULL));
998
for(j = 0; j < cblistnum(unrules); j++){
999
unrule = (UNRULE *)cblistval(unrules, j, NULL);
1000
if(!est_regex_match(unrule->regex, tbuf)) continue;
1001
dstr = est_regex_replace(tbuf, unrule->before, unrule->after);
1006
if(cbstrfwmatch(tbuf, "http://")){
1008
if((pv = strchr(pv, '/')) != NULL && cbstrfwmatch(pv, "/%E2%80%BE")){
1010
memmove(pv + 2, pv + 10, strlen(pv + 10) + 1);
1012
cblistover(links, i, tbuf, -1);
1017
cblistclose(reqheads);
1018
if(rdoc) est_doc_delete(rdoc);
1019
if(rheads) cbmapclose(rheads);
1020
if(rbuf) cbdatumclose(rbuf);
1021
if(codep) *codep = rescode;
1027
/*************************************************************************************************
1029
*************************************************************************************************/
1032
/* Close the log file. */
1033
static void log_close(void){
1034
if(log_fp) fclose(log_fp);
1038
/* Output the log message of a DB event.
1039
`message' specifies the log message of a DB event.
1040
`opaque' is simply ignored. */
1041
static void db_informer(const char *message, void *opaque){
1043
log_print(LL_INFO, "DB-EVENT: %s", message);
1047
/* Skip the label of a line.
1048
`str' specifies a string of a line.
1049
The return value is the pointer to the first character of the line. */
1050
static const char *skiplabel(const char *str){
1052
if(!(str = strchr(str, ':'))) return "";
1054
while(*str != '\0' && (*str == ' ' || *str == '\t')){
1061
/* Make the absolute path of a relative path.
1062
`rootdir' specifies the path of the root directory.
1063
`path' specifies a ralative path of the a file. */
1064
static char *makeabspath(const char *rootdir, const char *path){
1065
char mypath[URIBUFSIZ];
1066
assert(rootdir && path);
1067
if((ESTPATHCHR == '/' && path[0] == ESTPATHCHR) ||
1068
(ESTPATHCHR == '\\' && ((path[0] >= 'A' && path[0] <= 'Z') ||
1069
(path[0] >= 'a' && path[0] <= 'z')) && path[1] == ':' &&
1071
sprintf(mypath, "%s", path);
1073
sprintf(mypath, "%s%c%s", rootdir, ESTPATHCHR, path);
1075
return est_realpath(mypath);
1079
/* Compare keys of two records as double type objects.
1080
`aptr' specifies the pointer to the region of one key.
1081
`asiz' specifies the size of the region of one key.
1082
`bptr' specifies the pointer to the region of the other key.
1083
`bsiz' specifies the size of the region of the other key.
1084
The return value is positive if the former is big, negative if the latter is big, 0 if both
1086
static int queue_compare(const char *aptr, int asiz, const char *bptr, int bsiz){
1088
assert(aptr && asiz >= 0 && bptr && bsiz >= 0);
1089
if(asiz != bsiz) return asiz - bsiz;
1090
anum = (asiz == sizeof(double) ? *(double *)aptr : INT_MIN);
1091
bnum = (bsiz == sizeof(double) ? *(double *)bptr : INT_MIN);
1092
if(anum > bnum) return 1;
1093
if(anum < bnum) return -1;
1098
/* Compare two keywords by scores in descending order.
1099
`ap' specifies the pointer to one keyword.
1100
`bp' specifies the pointer to the other keyword.
1101
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
1102
static int keysc_compare(const void *ap, const void *bp){
1104
return ((KEYSC *)bp)->pt - ((KEYSC *)ap)->pt;
1108
/* Create a document object from docuemnt draft.
1109
`buf' specifies the pointer to a data buffer. It should be trailed by zero code.
1110
`size' specifies the size of the buffer.
1111
`doc' specifies a document handle to store attributes and texts.
1112
`link' specifies a list handle to store links. If it is `NULL', it is not used. */
1113
static void make_doc_from_draft(const char *buf, int size, ESTDOC *doc, CBLIST *links){
1118
assert(buf && size >= 0 && doc);
1119
lines = cbsplit(buf, -1, "\n");
1120
for(i = 0; i < CB_LISTNUM(lines); i++){
1121
line = CB_LISTVAL(lines, i);
1122
while(*line > '\0' && *line <= ' '){
1129
if(*line != '%' && (pv = strchr(line, '=')) != NULL){
1131
est_doc_add_attr(doc, line, pv);
1134
for(; i < CB_LISTNUM(lines); i++){
1135
line = CB_LISTVAL(lines, i);
1137
est_doc_add_hidden_text(doc, line + 1);
1139
est_doc_add_text(doc, line);
1146
/* Create a document object from plain text.
1147
`buf' specifies the pointer to a data buffer. It should be trailed by zero code.
1148
`size' specifies the size of the buffer.
1149
`penc' specifies the name of preferred encoding. If it is `NULL', it is not used.
1150
`plang' specifies the code of preferred language.
1151
`doc' specifies a document handle to store attributes and texts.
1152
`link' specifies a list handle to store links. If it is `NULL', it is not used. */
1153
static void make_doc_from_text(const char *buf, int size, const char *penc, int plang,
1154
ESTDOC *doc, CBLIST *links){
1157
const char *enc, *text, *line;
1158
char *nbuf, numbuf[NUMBUFSIZ];
1160
assert(buf && size >= 0 && doc);
1161
if(check_binary(buf, size)) return;
1162
enc = penc ? penc : est_enc_name(buf, size, plang);
1163
if(!strcmp(enc, "UTF-8")){
1168
nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
1169
if(nbuf) text = nbuf;
1171
lines = cbsplit(text, -1, "\n");
1172
CB_DATUMOPEN(datum);
1173
for(i = 0; i < CB_LISTNUM(lines); i++){
1174
line = CB_LISTVAL(lines, i);
1175
while(*line == ' ' || *line == '\t' || *line == '\r'){
1178
if(line[0] == '\0'){
1179
est_doc_add_text(doc, CB_DATUMPTR(datum));
1180
CB_DATUMSETSIZE(datum, 0);
1182
CB_DATUMCAT(datum, " ", 1);
1183
CB_DATUMCAT(datum, line, strlen(line));
1186
est_doc_add_text(doc, CB_DATUMPTR(datum));
1187
CB_DATUMCLOSE(datum);
1188
CB_LISTCLOSE(lines);
1189
est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain");
1190
sprintf(numbuf, "%d", size);
1191
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
1192
if(nbuf) free(nbuf);
1196
/* Check whether a buffer is binary.
1197
`buf' specifies the pointer to a data buffer. It should be trailed by zero code.
1198
`size' specifies the size of the buffer. */
1199
static int check_binary(const char *buf, int size){
1201
assert(buf && size >= 0);
1202
if(size < 32) return FALSE;
1204
if(!memcmp(buf, "%PDF-", 5)) return TRUE;
1206
if(!memcmp(buf, "%!PS-Adobe", 10)) return TRUE;
1207
/* generic binary */
1209
if(size >= 256) size = 256;
1211
for(i = 0; i < size; i++){
1213
if(buf[i+1] == 0x0 && buf[i+2] == 0x0 && buf[i+3] == 0x0 && buf[i+4] == 0x0) return TRUE;
1217
if(!bin) return FALSE;
1219
if(!memcmp(buf, "\x89PNG", 4)) return TRUE;
1221
if(!memcmp(buf, "GIF87a", 6)) return TRUE;
1223
if(!memcmp(buf, "GIF89a", 6)) return TRUE;
1225
if(!memcmp(buf, "\xff\xd8JFIF", 6)) return TRUE;
1227
if(!memcmp(buf, "MM\x00\x2a", 4)) return TRUE;
1228
/* TIFF(Motorola) */
1229
if(!memcmp(buf, "II\x2a\x00", 4)) return TRUE;
1231
if(!memcmp(buf, "BM", 2)) return TRUE;
1233
if(!memcmp(buf, "\x1f\x8b\x08", 3)) return TRUE;
1235
if(!memcmp(buf, "BZh", 3)) return TRUE;
1237
if(!memcmp(buf, "PK\x03\x04", 4)) return TRUE;
1239
if(!memcmp(buf, "ID3", 3)) return TRUE;
1241
if(((buf[0] * 0x100 + buf[1]) & 0xfffe) == 0xfffa) return TRUE;
1243
if(!memcmp(buf, "MThd", 4)) return TRUE;
1245
if(!memcmp(buf, "0xed0xab", 2)) return TRUE;
1246
/* Debian package */
1247
if(!memcmp(buf, "!<arch>\ndebian", 14)) return TRUE;
1249
if(!memcmp(buf, "\x7f\x45\x4c\x46", 4)) return TRUE;
1250
/* MS-DOS executable */
1251
if(!memcmp(buf, "MZ", 2)) return TRUE;
1253
if(!memcmp(buf, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) return TRUE;
1254
if(!memcmp(buf, "\xfe\x37\x00\x23", 4)) return TRUE;
1255
if(!memcmp(buf, "\xdb\xa5-\x00\x00\x00", 6)) return TRUE;
1260
/* Create a document object from HTML.
1261
`buf' specifies the pointer to a data buffer. It should be trailed by zero code.
1262
`size' specifies the size of the buffer.
1263
`penc' specifies the name of preferred encoding. If it is `NULL', it is not used.
1264
`plang' specifies the code of preferred language.
1265
`doc' specifies a document handle to store attributes and texts.
1266
`link' specifies a list handle to store links. If it is `NULL', it is not used. */
1267
static void make_doc_from_html(const char *buf, int size, const char *penc, int plang,
1268
ESTDOC *doc, CBLIST *links){
1272
const char *enc, *html, *elem, *next, *value, *name, *content, *rp;
1273
char *nbuf, *nenc, *rbuf, *lbuf, numbuf[NUMBUFSIZ];
1275
assert(buf && size >= 0 && doc);
1276
enc = est_enc_name(buf, size, plang);
1279
if(!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")){
1280
nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
1281
} else if(!strcmp(enc, "US-ASCII")){
1284
if((nenc = penc ? cbmemdup(penc, -1) : html_enc(buf)) != NULL){
1285
if(cbstricmp(nenc, "UTF-8")){
1286
nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL);
1287
if(!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
1291
nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
1294
if(nbuf) html = nbuf;
1295
if(!html) html = buf;
1296
CB_DATUMOPEN(datum);
1297
elems = cbxmlbreak(html, TRUE);
1298
for(i = 0; i < CB_LISTNUM(elems); i++){
1299
elem = CB_LISTVAL2(elems, i, esiz);
1300
if(!(next = cblistval(elems, i + 1, NULL))) next = "";
1302
if(cbstrfwimatch(elem, "<html")){
1303
attrs = cbxmlattrs(elem);
1304
value = cbmapget(attrs, "lang", -1, NULL);
1305
if(!value) value = cbmapget(attrs, "Lang", -1, NULL);
1306
if(!value) value = cbmapget(attrs, "LANG", -1, NULL);
1307
if(!value) value = cbmapget(attrs, "xml:lang", -1, NULL);
1308
if(value && value[0] != '\0') est_doc_add_attr(doc, ESTDATTRLANG, value);
1310
} else if(cbstrfwimatch(elem, "<meta")){
1311
attrs = cbxmlattrs(elem);
1312
name = cbmapget(attrs, "name", -1, NULL);
1313
if(!name) name = cbmapget(attrs, "Name", -1, NULL);
1314
if(!name) name = cbmapget(attrs, "NAME", -1, NULL);
1315
if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL);
1316
if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL);
1317
if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL);
1318
if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
1319
content = cbmapget(attrs, "content", -1, NULL);
1320
if(!content) content = cbmapget(attrs, "Content", -1, NULL);
1321
if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
1322
if(name && content){
1323
lbuf = cbmemdup(name, -1);
1326
if(!strcmp(lbuf, "author")){
1327
if(strchr(content, '&')){
1328
rbuf = html_raw_text(content);
1329
est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
1332
est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
1334
} else if(!strcmp(lbuf, "refresh")){
1335
if(strchr(content, '&')){
1336
rbuf = html_raw_text(content);
1338
while((*rp >= '0' && *rp <= '9') || *rp == ' ' || *rp == '\t' || *rp == ';'){
1341
if(cbstrfwmatch(rp, "url=")) rp += 4;
1342
if(*rp != '\0') cblistpush(links, rp, -1);
1346
while((*rp >= '0' && *rp <= '9') || *rp == ' ' || *rp == '\t' || *rp == ';'){
1349
if(cbstrfwmatch(rp, "url=")) rp += 4;
1350
if(*rp != '\0') cblistpush(links, rp, -1);
1353
if(name[0] != '@' && name[0] != '_'){
1354
if(strchr(content, '&')){
1355
rbuf = html_raw_text(content);
1356
est_doc_add_attr(doc, lbuf, rbuf);
1359
est_doc_add_attr(doc, lbuf, content);
1365
} else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){
1366
if(strchr(next, '&')){
1367
rbuf = html_raw_text(next);
1368
est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
1369
est_doc_add_hidden_text(doc, rbuf);
1372
est_doc_add_attr(doc, ESTDATTRTITLE, next);
1373
est_doc_add_hidden_text(doc, next);
1376
} else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){
1377
while((next = cblistval(elems, i + 1, NULL)) != NULL &&
1378
!(next[0] == '<' && next[1] != '!' && next[1] != ' ' && next[1] != '=')){
1381
} else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") ||
1382
cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") ||
1383
cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") ||
1384
cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") ||
1385
cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") ||
1386
cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") ||
1387
cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") ||
1388
cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") ||
1389
cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") ||
1390
cbstrfwimatch(elem, "<pre")){
1391
if(strchr(CB_DATUMPTR(datum), '&')){
1392
rbuf = html_raw_text(CB_DATUMPTR(datum));
1393
est_doc_add_text(doc, rbuf);
1396
est_doc_add_text(doc, CB_DATUMPTR(datum));
1398
CB_DATUMSETSIZE(datum, 0);
1399
} else if(links && (cbstrfwimatch(elem, "<a") || cbstrfwimatch(elem, "<link"))){
1400
attrs = cbxmlattrs(elem);
1401
value = cbmapget(attrs, "href", -1, NULL);
1402
if(!value) value = cbmapget(attrs, "HREF", -1, NULL);
1403
if(value && !cbstrfwimatch(value, "https:") && !cbstrfwimatch(value, "ftp:") &&
1404
!cbstrfwimatch(value, "mailto:") && !cbstrfwimatch(value, "javascript:"))
1405
cblistpush(links, value, -1);
1407
} else if(links && cbstrfwimatch(elem, "<frame")){
1408
attrs = cbxmlattrs(elem);
1409
value = cbmapget(attrs, "src", -1, NULL);
1410
if(!value) value = cbmapget(attrs, "SRC", -1, NULL);
1411
if(value && !cbstrfwimatch(value, "https:") && !cbstrfwimatch(value, "ftp:") &&
1412
!cbstrfwimatch(value, "mailto:") && !cbstrfwimatch(value, "javascript:"))
1413
cblistpush(links, value, -1);
1415
} else if(links && cbstrfwimatch(elem, "<object")){
1416
attrs = cbxmlattrs(elem);
1417
value = cbmapget(attrs, "data", -1, NULL);
1418
if(!value) value = cbmapget(attrs, "DATA", -1, NULL);
1419
if(value && !cbstrfwimatch(value, "https:") && !cbstrfwimatch(value, "ftp:") &&
1420
!cbstrfwimatch(value, "mailto:") && !cbstrfwimatch(value, "javascript:"))
1421
cblistpush(links, value, -1);
1423
} else if(links && (cbstrfwimatch(elem, "<embed") || cbstrfwimatch(elem, "<iframe"))){
1424
attrs = cbxmlattrs(elem);
1425
value = cbmapget(attrs, "src", -1, NULL);
1426
if(!value) value = cbmapget(attrs, "SRC", -1, NULL);
1427
if(value && !cbstrfwimatch(value, "https:") && !cbstrfwimatch(value, "ftp:") &&
1428
!cbstrfwimatch(value, "mailto:") && !cbstrfwimatch(value, "javascript:"))
1429
cblistpush(links, value, -1);
1433
CB_DATUMCAT(datum, " ", 1);
1434
CB_DATUMCAT(datum, elem, esiz);
1437
CB_LISTCLOSE(elems);
1438
if(strchr(CB_DATUMPTR(datum), '&')){
1439
rbuf = html_raw_text(CB_DATUMPTR(datum));
1440
est_doc_add_text(doc, rbuf);
1443
est_doc_add_text(doc, CB_DATUMPTR(datum));
1445
CB_DATUMCLOSE(datum);
1446
if(nbuf) free(nbuf);
1447
est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
1448
sprintf(numbuf, "%d", size);
1449
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
1453
/* Get the encoding of an HTML string.
1454
`str' specifies string of HTML.
1455
The return value is the name of the character encoding.
1456
Because the region of the return value is allocated with the `malloc' call, it should be
1457
released with the `free' call if it is no longer in use. */
1458
static char *html_enc(const char *str){
1461
const char *elem, *equiv, *content;
1465
elems = cbxmlbreak(str, TRUE);
1466
for(i = 0; i < CB_LISTNUM(elems); i++){
1467
elem = CB_LISTVAL(elems, i);
1468
if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue;
1470
attrs = cbxmlattrs(elem);
1471
equiv = cbmapget(attrs, "http-equiv", -1, NULL);
1472
if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
1473
if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL);
1474
if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL);
1475
if(equiv && !cbstricmp(equiv, "Content-Type")){
1476
content = cbmapget(attrs, "content", -1, NULL);
1477
if(!content) content = cbmapget(attrs, "Content", -1, NULL);
1478
if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
1479
if(content && ((pv = strstr(content, "charset")) != NULL ||
1480
(pv = strstr(content, "Charset")) != NULL ||
1481
(pv = strstr(content, "CHARSET")) != NULL)){
1482
enc = cbmemdup(pv + 8, -1);
1483
if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL ||
1484
(pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0';
1489
CB_LISTCLOSE(elems);
1493
CB_LISTCLOSE(elems);
1498
/* Unescape entity references of HTML.
1499
`str' specifies string of HTML.
1500
The return value is the result string.
1501
Because the region of the return value is allocated with the `malloc' call, it should be
1502
released with the `free' call if it is no longer in use. */
1503
static char *html_raw_text(const char *html){
1504
static const char *pairs[] = {
1506
"&", "&", "<", "<", ">", ">", """, "\"", "'", "'",
1508
" ", "\xc2\xa0", "¡", "\xc2\xa1", "¢", "\xc2\xa2",
1509
"£", "\xc2\xa3", "¤", "\xc2\xa4", "¥", "\xc2\xa5",
1510
"¦", "\xc2\xa6", "§", "\xc2\xa7", "¨", "\xc2\xa8",
1511
"©", "\xc2\xa9", "ª", "\xc2\xaa", "«", "\xc2\xab",
1512
"¬", "\xc2\xac", "­", "\xc2\xad", "®", "\xc2\xae",
1513
"¯", "\xc2\xaf", "°", "\xc2\xb0", "±", "\xc2\xb1",
1514
"²", "\xc2\xb2", "³", "\xc2\xb3", "´", "\xc2\xb4",
1515
"µ", "\xc2\xb5", "¶", "\xc2\xb6", "·", "\xc2\xb7",
1516
"¸", "\xc2\xb8", "¹", "\xc2\xb9", "º", "\xc2\xba",
1517
"»", "\xc2\xbb", "¼", "\xc2\xbc", "½", "\xc2\xbd",
1518
"¾", "\xc2\xbe", "¿", "\xc2\xbf", "À", "\xc3\x80",
1519
"Á", "\xc3\x81", "Â", "\xc3\x82", "Ã", "\xc3\x83",
1520
"Ä", "\xc3\x84", "Å", "\xc3\x85", "Æ", "\xc3\x86",
1521
"Ç", "\xc3\x87", "È", "\xc3\x88", "É", "\xc3\x89",
1522
"Ê", "\xc3\x8a", "Ë", "\xc3\x8b", "Ì", "\xc3\x8c",
1523
"Í", "\xc3\x8d", "Î", "\xc3\x8e", "Ï", "\xc3\x8f",
1524
"Ð", "\xc3\x90", "Ñ", "\xc3\x91", "Ò", "\xc3\x92",
1525
"Ó", "\xc3\x93", "Ô", "\xc3\x94", "Õ", "\xc3\x95",
1526
"Ö", "\xc3\x96", "×", "\xc3\x97", "Ø", "\xc3\x98",
1527
"Ù", "\xc3\x99", "Ú", "\xc3\x9a", "Û", "\xc3\x9b",
1528
"Ü", "\xc3\x9c", "Ý", "\xc3\x9d", "Þ", "\xc3\x9e",
1529
"ß", "\xc3\x9f", "à", "\xc3\xa0", "á", "\xc3\xa1",
1530
"â", "\xc3\xa2", "ã", "\xc3\xa3", "ä", "\xc3\xa4",
1531
"å", "\xc3\xa5", "æ", "\xc3\xa6", "ç", "\xc3\xa7",
1532
"è", "\xc3\xa8", "é", "\xc3\xa9", "ê", "\xc3\xaa",
1533
"ë", "\xc3\xab", "ì", "\xc3\xac", "í", "\xc3\xad",
1534
"î", "\xc3\xae", "ï", "\xc3\xaf", "ð", "\xc3\xb0",
1535
"ñ", "\xc3\xb1", "ò", "\xc3\xb2", "ó", "\xc3\xb3",
1536
"ô", "\xc3\xb4", "õ", "\xc3\xb5", "ö", "\xc3\xb6",
1537
"÷", "\xc3\xb7", "ø", "\xc3\xb8", "ù", "\xc3\xb9",
1538
"ú", "\xc3\xba", "û", "\xc3\xbb", "ü", "\xc3\xbc",
1539
"ý", "\xc3\xbd", "þ", "\xc3\xbe", "ÿ", "\xc3\xbf",
1541
"ƒ", "\xc6\x92", "Α", "\xce\x91", "Β", "\xce\x92",
1542
"Γ", "\xce\x93", "Δ", "\xce\x94", "Ε", "\xce\x95",
1543
"Ζ", "\xce\x96", "Η", "\xce\x97", "Θ", "\xce\x98",
1544
"Ι", "\xce\x99", "Κ", "\xce\x9a", "Λ", "\xce\x9b",
1545
"Μ", "\xce\x9c", "Ν", "\xce\x9d", "Ξ", "\xce\x9e",
1546
"Ο", "\xce\x9f", "Π", "\xce\xa0", "Ρ", "\xce\xa1",
1547
"Σ", "\xce\xa3", "Τ", "\xce\xa4", "Υ", "\xce\xa5",
1548
"Φ", "\xce\xa6", "Χ", "\xce\xa7", "Ψ", "\xce\xa8",
1549
"Ω", "\xce\xa9", "α", "\xce\xb1", "β", "\xce\xb2",
1550
"γ", "\xce\xb3", "δ", "\xce\xb4", "ε", "\xce\xb5",
1551
"ζ", "\xce\xb6", "η", "\xce\xb7", "θ", "\xce\xb8",
1552
"ι", "\xce\xb9", "κ", "\xce\xba", "λ", "\xce\xbb",
1553
"μ", "\xce\xbc", "ν", "\xce\xbd", "ξ", "\xce\xbe",
1554
"ο", "\xce\xbf", "π", "\xcf\x80", "ρ", "\xcf\x81",
1555
"ς", "\xcf\x82", "σ", "\xcf\x83", "τ", "\xcf\x84",
1556
"υ", "\xcf\x85", "φ", "\xcf\x86", "χ", "\xcf\x87",
1557
"ψ", "\xcf\x88", "ω", "\xcf\x89", "ϑ", "\xcf\x91",
1558
"ϒ", "\xcf\x92", "ϖ", "\xcf\x96", "•", "\xe2\x80\xa2",
1559
"…", "\xe2\x80\xa6", "′", "\xe2\x80\xb2", "″", "\xe2\x80\xb3",
1560
"‾", "\xe2\x80\xbe", "⁄", "\xe2\x81\x84", "℘", "\xe2\x84\x98",
1561
"ℑ", "\xe2\x84\x91", "ℜ", "\xe2\x84\x9c", "™", "\xe2\x84\xa2",
1562
"ℵ", "\xe2\x84\xb5", "←", "\xe2\x86\x90", "↑", "\xe2\x86\x91",
1563
"→", "\xe2\x86\x92", "↓", "\xe2\x86\x93", "↔", "\xe2\x86\x94",
1564
"↵", "\xe2\x86\xb5", "⇐", "\xe2\x87\x90", "⇑", "\xe2\x87\x91",
1565
"⇒", "\xe2\x87\x92", "⇓", "\xe2\x87\x93", "⇔", "\xe2\x87\x94",
1566
"∀", "\xe2\x88\x80", "∂", "\xe2\x88\x82", "∃", "\xe2\x88\x83",
1567
"∅", "\xe2\x88\x85", "∇", "\xe2\x88\x87", "∈", "\xe2\x88\x88",
1568
"∉", "\xe2\x88\x89", "∋", "\xe2\x88\x8b", "∏", "\xe2\x88\x8f",
1569
"∑", "\xe2\x88\x91", "−", "\xe2\x88\x92", "∗", "\xe2\x88\x97",
1570
"√", "\xe2\x88\x9a", "∝", "\xe2\x88\x9d", "∞", "\xe2\x88\x9e",
1571
"∠", "\xe2\x88\xa0", "∧", "\xe2\x88\xa7", "∨", "\xe2\x88\xa8",
1572
"∩", "\xe2\x88\xa9", "∪", "\xe2\x88\xaa", "∫", "\xe2\x88\xab",
1573
"∴", "\xe2\x88\xb4", "∼", "\xe2\x88\xbc", "≅", "\xe2\x89\x85",
1574
"≈", "\xe2\x89\x88", "≠", "\xe2\x89\xa0", "≡", "\xe2\x89\xa1",
1575
"≤", "\xe2\x89\xa4", "≥", "\xe2\x89\xa5", "⊂", "\xe2\x8a\x82",
1576
"⊃", "\xe2\x8a\x83", "⊄", "\xe2\x8a\x84", "⊆", "\xe2\x8a\x86",
1577
"⊇", "\xe2\x8a\x87", "⊕", "\xe2\x8a\x95", "⊗", "\xe2\x8a\x97",
1578
"⊥", "\xe2\x8a\xa5", "⋅", "\xe2\x8b\x85", "⌈", "\xe2\x8c\x88",
1579
"⌉", "\xe2\x8c\x89", "⌊", "\xe2\x8c\x8a", "⌋", "\xe2\x8c\x8b",
1580
"⟨", "\xe2\x8c\xa9", "⟩", "\xe2\x8c\xaa", "◊", "\xe2\x97\x8a",
1581
"♠", "\xe2\x99\xa0", "♣", "\xe2\x99\xa3", "♥", "\xe2\x99\xa5",
1582
"♦", "\xe2\x99\xa6", "Œ", "\xc5\x92", "œ", "\xc5\x93",
1583
"Š", "\xc5\xa0", "š", "\xc5\xa1", "Ÿ", "\xc5\xb8",
1584
"ˆ", "\xcb\x86", "˜", "\xcb\x9c", " ", "\xe2\x80\x82",
1585
" ", "\xe2\x80\x83", " ", "\xe2\x80\x89", "‌", "\xe2\x80\x8c",
1586
"‍", "\xe2\x80\x8d", "‎", "\xe2\x80\x8e", "‏", "\xe2\x80\x8f",
1587
"–", "\xe2\x80\x93", "—", "\xe2\x80\x94", "‘", "\xe2\x80\x98",
1588
"’", "\xe2\x80\x99", "‚", "\xe2\x80\x9a", "“", "\xe2\x80\x9c",
1589
"”", "\xe2\x80\x9d", "„", "\xe2\x80\x9e", "†", "\xe2\x80\xa0",
1590
"‡", "\xe2\x80\xa1", "‰", "\xe2\x80\xb0", "‹", "\xe2\x80\xb9",
1591
"›", "\xe2\x80\xba", "€", "\xe2\x82\xac",
1594
char *raw, *wp, buf[2], *tmp;
1595
int i, j, hit, num, tsiz;
1597
CB_MALLOC(raw, strlen(html) * 3 + 1);
1599
while(*html != '\0'){
1601
if(*(html + 1) == '#'){
1602
if(*(html + 2) == 'x' || *(html + 2) == 'X'){
1603
num = strtol(html + 3, NULL, 16);
1605
num = atoi(html + 2);
1609
if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){
1610
for(j = 0; j < tsiz; j++){
1611
*wp = ((unsigned char *)tmp)[j];
1616
while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
1619
if(*html == ';') html++;
1622
for(i = 0; pairs[i] != NULL; i += 2){
1623
if(cbstrfwmatch(html, pairs[i])){
1624
wp += sprintf(wp, "%s", pairs[i+1]);
1625
html += strlen(pairs[i]);
1647
/* Create a document object from MIME.
1648
`buf' specifies the pointer to a data buffer. It should be trailed by zero code.
1649
`size' specifies the size of the buffer.
1650
`penc' specifies the name of preferred encoding. If it is `NULL', it is not used.
1651
`plang' specifies the code of preferred language.
1652
`doc' specifies a document handle to store attributes and texts.
1653
`link' specifies a list handle to store links. If it is `NULL', it is not used. */
1654
static void make_doc_from_mime(const char *buf, int size, const char *penc, int plang,
1655
ESTDOC *doc, CBLIST *links){
1658
const CBLIST *texts;
1659
CBLIST *parts, *lines;
1661
const char *key, *val, *bound, *part, *text, *line;
1662
char *body, *swap, numbuf[NUMBUFSIZ];
1663
int i, j, bsiz, psiz, ssiz, mht;
1664
assert(buf && size >= 0 && doc);
1665
attrs = cbmapopenex(MINIBNUM);
1666
body = cbmimebreak(buf, size, attrs, &bsiz);
1667
if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){
1668
doc_add_attr_mime(doc, ESTDATTRTITLE, val);
1669
if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val);
1671
if((val = cbmapget(attrs, "from", -1, NULL)) != NULL)
1672
doc_add_attr_mime(doc, ESTDATTRAUTHOR, val);
1673
if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){
1674
doc_add_attr_mime(doc, ESTDATTRCDATE, val);
1675
doc_add_attr_mime(doc, ESTDATTRMDATE, val);
1677
est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822");
1678
sprintf(numbuf, "%d", size);
1679
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
1680
cbmapiterinit(attrs);
1681
while((key = cbmapiternext(attrs, NULL)) != NULL){
1682
if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@' || key[0] == '_') continue;
1683
val = cbmapget(attrs, key, -1, NULL);
1684
doc_add_attr_mime(doc, key, val);
1686
if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){
1687
mht = cbstrfwimatch(key, "multipart/related");
1688
if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){
1689
parts = cbmimeparts(body, bsiz, bound);
1690
for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){
1691
part = CB_LISTVAL2(parts, i, psiz);
1692
tdoc = est_doc_new();
1693
make_doc_from_mime(part, psiz, penc, plang, tdoc, links);
1695
if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL)
1696
est_doc_add_attr(doc, ESTDATTRTITLE, text);
1697
if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL)
1698
est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
1700
texts = est_doc_texts(tdoc);
1701
for(j = 0; j < CB_LISTNUM(texts); j++){
1702
text = CB_LISTVAL(texts, j);
1703
est_doc_add_text(doc, text);
1705
est_doc_delete(tdoc);
1707
CB_LISTCLOSE(parts);
1710
key = cbmapget(attrs, "content-transfer-encoding", -1, NULL);
1711
if(key && cbstrfwimatch(key, "base64")){
1712
swap = cbbasedecode(body, &ssiz);
1716
} else if(key && cbstrfwimatch(key, "quoted-printable")){
1717
swap = cbquotedecode(body, &ssiz);
1722
key = cbmapget(attrs, "content-encoding", -1, NULL);
1723
if(key && (cbstrfwimatch(key, "x-gzip") || cbstrfwimatch(key, "gzip")) &&
1724
(swap = cbgzdecode(body, bsiz, &ssiz)) != NULL){
1728
} else if(key && (cbstrfwimatch(key, "x-deflate") || cbstrfwimatch(key, "deflate")) &&
1729
(swap = cbinflate(body, bsiz, &ssiz)) != NULL){
1734
if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){
1735
if(!check_binary(body, bsiz)){
1736
if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){
1740
} else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL &&
1741
(swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){
1746
lines = cbsplit(body, bsiz, "\n");
1747
CB_DATUMOPEN(datum);
1748
for(i = 0; i < CB_LISTNUM(lines); i++){
1749
line = CB_LISTVAL(lines, i);
1750
while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){
1753
if(line[0] == '\0'){
1754
est_doc_add_text(doc, CB_DATUMPTR(datum));
1755
CB_DATUMSETSIZE(datum, 0);
1757
CB_DATUMCAT(datum, " ", 1);
1758
CB_DATUMCAT(datum, line, strlen(line));
1761
est_doc_add_text(doc, CB_DATUMPTR(datum));
1762
CB_DATUMCLOSE(datum);
1763
CB_LISTCLOSE(lines);
1765
} else if(cbstrfwimatch(key, "text/html") || cbstrfwimatch(key, "application/xhtml+xml")){
1766
tdoc = est_doc_new();
1767
make_doc_from_html(body, bsiz, penc, plang, tdoc, links);
1768
if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
1769
if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
1770
est_doc_add_text(doc, text);
1772
if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
1773
if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
1774
est_doc_add_text(doc, text);
1776
texts = est_doc_texts(tdoc);
1777
for(i = 0; i < CB_LISTNUM(texts); i++){
1778
text = CB_LISTVAL(texts, i);
1779
est_doc_add_text(doc, text);
1781
est_doc_delete(tdoc);
1782
} else if(cbstrfwimatch(key, "message/rfc822")){
1783
tdoc = est_doc_new();
1784
make_doc_from_mime(body, bsiz, penc, plang, tdoc, links);
1785
if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
1786
if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
1787
est_doc_add_text(doc, text);
1789
if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
1790
if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
1791
est_doc_add_text(doc, text);
1793
texts = est_doc_texts(tdoc);
1794
for(i = 0; i < CB_LISTNUM(texts); i++){
1795
text = CB_LISTVAL(texts, i);
1796
est_doc_add_text(doc, text);
1798
est_doc_delete(tdoc);
1799
} else if(cbstrfwimatch(key, "text/")){
1800
tdoc = est_doc_new();
1801
make_doc_from_text(body, bsiz, penc, plang, tdoc, links);
1802
texts = est_doc_texts(tdoc);
1803
for(i = 0; i < CB_LISTNUM(texts); i++){
1804
text = CB_LISTVAL(texts, i);
1805
est_doc_add_text(doc, text);
1807
est_doc_delete(tdoc);
1815
/* set mime value as an attribute of a document */
1816
static void doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){
1817
char enc[64], *ebuf, *rbuf;
1818
assert(doc && name && value);
1819
ebuf = cbmimedecode(value, enc);
1820
if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){
1821
est_doc_add_attr(doc, name, rbuf);
1828
/* Create a document object with an outer command.
1829
`xcmd' specifies an outer command line.
1830
`buf' specifies the pointer to a data buffer. It should be trailed by zero code.
1831
`size' specifies the size of the buffer.
1832
`url' specifies the URL of the target document.
1833
`penc' specifies the name of preferred encoding. If it is `NULL', it is not used.
1834
`plang' specifies the code of preferred language.
1835
`doc' specifies a document handle to store attributes and texts.
1836
`link' specifies a list handle to store links. If it is `NULL', it is not used. */
1837
static void make_doc_with_xcmd(const char *xcmd, const char *url, const char *buf, int size,
1838
const char *penc, int plang, ESTDOC *doc, CBLIST *links){
1839
const char *tmpdir, *pv, *ext, *fmt;
1840
char iname[URIBUFSIZ], oname[URIBUFSIZ], cmd[URIBUFSIZ];
1841
char *rbuf, numbuf[NUMBUFSIZ];
1844
assert(buf && size >= 0 && url && xcmd);
1845
if(ESTPATHCHR == '/' && stat("/tmp", &sbuf) == 0){
1847
} else if(ESTPATHCHR == '\\' &&
1848
((pv = getenv("TMP")) != NULL || (pv = getenv("TEMP")) != NULL) &&
1849
stat(pv, &sbuf) == 0){
1852
tmpdir = ESTCDIRSTR;
1855
if((pv = strrchr(url, ESTPATHCHR)) != NULL) url = pv;
1856
if((pv = strrchr(url, ESTEXTCHR)) != NULL) ext = pv;
1857
if(!ext || strlen(ext) >= 32 || strchr(ext, '"') || strchr(ext, '\\')) ext = "";
1858
rnd = dpouterhash(url, -1) & 0xffff;
1859
pid = (int)getpid() & 0xffff;
1860
sprintf(iname, "%s%cxcmd-in-%04X%04X%s", tmpdir, ESTPATHCHR, pid, rnd, ext);
1861
sprintf(oname, "%s%cxcmd-out-%04X%04X%cest", tmpdir, ESTPATHCHR, pid, rnd, ESTEXTCHR);
1863
if(cbstrfwmatch(xcmd, "T@")){
1866
} else if(cbstrfwmatch(xcmd, "H@")){
1869
} else if(cbstrfwmatch(xcmd, "M@")){
1873
cbwritefile(iname, buf, size);
1874
sprintf(cmd, "%s \"%s\" \"%s\"", xcmd, iname, oname);
1876
if((rbuf = cbreadfile(oname, &rsiz)) != NULL){
1877
if(fmt == DRAFTCMD){
1878
make_doc_from_draft(rbuf, rsiz, doc, links);
1879
} else if(fmt == TEXTCMD){
1880
make_doc_from_text(rbuf, rsiz, penc, plang, doc, links);
1881
} else if(fmt == HTMLCMD){
1882
make_doc_from_html(rbuf, rsiz, penc, plang, doc, links);
1883
} else if(fmt == MIMECMD){
1884
make_doc_from_mime(rbuf, rsiz, penc, plang, doc, links);
1888
if(doc && fmt != NULL){
1889
sprintf(numbuf, "%d", size);
1890
est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
1891
est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext));