30
34
#define ESTKEYIDXNUM "_idxnum" /* key for the number of inverted indexes */
31
35
#define ESTKEYDSEQ "_dseq" /* key for the sequence for document IDs */
32
36
#define ESTKEYDNUM "_dnum" /* key for the number of documents */
33
#define ESTKEYAMODE "_amode" /* key for the mode of text analyzer */
34
37
#define ESTKEYMETA "_meta" /* key for meta data */
36
39
#define ESTIDXDBNAME "_idx" /* name of the inverted index */
37
#define ESTIDXDBLRM 55 /* records in a leaf node of the inverted index */
40
#define ESTIDXDBLRM 109 /* records in a leaf node of the inverted index */
41
#define ESTIDXDBLRMA 17 /* records in a leaf node of the index in APN mode */
38
42
#define ESTIDXDBNIM 160 /* records in a non-leaf node of the inverted index */
39
43
#define ESTIDXDBLCN 16 /* number of leaf cache of the inverted index */
40
44
#define ESTIDXDBNCN 16 /* number of non-leaf cache of the inverted index */
41
45
#define ESTIDXDBRLCN 128 /* number of leaf cache of the index reader */
42
#define ESTIDXDBRNCN 64 /* number of non-leaf cache of the index reader */
43
#define ESTIDXDBMIN (1048576*768) /* minimum size of a database file */
46
#define ESTIDXDBRLCNA 32 /* number of leaf cache of the reader in APN mode */
47
#define ESTIDXDBRNCN 256 /* number of non-leaf cache of the index reader */
48
#define ESTIDXDBFBP 512 /* size of free block pool of the inverted index */
49
#define ESTIDXDBMIN (1048576*512) /* minimum size of a database file */
44
50
#define ESTIDXDBMAX (1048576*1536) /* maximum size of a database file */
46
52
#define ESTFWMDBNAME "_fwm" /* name of the database for forward matching */
47
#define ESTFWMDBLRM 111 /* records in a leaf node of forward matching DB */
53
#define ESTFWMDBLRM 251 /* records in a leaf node of forward matching DB */
48
54
#define ESTFWMDBNIM 110 /* records in a non-leaf node of forward matching DB */
49
55
#define ESTFWMDBLCN 32 /* number of leaf cache of forward matching DB */
50
56
#define ESTFWMDBNCN 16 /* number of non-leaf cache of forward matching DB */
52
#define ESTATTRDBNAME "_attr" /* name of the database for attrutes */
53
#define ESTATTRDBBNUM 245759 /* bucket number of the database for attrutes */
54
#define ESTATTRDBDNUM 3 /* division number of the database for attrutes */
55
#define ESTATTRDBALN -5 /* alignment of the database for attrutes */
57
#define ESTFWMDBFBP 128 /* size of free block pool of forward matching DB */
59
#define ESTAUXDBNAME "_aux" /* name of the auxiliary index */
60
#define ESTAUXDBLRM 23 /* records in a leaf node of the auxiliary index */
61
#define ESTAUXDBNIM 160 /* records in a non-leaf node of the auxiliary index */
62
#define ESTAUXDBLCN 16 /* number of leaf cache of the auxiliary index */
63
#define ESTAUXDBNCN 16 /* number of non-leaf cache of the auxiliary index */
64
#define ESTAUXDBRLCN 256 /* number of leaf cache of the auxiliary reader */
65
#define ESTAUXDBRNCN 64 /* number of non-leaf cache of the auxiliary reader */
66
#define ESTAUXDBFBP 256 /* size of free block pool of the auxiliary index */
68
#define ESTXFMDBNAME "_xfm" /* name of the database for auxiliary forward matching */
69
#define ESTXFMDBLRM 111 /* records in a leaf node of xfm DB */
70
#define ESTXFMDBNIM 110 /* records in a non-leaf node of xfm DB */
71
#define ESTXFMDBLCN 32 /* number of leaf cache of xfm DB */
72
#define ESTXFMDBNCN 16 /* number of non-leaf cache of xfm DB */
73
#define ESTXFMDBFBP 128 /* size of free block pool of xfm DB */
75
#define ESTATTRDBNAME "_attr" /* name of the database for attributes */
76
#define ESTATTRDBBNUM 212987 /* bucket number of the database for attributes */
77
#define ESTATTRDBDNUM 3 /* division number of the database for attributes */
78
#define ESTATTRDBALN -5 /* alignment of the database for attributes */
79
#define ESTATTRDBFBP 64 /* size of free block pool of the attribute DB */
57
81
#define ESTTEXTDBNAME "_text" /* name of the database of texts */
58
82
#define ESTTEXTDBBNUM 61417 /* bucket number of the database for texts */
59
83
#define ESTTEXTDBDNUM 7 /* division number of the database for texts */
60
84
#define ESTTEXTDBALN -5 /* alignment of the database for texts */
85
#define ESTTEXTDBFBP 128 /* size of free block pool of the text DB */
62
87
#define ESTKWDDBNAME "_kwd" /* name of the database of keywords */
63
88
#define ESTKWDDBBNUM 163819 /* bucket number of the database for keywords */
64
89
#define ESTKWDDBDNUM 3 /* division number of the database for keywords */
65
90
#define ESTKWDDBALN -5 /* alignment of the database for keywords */
91
#define ESTKWDDBFBP 64 /* size of free block pool of the keyword DB */
67
93
#define ESTLISTDBNAME "_list" /* name of the database of document list */
68
94
#define ESTLISTDBLRM 99 /* records in a leaf node of document list DB */
69
95
#define ESTLISTDBNIM 200 /* records in a non-leaf node of document list DB */
70
#define ESTLISTDBLCN 32 /* number of leaf cache of document list DB */
96
#define ESTLISTDBLCN 64 /* number of leaf cache of document list DB */
71
97
#define ESTLISTDBNCN 16 /* number of non-leaf cache of document list DB */
98
#define ESTLISTDBFBP 128 /* size of free block pool of document list DB */
100
#define ESTAISEQPREF "__seq_" /* prefix of the database for sequencial access */
101
#define ESTAISTRPREF "__str_" /* prefix of the database for string narrowing */
102
#define ESTAINUMPREF "__num_" /* prefix of the database for number narrowing */
103
#define ESTAIBDIAM 0.8 /* diameter of the bucket number */
104
#define ESTAIDXLRM 99 /* records in a leaf node of narrowing index */
105
#define ESTAIDXNIM 120 /* records in a non-leaf node of narrowing index */
106
#define ESTAIDXLCN 1024 /* number of leaf cache of narrowing index */
107
#define ESTAIDXNCN 256 /* number of non-leaf cache of narrowing index */
108
#define ESTAIDXDPFBP 32 /* size of free block pool of sequencial DB */
109
#define ESTAIDXVLFBP 128 /* size of free block pool of narrowing DB */
110
#define ESTAIKBUFSIZ 8192 /* size of a buffer for a key */
111
#define ESTOPDUMMY "[DUMMY]" /* dummy operator */
113
#define ESTDBSBRAT 0.3 /* ratio of bucket numbers of large mode */
114
#define ESTDBSDRAT 0.4 /* ratio of the division number of large mode */
115
#define ESTDBLBRAT 3.0 /* ratio of bucket numbers of large mode */
116
#define ESTDBLDRAT 1.0 /* ratio of the division number of large mode */
117
#define ESTDBHBRAT 5.0 /* ratio of bucket numbers of huge mode */
118
#define ESTDBHDRAT 2.0 /* ratio of the division number of huge mode */
119
#define ESTDBH2RAT 1.4 /* ratio of huge mode second */
120
#define ESTDBH3RAT 2.0 /* ratio of huge mode third */
122
#define ESTVLCRDNUM 2 /* division number of usual Villa databases */
123
#define ESTVLCRDNAUX 7 /* division number of the auxiliary index */
73
125
#define ESTIDXCCBNUM 524288 /* bucket number of cache for the inverted index */
126
#define ESTAUXCCBNUM 65521 /* bucket number of cache for the auxiliary index */
74
127
#define ESTIDXCCMAX (1048576*64) /* max size of the cache */
75
128
#define ESTOUTCCBNUM 131072 /* bucket number of cache for deleted documents */
76
129
#define ESTKEYCCMNUM 65536 /* bucket number of cache for keys for TF-IDF */
135
257
int pt; /* score tuned by TF-IDF */
260
typedef struct { /* type of structure for a meta hitting object */
261
int db; /* index of a container database */
262
int id; /* ID of a document */
263
int score; /* score tuned by TF-IDF */
264
char *value; /* value of an attribute for sorting */
139
268
/* private function prototypes */
269
static char *est_hex_encode(const char *str);
270
static char *est_hex_decode(const char *str);
140
271
static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
141
272
static void est_normalize_text(unsigned char *utext, int size, int *sp);
142
273
static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
143
274
static int est_char_category(int c);
144
275
static int est_char_category_perfng(int c);
145
static char *est_phrase_from_thumb(const char *sphrase);
276
static int est_char_category_chrcat(int c);
277
static char *est_make_snippet(const char *str, int len, const CBLIST *words,
278
int wwidth, int hwidth, int awidth);
279
static int est_check_cjk_only(const char *str);
280
static char *est_phrase_from_simple(const char *sphrase);
281
static char *est_phrase_from_rough(const char *rphrase);
282
static char *est_phrase_from_union(const char *uphrase);
283
static char *est_phrase_from_isect(const char *iphrase);
146
284
static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
147
285
int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
148
286
static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
149
287
const unsigned char *needle, int nsiz);
288
static char *est_strstr_sparse(const char *haystack, const char *needle);
289
static int est_idx_rec_last_id(const char *vbuf, int vsiz, int smode);
290
static void est_encode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int lid, int smode);
291
static void est_decode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int smode);
150
292
static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
151
293
static int est_idx_close(ESTIDX *idx);
152
static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum);
294
static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum,
153
296
static void est_idx_increment(ESTIDX *idx);
154
297
static int est_idx_dnum(ESTIDX *idx);
155
static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz);
298
static int est_idx_add(ESTIDX *idx, const char *word, int wsiz,
299
const char *vbuf, int vsiz, int smode);
156
300
static int est_idx_put_one(ESTIDX *idx, int inum, const char *word, int wsiz,
157
301
const char *vbuf, int vsiz);
158
302
static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
159
static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp);
160
static char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp);
303
static char *est_idx_scan(ESTIDX *idx, const char *word, int wsiz, int *sp, int smode);
304
static const char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp);
161
305
static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
162
306
static int est_idx_num(ESTIDX *idx);
163
307
static double est_idx_size(ESTIDX *idx);
164
308
static int est_idx_size_current(ESTIDX *idx);
309
static int est_idx_memflush(ESTIDX *idx);
165
310
static int est_idx_sync(ESTIDX *idx);
166
311
static int est_idx_optimize(ESTIDX *idx);
167
312
static void est_idx_set_current(ESTIDX *idx);
313
static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode);
314
static int est_crout(CURIA *curia, int id);
315
static char *est_crget(CURIA *curia, int flags, int id, int *sp);
316
static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz);
317
static int est_aidx_seq_out(DEPOT *db, int id);
318
static char *est_aidx_seq_get(DEPOT *db, int id, int *sp);
319
static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, const char *cop, int sign,
320
const char *oval, int osiz, const char *sval, int ssiz,
321
const void *regex, int onum, ESTSCORE *scores, int snum,
322
int limit, int *restp);
323
static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz);
324
static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz);
325
static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz);
326
static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, const char *cop, int sign,
327
const char *oval, int osiz, const char *sval, int ssiz,
328
const void *regex, int onum, ESTSCORE *scores, int snum);
329
static int est_int_compare(const void *ap, const void *bp);
330
static int est_short_compare(const void *ap, const void *bp);
331
static void est_inodes_delete(void *arg);
332
static void est_inodes_delete_informer(const char *msg, void *opaque);
168
333
static int est_db_write_meta(ESTDB *db);
169
334
static void est_db_inform(ESTDB *db, const char *info);
170
335
static void est_db_prepare_meta(ESTDB *db);
336
static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp);
337
static int est_pidx_uri_to_id(ESTDB *db, const char *uri);
171
338
static CBLIST *est_phrase_terms(const char *phrase);
172
static int est_score_compare_by_id(const void *ap, const void *bp);
173
static int est_score_compare_by_score(const void *ap, const void *bp);
339
static int est_score_compare_by_id_asc(const void *ap, const void *bp);
340
static int est_score_compare_by_id_desc(const void *ap, const void *bp);
341
static int est_score_compare_by_score_asc(const void *ap, const void *bp);
342
static int est_score_compare_by_score_desc(const void *ap, const void *bp);
174
343
static int est_score_compare_by_str_asc(const void *ap, const void *bp);
175
344
static int est_score_compare_by_str_desc(const void *ap, const void *bp);
176
345
static int est_score_compare_by_num_asc(const void *ap, const void *bp);
177
346
static int est_score_compare_by_num_desc(const void *ap, const void *bp);
347
static int est_metascore_compare_by_id_asc(const void *ap, const void *bp);
348
static int est_metascore_compare_by_id_desc(const void *ap, const void *bp);
349
static int est_metascore_compare_by_score_asc(const void *ap, const void *bp);
350
static int est_metascore_compare_by_score_desc(const void *ap, const void *bp);
351
static int est_metascore_compare_by_str_asc(const void *ap, const void *bp);
352
static int est_metascore_compare_by_str_desc(const void *ap, const void *bp);
353
static int est_metascore_compare_by_num_asc(const void *ap, const void *bp);
354
static int est_metascore_compare_by_num_desc(const void *ap, const void *bp);
178
355
static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
179
356
static void est_expand_word_bw(ESTDB *db, const char *word, CBLIST *list);
180
357
static void est_expand_word_ew(ESTDB *db, const char *word, CBLIST *list);
181
358
static void est_expand_word_rx(ESTDB *db, const char *word, CBLIST *list);
359
static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list);
360
static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list);
361
static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list);
182
362
static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
183
int *nump, CBMAP *hints, int add);
363
void (*xpn)(const char *, CBLIST *),
364
int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords);
184
365
static const ESTSCORE *est_rescc_get(ESTDB *db, const char *word, int size, int *nump);
185
366
static void est_rescc_put(ESTDB *db, const char *word, int size, ESTSCORE *scores, int num);
186
static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
187
ESTSCORE *scores, int snum, int limit, int *restp);
367
static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump);
368
static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum);
369
static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump);
370
static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump);
371
static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
373
static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
374
const char *order, const char *distinct, ESTSCORE *scores, int snum,
375
int limit, int *restp, CBMAP *ordattrs);
376
static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump);
377
static void est_free_cattr_list(ESTCATTR *list, int anum);
188
378
static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
189
379
int vnum, int tfidf, double limit, CBMAP *shadows);
190
380
static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
412
665
const char *kbuf, *vbuf;
413
666
int i, ksiz, vsiz;
415
datum = cbdatumopen("", 0);
417
670
list = est_doc_attr_names(doc);
418
671
for(i = 0; i < CB_LISTNUM(list); i++){
419
kbuf = CB_LISTVAL2(list, i, &ksiz);
672
kbuf = CB_LISTVAL2(list, i, ksiz);
420
673
vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
421
cbdatumcat(datum, kbuf, ksiz);
422
cbdatumcat(datum, "=", 1);
423
cbdatumcat(datum, vbuf, vsiz);
424
cbdatumcat(datum, "\n", 1);
428
cbdatumcat(datum, "\n", 1);
674
CB_DATUMCAT(datum, kbuf, ksiz);
675
CB_DATUMCAT(datum, "=", 1);
676
CB_DATUMCAT(datum, vbuf, vsiz);
677
CB_DATUMCAT(datum, "\n", 1);
681
if(doc->kwords && cbmaprnum(doc->kwords) > 0){
682
CB_DATUMCAT(datum, ESTDCNTLVECTOR, strlen(ESTDCNTLVECTOR));
683
cbmapiterinit(doc->kwords);
684
while((kbuf = cbmapiternext(doc->kwords, &ksiz)) != NULL){
685
CB_MAPITERVAL(vbuf, kbuf, vsiz);
686
CB_DATUMCAT(datum, "\t", 1);
687
CB_DATUMCAT(datum, kbuf, ksiz);
688
CB_DATUMCAT(datum, "\t", 1);
689
CB_DATUMCAT(datum, vbuf, vsiz);
691
CB_DATUMCAT(datum, "\n", 1);
693
if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, &vsiz)) != NULL){
694
CB_DATUMCAT(datum, ESTDCNTLSCORE, strlen(ESTDCNTLSCORE));
695
CB_DATUMCAT(datum, "\t", 1);
696
CB_DATUMCAT(datum, vbuf, vsiz);
697
CB_DATUMCAT(datum, "\n", 1);
699
CB_DATUMCAT(datum, "\n", 1);
430
701
for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
431
kbuf = CB_LISTVAL2(doc->dtexts, i, &ksiz);
432
cbdatumcat(datum, kbuf, ksiz);
433
cbdatumcat(datum, "\n", 1);
702
kbuf = CB_LISTVAL2(doc->dtexts, i, ksiz);
703
CB_DATUMCAT(datum, kbuf, ksiz);
704
CB_DATUMCAT(datum, "\n", 1);
436
707
if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
437
cbdatumcat(datum, "\t", 1);
438
cbdatumcat(datum, vbuf, vsiz);
439
cbdatumcat(datum, "\n", 1);
708
CB_DATUMCAT(datum, "\t", 1);
709
CB_DATUMCAT(datum, vbuf, vsiz);
710
CB_DATUMCAT(datum, "\n", 1);
441
712
return cbdatumtomalloc(datum, NULL);
445
716
/* Make a snippet of the body text of a document object. */
446
717
char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
450
const char *text, *word, *cval;
451
const unsigned char *rword;
452
unsigned char *rtext, *ctext;
453
int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
454
722
assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
455
if(!doc->dtexts) doc->dtexts = cblistopen();
456
res = cbdatumopen("", 0);
457
rwords = cblistopen();
458
for(i = 0; i < CB_LISTNUM(words); i++){
459
word = CB_LISTVAL2(words, i, &wsiz);
460
if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
461
rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
462
est_canonicalize_text(rtext, size, TRUE);
463
cblistpushbuf(rwords, (char *)rtext, size);
465
sbuf = cbdatumopen("", 0);
723
if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
466
725
for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
467
text = CB_LISTVAL2(doc->dtexts, i, &size);
468
if(i > 0) cbdatumcat(sbuf, " ", 1);
469
cbdatumcat(sbuf, text, size);
471
rtext = (unsigned char *)est_uconv_in(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf), &size);
472
ctext = (unsigned char *)cbmemdup((char *)rtext, size);
473
est_canonicalize_text(ctext, size, FALSE);
475
if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
476
if(mywidth > wwidth) mywidth = wwidth;
477
for(i = 0; i < size && mywidth > 0; i += 2){
478
mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
481
if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
482
est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
485
cbdatumcat(res, "\n", 1);
487
counts = cbmapopenex(ESTMINIBNUM);
488
for(i = bi; i < size && wwidth >= 0; i += 2){
489
for(j = 0; j < CB_LISTNUM(rwords); j++){
490
rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
491
if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
492
(!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
493
csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
494
cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
495
if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
497
counts = cbmapopenex(ESTMINIBNUM);
499
mywidth = awidth / 2 + 1;
500
for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
501
mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
504
mywidth = awidth / 2 + 1;
505
for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
506
mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
508
if(k > size) k = size;
509
est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
510
wwidth -= awidth + rwsiz / 2;
513
cbdatumcat(res, "\n", 1);
524
return cbdatumtomalloc(res, NULL);
726
text = CB_LISTVAL2(doc->dtexts, i, size);
727
if(i > 0) CB_DATUMCAT(sbuf, " ", 1);
728
CB_DATUMCAT(sbuf, text, size);
730
snippet = est_make_snippet(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf),
731
words, wwidth, hwidth, awidth);
758
1110
if(omode & ESTDBWRITER){
1111
est_idx_set_tuning(idxdb, amode == ESTDFPERFNG ? ESTIDXDBLRMA : ESTIDXDBLRM, ESTIDXDBNIM,
1112
ESTIDXDBLCN, ESTIDXDBNCN, ESTIDXDBFBP);
1113
est_idx_set_current(idxdb);
1114
vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
1115
vlsetfbpsiz(fwmdb, ESTFWMDBFBP);
1116
vlsettuning(auxdb, ESTAUXDBLRM, ESTAUXDBNIM, ESTAUXDBLCN, ESTAUXDBNCN);
1117
vlsetfbpsiz(auxdb, ESTAUXDBFBP);
1118
vlsettuning(xfmdb, ESTXFMDBLRM, ESTXFMDBNIM, ESTXFMDBLCN, ESTXFMDBNCN);
1119
vlsetfbpsiz(xfmdb, ESTXFMDBFBP);
759
1120
crsetalign(attrdb, ESTATTRDBALN);
1121
crsetfbpsiz(attrdb, ESTATTRDBFBP);
760
1122
crsetalign(textdb, ESTTEXTDBALN);
1123
crsetfbpsiz(textdb, ESTTEXTDBFBP);
761
1124
crsetalign(kwddb, ESTKWDDBALN);
762
est_idx_set_tuning(idxdb, ESTIDXDBLRM, ESTIDXDBNIM, ESTIDXDBLCN, ESTIDXDBNCN);
763
est_idx_set_current(idxdb);
764
vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
1125
crsetfbpsiz(kwddb, ESTKWDDBFBP);
765
1126
vlsettuning(listdb, ESTLISTDBLRM, ESTLISTDBNIM, ESTLISTDBLCN, ESTLISTDBNCN);
1127
vlsetfbpsiz(listdb, ESTLISTDBFBP);
767
est_idx_set_tuning(idxdb, -1, -1, ESTIDXDBRLCN, ESTIDXDBRNCN);
1129
est_idx_set_tuning(idxdb, -1, -1,
1130
amode == ESTDFPERFNG ? ESTIDXDBRLCNA : ESTIDXDBRLCN, ESTIDXDBRNCN, -1);
768
1131
vlsettuning(fwmdb, -1, -1, ESTFWMDBLCN, ESTFWMDBNCN);
1132
vlsettuning(auxdb, -1, -1, ESTAUXDBRLCN, ESTAUXDBRNCN);
1133
vlsettuning(xfmdb, -1, -1, ESTXFMDBLCN, ESTXFMDBNCN);
769
1134
vlsettuning(listdb, -1, -1, ESTLISTDBLCN, ESTLISTDBNCN);
1136
if((omode & ESTDBWRITER) && (omode & ESTDBTRUNC) && (list = cbdirlist(name)) != NULL){
1137
for(i = 0; i < CB_LISTNUM(list); i++){
1138
elem = CB_LISTVAL(list, i);
1139
if(cbstrfwmatch(elem, ESTAISEQPREF) || cbstrfwmatch(elem, ESTAISTRPREF) ||
1140
cbstrfwmatch(elem, ESTAINUMPREF)){
1141
sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
1142
if(unlink(path) == -1) est_rmdir_rec(path);
1147
aidxs = cbmapopenex(ESTMINIBNUM);
1148
if((list = cbdirlist(name)) != NULL){
1149
for(i = 0; i < CB_LISTNUM(list); i++){
1150
elem = CB_LISTVAL(list, i);
1153
if(cbstrfwmatch(elem, ESTAISEQPREF)){
1154
dec = est_hex_decode(elem + strlen(ESTAISEQPREF));
1155
type = ESTIDXATTRSEQ;
1156
} else if(cbstrfwmatch(elem, ESTAISTRPREF)){
1157
dec = est_hex_decode(elem + strlen(ESTAISTRPREF));
1158
type = ESTIDXATTRSTR;
1159
} else if(cbstrfwmatch(elem, ESTAINUMPREF)){
1160
dec = est_hex_decode(elem + strlen(ESTAINUMPREF));
1161
type = ESTIDXATTRNUM;
1164
sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
1167
if((aidxdb = vlopen(path, vomode, VL_CMPLEX)) != NULL){
1168
vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1169
vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1170
attridx.db = aidxdb;
1171
attridx.type = type;
1172
cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1176
if((aidxdb = vlopen(path, vomode, est_aidx_numcmp)) != NULL){
1177
vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1178
vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1179
attridx.db = aidxdb;
1180
attridx.type = type;
1181
cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1185
if((aidxdb = dpopen(path, domode, crbnum(attrdb) / ESTAIBDIAM)) != NULL){
1186
dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
1187
attridx.db = aidxdb;
1188
attridx.type = type;
1189
cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
771
1198
CB_MALLOC(db, sizeof(ESTDB));
772
1199
db->name = cbmemdup(name, -1);
773
1201
db->metadb = metadb;
774
1202
db->idxdb = idxdb;
775
1203
db->fwmdb = fwmdb;
776
1206
db->attrdb = attrdb;
777
1207
db->textdb = textdb;
778
1208
db->kwddb = kwddb;
779
1209
db->listdb = listdb;
1211
CB_LISTOPEN(db->pdocs);
780
1213
db->ecode = ESTENOERR;
781
1214
db->fatal = FALSE;
782
1215
db->dseq = dseq;
783
1216
db->dnum = dnum;
784
1217
db->amode = amode;
785
1220
if(omode & ESTDBWRITER){
786
1221
db->idxcc = cbmapopenex(ESTIDXCCBNUM);
1222
db->auxcc = cbmapopenex(ESTAUXCCBNUM);
788
1224
db->icmax = ESTIDXCCMAX;
789
1225
db->outcc = cbmapopenex(ESTOUTCCBNUM);
791
1227
db->idxcc = cbmapopenex(1);
1228
db->auxcc = cbmapopenex(1);
794
1231
db->outcc = cbmapopenex(1);
1336
/* Add an index for narrowing or sorting with document attributes. */
1337
int est_db_add_attr_index(ESTDB *db, const char *name, int type){
1341
char path[ESTPATHBUFSIZ], *enc, *vbuf;
1342
int i, domode, vomode, crdnum, err, snum;
1344
if(!dpwritable(db->metadb)){
1345
db->ecode = ESTEACCES;
1348
if(cbmapget(db->aidxs, name, -1, NULL)){
1349
db->ecode = ESTEMISC;
1352
enc = est_hex_encode(name);
1355
sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAISEQPREF, enc);
1358
sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAISTRPREF, enc);
1361
sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAINUMPREF, enc);
1365
db->ecode = ESTEINVAL;
1369
domode = DP_OWRITER | DP_OCREAT | DP_OTRUNC;
1370
vomode = VL_OWRITER | VL_OCREAT | VL_OTRUNC;
1372
vomode |= VL_OXCOMP;
1373
} else if(ESTUSELZO){
1374
vomode |= VL_OYCOMP;
1375
} else if(ESTUSEZLIB){
1376
vomode |= VL_OZCOMP;
1382
vlcrdnum = ESTVLCRDNUM;
1383
if(!(aidxdb = vlopen(path, vomode, VL_CMPLEX))){
1388
vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1389
vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1390
if(est_db_doc_num(db) > 0){
1391
scores = est_search_uvset(db, &snum, NULL, TRUE);
1392
for(i = 0; i < snum; i++){
1393
if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1394
if(!est_aidx_attr_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1401
if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1407
vlcrdnum = ESTVLCRDNUM;
1408
if(!(aidxdb = vlopen(path, vomode, est_aidx_numcmp))){
1413
vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1414
vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1415
if(est_db_doc_num(db) > 0){
1416
scores = est_search_uvset(db, &snum, NULL, TRUE);
1417
for(i = 0; i < snum; i++){
1418
if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1419
if(!est_aidx_attr_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1426
if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1432
if(!(aidxdb = dpopen(path, domode, crbnum(db->attrdb) * ESTAIBDIAM))){
1437
dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
1438
if(est_db_doc_num(db) > 0){
1439
scores = est_search_uvset(db, &snum, NULL, TRUE);
1440
for(i = 0; i < snum; i++){
1441
if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1442
if(!est_aidx_seq_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1449
if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1456
attridx.db = aidxdb;
1457
attridx.type = type;
1458
cbmapput(db->aidxs, name, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1459
return err ? FALSE : TRUE;
872
1463
/* Flush index words in the cache of a database. */
873
1464
int est_db_flush(ESTDB *db, int max){
1465
ESTATTRIDX *attridx;
877
const char *kbuf, *vbuf, *rp, *pv;
879
int i, j, inc, err, ksiz, vsiz, rnum, id, dnum, tsiz;
1469
const char *kbuf, *vbuf, *rp, *pv, *ep;
1470
char *tbuf, *wp, numbuf[ESTNUMBUFSIZ];
1471
int i, j, inc, err, ksiz, vsiz, rnum, len, id, sum, cid, vnum, lid, dnum, tsiz, vstep;
881
1473
if(!dpwritable(db->metadb)){
882
1474
db->ecode = ESTEACCES;
885
if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->outcc) < 1) return TRUE;
1477
if(max < 1 || max >= INT_MAX){
1478
if(!est_db_write_meta(db)) err = TRUE;
1479
if(!dpmemflush(db->metadb)) err = TRUE;
1480
if(!crmemflush(db->attrdb)) err = TRUE;
1481
if(!crmemflush(db->textdb)) err = TRUE;
1482
if(!crmemflush(db->kwddb)) err = TRUE;
1483
if(!vlmemflush(db->listdb)) err = TRUE;
1484
cbmapiterinit(db->aidxs);
1485
while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1486
attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1487
switch(attridx->type){
1490
if(!vlmemflush(attridx->db)) err = TRUE;
1493
if(!dpmemflush(attridx->db)) err = TRUE;
1498
if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->auxcc) < 1 && cbmaprnum(db->outcc) < 1)
886
1501
db->intflag = FALSE;
887
1502
inc = est_db_used_cache_size(db) > db->icmax;
890
1505
cbmapiterinit(db->idxcc);
891
1506
while((kbuf = cbmapiternext(db->idxcc, &ksiz)) != NULL){
892
cblistpush(keys, kbuf, ksiz);
1507
CB_LISTPUSH(keys, kbuf, ksiz);
894
1509
rnum = CB_LISTNUM(keys);
895
1510
cblistsort(keys);
897
1512
while(CB_LISTNUM(keys) > max){
898
free(cblistpop(keys, NULL));
901
1516
for(i = 0; i < CB_LISTNUM(keys); i++){
902
kbuf = CB_LISTVAL2(keys, i, &ksiz);
1517
kbuf = CB_LISTVAL2(keys, i, ksiz);
903
1518
vbuf = cbmapget(db->idxcc, kbuf, ksiz, &vsiz);
904
if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz) ||
1519
if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz, db->smode) ||
905
1520
(!vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP) && dpecode != DP_EKEEP)){
919
1534
if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
922
1537
if(cbmaprnum(db->idxcc) < 1){
923
1538
cbmapclose(db->idxcc);
924
1539
db->idxcc = cbmapopenex(rnum > ESTIDXCCBNUM ? rnum * 1.5 : ESTIDXCCBNUM);
1540
if(cbmaprnum(db->auxcc) > 0){
1542
cbmapiterinit(db->auxcc);
1543
while((kbuf = cbmapiternext(db->auxcc, &ksiz)) != NULL){
1544
CB_LISTPUSH(keys, kbuf, ksiz);
1547
for(i = 0; i < CB_LISTNUM(keys); i++){
1548
kbuf = CB_LISTVAL2(keys, i, ksiz);
1549
vbuf = cbmapget(db->auxcc, kbuf, ksiz, &vsiz);
1550
if(!vlput(db->auxdb, kbuf, ksiz, vbuf, vsiz, VL_DCAT)){
1554
len = sprintf(numbuf, "%d", vlvsiz(db->auxdb, kbuf, ksiz) / (int)(sizeof(int) * 2));
1555
if(!vlput(db->xfmdb, kbuf, ksiz, numbuf, len, VL_DOVER)){
1559
cbmapout(db->auxcc, kbuf, ksiz);
1561
if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing auxiliary keywords");
1562
if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1565
if(cbmaprnum(db->auxcc) < 1){
1566
cbmapclose(db->auxcc);
1567
db->auxcc = cbmapopenex(ESTAUXCCBNUM);
926
1571
if(max < 1 && cbmaprnum(db->outcc) > 0){
927
1572
ids = cbmapopen();
929
1574
cbmapiterinit(db->outcc);
930
1575
while((kbuf = cbmapiternext(db->outcc, &ksiz)) != NULL){
931
1576
if(*kbuf == '\t'){
932
1577
id = atoi(kbuf + 1);
933
1578
cbmapput(ids, (char *)&id, sizeof(int), "", 0, FALSE);
935
cblistpush(keys, kbuf, ksiz);
1580
CB_LISTPUSH(keys, kbuf, ksiz);
938
1583
cblistsort(keys);
939
1584
dnum = est_idx_dnum(db->idxdb);
940
1585
for(i = 0; i < CB_LISTNUM(keys); i++){
941
kbuf = CB_LISTVAL2(keys, i, &ksiz);
942
for(j = 0; j < dnum; j++){
943
if((tbuf = est_idx_get_one(db->idxdb, j, kbuf, ksiz, &tsiz)) != NULL){
944
nval = cbdatumopen("", 0);
1586
kbuf = CB_LISTVAL2(keys, i, ksiz);
1588
if((tbuf = vlget(db->auxdb, kbuf + 1, ksiz - 1, &tsiz)) != NULL){
946
while(rp < tbuf + tsiz){
1593
if(!cbmapget(ids, rp, sizeof(int), NULL)){
1594
memmove(wp, rp, sizeof(int) * 2);
1595
wp += sizeof(int) * 2;
953
if(!cbmapget(ids, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
955
if(!est_idx_put_one(db->idxdb, j, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1597
rp += sizeof(int) * 2;
1600
if(!vlput(db->auxdb, kbuf + 1, ksiz - 1, tbuf, wp - tbuf, VL_DOVER)) err = TRUE;
1601
len = sprintf(numbuf, "%d", (int)((wp - tbuf) / (sizeof(int) * 2)));
1602
if(!vlput(db->xfmdb, kbuf + 1, ksiz - 1, numbuf, len, VL_DOVER)) err = TRUE;
1604
if(!vlout(db->auxdb, kbuf + 1, ksiz - 1)) err = TRUE;
1605
if(!vlout(db->xfmdb, kbuf + 1, ksiz - 1) && dpecode != DP_ENOITEM) err = TRUE;
1611
for(j = 0; j < dnum; j++){
1612
if((vbuf = est_idx_get_one(db->idxdb, j, kbuf, ksiz, &tsiz)) != NULL){
1619
EST_READ_VNUMBUF(rp, vnum, vstep);
1638
if(!cbmapget(ids, (char *)&cid, sizeof(int), NULL)){
1639
EST_SET_VNUMBUF(vstep, numbuf, cid - lid - 1);
1640
CB_DATUMCAT(nval, numbuf, vstep);
1641
CB_DATUMCAT(nval, pv, rp - pv);
1645
if(!est_idx_put_one(db->idxdb, j, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1647
sum += CB_DATUMSIZE(nval);
1648
CB_DATUMCLOSE(nval);
1651
if(sum < 1 && !vlout(db->fwmdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
961
1653
cbmapout(db->outcc, kbuf, ksiz);
962
1654
if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
963
1655
if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
967
if(!(max > 0 && db->intflag)){
1657
if(cbmaprnum(db->outcc) <= cbmaprnum(ids)){
968
1658
cbmapclose(db->outcc);
969
1659
db->outcc = cbmapopenex(ESTOUTCCBNUM);
972
1664
cbmapclose(db->keycc);
973
1665
db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
1886
/* Merge another database. */
1887
int est_db_merge(ESTDB *db, const char *name, int options){
1889
ESTATTRIDX *attridx;
1890
CBMAP *idmap, *seqmap, *attrs;
1893
const char *kbuf, *vbuf, *rp, *ep, *sp;
1894
char *tbuf, numbuf[ESTNUMBUFSIZ];
1895
int i, j, ecode, err, ksiz, vsiz, tsiz, oid, nid, len, vstep, anum, *ary;
1897
if(!dpwritable(db->metadb)){
1898
db->ecode = ESTEACCES;
1901
est_db_inform(db, "opening the target database");
1902
if(!(tgdb = est_db_open(name, ESTDBREADER, &ecode))){
1906
if(dpgetflags(db->metadb) != dpgetflags(tgdb->metadb)){
1907
est_db_close(tgdb, &ecode);
1908
db->ecode = ESTEMISC;
1912
idmap = cbmapopenex(est_db_doc_num(tgdb) + 1);
1913
vlcurfirst(tgdb->listdb);
1914
for(i = 0; (kbuf = vlcurkeycache(tgdb->listdb, &ksiz)) != NULL; i++){
1915
if((vbuf = vlgetcache(db->listdb, kbuf, ksiz, NULL)) != NULL &&
1916
!est_db_out_doc(db, atoi(vbuf), options & ESTMGCLEAN ? ESTODCLEAN : 0)) err = TRUE;
1917
oid = atoi(vlcurvalcache(tgdb->listdb, NULL));
1920
cbmapput(idmap, (char *)&oid, sizeof(int), (char *)&(db->dseq), sizeof(int), FALSE);
1921
vlcurnext(tgdb->listdb);
1922
if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "calculating ID mapping");
1924
if(!est_db_flush(db, -1)){
1926
est_db_close(tgdb, &ecode);
1929
cbmapiterinit(idmap);
1930
for(i = 0; (kbuf = cbmapiternext(idmap, &ksiz)) != NULL; i++){
1931
CB_MAPITERVAL(vbuf, kbuf, vsiz);
1934
if((tbuf = est_crget(tgdb->attrdb, tgdb->zmode, oid, &tsiz)) != NULL){
1935
attrs = cbmapload(tbuf, tsiz);
1936
len = sprintf(numbuf, "%d", nid);
1937
cbmapput(attrs, ESTDATTRID, -1, numbuf, len, TRUE);
1939
tbuf = cbmapdump(attrs, &tsiz);
1940
if((vbuf = cbmapget(attrs, ESTDATTRURI, -1, &vsiz)) != NULL){
1941
if(!vlput(db->listdb, vbuf, vsiz, numbuf, len, VL_DKEEP)){
1951
if(!est_crput(db->attrdb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
1956
if(cbmaprnum(db->aidxs) > 0){
1957
cbmapiterinit(db->aidxs);
1958
while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
1959
if(!(vbuf = cbmapget(attrs, kbuf, ksiz, &vsiz))) continue;
1960
attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1961
switch(attridx->type){
1964
if(!est_aidx_attr_put(attridx->db, nid, vbuf, vsiz)){
1971
if(!est_aidx_seq_put(attridx->db, nid, vbuf, vsiz)){
1986
if((tbuf = est_crget(tgdb->textdb, tgdb->zmode, oid, &tsiz)) != NULL){
1987
if(!est_crput(db->textdb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
1997
if((tbuf = est_crget(tgdb->kwddb, tgdb->zmode, oid, &tsiz)) != NULL){
1998
if(!est_crput(db->kwddb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
2004
} else if(dpecode != DP_ENOITEM){
2009
if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "importing documents");
2012
vlcurfirst(tgdb->fwmdb);
2013
while((kbuf = vlcurkeycache(tgdb->fwmdb, &ksiz)) != NULL){
2014
CB_LISTPUSH(words, kbuf, ksiz);
2015
vlcurnext(tgdb->fwmdb);
2017
for(i = 0; i < CB_LISTNUM(words); i++){
2018
kbuf = CB_LISTVAL2(words, i, ksiz);
2019
seqmap = cbmapopenex(tsiz / sizeof(int) + 1);
2020
tbuf = est_idx_scan(tgdb->idxdb, kbuf, ksiz, &tsiz, tgdb->smode);
2024
EST_READ_VNUMBUF(rp, oid, vstep);
2026
vbuf = cbmapget(idmap, (char *)&oid, sizeof(int), NULL);
2027
nid = vbuf ? *(int *)vbuf : -1;
2029
switch(tgdb->smode){
2044
if(nid > 0) cbmapputcat(seqmap, (char *)&nid, sizeof(int), sp, rp - sp);
2046
anum = cbmaprnum(seqmap);
2047
CB_MALLOC(ary, anum * sizeof(int) + 1);
2048
cbmapiterinit(seqmap);
2049
for(j = 0; (rp = cbmapiternext(seqmap, NULL)) != NULL; j++){
2050
ary[j] = *(int *)rp;
2052
qsort(ary, anum, sizeof(int), est_int_compare);
2054
for(j = 0; j < anum; j++){
2055
EST_SET_VNUMBUF(vstep, numbuf, ary[j]);
2056
CB_DATUMCAT(rbuf, numbuf, vstep);
2057
vbuf = cbmapget(seqmap, (char *)(ary + j), sizeof(int), &vsiz);
2058
CB_DATUMCAT(rbuf, vbuf, vsiz);
2060
if(!est_idx_add(db->idxdb, kbuf, ksiz, CB_DATUMPTR(rbuf), CB_DATUMSIZE(rbuf), db->smode)){
2065
CB_DATUMCLOSE(rbuf);
2069
vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP);
2070
if(i % ESTCCCBFREQ == 0){
2071
est_db_inform(db, "importing words");
2072
if(est_idx_size_current(db->idxdb) >= ESTIDXDBMAX){
2073
est_db_inform(db, "adding a new database file");
2074
est_idx_increment(db->idxdb);
2078
CB_LISTCLOSE(words);
2080
vlcurfirst(tgdb->auxdb);
2081
while((kbuf = vlcurkeycache(tgdb->auxdb, &ksiz)) != NULL){
2082
CB_LISTPUSH(words, kbuf, ksiz);
2083
vlcurnext(tgdb->auxdb);
2085
for(i = 0; i < CB_LISTNUM(words); i++){
2086
kbuf = CB_LISTVAL2(words, i, ksiz);
2087
vbuf = vlgetcache(tgdb->auxdb, kbuf, ksiz, &vsiz);
2093
vbuf = cbmapget(idmap, rp, sizeof(int), NULL);
2094
nid = vbuf ? *(int *)vbuf : -1;
2096
CB_DATUMCAT(rbuf, (char *)&nid, sizeof(int));
2097
CB_DATUMCAT(rbuf, rp + sizeof(int), sizeof(int));
2099
rp += sizeof(int) * 2;
2101
if(!vlput(db->auxdb, kbuf, ksiz, CB_DATUMPTR(rbuf), CB_DATUMSIZE(rbuf), VL_DCAT)){
2106
CB_DATUMCLOSE(rbuf);
2108
if((vbuf = vlgetcache(tgdb->xfmdb, kbuf, ksiz, NULL)) != NULL) anum += atoi(vbuf);
2109
len = sprintf(numbuf, "%d", anum);
2110
vlput(db->xfmdb, kbuf, ksiz, numbuf, len, VL_DOVER);
2111
if(i % ESTCCCBFREQ == 0) est_db_inform(db, "importing auxiliary words");
2113
CB_LISTCLOSE(words);
2115
est_db_inform(db, "closing the target database");
2116
if(!est_db_close(tgdb, &ecode)){
2120
if(!est_db_flush(db, -1)) err = TRUE;
2121
return err ? FALSE : TRUE;
1109
2125
/* Add a document to a database. */
1110
2126
int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options){
1111
2127
CBMAP *ocmap, *fmap, *qmap;
1113
2129
CBDATUM *ocbuf;
2130
ESTATTRIDX *attridx;
1114
2131
md5_state_t ms;
1115
2132
const char *uri, *ndig, *text, *word, *fnext, *snext, *kbuf, *vbuf;
1116
2133
unsigned char junc[2], c;
1117
char dobuf[32], dsbuf[64], *wp, *odig, wbuf[ESTWORDMAXLEN+3], *sbuf, *zbuf, nbuf[ESTNUMBUFSIZ];
1118
int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, num, ksiz, vsiz, ssiz, zsiz;
2134
char dobuf[32], dsbuf[64], *wp, *odig, wbuf[ESTWORDMAXLEN+3], *sbuf, nbuf[ESTNUMBUFSIZ];
2135
int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, score, num, ksiz, vsiz, ssiz;
2136
double tune, weight;
1120
2137
assert(db && doc);
1121
2138
if(!dpwritable(db->metadb)){
1122
2139
db->ecode = ESTEACCES;
1198
2218
num += ESTOCPOINT;
1199
2219
cbmapput(fmap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1200
2220
if(cbmapput(qmap, wbuf, wsiz + 3, "", 0, FALSE))
1201
cbmapputcat(ocmap, word, wsiz, (char *)junc, 2);
2221
cbmapputcat(ocmap, word, wsiz, (char *)junc, fnext ? 2 : 0);
1205
tune = log(wnum + 32);
1206
tune = (tune * tune) / 12.0;
2223
CB_LISTCLOSE(words);
2225
score = (vbuf = cbmapget(doc->attrs, "\t", 1, NULL)) ? atoi(vbuf) : -1;
2227
if(score < 0 && (options & ESTPDWEIGHT) &&
2228
(vbuf = cbmapget(doc->attrs, ESTDATTRWEIGHT, -1, NULL)) != NULL){
2229
weight = strtod(vbuf, NULL);
2230
weight = weight >= 0.01 ? weight : 0.01;
2232
tune = sqrt(wnum + 128) / 16.0 / weight;
1207
2233
cbmapiterinit(ocmap);
1208
2234
while((kbuf = cbmapiternext(ocmap, &ksiz)) != NULL){
1209
vbuf = cbmapget(ocmap, kbuf, ksiz, &vsiz);
1210
ocbuf = cbdatumopen("", 0);
1211
cbdatumcat(ocbuf, (char *)&(doc->id), sizeof(int));
1212
num = *(int *)cbmapget(fmap, kbuf, ksiz, NULL) / tune;
1213
if(num >= 0x80) num += (0x80 - num) * 0.75;
1214
if(num >= 0xc0) num += (0xc0 - num) * 0.75;
1215
c = num < 0xff ? num : 0xff;
1216
cbdatumcat(ocbuf, (char *)&c, 1);
1217
cbdatumcat(ocbuf, vbuf, vsiz);
2235
CB_MAPITERVAL(vbuf, kbuf, vsiz);
2236
if(vsiz > 2) qsort((void *)vbuf, vsiz / 2, 2, est_short_compare);
2237
CB_DATUMOPEN(ocbuf);
2238
EST_SET_VNUMBUF(wsiz, wbuf, doc->id);
2239
CB_DATUMCAT(ocbuf, wbuf, wsiz);
2244
num = score < 0 ? *(int *)cbmapget(fmap, kbuf, ksiz, NULL) / tune : score;
2245
if(num >= 0x80) num += (0x80 - num) * 0.75;
2246
if(num >= 0xc0) num += (0xc0 - num) * 0.75;
2247
c = num < 0xff ? num : 0xff;
2248
CB_DATUMCAT(ocbuf, (char *)&c, 1);
2252
num = score < 0 ? *(int *)cbmapget(fmap, kbuf, ksiz, NULL) * 10 / tune : score;
2253
CB_DATUMCAT(ocbuf, (char *)&num, sizeof(int));
2256
CB_DATUMCAT(ocbuf, vbuf, vsiz);
1219
cbdatumcat(ocbuf, (char *)&c, 1);
2258
CB_DATUMCAT(ocbuf, (char *)&c, 1);
1220
2259
cbmapputcat(db->idxcc, kbuf, ksiz, CB_DATUMPTR(ocbuf), CB_DATUMSIZE(ocbuf));
1221
2260
db->icsiz += CB_DATUMSIZE(ocbuf);
1222
cbdatumclose(ocbuf);
2261
CB_DATUMCLOSE(ocbuf);
1224
2263
cbmapclose(qmap);
1225
2264
cbmapclose(fmap);
1226
2265
cbmapclose(ocmap);
1228
2267
sbuf = cbmapdump(doc->attrs, &ssiz);
1229
if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DKEEP)){
2268
if(!est_crput(db->attrdb, db->zmode, doc->id, sbuf, ssiz, CR_DKEEP)){
1230
2269
db->ecode = ESTEDB;
1231
2270
db->fatal = TRUE;
1235
2274
sbuf = cblistdump(doc->dtexts, &ssiz);
1236
if(!(zbuf = est_deflate(sbuf, ssiz, &zsiz))){
1239
db->ecode = ESTEMISC;
1243
if(!crput(db->textdb, (char *)&(doc->id), sizeof(int), zbuf, zsiz, CR_DKEEP)){
2275
if(!est_crput(db->textdb, db->zmode, doc->id, sbuf, ssiz, CR_DKEEP)){
1244
2276
db->ecode = ESTEDB;
1245
2277
db->fatal = TRUE;
2281
if(doc->kwords && !est_db_put_keywords(db, doc->id, doc->kwords, weight)) err = TRUE;
1250
2282
sprintf(nbuf, "%d", doc->id);
1251
2283
if(!vlput(db->listdb, uri, -1, nbuf, -1, VL_DKEEP)){
1252
2284
db->ecode = ESTEDB;
1253
2285
db->fatal = TRUE;
2288
if(cbmaprnum(db->aidxs) > 0){
2289
cbmapiterinit(db->aidxs);
2290
while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2291
if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2292
attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2293
switch(attridx->type){
2296
if(!est_aidx_attr_put(attridx->db, doc->id, vbuf, vsiz)){
2303
if(!est_aidx_seq_put(attridx->db, doc->id, vbuf, vsiz)){
1257
2313
if(est_db_used_cache_size(db) > db->icmax && !est_db_flush(db, INT_MAX)) err = TRUE;
1258
2314
return err ? FALSE : TRUE;
1698
2996
cond->shadows = NULL;
1700
if(cond->max >= 0 && cond->max < snum) snum = cond->max;
1701
CB_MALLOC(rval, snum * sizeof(int) + 1);
1702
for(i = 0; i < snum; i++){
1703
rval[i] = scores[i].id;
2998
rnum = snum - cond->skip;
2999
if(rnum < 0) rnum = 0;
3000
if(cond->max >= 0 && cond->max < rnum) rnum = cond->max;
3001
CB_MALLOC(rval, rnum * sizeof(int) + 1);
3002
tscores = scores + cond->skip;
3003
for(i = 0; i < rnum; i++){
3004
rval[i] = tscores[i].id;
1705
3006
if(cond->scfb){
1706
CB_REALLOC(cond->scores, snum * sizeof(int) + 1);
1707
for(i = 0; i < snum; i++){
1708
cond->scores[i] = scores[i].score;
3007
CB_REALLOC(cond->scores, rnum * sizeof(int) + 1);
3008
for(i = 0; i < rnum; i++){
3009
cond->scores[i] = tscores[i].score;
1713
3014
if(*nump < 1) db->ecode = ESTENOITEM;
3015
cbmapclose(ordattrs);
3021
/* Search documents of plural databases. */
3022
int *est_db_search_meta(ESTDB **dbs, int dbnum, ESTCOND *cond, int *nump, CBMAP *hints){
3023
ESTMETASCORE *scores, *tscores;
3025
CBMAP *thints, *umap;
3026
const char *kbuf, *otype, *rp;
3027
char *distinct, numbuf[ESTNUMBUFSIZ], *oname, *wp, *vbuf;
3028
int i, j, max, skip, smax, snum, *res, rnum, ksiz, num;
3030
assert(dbs && dbnum >= 0 && cond && nump);
3032
if(cond->distinct) cond->max = -1;
3035
distinct = cond->distinct;
3036
cond->distinct = NULL;
3037
smax = ESTALLOCUNIT;
3038
CB_MALLOC(scores, smax * sizeof(ESTMETASCORE));
3040
for(i = 0; i < dbnum; i++){
3041
if(cond->mask & (1 << i)) continue;
3042
tcond = est_cond_dup(cond);
3043
est_cond_set_options(tcond, ESTCONDSCFB);
3044
thints = cbmapopenex(ESTMINIBNUM);
3045
res = est_db_search(dbs[i], tcond, &rnum, thints);
3046
for(j = 0; j < rnum; j++){
3049
CB_REALLOC(scores, smax * sizeof(ESTMETASCORE));
3051
scores[snum].db = i;
3052
scores[snum].id = res[j];
3053
scores[snum].score = est_cond_score(tcond, j);
3054
scores[snum].value = NULL;
3058
cbmapiterinit(thints);
3059
while((kbuf = cbmapiternext(thints, &ksiz)) != NULL){
3060
num = atoi(cbmapiterval(kbuf, NULL));
3061
if((rp = cbmapget(hints, kbuf, ksiz, NULL)) != NULL) num += atoi(rp);
3062
sprintf(numbuf, "%d", num);
3063
cbmapput(hints, kbuf, ksiz, numbuf, -1, TRUE);
3068
est_cond_delete(tcond);
3073
oname = cbmemdup(cond->order, -1);
3076
if((wp = strchr(oname, ' ')) != NULL){
3086
if(!cbstricmp(oname, ESTORDIDA)){
3087
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_id_asc);
3088
} else if(!cbstricmp(oname, ESTORDIDD)){
3089
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_id_desc);
3090
} else if(!cbstricmp(oname, ESTORDSCA)){
3091
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_asc);
3092
} else if(!cbstricmp(oname, ESTORDSCD)){
3093
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_desc);
3095
for(i = 0; i < snum; i++){
3096
scores[i].value = est_db_get_doc_attr(dbs[scores[i].db], scores[i].id, oname);
3097
if(!scores[i].value) scores[i].value = cbmemdup("", 0);
3099
if(!cbstricmp(otype, ESTORDSTRA)){
3100
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_str_asc);
3101
} else if(!cbstricmp(otype, ESTORDSTRD)){
3102
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_str_desc);
3103
} else if(!cbstricmp(otype, ESTORDNUMA)){
3104
for(i = 0; i < snum; i++){
3105
tval = cbstrmktime(scores[i].value);
3106
free(scores[i].value);
3107
scores[i].value = (void *)tval;
3109
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_num_asc);
3110
for(i = 0; i < snum; i++){
3111
scores[i].value = NULL;
3113
} else if(!cbstricmp(otype, ESTORDNUMD)){
3114
for(i = 0; i < snum; i++){
3115
tval = cbstrmktime(scores[i].value);
3116
free(scores[i].value);
3117
scores[i].value = (void *)tval;
3119
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_num_desc);
3120
for(i = 0; i < snum; i++){
3121
scores[i].value = NULL;
3124
for(i = 0; i < snum; i++){
3125
free(scores[i].value);
3130
qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_desc);
3133
umap = cbmapopenex(snum + 1);
3135
for(i = 0; i < snum; i++){
3136
vbuf = est_db_get_doc_attr(dbs[scores[i].db], scores[i].id, distinct);
3137
if(!vbuf) vbuf = cbmemdup("", 0);
3138
if(cbmapput(umap, vbuf, -1, "", 0, FALSE)) scores[rnum++] = scores[i];
3145
if(rnum < 0) rnum = 0;
3146
if(cond->max >= 0 && cond->max < rnum) rnum = cond->max;
3147
CB_MALLOC(res, rnum * sizeof(int) * 2 + 1);
3148
tscores = scores + skip;
3149
for(i = 0; i < rnum; i++){
3150
res[i*2] = tscores[i].db;
3151
res[i*2+1] = tscores[i].id;
3154
CB_REALLOC(cond->scores, rnum * sizeof(int) + 1);
3155
for(i = 0; i < rnum; i++){
3156
cond->scores[i] = tscores[i].score;
3164
cond->distinct = distinct;
1719
3169
/* Check whether a document object matches the phrase of a search condition object definitely. */
1720
3170
int est_db_scan_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond){
1721
3171
struct { char *word; int num; } wsets[ESTSCANWNUM];
5042
/* Repair a broken database directory. */
5043
int est_db_repair(const char *name, int options, int *ecp){
5045
DEPOT *depot, *metadb;
5046
CURIA *curia, *attrdb, *textdb, *kwddb;
5047
VILLA *villa, *listdb;
5049
CBMAP *aidxs, *attrs;
5050
ESTATTRIDX attridx, *attridxp;
5052
const char *elem, *abuf;
5053
char path[ESTPATHBUFSIZ], *kbuf, vbuf[ESTNUMBUFSIZ], *dec, *mbuf;
5054
int i, err, idmax, flags, zmode, dnum, dseq, ksiz, vsiz, type, id, msiz, esiz, asiz;
5055
assert(name && ecp);
5056
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
5057
if(est_inode(path) == -1){
5061
if(!(options & ESTRPSTRICT) && (depot= dpopen(path, DP_OWRITER, -1)) != NULL){
5066
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
5067
if(est_inode(path) == -1){
5071
if((list = cbdirlist(path)) != NULL){
5072
for(i = 1; i < CB_LISTNUM(list); i++){
5073
elem = CB_LISTVAL(list, i);
5074
if(elem[0] < '0' || elem[0] > '9') continue;
5075
sprintf(path, "%s%c%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME, ESTPATHCHR, elem);
5076
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5079
vlrepair(path, VL_CMPLEX);
5084
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
5085
if(est_inode(path) == -1){
5089
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5092
vlrepair(path, VL_CMPLEX);
5094
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTAUXDBNAME);
5095
if(est_inode(path) == -1){
5099
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5102
vlrepair(path, VL_CMPLEX);
5104
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTXFMDBNAME);
5105
if(est_inode(path) == -1){
5109
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5112
vlrepair(path, VL_CMPLEX);
5114
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
5115
if(est_inode(path) == -1){
5119
if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5124
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
5125
if(est_inode(path) == -1){
5129
if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5134
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
5135
if(est_inode(path) == -1){
5139
if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5144
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
5145
if(est_inode(path) == -1){
5149
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5152
vlrepair(path, VL_CMPLEX);
5154
if((list = cbdirlist(name)) != NULL){
5155
for(i = 0; i < CB_LISTNUM(list); i++){
5156
elem = CB_LISTVAL(list, i);
5157
if(cbstrfwmatch(elem, ESTAISEQPREF)){
5158
sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5159
if(!(options & ESTRPSTRICT) && (depot = dpopen(path, DP_OWRITER, -1)) != NULL){
5164
} else if(cbstrfwmatch(elem, ESTAISTRPREF)){
5165
sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5166
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5169
vlrepair(path, VL_CMPLEX);
5171
} else if(cbstrfwmatch(elem, ESTAINUMPREF)){
5172
sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5173
if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5176
vlrepair(path, VL_CMPLEX);
5182
if((options & ESTRPSHODDY) && (db = est_db_open(name, ESTDBWRITER, ecp)) != NULL){
5183
if(!est_db_close(db, ecp)) return FALSE;
5186
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
5187
metadb = dpopen(path, DP_OWRITER, -1);
5188
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
5189
attrdb = cropen(path, CR_OWRITER, -1, -1);
5190
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
5191
textdb = cropen(path, CR_OWRITER, -1, -1);
5192
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
5193
kwddb = cropen(path, CR_OWRITER, -1, -1);
5194
sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
5195
listdb = vlopen(path, VL_OWRITER, VL_CMPLEX);
5196
if(!attrdb || !textdb || !kwddb || !listdb){
5197
if(listdb) vlclose(listdb);
5198
if(kwddb) crclose(kwddb);
5199
if(textdb) crclose(textdb);
5200
if(attrdb) crclose(attrdb);
5201
if(metadb) dpclose(metadb);
5205
aidxs = cbmapopenex(ESTMINIBNUM);
5206
if((list = cbdirlist(name)) != NULL){
5207
for(i = 0; i < CB_LISTNUM(list); i++){
5208
elem = CB_LISTVAL(list, i);
5211
if(cbstrfwmatch(elem, ESTAISEQPREF)){
5212
dec = est_hex_decode(elem + strlen(ESTAISEQPREF));
5213
type = ESTIDXATTRSEQ;
5214
} else if(cbstrfwmatch(elem, ESTAISTRPREF)){
5215
dec = est_hex_decode(elem + strlen(ESTAISTRPREF));
5216
type = ESTIDXATTRSTR;
5217
} else if(cbstrfwmatch(elem, ESTAINUMPREF)){
5218
dec = est_hex_decode(elem + strlen(ESTAINUMPREF));
5219
type = ESTIDXATTRNUM;
5222
sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5225
if((aidxdb = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5226
vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
5227
vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
5228
attridx.db = aidxdb;
5229
attridx.type = type;
5230
cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5234
if((aidxdb = vlopen(path, VL_OWRITER, est_aidx_numcmp)) != NULL){
5235
vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
5236
vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
5237
attridx.db = aidxdb;
5238
attridx.type = type;
5239
cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5243
if((aidxdb = dpopen(path, DP_OWRITER, crbnum(attrdb) / ESTAIBDIAM)) != NULL){
5244
dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
5245
attridx.db = aidxdb;
5246
attridx.type = type;
5247
cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5258
if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
5262
flags = dpgetflags(metadb);
5264
if(flags & ESTDFZLIB){
5266
} else if(flags & ESTDFLZO){
5268
} else if(flags & ESTDFBZIP){
5274
if(!criterinit(attrdb)) err = TRUE;
5275
while((kbuf = criternext(attrdb, &ksiz)) != NULL){
5276
if(ksiz == sizeof(int) && (id = *(int *)kbuf) > 0 && id <= idmax &&
5277
crvsiz(attrdb, kbuf, ksiz) > 0 && crvsiz(textdb, kbuf, ksiz) > 0){
5279
if(dseq < id) dseq = id;
5280
if(options & ESTRPSTRICT){
5281
if((mbuf = est_crget(attrdb, zmode, id, &msiz)) != NULL){
5282
attrs = cbmapload(mbuf, msiz);
5283
if((elem = cbmapget(attrs, ESTDATTRURI, -1, &esiz)) != NULL){
5284
vsiz = sprintf(vbuf, "%d", id);
5285
vlput(listdb, elem, esiz, vbuf, vsiz, VL_DKEEP);
5287
if(cbmaprnum(aidxs) > 0){
5288
cbmapiterinit(aidxs);
5289
while((abuf = cbmapiternext(aidxs, &asiz)) != NULL){
5290
if(!(elem = cbmapget(attrs, abuf, asiz, &esiz))) continue;
5291
attridxp = (ESTATTRIDX *)cbmapiterval(abuf, NULL);
5292
switch(attridxp->type){
5295
est_aidx_attr_put(attridxp->db, id, elem, esiz);
5298
est_aidx_seq_put(attridxp->db, id, elem, esiz);
5308
CB_LISTPUSH(list, kbuf, ksiz);
5312
if(dpecode != DP_ENOITEM) err = TRUE;
5313
for(i = 0; i < CB_LISTNUM(list); i++){
5314
elem = CB_LISTVAL2(list, i, esiz);
5315
crout(attrdb, elem, esiz);
5316
crout(textdb, elem, esiz);
5317
crout(kwddb, elem, esiz);
5320
sprintf(vbuf, "%d", dseq);
5321
if(!dpput(metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
5322
sprintf(vbuf, "%d", dnum);
5323
if(!dpput(metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
5324
cbmapiterinit(aidxs);
5325
while((elem = cbmapiternext(aidxs, NULL)) != NULL){
5326
attridxp = (ESTATTRIDX *)cbmapiterval(elem, NULL);
5327
switch(attridxp->type){
5330
if(!vlclose(attridxp->db)) err = TRUE;
5333
if(!dpclose(attridxp->db)) err = TRUE;
5338
if(!vlclose(listdb)) err = TRUE;
5339
if(!crclose(kwddb)) err = TRUE;
5340
if(!crclose(textdb)) err = TRUE;
5341
if(!crclose(attrdb)) err = TRUE;
5342
if(!dpclose(metadb)) err = TRUE;
5347
return err ? FALSE : TRUE;
2904
5351
/* Extract words for snippet from hints of search. */
2905
5352
CBLIST *est_hints_to_words(CBMAP *hints){
2907
5354
const char *kbuf;
2910
words = cblistopen();
2911
5358
cbmapiterinit(hints);
2912
5359
while((kbuf = cbmapiternext(hints, &ksiz)) != NULL){
2913
if(ksiz < 1 || atoi(cbmapget(hints, kbuf, ksiz, NULL)) < 1) continue;
2914
cblistpush(words, kbuf, ksiz);
5360
if(ksiz < 1 || atoi(cbmapget(hints, kbuf, ksiz, NULL)) < 0) continue;
5361
CB_LISTPUSH(words, kbuf, ksiz);
5367
/* Reset the environment of the process. */
5368
void est_proc_env_reset(void){
5372
putenv("LANGUAGE=C");
5373
putenv("LC_CTYPE=C");
5374
putenv("LC_COLLATE=C");
5375
putenv("LC_TIME=C");
5376
putenv("LC_NUMERIC=C");
5377
putenv("LC_MONETARY=C");
5379
putenv("EST_VERSION=" _EST_VERSION);
5380
if((value = getenv("PATH")) != NULL){
5381
if(ESTPATHCHR == '\\'){
5382
pbuf = cbsprintf("PATH=%s;C:\\hyperestraier;D:\\hyperestraier;E:\\hyperestraier", value);
5384
pbuf = cbsprintf("PATH=%s:/bin:/usr/bin:/usr/local/bin", value);
5387
cbglobalgc(pbuf, free);
2920
5392
/* Make a directory. */
2921
5393
int est_mkdir(const char *path){
2922
5394
#if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
6223
/* Categorize a character for character category analyzer.
6224
`c' specifies the UCS number of a character.
6225
The return value is the category of the character. */
6226
static int est_char_category_chrcat(int c){
6228
if(c <= 0x0020) return ESTSPACECHR;
6230
if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
6231
(c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
6233
if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
6235
/* arabic and syrian */
6236
if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
6237
/* south and south east asia */
6238
if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
6240
if(c >= 0x3040 && c <= 0x309f) return ESTHIRAGANA;
6242
if(c >= 0x30a0 && c <= 0x30ff) return ESTKATAKANA;
6244
if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x3130 && c <= 0x318f) ||
6245
(c >= 0xac00 && c <= 0xd7af)) return ESTHANGUL;
6247
if(c >= 0x4e00 && c <= 0x9faf) return ESTKANJI;
6248
/* other cjk and surrogates */
6249
if((c >= 0x2e80 && c <= 0xdfff) || (c >= 0xf900 && c <= 0xfaff) ||
6250
(c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
6251
/* asian presentation forms */
6252
if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
6253
(c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
6259
/* Make a snippet of an arbitrary string.
6260
`word' specifies a list object of words to be highlight.
6261
`wwidth' specifies whole width of the result.
6262
`hwidth' specifies width of strings picked up from the beginning of the text.
6263
`awidth' specifies width of strings picked up around each highlighted word.
6264
The return value is a snippet string of the string. */
6265
static char *est_make_snippet(const char *str, int len, const CBLIST *words,
6266
int wwidth, int hwidth, int awidth){
6270
const char *word, *cval;
6271
const unsigned char *rword;
6272
unsigned char *rtext, *ctext;
6273
int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
6274
assert(str && len >= 0 && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
6276
CB_LISTOPEN(rwords);
6277
for(i = 0; i < CB_LISTNUM(words); i++){
6278
word = CB_LISTVAL2(words, i, wsiz);
6279
if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
6280
rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
6281
est_canonicalize_text(rtext, size, TRUE);
6282
CB_LISTPUSHBUF(rwords, (char *)rtext, size);
6284
rtext = (unsigned char *)est_uconv_in(str, len, &size);
6285
ctext = (unsigned char *)cbmemdup((char *)rtext, size);
6286
est_canonicalize_text(ctext, size, FALSE);
6288
if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
6289
if(mywidth > wwidth) mywidth = wwidth;
6290
for(i = 0; i < size && mywidth > 0; i += 2){
6291
mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
6294
if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
6295
est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
6298
CB_DATUMCAT(res, "\n", 1);
6300
counts = cbmapopenex(ESTMINIBNUM);
6301
for(i = bi; i < size && wwidth >= 0; i += 2){
6302
for(j = 0; j < CB_LISTNUM(rwords); j++){
6303
rword = (unsigned char *)CB_LISTVAL2(rwords, j, rwsiz);
6304
if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
6305
(!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
6306
csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
6307
cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
6308
if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
6310
counts = cbmapopenex(ESTMINIBNUM);
6312
mywidth = awidth / 2 + 1;
6313
for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
6314
mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
6317
mywidth = awidth / 2 + 1;
6318
for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
6319
mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
6321
if(k > size) k = size;
6322
est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
6323
wwidth -= awidth + rwsiz / 2;
6326
CB_DATUMCAT(res, "\n", 1);
6335
CB_LISTCLOSE(rwords);
6336
return cbdatumtomalloc(res, NULL);
6340
/* Check whether a string is compsed of CJK characters only.
6341
`str' specifies a string of UTF-8.
6342
The return value is whether the string is compsed of CJK characters only. */
6343
static int est_check_cjk_only(const char *str){
6344
const unsigned char *rp;
6346
rp = (unsigned char *)str;
6348
while(rp < (unsigned char *)str + size){
6351
} else if(*rp < 0xdf){
6353
} else if(*rp < 0xf0){
6354
if(rp >= (unsigned char *)str + size - 2) break;
6356
} else if(*rp < 0xf8){
6357
if(rp >= (unsigned char *)str + size - 3) break;
6359
} else if(*rp < 0xfb){
6360
if(rp >= (unsigned char *)str + size - 4) break;
6362
} else if(*rp < 0xfd){
6363
if(rp >= (unsigned char *)str + size - 5) break;
3489
6373
/* Convert a simplified phrase into complete form.
3490
6374
`sphrase' specifies a simplified phrase.
3491
6375
The return value is the complete form of the phrase. */
3492
static char *est_phrase_from_thumb(const char *sphrase){
6376
static char *est_phrase_from_simple(const char *sphrase){
3493
6377
CBDATUM *datum;
3494
6378
const char *oper, *rp, *pv;
3495
6379
unsigned char *utext;
3497
6381
int size, quote, lw;
3498
6382
assert(sphrase);
3499
datum = cbdatumopen("", 0);
6383
CB_DATUMOPEN(datum);
3500
6384
utext = (unsigned char *)est_uconv_in(sphrase, strlen(sphrase), &size);
3501
6385
est_normalize_text(utext, size, &size);
3502
6386
est_canonicalize_text(utext, size, FALSE);
3548
6432
if(pv > rp + 1 && pv[-1] == '*'){
3549
6433
if(rp[0] == '*'){
3550
cbdatumcat(datum, ESTOPWCRX " ", -1);
6434
CB_DATUMCAT(datum, ESTOPWCRX " ", strlen(ESTOPWCRX) + 1);
3552
cbdatumcat(datum, ESTOPWCBW " ", -1);
6436
CB_DATUMCAT(datum, ESTOPWCBW " ", strlen(ESTOPWCBW) + 1);
3554
6438
} else if(pv > rp + 1 && rp[0] == '*'){
3555
6439
if(pv[-1] == '*'){
3556
cbdatumcat(datum, ESTOPWCRX " ", -1);
6440
CB_DATUMCAT(datum, ESTOPWCRX " ", strlen(ESTOPWCRX) + 1);
3558
cbdatumcat(datum, ESTOPWCEW " ", -1);
6442
CB_DATUMCAT(datum, ESTOPWCEW " ", strlen(ESTOPWCEW) + 1);
3562
if(*rp != '*' || (lw && rp[1] != '\0' && rp[1] != ' ')) cbdatumcat(datum, rp, 1);
6446
if(*rp != '*' || (lw && rp[1] != '\0' && rp[1] != ' ')) CB_DATUMCAT(datum, rp, 1);
6452
return cbdatumtomalloc(datum, NULL);
6456
/* Convert a rough phrase into complete form.
6457
`rphrase' specifies a simplified phrase.
6458
The return value is the complete form of the phrase. */
6459
static char *est_phrase_from_rough(const char *rphrase){
6461
const char *oper, *rp;
6462
unsigned char *utext;
6464
int size, quote, lw;
6466
CB_DATUMOPEN(datum);
6467
utext = (unsigned char *)est_uconv_in(rphrase, strlen(rphrase), &size);
6468
est_normalize_text(utext, size, &size);
6469
est_canonicalize_text(utext, size, FALSE);
6470
rtext = est_uconv_out((char *)utext, size, NULL);
6475
for(rp = rtext; *rp != '\0'; rp++){
6478
CB_DATUMCAT(datum, oper, strlen(oper));
6485
CB_DATUMCAT(datum, rp, 1);
6490
if(!oper) oper = " AND ";
6503
CB_DATUMCAT(datum, rp, 1);
6510
CB_DATUMCAT(datum, oper, strlen(oper));
6513
CB_DATUMCAT(datum, rp, 1);
6519
return cbdatumtomalloc(datum, NULL);
6523
/* Convert a union phrase into complete form.
6524
`uphrase' specifies a simplified phrase.
6525
The return value is the complete form of the phrase. */
6526
static char *est_phrase_from_union(const char *uphrase){
6530
unsigned char *utext;
6534
CB_DATUMOPEN(datum);
6535
utext = (unsigned char *)est_uconv_in(uphrase, strlen(uphrase), &size);
6536
est_normalize_text(utext, size, &size);
6537
est_canonicalize_text(utext, size, FALSE);
6538
rtext = est_uconv_out((char *)utext, size, NULL);
6540
terms = cbsplit(rtext, -1, " ");
6541
for(i = 0; i < CB_LISTNUM(terms); i++){
6542
term = CB_LISTVAL2(terms, i, size);
6543
if(size < 1) continue;
6544
if(CB_DATUMSIZE(datum) > 0) CB_DATUMCAT(datum, " OR ", 4);
6545
CB_DATUMCAT(datum, term, size);
6547
CB_LISTCLOSE(terms);
6550
return cbdatumtomalloc(datum, NULL);
6554
/* Convert a intersection phrase into complete form.
6555
`iphrase' specifies a simplified phrase.
6556
The return value is the complete form of the phrase. */
6557
static char *est_phrase_from_isect(const char *iphrase){
6561
unsigned char *utext;
6565
CB_DATUMOPEN(datum);
6566
utext = (unsigned char *)est_uconv_in(iphrase, strlen(iphrase), &size);
6567
est_normalize_text(utext, size, &size);
6568
est_canonicalize_text(utext, size, FALSE);
6569
rtext = est_uconv_out((char *)utext, size, NULL);
6571
terms = cbsplit(rtext, -1, " ");
6572
for(i = 0; i < CB_LISTNUM(terms); i++){
6573
term = CB_LISTVAL2(terms, i, size);
6574
if(size < 1) continue;
6575
if(CB_DATUMSIZE(datum) > 0) CB_DATUMCAT(datum, " AND ", 5);
6576
CB_DATUMCAT(datum, term, size);
6578
CB_LISTCLOSE(terms);
3568
6581
return cbdatumtomalloc(datum, NULL);
7127
/* Store a record related to the ID number of a document.
7128
`curia' specifies a database object.
7129
`zmode' specifies a compression mode.
7130
`id' specifies the ID number of a document.
7131
`vbuf' specifies the pointer to the value of a record.
7132
`vsiz' specifies the size of the value.
7133
The return value is true if success, else it is false. */
7134
static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode){
7137
assert(curia && id > 0 && vbuf && vsiz >= 0);
7140
if(!(zbuf = est_deflate(vbuf, vsiz, &zsiz, -1))){
7144
if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7151
if(!(zbuf = est_lzoencode(vbuf, vsiz, &zsiz))){
7155
if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7162
if(!(zbuf = est_bzencode(vbuf, vsiz, &zsiz))){
7166
if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7173
if(!crput(curia, (char *)&id, sizeof(int), vbuf, vsiz, dmode)) return FALSE;
7180
/* Remove a record related to the ID number of a document.
7181
`curia' specifies a database object.
7182
`id' specifies the ID number of a document.
7183
The return value is true if success, else it is false. */
7184
static int est_crout(CURIA *curia, int id){
7185
assert(curia && id > 0);
7186
return crout(curia, (char *)&id, sizeof(int));
7190
/* Get a record related to the ID number of a document.
7191
`curia' specifies a database object.
7192
`zmode' specifies a compression mode.
7193
`id' specifies the ID number of a document.
7194
`sp' specifies the pointer to a variable to which the size of the region of the return value
7196
The return value is the pointer to the region of the value of the corresponding record. */
7197
static char *est_crget(CURIA *curia, int zmode, int id, int *sp){
7200
assert(curia && id > 0 && sp);
7203
if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7204
if(!(vbuf = est_inflate(zbuf, zsiz, sp, -1))){
7211
if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7212
if(!(vbuf = est_lzodecode(zbuf, zsiz, sp))){
7219
if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7220
if(!(vbuf = est_bzdecode(zbuf, zsiz, sp))){
7227
if(!(vbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, sp))) return NULL;
7234
/* Add an attribute of a document to a sequencial attribute index.
7235
`db' specifies a handle of a sequencial attribute index.
7236
`id' specifies the ID number of a document.
7237
`vbuf' specifies the pointer to the attribute value.
7238
`vsiz' specifies the size of the attribute value.
7239
The return value is true if success, else it is false. */
7240
static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz){
7242
assert(db && id >= 0 && vbuf && vsiz >= 0);
7244
if(!dpput(db, (char *)&id, sizeof(int), vbuf, vsiz, DP_DKEEP)) err = TRUE;
7245
return err ? FALSE : TRUE;
7249
/* Remove an attribute of a document from a sequencial attribute index.
7250
`db' specifies a handle of a sequencial attribute index.
7251
`id' specifies the ID number of a document.
7252
The return value is true if success, else it is false. */
7253
static int est_aidx_seq_out(DEPOT *db, int id){
7255
assert(db && id >= 0);
7257
if(!dpout(db, (char *)&id, sizeof(int))) err = TRUE;
7258
return err ? FALSE : TRUE;
7262
/* Retrieve the value of an attribute of a document in a sequencial attribute index.
7263
`db' specifies a handle of a sequencial attribute index.
7264
`id' specifies the ID number of a document.
7265
The return value is the value of the attribute or `NULL' if no attribute. */
7266
static char *est_aidx_seq_get(DEPOT *db, int id, int *sp){
7267
assert(db && id >= 0 && sp);
7268
return dpget(db, (char *)&id, sizeof(int), 0, -1, sp);
7272
/* Narrow scores of search candidates with a sequencial attribute index.
7273
`db' specifies a handle of a sequencial attribute index.
7274
`pdocs' specifies a list of pseudo documents.
7275
`cop' specifies the pointer to the operator.
7276
`sign' specifies the sign of operation.
7277
`oval' specifies the operation value.
7278
`osiz' specifies the size of the operation value
7279
`sval' specifies the operation value of small cases.
7280
`ssiz' specifies the size of the operation value of small cases.
7281
`regex' specifies the regular expressions.
7282
`onum' specifies the numeric value.
7283
`scores' specifies an array of scores of search candidates.
7284
`snum' specifies the number of the array.
7285
`limit' specifies the limit number to check.
7286
`restp' specifies the pointer to a variable to which rest number to be checked is assigned.
7287
The return value is the new number of the array. */
7288
static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, const char *cop, int sign,
7289
const char *oval, int osiz, const char *sval, int ssiz,
7290
const void *regex, int onum, ESTSCORE *scores, int snum,
7291
int limit, int *restp){
7292
char vbuf[ESTAIKBUFSIZ];
7294
assert(db && cop && oval && osiz >= 0 && scores && snum >= 0 && limit >= 0 && restp);
7296
for(i = 0; i < snum; i++){
7301
if(scores[i].id >= ESTPDOCIDMIN){
7302
scores[nnum].id = scores[i].id;
7303
scores[nnum].score = scores[i].score;
7307
if((vsiz = dpgetwb(db, (char *)&(scores[i].id), sizeof(int), 0, ESTAIKBUFSIZ - 1, vbuf)) < 0)
7310
if(est_match_attr(vbuf, vsiz, cop, sign, oval, osiz, sval, ssiz, regex, onum)){
7311
scores[nnum].id = scores[i].id;
7312
scores[nnum].score = scores[i].score;
7320
/* Compare two record in numeric order.
7321
`aptr' specifies the pointer to the region of one key.
7322
`asiz' specifies the size of the region of one key.
7323
`bptr' specifies the pointer to the region of the other key.
7324
`bsiz' specifies the size of the region of the other key.
7325
The return value is positive if the former is big, negative if the latter is big, 0 if both
7327
static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz){
7329
if((rv = cbstrmktime(aptr) - cbstrmktime(bptr)) != 0) return rv;
7330
return VL_CMPLEX(aptr, asiz, bptr, bsiz);
7334
/* Add an attribute of a document to an attribute narrowing index.
7335
`db' specifies a handle of an attribute narrowing index.
7336
`id' specifies the ID number of a document.
7337
`vbuf' specifies the pointer to the attribute value.
7338
`vsiz' specifies the size of the attribute value.
7339
The return value is true if success, else it is false. */
7340
static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz){
7343
assert(db && id >= 0 && vbuf && vsiz >= 0);
7345
tsiz = vsiz + sizeof(int) + 1;
7346
CB_MALLOC(tbuf, tsiz);
7347
memcpy(tbuf, vbuf, vsiz + 1);
7348
memcpy(tbuf + vsiz + 1, &id, sizeof(int));
7349
if(!vlput(db, tbuf, tsiz, "", 0, VL_DKEEP)) err = TRUE;
7351
return err ? FALSE : TRUE;
7355
/* Remove an attribute of a document from an attribute narrowing index.
7356
`db' specifies a handle of an attribute narrowing index.
7357
`id' specifies the ID number of a document.
7358
`vbuf' specifies the pointer to the attribute value.
7359
`vsiz' specifies the size of the attribute value.
7360
The return value is true if success, else it is false. */
7361
static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz){
7364
assert(db && id >= 0 && vbuf && vsiz >= 0);
7366
tsiz = vsiz + sizeof(int) + 1;
7367
CB_MALLOC(tbuf, tsiz);
7368
memcpy(tbuf, vbuf, vsiz + 1);
7369
memcpy(tbuf + vsiz + 1, &id, sizeof(int));
7370
if(!vlout(db, tbuf, tsiz)) err = TRUE;
7372
return err ? FALSE : TRUE;
7376
/* Narrow scores of search candidates with an attribute narrowing index.
7377
`db' specifies a handle of an attribute narrowing index.
7378
`pdocs' specifies a list of pseudo documents.
7379
`cop' specifies the pointer to the operator.
7380
`sign' specifies the sign of operation.
7381
`oval' specifies the operation value.
7382
`osiz' specifies the size of the operation value
7383
`sval' specifies the operation value of small cases.
7384
`ssiz' specifies the size of the operation value of small cases.
7385
`regex' specifies the regular expressions.
7386
`onum' specifies the numeric value.
7387
`scores' specifies an array of scores of search candidates.
7388
`snum' specifies the number of the array.
7389
The return value is the new number of the array. */
7390
static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, const char *cop, int sign,
7391
const char *oval, int osiz, const char *sval, int ssiz,
7392
const void *regex, int onum, ESTSCORE *scores, int snum){
7396
char numbuf[ESTNUMBUFSIZ], *tmp, *wp;
7397
int i, j, ksiz, len, esc, jmp, id, nnum, *ary, anum;
7398
time_t lower, upper;
7399
assert(db && pdocs && cop && oval && osiz >= 0 && scores && snum >= 0);
7401
if(cop == ESTOPSTROREQ && sign && !sval){
7402
tokens = cbsplit(oval, osiz, " ,");
7404
for(i = 0; i < CB_LISTNUM(tokens); i++){
7405
oval = CB_LISTVAL2(tokens, i, osiz);
7406
if(osiz < 1) continue;
7407
vlcurjump(db, oval, osiz, VL_JFORWARD);
7408
while((kbuf = vlcurkeycache(db, &ksiz)) != NULL && !strcmp(kbuf, oval)){
7409
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7413
CB_LISTCLOSE(tokens);
7414
} else if(cop == ESTOPNUMBT && sign && !sval){
7415
CB_MEMDUP(tmp, oval, osiz);
7416
if((wp = strchr(tmp, ' ')) != NULL || (wp = strchr(tmp, '\t')) != NULL){
7418
while(*wp == ' ' || *wp == '\t'){
7421
lower = cbstrmktime(tmp);
7422
upper = cbstrmktime(wp);
7424
lower = cbstrmktime(tmp);
7427
len = sprintf(numbuf, "%.0f", (double)lower);
7428
vlcurjump(db, numbuf, len, VL_JFORWARD);
7429
while((kbuf = vlcurkeycache(db, &ksiz)) != NULL && cbstrmktime(kbuf) <= upper){
7430
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7434
} else if(!sign || sval){
7437
if(sign && (cop == ESTOPSTREQ || cop == ESTOPSTRBW) && osiz > 0){
7438
if(*sval > 0x0 && *sval < 0x7f){
7441
esc = *(unsigned char *)sval;
7442
if(*sval >= 'a' && *sval <= 'z'){
7443
numbuf[0] -= 'a' - 'A';
7444
jmp = *sval - 'a' + 'A';
7446
vlcurjump(db, numbuf, 1, VL_JFORWARD);
7447
} else if(*(unsigned char *)sval >= 0xc0){
7450
esc = *(unsigned char *)sval;
7451
vlcurjump(db, numbuf, 1, VL_JFORWARD);
7458
while((kbuf = vlcurkeycache(db, &ksiz)) != NULL){
7459
if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
7460
cop, sign, oval, osiz, sval, ssiz, regex, onum))
7461
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7462
if(*(unsigned char *)kbuf > jmp && *(unsigned char *)kbuf < *(unsigned char *)sval){
7465
vlcurjump(db, numbuf, 1, VL_JFORWARD);
7467
} else if(*(unsigned char *)kbuf > esc){
7474
if(cop == ESTOPSTREQ || cop == ESTOPSTRBW ||
7475
cop == ESTOPNUMEQ || cop == ESTOPNUMGT || cop == ESTOPNUMGE){
7476
vlcurjump(db, oval, osiz, VL_JFORWARD);
7477
if(cop == ESTOPNUMGT){
7478
while((kbuf = vlcurkeycache(db, NULL)) != NULL && cbstrmktime(kbuf) <= onum){
7482
} else if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
7483
len = sprintf(numbuf, "%.0f", (double)cbstrmktime(oval) + 1);
7484
vlcurjump(db, numbuf, len, VL_JBACKWARD);
7485
if(cop == ESTOPNUMLT){
7486
while((kbuf = vlcurkeycache(db, NULL)) != NULL && cbstrmktime(kbuf) >= onum){
7493
while((kbuf = vlcurkeycache(db, &ksiz)) != NULL){
7494
if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
7495
cop, TRUE, oval, osiz, sval, ssiz, regex, onum)){
7496
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7497
} else if(cop == ESTOPSTREQ || cop == ESTOPSTRBW || cop == ESTOPNUMEQ){
7500
if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
7507
for(i = 0; i < CB_LISTNUM(pdocs); i++){
7508
id = ESTPDOCIDMIN + i;
7509
CB_DATUMCAT(abuf, &id, sizeof(int));
7512
ary = (int *)CB_DATUMPTR(abuf);
7513
anum = CB_DATUMSIZE(abuf) / sizeof(int);
7514
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
7515
qsort(ary, anum, sizeof(int), est_int_compare);
7516
for(i = 0, j = 0; i < snum; i++){
7517
while(j < anum && ary[j] < scores[i].id){
7520
if(j < anum && scores[i].id == ary[j]){
7521
scores[nnum].id = scores[i].id;
7522
scores[nnum].score = scores[i].score;
7526
CB_DATUMCLOSE(abuf);
7531
/* Compare two integers.
7532
`ap' specifies the pointer to one element.
7533
`bp' specifies the pointer to the other element.
7534
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
7535
static int est_int_compare(const void *ap, const void *bp){
7537
return *(int *)ap - *(int *)bp;
7541
/* Compare elements of a record for effective compression.
7542
`ap' specifies the pointer to one element.
7543
`bp' specifies the pointer to the other element.
7544
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
7545
static int est_short_compare(const void *ap, const void *bp){
7547
return ((((unsigned char *)ap)[0] << 8) + ((unsigned char *)ap)[1]) -
7548
((((unsigned char *)bp)[0] << 8) + ((unsigned char *)bp)[1]);
7552
/* Clean up the inode map.
7553
`arg' specifies a dummy argument. */
7554
static void est_inodes_delete(void *arg){
7560
if(cbmaprnum(est_inodes) > 0){
7561
cbmapiterinit(est_inodes);
7562
while((kbuf = cbmapiternext(est_inodes, NULL)) != NULL){
7563
db = *(ESTDB **)cbmapiterval(kbuf, NULL);
7564
est_db_set_informer(db, est_inodes_delete_informer, NULL);
7565
est_db_close(db, &ecode);
7568
cbmapclose(est_inodes);
7574
if(cbmaprnum(est_inodes) > 0){
7575
cbmapiterinit(est_inodes);
7576
while((kbuf = cbmapiternext(est_inodes, NULL)) != NULL){
7577
db = *(ESTDB **)cbmapiterval(kbuf, NULL);
7578
fprintf(stderr, "\nWARNING: %s is not closed.\n\n", cbmemdup(est_db_name(db), -1));
7579
est_db_set_informer(db, est_inodes_delete_informer, NULL);
7580
est_db_close(db, &ecode);
7583
cbmapclose(est_inodes);
7588
/* Inform a database event while clening up database handles.
7589
`msg' specifies the message of each event.
7590
`opaque' is ignored. */
7591
static void est_inodes_delete_informer(const char *msg, void *opaque){
7592
#if !defined(NDEBUG)
7593
fprintf(stderr, "estraier: %s\n", msg);
3924
7598
/* Write meta data to the database.
3925
7599
`db' specifies a database object.
3926
7600
The return value is true if success, else it is false. */
7656
/* Score a document object matching the phrase of a search condition object definitely.
7657
`db' specifies a database object.
7658
`doc' specifies a document object.
7659
`cond' specifies a search condition object.
7660
`scp' specifies the pointer to a variable to which the score is assigned.
7661
The return value is true if the document matches the phrase of the condition object
7662
definitely, else it is false. */
7663
static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp){
7664
struct { char *word; int num; } wsets[ESTSCANWNUM], nsets[ESTSCANWNUM];
7665
CBLIST *terms, *words;
7666
const char *term, *text, *rp;
7667
unsigned char *rbuf;
7669
int i, j, k, sc, wsnum, nsnum, asiz, tsiz, add, rsiz, hit;
7671
assert(db && doc && cond && scp);
7673
if(!cond->phrase || cbstrfwmatch(cond->phrase, ESTOPSIMILAR) ||
7674
cbstrfwmatch(cond->phrase, ESTOPID) || cbstrfwmatch(cond->phrase, ESTOPURI)) return FALSE;
7675
if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
7676
switch(cond->pmode){
7678
terms = est_phrase_terms(cond->phrase);
7681
tmp = est_phrase_from_simple(cond->phrase);
7682
terms = est_phrase_terms(tmp);
7686
tmp = est_phrase_from_rough(cond->phrase);
7687
terms = est_phrase_terms(tmp);
7691
tmp = est_phrase_from_union(cond->phrase);
7692
terms = est_phrase_terms(tmp);
7696
tmp = est_phrase_from_isect(cond->phrase);
7697
terms = est_phrase_terms(tmp);
7704
for(i = 0; i < CB_LISTNUM(terms); i++){
7705
term = CB_LISTVAL(terms, i);
7706
if(!strcmp(term, ESTOPISECT)){
7708
} else if(!strcmp(term, ESTOPDIFF)){
7710
} else if(strcmp(term, ESTOPUVSET)){
7715
} else if(term[0] == 'e'){
7719
words = cbsplit(term, -1, "\t");
7721
while(wsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
7722
wsets[wsnum].word = cblistshift(words, NULL);
7723
wsets[wsnum].num = i;
7727
while(nsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
7728
nsets[nsnum].word = cblistshift(words, NULL);
7729
nsets[nsnum].num = i;
7733
CB_LISTCLOSE(words);
7738
if((rp = cbmapget(doc->attrs, "\t", 1, NULL)) != NULL) sc = -1 - atoi(rp);
7739
for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
7741
if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
7742
asiz += strlen(text);
7744
text = CB_LISTVAL2(doc->dtexts, i, tsiz);
7747
rbuf = (unsigned char *)est_uconv_in(text, strlen(text), &rsiz);
7748
est_canonicalize_text(rbuf, rsiz, FALSE);
7749
tmp = est_uconv_out((char *)rbuf, rsiz, &rsiz);
7750
for(j = 0; j < wsnum; j++){
7751
if(!wsets[j].word) continue;
7752
if((rp = est_strstr_sparse(tmp, wsets[j].word)) != NULL){
7756
rp += strlen(wsets[j].word);
7757
} while((rp = est_strstr_sparse(rp, wsets[j].word)) != NULL);
7759
for(k = 0; k < wsnum; k++){
7760
if(!wsets[k].word) continue;
7761
if(wsets[k].num == wsets[j].num){
7762
free(wsets[k].word);
7763
wsets[k].word = NULL;
7768
for(j = 0; j < nsnum; j++){
7769
if(!nsets[j].word) continue;
7770
if((rp = est_strstr_sparse(tmp, nsets[j].word)) != NULL){
7771
for(k = 0; k < nsnum; k++){
7772
if(!nsets[k].word) continue;
7773
if(nsets[k].num == nsets[j].num){
7774
free(nsets[k].word);
7775
nsets[k].word = NULL;
7784
for(i = 0; i < wsnum; i++){
7785
if(!wsets[i].word) continue;
7786
free(wsets[i].word);
7789
for(i = 0; i < nsnum; i++){
7794
free(nsets[i].word);
7796
CB_LISTCLOSE(terms);
7797
if(sc < 0) sc = -1 - sc;
7798
tune = sqrt(asiz / 8.0 + 128) / 16.0;
7805
if(sc >= 0x80) sc += (0x80 - sc) * 0.75;
7806
if(sc >= 0xc0) sc += (0xc0 - sc) * 0.75;
7807
sc = sc < 0xff ? sc : 0xff;
7820
/* Get the ID of a document specified by URI from pseudo indexes.
7821
`db' specifies a database object.
7822
`uri' specifies the URI of a registered document.
7823
The return value is the ID of the document. On error, -1 is returned. */
7824
static int est_pidx_uri_to_id(ESTDB *db, const char *uri){
7830
db->puris = cbmapopenex(CB_LISTNUM(db->pdocs) + 1);
7831
for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
7832
if((doc = est_db_get_doc(db, ESTPDOCIDMIN + i, 0)) != NULL){
7833
if((vbuf = cbmapget(doc->attrs, ESTDATTRURI, -1, &vsiz)) != NULL)
7834
cbmapput(db->puris, vbuf, vsiz, (char *)&(doc->id), sizeof(int), FALSE);
7835
est_doc_delete(doc);
7839
if((vbuf = cbmapget(db->puris, uri, -1, NULL)) != NULL) return *(int *)vbuf;
3983
7844
/* Create a list of terms for search.
3984
7845
`phrase' specifies a search phrase.
3985
7846
The return value is a list object of the terms of the phrase. */
7991
/* Compare two meta scores by each ID for ascending order.
7992
`ap' specifies the pointer to one meta score
7993
`bp' specifies the pointer to the other meta score
7994
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
7995
static int est_metascore_compare_by_id_asc(const void *ap, const void *bp){
7997
return ((ESTMETASCORE *)ap)->id - ((ESTMETASCORE *)bp)->id;
8001
/* Compare two meta scores by each ID for descending order.
8002
`ap' specifies the pointer to one meta score
8003
`bp' specifies the pointer to the other meta score
8004
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8005
static int est_metascore_compare_by_id_desc(const void *ap, const void *bp){
8007
return ((ESTMETASCORE *)bp)->id - ((ESTMETASCORE *)ap)->id;
8011
/* Compare two meta scores by each score point for ascending order.
8012
`ap' specifies the pointer to one meta score
8013
`bp' specifies the pointer to the other meta score
8014
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8015
static int est_metascore_compare_by_score_asc(const void *ap, const void *bp){
8017
return ((ESTMETASCORE *)ap)->score - ((ESTMETASCORE *)bp)->score;
8021
/* Compare two meta scores by each score point for descending order.
8022
`ap' specifies the pointer to one meta score
8023
`bp' specifies the pointer to the other meta score
8024
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8025
static int est_metascore_compare_by_score_desc(const void *ap, const void *bp){
8027
return ((ESTMETASCORE *)bp)->score - ((ESTMETASCORE *)ap)->score;
8031
/* Compare two meta scores by attributes of strings for ascending order.
8032
`ap' specifies the pointer to one meta score
8033
`bp' specifies the pointer to the other meta score
8034
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8035
static int est_metascore_compare_by_str_asc(const void *ap, const void *bp){
8037
return strcmp(((ESTMETASCORE *)ap)->value, ((ESTMETASCORE *)bp)->value);
8041
/* Compare two meta scores by attributes of strings for descending order.
8042
`ap' specifies the pointer to one meta score
8043
`bp' specifies the pointer to the other meta score
8044
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8045
static int est_metascore_compare_by_str_desc(const void *ap, const void *bp){
8047
return strcmp(((ESTMETASCORE *)bp)->value, ((ESTMETASCORE *)ap)->value);
8051
/* Compare two meta scores by attributes of numbers for ascending order.
8052
`ap' specifies the pointer to one meta score
8053
`bp' specifies the pointer to the other meta score
8054
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8055
static int est_metascore_compare_by_num_asc(const void *ap, const void *bp){
8057
return (time_t)((ESTMETASCORE *)ap)->value - (time_t)((ESTMETASCORE *)bp)->value;
8061
/* Compare two meta scores by attributes of numbers for descending order.
8062
`ap' specifies the pointer to one meta score
8063
`bp' specifies the pointer to the other meta score
8064
The return value is negative if one is small, positive if one is big, 0 if both are equal. */
8065
static int est_metascore_compare_by_num_desc(const void *ap, const void *bp){
8067
return (time_t)((ESTMETASCORE *)bp)->value - (time_t)((ESTMETASCORE *)ap)->value;
4110
8071
/* Get the universal set of documents in a database.
4111
8072
`db' specifies a database object.
4112
8073
`nump' specifies the pointer to which the number of elements in the result is assigned.
4113
8074
`hints' specifies a list object. If it is `NULL', it is not used.
4114
8075
`add' specifies whether the result to be treated in union or difference.
4115
The return value is an array whose elements are ID numbers of corresponding documents. */
8076
The return value is an array of score structures of corresponding documents. */
4116
8077
static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add){
4117
8078
ESTSCORE *scores;
4118
8079
char *vbuf, numbuf[ESTNUMBUFSIZ];
8170
/* Expand a keyword to keywords which begins with it.
8171
`db' specifies a database object.
8172
`word' specifies a word.
8173
`list' specifies a list object to contain the results. */
8174
static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list){
8177
assert(db && word && list);
8179
vlcurjump(db->xfmdb, word, -1, VL_JFORWARD);
8180
while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8181
if(!cbstrfwmatch(kbuf, word)) break;
8182
CB_LISTPUSH(list, kbuf, ksiz);
8183
if(++num >= db->wildmax) break;
8184
vlcurnext(db->xfmdb);
8189
/* Expand a keyword to keywords which ends with it.
8190
`db' specifies a database object.
8191
`word' specifies a word.
8192
`list' specifies a list object to contain the results. */
8193
static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list){
8195
int num, wsiz, ksiz;
8196
assert(db && word && list);
8198
wsiz = strlen(word);
8199
vlcurfirst(db->xfmdb);
8200
while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8201
if(ksiz >= wsiz && !memcmp(kbuf + ksiz - wsiz, word, wsiz)){
8202
CB_LISTPUSH(list, kbuf, ksiz);
8203
if(++num >= db->wildmax) break;
8205
vlcurnext(db->xfmdb);
8210
/* Expand regular expressios to keywords which matches them.
8211
`db' specifies a database object.
8212
`word' specifies regular expressions.
8213
`list' specifies a list object to contain the results. */
8214
static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list){
8218
assert(db && word && list);
8219
if(!(regex = est_regex_new(word))) return;
8221
vlcurfirst(db->xfmdb);
8222
while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8223
if(est_regex_match(regex, kbuf)){
8224
CB_LISTPUSH(list, kbuf, ksiz);
8225
if(++num >= db->wildmax) break;
8227
vlcurnext(db->xfmdb);
8229
est_regex_delete(regex);
4209
8233
/* Get a correspinding set of documents in a database.
4210
8234
`db' specifies a database object.
4211
8235
`term' specifies a union term.
4212
8236
`gstep' specifies number of steps of N-gram.
8237
`xpn' specifies the pointer to a function for query expansion. If it is `NULL', it is not
4213
8239
`nump' specifies the pointer to which the number of elements in the result is assigned.
4214
8240
`hints' specifies a list object. If it is `NULL', it is not used.
4215
8241
`add' specifies whether the result to be treated in union or difference.
4216
The return value is an array whose elements are ID numbers of corresponding documents. */
8242
`auxmin' specifies the minimum hits to adopt the auxiliary index. If it is not more than 0,
8243
the auxiliary index is not used.
8244
`auxwords' specifies a map object where keywords used with the auxiliary index are stored. If
8245
it is `NULL', it is not used.
8246
The return value is an array of score structures of corresponding documents. */
4217
8247
static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
4218
int *nump, CBMAP *hints, int add){
8248
void (*xpn)(const char *, CBLIST *),
8249
int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords){
4219
8250
const ESTSCORE *cscores;
4220
8251
ESTSCORE *scores, *tscores;
4221
8253
CBLIST *words, *grams;
4222
8254
const char *ckey, *word, *gram, *rp, *fnext, *snext, *cbuf;
4223
char *vbuf, numbuf[ESTNUMBUFSIZ];
8255
char *vbuf, *wbuf, numbuf[ESTNUMBUFSIZ];
4224
8256
int i, j, k, snum, smax, cksiz, single, tsmax, tsnum, vsiz, gcnum, gsiz, csiz, wgstep, nnum;
4225
int west, wild, mfsiz, mssiz, mfhash, mshash, tfhash, tshash, id, score, hit, hnum;
8257
int west, wild, mfsiz, mssiz, mfhash, mshash, tfhash, tshash, id, vstep, score, hit, hnum;
8258
double avg, sd, dif;
4226
8259
assert(db && term && gstep > 0 && nump);
4227
8260
smax = ESTALLOCUNIT;
4228
8261
CB_MALLOC(scores, smax * sizeof(ESTSCORE));
4230
8263
words = cbsplit(term, -1, "\t");
8265
umap = cbmapopenex(ESTMINIBNUM);
8266
for(i = 0; i < CB_LISTNUM(words); i++){
8267
word = CB_LISTVAL(words, i);
8268
if(word[0] == '\0' || word[0] == ' ') continue;
8271
for(j = 0; j < CB_LISTNUM(grams); j++){
8272
word = CB_LISTVAL(grams, j);
8273
cbmapput(umap, word, -1, "", 0, FALSE);
8275
CB_LISTCLOSE(grams);
8277
CB_LISTCLOSE(words);
8278
words = cbmapkeys(umap);
4231
8281
for(i = 0; i < CB_LISTNUM(words); i++){
4232
ckey = CB_LISTVAL2(words, i, &cksiz);
8282
ckey = CB_LISTVAL2(words, i, cksiz);
8283
if(cksiz < 1) continue;
4234
8286
if((cscores = est_rescc_get(db, ckey, cksiz, &tsnum)) != NULL){
8289
if(word[0] != '\0') word++;
4236
8292
sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
4237
cbmapput(hints, word, -1, numbuf, -1, FALSE);
8293
cbmapput(hints, word, -1, numbuf, -1, TRUE);
4239
8295
for(j = 0; j < tsnum; j++){
4240
8296
if(snum >= smax){
8598
/* Search the auxiliary index.
8599
`db' specifies a database object.
8600
`word' specifies a search word.
8601
`min' specifies the minimum hits to adopt the auxiliary index.
8602
`nump' specifies the pointer to which the number of elements in the result is assigned.
8603
The return value is an array of score structures of corresponding documents. */
8604
static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump){
8609
int i, rnum, snum, wsiz, nnum, lid;
8610
assert(db && word && min >= 0 && nump);
8611
if(*word != ' ' && (res = (int *)vlgetcache(db->auxdb, word, -1, &rnum)) != NULL &&
8612
(rnum /= sizeof(int)) / 2 >= min){
8613
CB_MALLOC(scores, (rnum / 2) * sizeof(ESTSCORE) + 1);
8615
for(i = 0; i < rnum; i += 2){
8616
scores[snum].id = res[i];
8617
scores[snum].score = res[i+1];
8627
est_expand_keyword_bw(db, word + 1, words);
8628
} else if(*word == 'e'){
8629
est_expand_keyword_ew(db, word + 1, words);
8630
} else if(*word == 'r'){
8631
est_expand_keyword_rx(db, word + 1, words);
8633
} else if(*(unsigned char *)word >= 0xe3){
8634
est_expand_keyword_bw(db, word, words);
8637
for(i = 0; i < CB_LISTNUM(words) &&
8638
CB_DATUMSIZE(rbuf) <= sizeof(int) * 2 * min * ESTAUXEXRAT; i++){
8639
word = CB_LISTVAL2(words, i, wsiz);
8640
if(!(res = (int *)vlgetcache(db->auxdb, word, wsiz, &rnum))) continue;
8641
CB_DATUMCAT(rbuf, (char *)res, rnum);
8643
res = (int *)CB_DATUMPTR(rbuf);
8644
rnum = CB_DATUMSIZE(rbuf);
8645
if((rnum /= sizeof(int)) / 2 < min){
8646
CB_DATUMCLOSE(rbuf);
8647
CB_LISTCLOSE(words);
8650
CB_MALLOC(scores, (rnum / 2) * sizeof(ESTSCORE) + 1);
8652
for(i = 0; i < rnum; i += 2){
8653
scores[snum].id = res[i];
8654
scores[snum].score = res[i+1];
8657
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8660
for(i = 0; i < snum; i++){
8661
if(nnum > 0 && scores[i].id == lid){
8662
scores[nnum-1].score += scores[i].score;
8665
scores[nnum].id = scores[i].id;
8666
scores[nnum].score = scores[i].score;
8670
CB_DATUMCLOSE(rbuf);
8671
CB_LISTCLOSE(words);
8677
/* Weight scores with the auxiliary index.
8678
`db' specifies a database object.
8679
`word' specifies a search word.
8680
`scores' specifies an array of scores of search candidates.
8681
`snum' specifies the number of the array. */
8682
static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum){
8687
if(!(res = (int *)vlgetcache(db->auxdb, word, -1, &knum)) || knum < 2) return;
8688
knum /= sizeof(int);
8689
CB_MALLOC(kscores, knum / 2 * sizeof(ESTSCORE));
8690
rank = knum / 2 + 1;
8692
for(i = 0; i < knum; i += 2){
8693
kscores[nnum].id = res[i];
8694
kscores[nnum].score = (pow(rank, 0.7) / 8.0 + 1.0) * 10000.0;
8699
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8700
qsort(kscores, knum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8702
for(i = 0; i < snum; i++){
8703
while(nnum < knum && kscores[nnum].id < scores[i].id){
8706
if(nnum < knum && kscores[nnum].id == scores[i].id)
8707
scores[i].score *= kscores[nnum].score / 10000.0;
8713
/* Get scores correspinding a ranking search with an attribute narrowing index.
8714
`db' specifies a database object.
8715
`name' specifies the name of an attribute.
8716
`nump' specifies the pointer to which the number of elements in the result is assigned.
8717
The return value is an array of score structures of corresponding documents. */
8718
static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump){
8719
ESTATTRIDX *attridx;
8723
assert(db && name && nump);
8724
if(top == 0 || !(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, -1, NULL)) ||
8725
(attridx->type != ESTIDXATTRSTR && attridx->type != ESTIDXATTRNUM)){
8730
if(snum > db->dnum) snum = db->dnum;
8731
CB_MALLOC(scores, snum * sizeof(ESTSCORE) + 1);
8734
vlcurfirst(attridx->db);
8735
while(snum < top && (kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8736
if(ksiz < sizeof(int)){
8737
vlcurnext(attridx->db);
8740
memcpy(&id, kbuf + ksiz - sizeof(int), sizeof(int));
8742
vlcurnext(attridx->db);
8745
scores[snum].id = id;
8746
scores[snum].score = 0;
8747
scores[snum].value = NULL;
8749
vlcurnext(attridx->db);
8753
vlcurlast(attridx->db);
8754
while(snum < top && (kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8755
if(ksiz < sizeof(int)){
8756
vlcurprev(attridx->db);
8759
memcpy(&id, kbuf + ksiz - sizeof(int), sizeof(int));
8761
vlcurprev(attridx->db);
8764
scores[snum].id = id;
8765
scores[snum].score = 0;
8766
scores[snum].value = NULL;
8768
vlcurprev(attridx->db);
8776
/* Get scores correspinding an attribute expression with an attribute narrowing index.
8777
`db' specifies a database object.
8778
`expr' specifies an attribute search expression.
8779
`nump' specifies the pointer to which the number of elements in the result is assigned.
8780
The return value is an array of score structures of corresponding documents or `NULL' if no
8781
index is available. */
8782
static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump){
8783
ESTATTRIDX *attridx;
8788
const char *cop, *pv, *kbuf, *tbuf;
8789
unsigned char *utmp;
8790
char *name, *oper, *val, *sval, *wp, numbuf[ESTNUMBUFSIZ];
8791
int i, nsiz, vsiz, ksiz, tsiz, sign, ic, ssiz, esc, jmp, len, *ary, anum;
8792
time_t num, lower, upper;
8793
assert(db && expr && nump);
8799
while(*expr > 0 && *expr <= ' '){
8802
if((pv = strchr(expr, ' ')) != NULL){
8804
name = cbmemdup(expr, nsiz);
8806
while(*expr > 0 && *expr <= ' '){
8809
if((pv = strchr(expr, ' ')) != NULL){
8810
oper = cbmemdup(expr, pv - expr);
8812
while(*expr > 0 && *expr <= ' '){
8815
vsiz = strlen(expr);
8816
val = cbmemdup(expr, vsiz);
8818
oper = cbmemdup(expr, -1);
8821
nsiz = strlen(expr);
8822
name = cbmemdup(expr, nsiz);
8825
oper = cbmemdup("", 0);
8829
val = cbmemdup("", 0);
8838
if(*cop == 'I' || *cop == 'i'){
8839
ic = !est_check_cjk_only(val);
8845
if(!cbstricmp(cop, ESTOPSTREQ)){
8847
} else if(!cbstricmp(cop, ESTOPSTRNE)){
8849
} else if(!cbstricmp(cop, ESTOPSTRINC)){
8851
} else if(!cbstricmp(cop, ESTOPSTRBW)){
8853
} else if(!cbstricmp(cop, ESTOPSTREW)){
8855
} else if(!cbstricmp(cop, ESTOPSTRAND)){
8857
} else if(!cbstricmp(cop, ESTOPSTROR)){
8859
} else if(!cbstricmp(cop, ESTOPSTROREQ)){
8861
} else if(!cbstricmp(cop, ESTOPSTRRX)){
8863
regex = est_regex_new(val);
8864
} else if(!cbstricmp(cop, ESTOPNUMEQ)){
8866
} else if(!cbstricmp(cop, ESTOPNUMNE)){
8868
} else if(!cbstricmp(cop, ESTOPNUMGT)){
8870
} else if(!cbstricmp(cop, ESTOPNUMGE)){
8872
} else if(!cbstricmp(cop, ESTOPNUMLT)){
8874
} else if(!cbstricmp(cop, ESTOPNUMLE)){
8876
} else if(!cbstricmp(cop, ESTOPNUMBT)){
8883
num = cbstrmktime(val);
8884
if(!(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, nsiz, NULL)) ||
8885
(attridx->type != ESTIDXATTRSTR && attridx->type != ESTIDXATTRNUM) ||
8886
(attridx->type == ESTIDXATTRNUM &&
8887
cop != ESTOPNUMEQ && cop != ESTOPNUMNE && cop != ESTOPNUMGT && cop != ESTOPNUMGE &&
8888
cop != ESTOPNUMLT && cop != ESTOPNUMLE && cop != ESTOPNUMBT)){
8889
if(regex) est_regex_delete(regex);
8898
utmp = (unsigned char *)est_uconv_in(val, vsiz, &tsiz);
8899
est_normalize_text(utmp, tsiz, &tsiz);
8900
est_canonicalize_text(utmp, tsiz, FALSE);
8901
sval = (char *)est_uconv_out((char *)utmp, tsiz, &ssiz);
8909
if(sign && (cop == ESTOPSTREQ || cop == ESTOPSTRBW) && vsiz > 0){
8910
if(*sval > 0x0 && *sval < 0x7f){
8913
esc = *(unsigned char *)sval;
8914
if(*sval >= 'a' && *sval <= 'z'){
8915
numbuf[0] -= 'a' - 'A';
8916
jmp = *sval - 'a' + 'A';
8918
vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
8919
} else if(*(unsigned char *)sval >= 0xc0){
8922
esc = *(unsigned char *)sval;
8923
vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
8925
vlcurfirst(attridx->db);
8928
vlcurfirst(attridx->db);
8930
while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8931
if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
8932
cop, sign, val, vsiz, sval, ssiz, regex, num))
8933
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
8934
if(*(unsigned char *)kbuf > jmp && *(unsigned char *)kbuf < *(unsigned char *)sval){
8937
vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
8939
} else if(*(unsigned char *)kbuf > esc){
8942
vlcurnext(attridx->db);
8945
if(sval) free(sval);
8946
} else if(cop == ESTOPSTROREQ){
8947
tokens = cbsplit(val, vsiz, " ,");
8949
for(i = 0; i < CB_LISTNUM(tokens); i++){
8950
tbuf = CB_LISTVAL2(tokens, i, tsiz);
8951
vlcurjump(attridx->db, tbuf, tsiz, VL_JFORWARD);
8952
while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL && !strcmp(kbuf, tbuf)){
8953
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
8954
vlcurnext(attridx->db);
8957
CB_LISTCLOSE(tokens);
8958
} else if(cop == ESTOPNUMBT){
8959
if((wp = strchr(val, ' ')) != NULL || (wp = strchr(val, '\t')) != NULL){
8961
while(*wp == ' ' || *wp == '\t'){
8964
lower = cbstrmktime(val);
8965
upper = cbstrmktime(wp);
8967
lower = cbstrmktime(val);
8970
len = sprintf(numbuf, "%.0f", (double)lower);
8971
vlcurjump(attridx->db, numbuf, len, VL_JFORWARD);
8972
while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL && cbstrmktime(kbuf) <= upper){
8973
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
8974
vlcurnext(attridx->db);
8977
if(cop == ESTOPSTREQ || cop == ESTOPSTRBW ||
8978
cop == ESTOPNUMEQ || cop == ESTOPNUMGT || cop == ESTOPNUMGE){
8979
vlcurjump(attridx->db, val, vsiz, VL_JFORWARD);
8980
if(cop == ESTOPNUMGT){
8981
while((kbuf = vlcurkeycache(attridx->db, NULL)) != NULL && cbstrmktime(kbuf) <= num){
8982
vlcurnext(attridx->db);
8985
} else if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
8986
len = sprintf(numbuf, "%.0f", (double)cbstrmktime(val) + 1);
8987
vlcurjump(attridx->db, numbuf, len, VL_JBACKWARD);
8988
if(cop == ESTOPNUMLT){
8989
while((kbuf = vlcurkeycache(attridx->db, NULL)) != NULL && cbstrmktime(kbuf) >= num){
8990
vlcurprev(attridx->db);
8994
vlcurfirst(attridx->db);
8996
while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8997
if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
8998
cop, TRUE, val, vsiz, NULL, 0, regex, num)){
8999
CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9000
} else if(cop == ESTOPSTREQ || cop == ESTOPSTRBW || cop == ESTOPNUMEQ){
9003
if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
9004
vlcurprev(attridx->db);
9006
vlcurnext(attridx->db);
9010
ary = (int *)CB_DATUMPTR(abuf);
9011
anum = CB_DATUMSIZE(abuf) / sizeof(int);
9012
CB_MALLOC(scores, anum * sizeof(ESTSCORE) + 1);
9013
for(i = 0; i < anum; i++){
9014
scores[i].id = ary[i];
9015
scores[i].score = 0;
9016
scores[i].value = NULL;
9019
CB_DATUMCLOSE(abuf);
9020
if(regex) est_regex_delete(regex);
9028
/* Get a correspinding set of documents in pseudo indexes.
9029
`db' specifies a database object.
9030
`cond' specifies a search condition object.
9031
`scores' specifies an array of scores of search candidates.
9032
`nump' specifies the pointer to which the number of elements in the parameter and result is
9034
`ordattrs' specifies a map object into which ordering attributes are stored.
9035
The return value is an array of re-allocated score structures. */
9036
static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
9040
const char *otype, *lbuf, *vbuf;
9042
int i, j, k, snum, anum, id, hit, sc, miss, lsiz, vsiz;
9043
double avg, sd, dif, tune;
9044
assert(db && cond && scores && nump && ordattrs);
9046
CB_REALLOC(scores, (snum + CB_LISTNUM(db->pdocs)) * sizeof(ESTSCORE) + 1);
9048
if(cbstrfwmatch(cond->phrase, ESTOPID)){
9050
} else if(cbstrfwmatch(cond->phrase, ESTOPURI)){
9052
} else if(cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
9059
oname = cbmemdup(cond->order, -1);
9062
if((wp = strchr(oname, ' ')) != NULL){
9072
if(cond->attrs) list = est_make_cattr_list(cond->attrs, &anum);
9073
for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
9074
id = ESTPDOCIDMIN + i;
9078
if(!cond->phrase || cond->phrase[0] == '\0'){
9079
hit = cond->attrs ? TRUE : FALSE;
9080
} else if(cbstrfwmatch(cond->phrase, ESTOPUVSET)){
9083
if((doc = est_db_get_doc(db, id, 0)) != NULL){
9084
hit = est_db_score_doc(db, doc, cond, &sc);
9090
if(!doc && !(doc = est_db_get_doc(db, id, 0))){
9094
for(j = 0; !miss && j < anum; j++){
9095
if(list[j].nsiz < 1) continue;
9098
for(k = 0; k < CB_LISTNUM(list[j].nlist); k++){
9099
lbuf = CB_LISTVAL2(list[j].nlist, k, lsiz);
9100
if(lsiz < 1) continue;
9101
if(!(vbuf = cbmapget(doc->attrs, lbuf, lsiz, &vsiz))) continue;
9102
if(est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
9103
list[j].sval, list[j].ssiz, list[j].regex, list[j].num)){
9108
if(!hit) miss = TRUE;
9109
} else if(!(vbuf = cbmapget(doc->attrs, list[j].name, list[j].nsiz, &vsiz))){
9111
} else if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign,
9112
list[j].val, list[j].vsiz, list[j].sval, list[j].ssiz,
9113
list[j].regex, list[j].num)){
9121
scores[snum].id = id;
9122
scores[snum].score = sc;
9123
scores[snum].value = NULL;
9125
if(oname && (doc || (doc = est_db_get_doc(db, id, 0)) != NULL)){
9126
if(!(vbuf = cbmapget(doc->attrs, oname, -1, &vsiz))){
9130
cbmapput(ordattrs, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
9133
if(doc) est_doc_delete(doc);
9135
if(list) est_free_cattr_list(list, anum);
9136
if(oname) free(oname);
9137
if(db->smode != ESTDFSCASIS && snum > *nump){
9139
for(i = *nump; i < snum; i++){
9140
avg += scores[i].score;
9142
avg /= snum - *nump;
9144
for(i = *nump; i < snum; i++){
9145
dif = avg - scores[i].score;
9151
for(i = *nump; i < snum; i++){
9152
scores[i].score = ESTSCOREUNIT / 2;
9155
for(i = *nump; i < snum; i++){
9156
scores[i].score = (int)(((scores[i].score - avg) * (ESTSCOREUNIT / 10.0) / sd) +
9157
ESTSCOREUNIT / 2.0);
9161
tune = pow(snum - *nump + 64, 0.4);
9162
for(i = *nump; i < snum; i++){
9163
scores[i].score *= 100.0 / tune;
9166
for(i = *nump; i < snum; i++){
9167
scores[i].score *= 10;
4463
9176
/* Narrow and sort scores of search candidates.
4464
9177
`db' specifies a database object.
4465
9178
`attrs' specifies a list object of narrowing attributes.
9179
`ign' specifies the offset of an attribute to be ignored.
4466
9180
`order' specifies an expression for sorting.
9181
`distinct' specifies the name of the distinct attribute.
4467
9182
`scores' specifies an array of scores of search candidates.
4468
9183
`snum' specifies the number of the array.
4469
9184
`limit' specifies the limit number to check.
4470
9185
`restp' specifies the pointer to a variable to which rest number to be checked is assigned.
9186
`ordattrs' specifies a map object of cached ordering attributes.
4471
9187
The return value is the new number of the array. */
4472
static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
4473
ESTSCORE *scores, int snum, int limit, int *restp){
9188
static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
9189
const char *order, const char *distinct, ESTSCORE *scores, int snum,
9190
int limit, int *restp, CBMAP *ordattrs){
4474
9191
ESTCATTR *list;
4475
const char *otype, *cbuf, *rp, *pv, *ibuf;
4476
unsigned char *utmp;
9192
ESTATTRIDX *attridx;
9194
const char *otype, *cbuf, *ibuf, *lbuf;
4477
9195
char *oname, *wp, *mbuf, *vbuf;
4478
int i, j, k, ci, oi, anum, tsiz, nnum, csiz, msiz, miss, vsiz, num, isiz, onlen;
9196
int i, j, k, ci, oi, anum, done, mixed, nnum, csiz, msiz;
9197
int miss, vsiz, num, isiz, lsiz, hit, onlen, dnlen;
4480
assert(db && scores && snum >= 0 && restp);
9199
assert(db && scores && snum >= 0 && limit >= 0 && restp && ordattrs);
4488
9207
cbstrtrim(oname);
4489
9208
otype = ESTORDSTRA;
4490
9209
if((wp = strchr(oname, ' ')) != NULL){
4500
anum = CB_LISTNUM(attrs);
4501
CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
4502
for(i = 0; i < anum; i++){
4503
list[i].name = NULL;
4504
list[i].oper = NULL;
4506
rp = CB_LISTVAL(attrs, i, NULL);
4507
while(*rp > 0 && *rp <= ' '){
4510
if((pv = strchr(rp, ' ')) != NULL){
4511
list[i].nsiz = pv - rp;
4512
list[i].name = cbmemdup(rp, list[i].nsiz);
4514
while(*rp > 0 && *rp <= ' '){
9218
list = est_make_cattr_list(attrs, &anum);
9219
if(cbmaprnum(db->aidxs) > 0){
9222
for(i = 0; i < anum; i++){
9223
if(i == ign) continue;
9224
if(!(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, list[i].name, list[i].nsiz, NULL)) ||
9225
(attridx->type == ESTIDXATTRNUM &&
9226
list[i].cop != ESTOPNUMEQ && list[i].cop != ESTOPNUMNE &&
9227
list[i].cop != ESTOPNUMGT && list[i].cop != ESTOPNUMGE &&
9228
list[i].cop != ESTOPNUMLT && list[i].cop != ESTOPNUMLE &&
9229
list[i].cop != ESTOPNUMBT)){
4517
if((pv = strchr(rp, ' ')) != NULL){
4518
list[i].oper = cbmemdup(rp, pv - rp);
4520
while(*rp > 0 && *rp <= ' '){
9233
switch(attridx->type){
9236
snum = est_aidx_attr_narrow(attridx->db, db->pdocs, list[i].cop, list[i].sign,
9237
list[i].val, list[i].vsiz, list[i].sval, list[i].ssiz,
9238
list[i].regex, list[i].num, scores, snum);
9242
if(done && i == anum - 1 && !order && mixed){
9243
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
4523
list[i].vsiz = strlen(rp);
4524
list[i].val = cbmemdup(rp, list[i].vsiz);
4526
list[i].oper = cbmemdup(rp, -1);
9246
snum = est_aidx_seq_narrow(attridx->db, db->pdocs, list[i].cop, list[i].sign,
9247
list[i].val, list[i].vsiz, list[i].sval, list[i].ssiz,
9248
list[i].regex, list[i].num, scores, snum,
9249
done && i == anum - 1 ? limit : INT_MAX, restp);
4529
list[i].nsiz = strlen(rp);
4530
list[i].name = cbmemdup(rp, list[i].nsiz);
4533
list[i].oper = cbmemdup("", 0);
4537
list[i].val = cbmemdup("", 0);
4540
for(i = 0; i < anum; i++){
4543
list[i].sign = FALSE;
4546
list[i].sign = TRUE;
4548
if(*rp == 'I' || *rp == 'i'){
4549
utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
4550
est_normalize_text(utmp, tsiz, &tsiz);
4551
est_canonicalize_text(utmp, tsiz, FALSE);
4552
list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
4556
list[i].sval = NULL;
4559
list[i].regex = NULL;
4560
list[i].num = cbstrmktime(list[i].val);
4561
if(!cbstricmp(rp, ESTOPSTREQ)){
4562
list[i].cop = ESTOPSTREQ;
4563
} else if(!cbstricmp(rp, ESTOPSTRNE)){
4564
list[i].cop = ESTOPSTRNE;
4565
} else if(!cbstricmp(rp, ESTOPSTRINC)){
4566
list[i].cop = ESTOPSTRINC;
4567
} else if(!cbstricmp(rp, ESTOPSTRBW)){
4568
list[i].cop = ESTOPSTRBW;
4569
} else if(!cbstricmp(rp, ESTOPSTREW)){
4570
list[i].cop = ESTOPSTREW;
4571
} else if(!cbstricmp(rp, ESTOPSTRAND)){
4572
list[i].cop = ESTOPSTRAND;
4573
} else if(!cbstricmp(rp, ESTOPSTROR)){
4574
list[i].cop = ESTOPSTROR;
4575
} else if(!cbstricmp(rp, ESTOPSTRRX)){
4576
list[i].cop = ESTOPSTRRX;
4577
list[i].regex = list[i].sval ? est_regex_new(list[i].sval) : est_regex_new(list[i].val);
4578
} else if(!cbstricmp(rp, ESTOPNUMEQ)){
4579
list[i].cop = ESTOPNUMEQ;
4580
} else if(!cbstricmp(rp, ESTOPNUMNE)){
4581
list[i].cop = ESTOPNUMNE;
4582
} else if(!cbstricmp(rp, ESTOPNUMGT)){
4583
list[i].cop = ESTOPNUMGT;
4584
} else if(!cbstricmp(rp, ESTOPNUMGE)){
4585
list[i].cop = ESTOPNUMGE;
4586
} else if(!cbstricmp(rp, ESTOPNUMLT)){
4587
list[i].cop = ESTOPNUMLT;
4588
} else if(!cbstricmp(rp, ESTOPNUMLE)){
4589
list[i].cop = ESTOPNUMLE;
4590
} else if(!cbstricmp(rp, ESTOPNUMBT)){
4591
list[i].cop = ESTOPNUMBT;
9252
list[i].cop = ESTOPDUMMY;
9254
if(mixed && !order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
4597
9259
for(i = 0; i < anum; i++){
9276
for(i = 0; i < snum; i++){
9281
scores[i].value = NULL;
9283
if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
9284
cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
9290
if(scores[i].id >= ESTPDOCIDMIN){
9291
scores[nnum++] = scores[i];
9292
} else if((cbuf && anum == 1) ||
9293
(mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9295
for(j = 0; !miss && j < anum; j++){
9296
if(list[j].nsiz < 1) continue;
9299
for(k = 0; k < CB_LISTNUM(list[j].nlist); k++){
9300
lbuf = CB_LISTVAL2(list[j].nlist, k, lsiz);
9301
if(lsiz < 1) continue;
9302
if(!(vbuf = cbmaploadone(mbuf, msiz, lbuf, lsiz, &vsiz))) continue;
9303
if(est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
9304
list[j].sval, list[j].ssiz, list[j].regex, list[j].num)){
9311
if(!hit) miss = TRUE;
9315
vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
9316
} else if(csiz != 1 || cbuf[0] != '\0'){
9317
vbuf = cbmemdup(cbuf, csiz);
9322
if(list[j].oper[0] == '\0'){
9323
if(!vbuf) miss = TRUE;
9326
vbuf = cbmemdup("", 0);
9329
if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign,
9330
list[j].val, list[j].vsiz, list[j].sval, list[j].ssiz,
9331
list[j].regex, list[j].num)) miss = TRUE;
9334
if(j == ci && !cbuf){
9336
cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9338
cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
9340
if(cbmaprnum(db->spacc) > db->scmnum){
9341
num = db->scmnum * 0.1 + 1;
9342
cbmapiterinit(db->spacc);
9343
for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
9344
cbmapout(db->spacc, ibuf, isiz);
9349
scores[i].value = vbuf;
9355
free(scores[i].value);
9357
scores[nnum++] = scores[i];
9364
for(i = 0; i < snum; i++){
9365
scores[i].value = NULL;
9368
est_free_cattr_list(list, anum);
4613
9370
for(i = 0; i < snum; i++){
4618
9371
scores[i].value = NULL;
4620
if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
9375
if(!cbstricmp(oname, ESTORDIDA)){
9376
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
9377
} else if(!cbstricmp(oname, ESTORDIDD)){
9378
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_desc);
9379
} else if(!cbstricmp(oname, ESTORDSCA)){
9380
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_asc);
9381
} else if(!cbstricmp(oname, ESTORDSCD)){
9382
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9384
ci = db->spacc && !strcmp(oname, db->scname);
9385
onlen = strlen(oname);
9386
attridx = (ESTATTRIDX *)cbmapget(db->aidxs, oname, onlen, NULL);
9387
if(attridx && attridx->type != ESTIDXATTRSEQ) attridx = NULL;
9388
for(i = 0; i < snum; i++){
9389
if(scores[i].value) continue;
9391
(cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
4621
9392
cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
4627
if((cbuf && anum == 1) ||
4628
(mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
4630
for(j = 0; !miss && j < anum; j++){
4631
if(list[j].nsiz < 1) continue;
4633
vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
4634
} else if(csiz != 1 || cbuf[0] != '\0'){
4635
vbuf = cbmemdup(cbuf, csiz);
4640
if(list[j].oper[0] == '\0'){
4641
if(!vbuf) miss = TRUE;
4644
vbuf = cbmemdup("", 0);
4647
if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
4648
list[j].sval, list[j].ssiz, list[j].regex, list[j].num))
4651
if(j == ci && !cbuf){
4653
cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
4655
cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
4657
if(cbmaprnum(db->spacc) > db->scmnum){
4658
num = db->scmnum * 0.1 + 1;
4659
cbmapiterinit(db->spacc);
4660
for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
4661
cbmapout(db->spacc, ibuf, isiz);
4666
scores[i].value = vbuf;
4672
free(scores[i].value);
4674
scores[nnum++] = scores[i];
4680
for(i = 0; i < anum; i++){
4681
if(list[i].regex) est_regex_delete(list[i].regex);
4689
for(i = 0; i < snum; i++){
4690
scores[i].value = NULL;
4694
ci = db->spacc && !strcmp(oname, db->scname);
4695
onlen = strlen(oname);
4696
for(i = 0; i < snum; i++){
4697
if(scores[i].value) continue;
4698
if(ci && (cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
4699
cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
4700
if(csiz == 1 && cbuf[0] == '\0'){
4701
scores[i].value = cbmemdup("", 0);
9393
if(csiz == 1 && cbuf[0] == '\0'){
9394
scores[i].value = cbmemdup("", 0);
9396
scores[i].value = cbmemdup(cbuf, csiz);
9400
if((cbuf = cbmapget(ordattrs, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
4703
9401
scores[i].value = cbmemdup(cbuf, csiz);
4707
if((mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
4708
if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
4709
if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9405
if(!(vbuf = est_aidx_seq_get(attridx->db, scores[i].id, &vsiz))) vbuf = cbmemdup("", 0);
4710
9406
scores[i].value = vbuf;
9409
if((mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9410
if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
9411
if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9412
scores[i].value = vbuf;
9414
if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
9415
scores[i].value = cbmemdup("", 0);
9417
if(ci && cbmaprnum(db->spacc) > db->scmnum){
9418
num = db->scmnum * 0.1 + 1;
9419
cbmapiterinit(db->spacc);
9420
for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
9421
cbmapout(db->spacc, ibuf, isiz);
4712
if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
4713
9426
scores[i].value = cbmemdup("", 0);
4715
if(ci && cbmaprnum(db->spacc) > db->scmnum){
4716
num = db->scmnum * 0.1 + 1;
4717
cbmapiterinit(db->spacc);
4718
for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
4719
cbmapout(db->spacc, ibuf, isiz);
9429
if(!cbstricmp(otype, ESTORDSTRA)){
9430
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
9431
} else if(!cbstricmp(otype, ESTORDSTRD)){
9432
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
9433
} else if(!cbstricmp(otype, ESTORDNUMA)){
9434
for(i = 0; i < snum; i++){
9435
tval = cbstrmktime(scores[i].value);
9436
free(scores[i].value);
9437
scores[i].value = (void *)tval;
9439
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
9440
for(i = 0; i < snum; i++){
9441
scores[i].value = NULL;
9443
} else if(!cbstricmp(otype, ESTORDNUMD)){
9444
for(i = 0; i < snum; i++){
9445
tval = cbstrmktime(scores[i].value);
9446
free(scores[i].value);
9447
scores[i].value = (void *)tval;
9449
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
9450
for(i = 0; i < snum; i++){
9451
scores[i].value = NULL;
9454
for(i = 0; i < snum; i++){
9455
free(scores[i].value);
9461
dnlen = strlen(distinct);
9462
umap = cbmapopenex(snum + 1);
9463
attridx = (ESTATTRIDX *)cbmapget(db->aidxs, distinct, dnlen, NULL);
9464
if(attridx && attridx->type != ESTIDXATTRSEQ) attridx = NULL;
9466
for(i = 0; i < snum; i++){
9467
if(scores[i].id >= ESTPDOCIDMIN){
9468
if(!(vbuf = est_db_get_doc_attr(db, scores[i].id, distinct))) vbuf = cbmemdup("", 0);
9469
vsiz = strlen(vbuf);
9471
if(!(vbuf = est_aidx_seq_get(attridx->db, scores[i].id, &vsiz))){
9472
vbuf = cbmemdup("", 0);
4724
scores[i].value = cbmemdup("", 0);
4727
if(!cbstricmp(otype, ESTORDSTRA)){
4728
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
4729
} else if(!cbstricmp(otype, ESTORDSTRD)){
4730
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
4731
} else if(!cbstricmp(otype, ESTORDNUMA)){
4732
for(i = 0; i < snum; i++){
4733
tval = cbstrmktime(scores[i].value);
4734
free(scores[i].value);
4735
scores[i].value = (void *)tval;
4737
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
4738
for(i = 0; i < snum; i++){
4739
scores[i].value = NULL;
4741
} else if(!cbstricmp(otype, ESTORDNUMD)){
4742
for(i = 0; i < snum; i++){
4743
tval = cbstrmktime(scores[i].value);
4744
free(scores[i].value);
4745
scores[i].value = (void *)tval;
4747
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
4748
for(i = 0; i < snum; i++){
4749
scores[i].value = NULL;
4752
for(i = 0; i < snum; i++){
4753
free(scores[i].value);
9476
if((mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9477
if(!(vbuf = cbmaploadone(mbuf, msiz, distinct, dnlen, &vsiz))){
9478
vbuf = cbmemdup("", 0);
9483
vbuf = cbmemdup("", 0);
9487
if(cbmapput(umap, vbuf, vsiz, "", 0, FALSE)) scores[nnum++] = scores[i];
9497
/* Make a list of condition attributes.
9498
`attrs' specifies a list object of attribute expressions.
9499
`nump' specifies the pointer to which the number of elements in the result is assigned.
9500
The return value is a list of condition attributes. */
9501
static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump){
9503
const char *rp, *pv;
9504
unsigned char *utmp;
9506
assert(attrs && nump);
9507
anum = CB_LISTNUM(attrs);
9508
CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
9509
for(i = 0; i < anum; i++){
9510
list[i].name = NULL;
9511
list[i].oper = NULL;
9513
rp = CB_LISTVAL(attrs, i);
9514
while(*rp > 0 && *rp <= ' '){
9517
if((pv = strchr(rp, ' ')) != NULL){
9518
list[i].nsiz = pv - rp;
9519
list[i].name = cbmemdup(rp, list[i].nsiz);
9521
while(*rp > 0 && *rp <= ' '){
9524
if((pv = strchr(rp, ' ')) != NULL){
9525
list[i].oper = cbmemdup(rp, pv - rp);
9527
while(*rp > 0 && *rp <= ' '){
9530
list[i].vsiz = strlen(rp);
9531
list[i].val = cbmemdup(rp, list[i].vsiz);
9533
list[i].oper = cbmemdup(rp, -1);
9536
list[i].nsiz = strlen(rp);
9537
list[i].name = cbmemdup(rp, list[i].nsiz);
9539
if(strchr(list[i].name, ',')){
9540
list[i].nlist = cbsplit(list[i].name, list[i].nsiz, ",");
9542
list[i].nlist = NULL;
9545
list[i].oper = cbmemdup("", 0);
9549
list[i].val = cbmemdup("", 0);
9552
for(i = 0; i < anum; i++){
9555
list[i].sign = FALSE;
9558
list[i].sign = TRUE;
9560
if(*rp == 'I' || *rp == 'i'){
9561
if(est_check_cjk_only(list[i].val)){
9562
list[i].sval = NULL;
9565
utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
9566
est_normalize_text(utmp, tsiz, &tsiz);
9567
est_canonicalize_text(utmp, tsiz, FALSE);
9568
list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
9573
list[i].sval = NULL;
9576
list[i].regex = NULL;
9577
list[i].num = cbstrmktime(list[i].val);
9578
if(!cbstricmp(rp, ESTOPSTREQ)){
9579
list[i].cop = ESTOPSTREQ;
9580
} else if(!cbstricmp(rp, ESTOPSTRNE)){
9581
list[i].cop = ESTOPSTRNE;
9582
} else if(!cbstricmp(rp, ESTOPSTRINC)){
9583
list[i].cop = ESTOPSTRINC;
9584
} else if(!cbstricmp(rp, ESTOPSTRBW)){
9585
list[i].cop = ESTOPSTRBW;
9586
} else if(!cbstricmp(rp, ESTOPSTREW)){
9587
list[i].cop = ESTOPSTREW;
9588
} else if(!cbstricmp(rp, ESTOPSTRAND)){
9589
list[i].cop = ESTOPSTRAND;
9590
} else if(!cbstricmp(rp, ESTOPSTROR)){
9591
list[i].cop = ESTOPSTROR;
9592
} else if(!cbstricmp(rp, ESTOPSTROREQ)){
9593
list[i].cop = ESTOPSTROREQ;
9594
} else if(!cbstricmp(rp, ESTOPSTRRX)){
9595
list[i].cop = ESTOPSTRRX;
9596
list[i].regex = list[i].sval ? est_regex_new(list[i].sval) : est_regex_new(list[i].val);
9597
} else if(!cbstricmp(rp, ESTOPNUMEQ)){
9598
list[i].cop = ESTOPNUMEQ;
9599
} else if(!cbstricmp(rp, ESTOPNUMNE)){
9600
list[i].cop = ESTOPNUMNE;
9601
} else if(!cbstricmp(rp, ESTOPNUMGT)){
9602
list[i].cop = ESTOPNUMGT;
9603
} else if(!cbstricmp(rp, ESTOPNUMGE)){
9604
list[i].cop = ESTOPNUMGE;
9605
} else if(!cbstricmp(rp, ESTOPNUMLT)){
9606
list[i].cop = ESTOPNUMLT;
9607
} else if(!cbstricmp(rp, ESTOPNUMLE)){
9608
list[i].cop = ESTOPNUMLE;
9609
} else if(!cbstricmp(rp, ESTOPNUMBT)){
9610
list[i].cop = ESTOPNUMBT;
9612
list[i].cop = ESTOPSTRINC;
9613
list[i].val[0] = '\0';
9616
list[i].sval[0] = '\0';
9626
/* Release resources of a list of condition attributes.
9627
`list' specifies a list of condition attributes.
9628
`anum' specifies the number of elements of the list. */
9629
static void est_free_cattr_list(ESTCATTR *list, int anum){
9631
assert(list && anum >= 0);
9632
for(i = 0; i < anum; i++){
9633
if(list[i].regex) est_regex_delete(list[i].regex);
9637
if(list[i].nlist) CB_LISTCLOSE(list[i].nlist);
4761
9644
/* Narrow and sort scores of search candidates.
4762
9645
`db' specifies a database object.
4763
9646
`scores' specifies an array of scores of search candidates.
4767
9650
`vnum' specifies the number of dimensions of the vector.
4768
9651
`tfidf' specifies whether to perform TF-IDF tuning.
4769
9652
`limit' specifies the upper limit of similarity for documents to survive.
9653
`opts' specifies optoins for eclipse.
4770
9654
`shadows' specifies a map object to store shadow document information.
4771
9655
The return value is the new number of the array. */
4772
9656
static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
4773
9657
int vnum, int tfidf, double limit, CBMAP *shadows){
4774
9658
CBMAP *svmap, *tvmap;
4775
int i, j, max, *svec, *tvec, pair[2], nnum;
9659
const char *suri, *turi;
9661
int i, j, ubase, simurl, max, *svec, *tvec, pair[2], nnum;
4777
9663
assert(db && scores && snum >= 0 && num >= 0 && vnum > 0 && limit > 0.0 && shadows);
4778
max = limit < 0.1 ? snum : num * ((2.0 / limit) + 0.5);
4779
if(max > snum) max = snum;
4780
CB_MALLOC(svec, vnum * sizeof(int));
4781
CB_MALLOC(tvec, vnum * sizeof(int));
4782
for(i = 0; i < max; i++){
4783
svmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
4784
scores[i].value = (char *)svmap;
4786
for(i = 0; i < max; i++){
4787
svmap = (CBMAP *)(scores[i].value);
4788
if(!svmap || cbmaprnum(svmap) < 1) continue;
4789
if(num-- < 1) continue;
4790
est_vector_set_seed(svmap, svec, vnum);
4791
for(j = i + 1; j < max; j++){
4792
tvmap = (CBMAP *)(scores[j].value);
4793
if(!tvmap || cbmaprnum(tvmap) < 1) continue;
4794
est_vector_set_target(svmap, tvmap, tvec, vnum);
4795
dval = est_vector_cosine(svec, tvec, vnum);
4798
scores[j].value = NULL;
4799
pair[0] = scores[j].id;
4800
pair[1] = (int)(dval * 10000.0);
4801
cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
4802
(char *)pair, sizeof(int) * 2);
9666
if(limit == ESTECLSERV || limit == ESTECLDIR || limit == ESTECLFILE){
9668
} else if(limit >= ESTECLSIMURL){
9670
limit -= ESTECLSIMURL;
9671
if(limit < 0.01) limit = 0.01;
9672
if(limit > 1.0) limit = 1.0;
4807
for(i = 0; i < max; i++){
4808
if(scores[i].value){
4809
cbmapclose((CBMAP *)(scores[i].value));
4810
scores[nnum++] = scores[i];
4813
for(i = max; i < snum; i++){
4814
scores[nnum++] = scores[i];
9676
if(limit == ESTECLSERV){
9677
max = num * 14.8 + 8;
9678
} else if(limit == ESTECLDIR){
9679
max = num * 6.8 + 8;
9681
max = num * 4.8 + 8;
9683
if(max > snum) max = snum;
9684
for(i = 0; i < max; i++){
9685
scores[i].value = est_db_get_doc_attr(db, scores[i].id, ESTDATTRURI);
9687
for(i = 0; i < max; i++){
9688
if(!scores[i].value) continue;
9689
for(j = i + 1; j < max; j++){
9691
if(scores[j].value){
9692
switch(est_url_sameness(scores[i].value, scores[j].value)){
9705
free(scores[j].value);
9706
scores[j].value = NULL;
9707
pair[0] = scores[j].id;
9709
cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
9710
(char *)pair, sizeof(int) * 2);
9714
for(i = 0; i < max; i++){
9715
if(scores[i].value){
9716
free(scores[i].value);
9717
scores[nnum++] = scores[i];
9720
for(i = max; i < snum; i++){
9721
scores[nnum++] = scores[i];
9724
max = limit < 0.1 ? snum : num * ((2.4 / (limit - 0.05)) + 0.8) + 8;
9725
if(simurl) max *= 1.4;
9726
if(max > snum) max = snum;
9727
CB_MALLOC(svec, vnum * sizeof(int));
9728
CB_MALLOC(tvec, vnum * sizeof(int));
9729
for(i = 0; i < max; i++){
9730
if((svmap = est_get_tvmap(db, scores[i].id, vnum, tfidf)) != NULL){
9731
scores[i].value = (char *)svmap;
9732
if(simurl && (tmp = est_db_get_doc_attr(db, scores[i].id, ESTDATTRURI)) != NULL){
9733
cbmapput(svmap, "", 0, tmp, -1, TRUE);
9737
scores[i].value = NULL;
9740
for(i = 0; i < max; i++){
9741
svmap = (CBMAP *)(scores[i].value);
9742
if(!svmap || cbmaprnum(svmap) < 1) continue;
9743
suri = cbmapget((CBMAP *)scores[i].value, "", -1, NULL);
9744
if(num-- < 1) continue;
9745
est_vector_set_seed(svmap, svec, vnum);
9746
for(j = i + 1; j < max; j++){
9747
tvmap = (CBMAP *)(scores[j].value);
9748
if(!tvmap || cbmaprnum(tvmap) < 1) continue;
9749
est_vector_set_target(svmap, tvmap, tvec, vnum);
9750
dval = est_vector_cosine(svec, tvec, vnum);
9751
if(dval > 0.01 && suri &&
9752
(turi = cbmapget((CBMAP *)scores[j].value, "", -1, NULL)) != NULL){
9753
switch(est_url_sameness(suri, turi)){
9755
dval = pow(cos(acos(dval) * (1.0 - pow(dval, 9.9))), 1.07);
9758
dval = pow(cos(acos(dval) * (1.0 - pow(dval, 4.1))), 1.05);
9761
dval = pow(cos(acos(dval) * (1.0 - pow(dval, 2.9))), 1.03);
9764
dval = pow(cos(acos(dval) * (1.0 - pow(dval, 2.1))), 1.01);
9770
scores[j].value = NULL;
9771
pair[0] = scores[j].id;
9772
pair[1] = (int)(dval * 10000.0);
9773
cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
9774
(char *)pair, sizeof(int) * 2);
9778
for(i = 0; i < max; i++){
9779
if(scores[i].value){
9780
cbmapclose((CBMAP *)(scores[i].value));
9781
scores[nnum++] = scores[i];
9784
for(i = max; i < snum; i++){
9785
scores[nnum++] = scores[i];
5007
10012
`unum' specifies the number of adopted documents for a keyword.
5008
10013
`tfidf' specifies whether to perform TF-IDF tuning.
5009
10014
`nmin' specifies the minimum value for narrowing.
5010
The return value is an array whose elements are ID numbers of similar documents. */
10015
`auxmin' specifies the minimum hits to adopt the auxiliary index. If it is not more than 0,
10016
the auxiliary index is not used.
10017
`auxwords' specifies a map object where keywords used with the auxiliary index are stored. If
10018
it is `NULL', it is not used.
10019
The return value is an array of score structures of corresponding documents. */
5011
10020
static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
5012
int knum, int unum, int tfidf, double nmin){
10021
int knum, int unum, int mnum, int tfidf,
10022
double nmin, int auxmin, CBMAP *auxwords){
5013
10023
ESTSCORE *scores, *tscores;
5015
10025
const char *word;
5016
10026
int i, j, vnum, snum, tmax, tsnum, nnum, lid, *svec, *tvec;
5018
10028
assert(db && svmap && nump && knum >= 0 && unum >= 0 && nmin >= 0.0);
5019
CB_MALLOC(scores, sizeof(ESTSCORE) * unum * knum);
10029
CB_MALLOC(scores, sizeof(ESTSCORE) * (unum * knum + CB_LISTNUM(db->pdocs)) + 1);
5021
10031
if((vnum = cbmaprnum(svmap)) < 1) vnum = 1;
5022
10032
cbmapiterinit(svmap);
5024
for(i = 0; i < knum && (word = cbmapiternext(svmap, NULL)) != NULL; i++){
5025
tscores = est_search_union(db, word, 1, &tsnum, NULL, TRUE);
5026
qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score);
10034
for(i = 0; (i < knum || (i < knum * 2 && snum < unum * 2)) &&
10035
(word = cbmapiternext(svmap, NULL)) != NULL; i++){
10036
while(*word > '\0' && *word <= ' '){
10039
tscores = est_search_union(db, word, 1, NULL, &tsnum, NULL, TRUE, auxmin, auxwords);
10040
qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
5027
10041
for(j = 0; j < tmax && j < tsnum; j++){
5028
10042
scores[snum].id = tscores[j].id;
5029
scores[snum].score = tscores[j].score;
10043
scores[snum].score = tscores[j].score * (knum * 2.2 - i);
5032
10046
free(tscores);
5033
10047
tmax -= unum / knum / 1.25;
5035
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
10048
if(tmax < unum / 4) tmax = unum / 4;
10050
for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
10051
scores[snum].id = ESTPDOCIDMIN + i;
10052
scores[snum].score = 1;
10055
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
10058
for(i = 0; i < snum; i++){
10059
if(nnum > 0 && scores[i].id == lid){
10060
scores[nnum-1].score += scores[i].score;
10063
scores[nnum].id = scores[i].id;
10064
scores[nnum].score = scores[i].score;
10066
lid = scores[i].id;
10069
qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
5038
10071
CB_MALLOC(svec, vnum * sizeof(int));
5039
10072
CB_MALLOC(tvec, vnum * sizeof(int));
5040
10073
est_vector_set_seed(svmap, svec, vnum);
5041
for(i = 0; i < snum; i++){
5042
if(scores[i].id != lid){
5043
tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
5045
est_vector_set_target(svmap, tvmap, tvec, vnum);
5046
if((dval = est_vector_cosine(svec, tvec, vnum)) >= nmin){
5047
scores[nnum].id = scores[i].id;
5048
scores[nnum].score = (int)(dval * 10000);
5049
if(scores[nnum].score == 9999) scores[nnum].score = 10000;
10074
for(i = 0; i < snum && nnum < mnum; i++){
10075
tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
10077
est_vector_set_target(svmap, tvmap, tvec, vnum);
10078
if((dval = est_vector_cosine(svec, tvec, vnum)) >= nmin){
10079
scores[nnum].id = scores[i].id;
10080
scores[nnum].score = (int)(dval * 10000);
10081
if(scores[nnum].score == 9999) scores[nnum].score = 10000;
10082
scores[nnum].value = NULL;