1
/* Copyright (C) 1995-2011 Edward Der-Hua Liu, Hsin-Chu, Taiwan
3
* This library is free software; you can redistribute it and/or
4
* modify it under the terms of the GNU Lesser General Public
5
* License as published by the Free Software Foundation; either
6
* version 2.1 of the License, or (at your option) any later version.
8
* This library is distributed in the hope that it will be useful,
9
* but WITHOUT ANY WARRANTY; without even the implied warranty of
10
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
* Lesser General Public License for more details.
13
* You should have received a copy of the GNU Lesser General Public
14
* License along with this library; if not, write to the Free Software
15
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
int hashidx[TSIN_HASH_N];
27
static FILE *fp_phidx;
30
int ph_key_sz; // bytes
31
gboolean tsin_is_gtab;
32
static int tsin_hash_shift;
34
#define PHIDX_SKIP (sizeof(phcount) + sizeof(hashidx))
36
char *current_tsin_fname;
38
//static int *ts_gtab_hash;
43
void get_hime_user_or_sys_fname(char *name, char fname[]);
45
void load_tsin_db0(char *infname, gboolean is_gtab_i)
48
// dbg("cur %s %s\n", infname, current_tsin_fname);
50
if (current_tsin_fname && !strcmp(current_tsin_fname, infname))
53
strcpy(tsidxfname, infname);
54
strcat(tsidxfname, ".idx");
56
// dbg("tsidxfname %s\n", tsidxfname);
60
if ((fr=fopen(tsidxfname,"rb+"))==NULL) {
61
p_err("load_tsin_db0 A Cannot open '%s'\n", tsidxfname);
65
fread(&phcount,4,1,fr);
67
printf("phcount:%d\n",phcount);
69
a_phcount=phcount+256;
70
fread(&hashidx,1,sizeof(hashidx),fr);
77
dbg("tsfname: %s\n", infname);
79
if ((fph=fopen(infname,"rb+"))==NULL)
80
p_err("load_tsin_db0 B Cannot open '%s'", infname);
82
free(current_tsin_fname);
83
current_tsin_fname = strdup(infname);
87
fread(&head, sizeof(head), 1, fph);
88
if (head.keybits*head.maxkey > 32) {
90
tsin_hash_shift = TSIN_HASH_SHIFT_64;
94
tsin_hash_shift = TSIN_HASH_SHIFT_32;
98
tsin_hash_shift = TSIN_HASH_SHIFT;
100
tsin_is_gtab = is_gtab_i;
109
free(current_tsin_fname); current_tsin_fname=NULL;
112
fclose(fph); fph = NULL;
116
fclose(fp_phidx); fp_phidx=NULL;
125
char *fname = tsin32_f;
127
get_hime_user_or_sys_fname(fname, tsfname);
128
load_tsin_db0(tsfname, FALSE);
132
static void seek_fp_phidx(int i)
134
fseek(fp_phidx, PHIDX_SKIP + i*sizeof(int), SEEK_SET);
137
void reload_tsin_db()
140
if (!current_tsin_fname)
143
strcpy(tt, current_tsin_fname);
144
free(current_tsin_fname); current_tsin_fname = NULL;
145
load_tsin_db0(tt, tsin_is_gtab);
148
inline static int get_phidx(int i)
152
fread(&t, sizeof(int), 1, fp_phidx);
155
t += sizeof(TSIN_GTAB_HEAD);
161
static inline int phokey_t_seq16(phokey_t *a, phokey_t *b, int len)
165
for (i=0;i<len;i++) {
166
if (a[i] > b[i]) return 1;
168
if (a[i] < b[i]) return -1;
175
static inline int phokey_t_seq32(u_int *a, u_int *b, int len)
179
for (i=0;i<len;i++) {
180
if (a[i] > b[i]) return 1;
182
if (a[i] < b[i]) return -1;
189
static inline int phokey_t_seq64(u_int64_t *a, u_int64_t *b, int len)
193
for (i=0;i<len;i++) {
194
if (a[i] > b[i]) return 1;
196
if (a[i] < b[i]) return -1;
203
static int phokey_t_seq(void *a, void *b, int len)
206
return phokey_t_seq16((phokey_t *)a, (phokey_t *)b, len);
208
return phokey_t_seq32((u_int *)a, (u_int *)b, len);
210
return phokey_t_seq64((u_int64_t*)a, (u_int64_t*)b, len);
215
static int phseq(u_char *a, u_char *b)
217
u_char lena, lenb, mlen;
219
lena=*(a++); lenb=*(b++);
220
a+=sizeof(usecount_t); b+=sizeof(usecount_t); // skip usecount
223
u_int64_t ka[MAX_PHRASE_LEN], kb[MAX_PHRASE_LEN];
225
memcpy(ka, a, ph_key_sz * mlen);
226
memcpy(kb, b, ph_key_sz * mlen);
228
int d = phokey_t_seq(a, b, mlen);
232
if (lena > lenb) return 1;
233
if (lena < lenb) return -1;
237
void inc_dec_tsin_use_count(void *pho, char *ch, int N);
239
static gboolean saved_phrase;
241
gboolean save_phrase_to_db(void *phkeys, char *utf8str, int len, usecount_t usecount)
243
int mid, ord = 0, ph_ofs, hashno;
244
u_char tbuf[MAX_PHRASE_LEN*(sizeof(u_int64_t)+CH_SZ) + 1 + sizeof(usecount_t)],
245
sbuf[MAX_PHRASE_LEN*(sizeof(u_int64_t)+CH_SZ) + 1 + sizeof(usecount_t)];
250
memcpy(&tbuf[1], &usecount, sizeof(usecount)); // usecount
251
int tlen = utf8_tlen(utf8str, len);
253
dbg("tlen %d '", tlen);
254
for(i=0; i < tlen; i++)
259
dbg("save_phrase_to_db '%s' tlen:%d\n", utf8str, tlen);
261
memcpy(&tbuf[1 + sizeof(usecount_t)], phkeys, ph_key_sz * len);
262
memcpy(&tbuf[ph_key_sz*len + 1 + sizeof(usecount_t)], utf8str, tlen);
265
hashno= *((phokey_t *)phkeys) >> TSIN_HASH_SHIFT;
266
else if (ph_key_sz==4)
267
hashno= *((u_int *)phkeys) >> TSIN_HASH_SHIFT_32;
269
hashno= *((u_int64_t *)phkeys) >> TSIN_HASH_SHIFT_64;
271
// dbg("hashno %d\n", hashno);
273
if (hashno >= TSIN_HASH_N)
276
for(mid=hashidx[hashno]; mid<hashidx[hashno+1]; mid++) {
277
ph_ofs=get_phidx(mid);
279
fseek(fph, ph_ofs, SEEK_SET);
281
fread(&sbuf[1], sizeof(usecount_t), 1, fph); // use count
282
fread(&sbuf[1+sizeof(usecount_t)], 1, (ph_key_sz + CH_SZ) * sbuf[0], fph);
283
if ((ord=phseq(sbuf,tbuf)) > 0)
286
if (!ord && !memcmp(&sbuf[sbuf[0]*ph_key_sz+1+sizeof(usecount_t)], utf8str, tlen)) {
288
dbg("Phrase already exists\n");
289
inc_dec_tsin_use_count(phkeys, utf8str, len);
294
int wN = phcount - mid;
296
// dbg("wN %d phcount:%d mid:%d\n", wN, phcount, mid);
299
int *phidx = tmalloc(int, wN);
301
fread(phidx, sizeof(int), wN, fp_phidx);
302
seek_fp_phidx(mid+1);
303
fwrite(phidx, sizeof(int), wN, fp_phidx);
307
fseek(fph,0,SEEK_END);
311
ph_ofs -= sizeof(TSIN_GTAB_HEAD);
313
// dbg("ph_ofs %d ph_key_sz:%d\n", ph_ofs, ph_key_sz);
315
fwrite(&ph_ofs, sizeof(int), 1, fp_phidx);
318
fwrite(tbuf, 1, ph_key_sz*len + tlen + 1+ sizeof(usecount_t), fph);
321
if (hashidx[hashno]>mid)
324
for(hashno++; hashno<TSIN_HASH_N; hashno++)
328
fwrite(&phcount, sizeof(phcount), 1, fp_phidx);
329
fwrite(&hashidx,sizeof(hashidx),1, fp_phidx);
332
// dbg("ofs %d\n", get_phidx(mid));
338
#include <sys/stat.h>
341
void load_tsin_entry0(char *len, usecount_t *usecount, void *pho, u_char *ch)
345
fread(len, 1, 1, fph);
347
if (*len > MAX_PHRASE_LEN || *len <= 0) {
348
dbg("err: tsin db changed reload\n");
349
reload_tsin_db(); // probably db changed, reload;
354
fread(usecount, sizeof(usecount_t), 1, fph); // use count
355
fread(pho, ph_key_sz, (int)(*len), fph);
357
fread(ch, CH_SZ, (int)(*len), fph);
358
int tlen = utf8_tlen((char *)ch, *len);
364
void load_tsin_entry(int idx, char *len, usecount_t *usecount, void *pho, u_char *ch)
369
if (idx >= phcount) {
370
reload_tsin_db(); // probably db changed, reload;
375
int ph_ofs=get_phidx(idx);
376
// dbg("idx %d:%d\n", idx, ph_ofs);
378
fseek(fph, ph_ofs , SEEK_SET);
379
load_tsin_entry0(len, usecount, pho, ch);
382
// tone_mask : 1 -> pho has tone
383
void mask_tone(phokey_t *pho, int plen, char *tone_mask)
386
// dbg("mask_tone\n");
390
for(i=0; i < plen; i++) {
397
// *** r_sti<= range < r_edi
398
gboolean tsin_seek(void *pho, int plen, int *r_sti, int *r_edi, char *tone_mask)
401
u_int64_t ss[MAX_PHRASE_LEN], stk[MAX_PHRASE_LEN];
407
dbg("tsin_seek %d\n", plen);
412
mask_tone((phokey_t *)pho, plen, tone_mask);
416
hashi= *((phokey_t *)pho) >> TSIN_HASH_SHIFT;
417
else if (ph_key_sz==4)
418
hashi= *((u_int *)pho) >> TSIN_HASH_SHIFT_32;
420
hashi= *((u_int64_t *)pho) >> TSIN_HASH_SHIFT_64;
422
if (hashi >= TSIN_HASH_N) {
423
// dbg("hashi >= TSIN_HASH_N\n");
427
int top=hashidx[hashi];
428
int bot=hashidx[hashi+1];
431
// dbg("top>=phcount\n");
437
load_tsin_entry(mid, &len, &usecount, ss, NULL);
446
// mask_tone((phokey_t *)ss, mlen, tone_mask);
455
cmp=phokey_t_seq(ss, pho, mlen);
457
if (!cmp && len < plen)
469
if (cmp && !tone_mask) {
470
// dbg("no match %d\n", cmp);
475
// seek to the first match because binary search is used
476
gboolean found=FALSE;
479
for(sti = mid; sti>=0; sti--) {
480
load_tsin_entry(sti, &len, &usecount, stk, NULL);
490
mask_tone((phokey_t *)stk, mlen, tone_mask);
492
int v = phokey_t_seq(stk, pho, plen);
498
dbg("%d] %d*> ", sti, mlen);
503
if ((!tone_mask && !v && len>=plen) ||
504
((tone_mask && v>0) || (!v && len >= plen)))
513
int top=hashidx[hashi];
514
int bot=hashidx[hashi+1];
517
// dbg("top>=phcount\n");
521
phokey_t tpho[MAX_PHRASE_LEN];
524
for(i=0; i < plen; i++)
525
tpho[i]=((phokey_t*)pho)[i] | 7;
529
load_tsin_entry(mid, &len, &usecount, ss, NULL);
546
cmp=phokey_t_seq(ss, tpho, mlen);
548
if (!cmp && len < plen)
562
for(edi = mid; edi < phcount; edi++) {
563
load_tsin_entry(edi, &len, &usecount, stk, NULL);
573
mask_tone((phokey_t *)stk, mlen, tone_mask);
575
int v = phokey_t_seq(stk, pho, plen);
579
dbg("edi%d -> ", edi);
584
if ((!tone_mask && !v && len >= plen)
585
|| ((tone_mask && v<0) || (!v && len >= plen)))
591
dbg("sti%d edi:%d found:%d\n", sti, edi, found);
600
void inc_dec_tsin_use_count(void *pho, char *ch, int N)
604
// dbg("inc_dec_tsin_use_count '%s'\n", ch);
606
if (!tsin_seek(pho, N, &sti, &edi, NULL))
612
int tlen = strlen(ch);
614
dbg("otlen %d ", tlen);
616
for(i=0; i < tlen; i++)
621
for(idx=sti; idx < edi; idx++) {
623
usecount_t usecount, n_usecount;
624
u_int64_t phi[MAX_PHRASE_LEN];
625
char stch[MAX_PHRASE_LEN * CH_SZ * 2];
627
load_tsin_entry(idx, &len, &usecount, phi, (u_char *)stch);
628
n_usecount = usecount;
630
if (len!=N || phokey_t_seq(phi, pho, N))
633
for(i=0; i < tlen; i++)
638
// dbg("stch %s\n", stch);
639
if (strcmp(stch, ch))
642
dbg("found match\n");
644
int ph_ofs=get_phidx(idx);
646
fseek(fph, ph_ofs + 1, SEEK_SET);
648
if (usecount < 0x3fffffff)
651
if (n_usecount != usecount) {
652
fwrite(&n_usecount, sizeof(usecount_t), 1, fph); // use count