1
/* Copyright (C) 1995-2011 Edward Der-Hua Liu, Hsin-Chu, Taiwan
3
* This library is free software; you can redistribute it and/or
4
* modify it under the terms of the GNU Lesser General Public
5
* License as published by the Free Software Foundation; either
6
* version 2.1 of the License, or (at your option) any later version.
8
* This library is distributed in the hope that it will be useful,
9
* but WITHOUT ANY WARRANTY; without even the implied warranty of
10
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
* Lesser General Public License for more details.
13
* You should have received a copy of the GNU Lesser General Public
14
* License along with this library; if not, write to the Free Software
15
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
#include <sys/types.h>
27
void load_pin_juyin();
28
phokey_t pinyin2phokey(char *s);
31
//static int bfN_a = 0;
32
static gboolean b_pinyin;
34
int *phidx, *sidx, phcount;
35
int bfsize, phidxsize;
37
gboolean is_gtab, gtabkey64;
39
int (*key_cmp)(char *a, char *b, char len);
41
int key_cmp16(char *a, char *b, char len)
44
for(i=0; i < len; i++) {
48
if (ka > kb) return 1;
49
if (kb > ka) return -1;
57
int key_cmp32(char *a, char *b, char len)
60
for(i=0; i < len; i++) {
64
if (ka > kb) return 1;
65
if (kb > ka) return -1;
72
int key_cmp64(char *a, char *b, char len)
75
for(i=0; i < len; i++) {
79
if (ka > kb) return 1;
80
if (kb > ka) return -1;
87
static int qcmp(const void *a, const void *b)
89
int idxa=*((int *)a); char *pa = (char *)&bf[idxa];
90
int idxb=*((int *)b); char *pb = (char *)&bf[idxb];
91
u_char lena,lenb, len;
92
usecount_t usecounta, usecountb;
94
lena=*(pa++); memcpy(&usecounta, pa, sizeof(usecount_t)); pa+= sizeof(usecount_t);
97
lenb=*(pb++); memcpy(&usecountb, pb, sizeof(usecount_t)); pb+= sizeof(usecount_t);
102
int d = (*key_cmp)(ka, kb, len);
111
int tlena = utf8_tlen(pa, lena);
112
int tlenb = utf8_tlen(pb, lenb);
119
if ((d=memcmp(pa, pb, tlena)))
122
// large first, so large one will be kept after delete
123
return usecountb - usecounta;
126
static int qcmp_eq(const void *a, const void *b)
128
int idxa=*((int *)a); char *pa = (char *)&bf[idxa];
129
int idxb=*((int *)b); char *pb = (char *)&bf[idxb];
130
u_char lena,lenb, len;
132
lena=*(pa++); pa+= sizeof(usecount_t);
135
lenb=*(pb++); pb+= sizeof(usecount_t);
140
int d = (*key_cmp)(ka, kb, len);
149
int tlena = utf8_tlen(pa, lena);
150
int tlenb = utf8_tlen(pb, lenb);
157
return memcmp(pa, pb, tlena);
160
static int qcmp_usecount(const void *a, const void *b)
162
int idxa=*((int *)a); char *pa = (char *)&sf[idxa];
163
int idxb=*((int *)b); char *pb = (char *)&sf[idxb];
164
u_char lena,lenb, len;
165
usecount_t usecounta, usecountb;
167
lena=*(pa++); memcpy(&usecounta, pa, sizeof(usecount_t)); pa+= sizeof(usecount_t);
168
lenb=*(pb++); memcpy(&usecountb, pb, sizeof(usecount_t)); pb+= sizeof(usecount_t);
171
int d = (*key_cmp)(pa, pb, len);
183
int tlena = utf8_tlen(pa, lena);
184
int tlenb = utf8_tlen(pb, lenb);
191
return usecountb - usecounta;
194
void send_hime_message(Display *dpy, char *s);
196
void init_TableDir();
198
int main(int argc, char **argv)
202
u_char chbuf[MAX_PHRASE_LEN * CH_SZ];
205
u_int64_t phbuf64[80];
206
int i,j,idx,len, ofs;
209
int hashidx[TSIN_HASH_N];
212
gboolean reload = getenv("HIME_NO_RELOAD")==NULL;
215
dbg("need reload\n");
217
dbg("NO_GTK_INIT\n");
220
if (getenv("NO_GTK_INIT")==NULL)
221
gtk_init(&argc, &argv);
223
dbg("enter %s\n", argv[0]);
226
p_err("must specify input file");
231
if ((fp=fopen(argv[1], "rb"))==NULL) {
232
printf("Cannot open %s\n", argv[1]);
236
skip_utf8_sigature(fp);
238
int fofs = ftell(fp);
239
myfgets(s, sizeof(s), fp);
240
if (strstr(s, "!!pinyin")) {
242
printf("is pinyin\n");
245
fseek(fp, fofs, SEEK_SET);
248
int keybits=0, maxkey=0;
251
bzero(kno, sizeof(kno));
252
myfgets(s, sizeof(s), fp);
254
if (strstr(s, TSIN_GTAB_KEY)) {
259
p_err("useage %s input_file output_file", argv[0]);
263
len=strlen((char *)s);
268
sscanf(s, "%s %d %d %s", aa, &keybits, &maxkey, keymap+1);
269
for(i=0; keymap[i]; i++)
272
if (maxkey * keybits > 32)
280
fseek(fp, fofs, SEEK_SET);
285
INMD inmd, *cur_inmd = &inmd;
289
cur_inmd->keybits = keybits;
291
cphbuf = (char *)phbuf64;
294
hash_shift = TSIN_HASH_SHIFT_64;
295
cur_inmd->key64 = TRUE;
297
cphbuf = (char *)phbuf32;
299
hash_shift = TSIN_HASH_SHIFT_32;
301
cur_inmd->key64 = FALSE;
303
cur_inmd->last_k_bitn = (((cur_inmd->key64 ? 64:32) / cur_inmd->keybits) - 1) * cur_inmd->keybits;
304
dbg("cur_inmd->last_k_bitn %d\n", cur_inmd->last_k_bitn);
306
cphbuf = (char *)phbuf;
309
hash_shift = TSIN_HASH_SHIFT;
312
dbg("phsz: %d\n", phsz);
316
usecount_t usecount=0;
320
myfgets((char *)s,sizeof(s),fp);
321
len=strlen((char *)s);
325
if (strstr(s, TSIN_GTAB_KEY))
337
while (s[i]!=' ' && i<len) {
338
int len = utf8_sz((char *)&s[i]);
340
memcpy(&chbuf[chbufN], &s[i], len);
347
while ((i < len && s[i]==' ') || s[i]=='\t')
351
while (i<len && phbufN < charN && s[i]!=' ') {
355
while (s[i]!=' ' && i<len) {
357
kk64|=(u_int64_t)k << ( LAST_K_bitN - idx*keybits);
363
phbuf64[phbufN++]=kk64;
365
phbuf32[phbufN++]=(u_int)kk64;
369
kk = pinyin2phokey(s+i);
370
while (s[i]!=' ' && i<len)
373
while (s[i]!=' ' && i<len) {
374
if (kk==(BACK_QUOTE_NO << 9))
377
kk |= lookup((u_char *)&s[i]);
379
i+=utf8_sz((char *)&s[i]);
390
p_err("%s Line %d problem in phbufN!=chbufN %d != %d\n", s, lineCnt, phbufN, chbufN);
395
while (i<len && s[i]==' ')
401
usecount = atoi((char *)&s[i]);
403
/* printf("len:%d\n", clen); */
405
if (phcount >= phidxsize) {
407
if (!(phidx=(int *)realloc(phidx,phidxsize*4))) {
413
phidx[phcount++]=ofs;
415
int new_bfN = ofs + 1 + sizeof(usecount_t)+ phsz * clen + chbufN;
417
if (bfsize < new_bfN) {
418
bfsize = new_bfN + 1024*1024;
419
bf = (char *)realloc(bf, bfsize);
422
memcpy(&bf[ofs++],&clen,1);
423
memcpy(&bf[ofs],&usecount, sizeof(usecount_t)); ofs+=sizeof(usecount_t);
425
memcpy(&bf[ofs], cphbuf, clen * phsz);
428
memcpy(&bf[ofs], chbuf, chbufN);
433
/* dumpbf(bf,phidx); */
435
puts("Sorting ....");
437
qsort(phidx,phcount, sizeof(phidx[0]),qcmp);
439
if (!(sf=(u_char *)malloc(bfsize))) {
444
if (!(sidx=(int *)malloc(phidxsize*sizeof(int)))) {
453
for(i=0;i<phcount;i++) {
457
int tlen = utf8_tlen(&bf[idx + 1 + sizeof(usecount_t) + phsz*len], len);
458
clen= phsz*len + tlen + 1 + sizeof(usecount_t);
460
if (i && !qcmp_eq(&phidx[i-1], &phidx[i]))
463
memcpy(&sf[ofs], &bf[idx], clen);
470
puts("Sorting by usecount ....");
471
qsort(sidx, phcount, 4, qcmp_usecount);
477
for(i=0;i<phcount;i++) {
479
idx+= 1 + sizeof(usecount_t);
484
memcpy(&kk, &sf[idx], phsz);
485
v = kk >> TSIN_HASH_SHIFT;
486
} else if (phsz==4) {
488
memcpy(&kk32, &sf[idx], phsz);
489
v = kk32 >> TSIN_HASH_SHIFT_32;
493
memcpy(&kk64, &sf[idx], phsz);
494
v = kk64 >> TSIN_HASH_SHIFT_64;
497
if (v >= TSIN_HASH_N)
498
p_err("error found %d", v);
500
if (hashidx[v] < 0) {
508
hashidx[TSIN_HASH_N-1]=phcount;
509
for(i=TSIN_HASH_N-2;i>=0;i--) {
511
hashidx[i]=hashidx[i+1];
514
for(i=1; i< TSIN_HASH_N; i++) {
516
hashidx[i]=hashidx[i-1];
519
printf("Writing data %s %d\n", outfile, ofs);
520
if ((fw=fopen(outfile,"wb"))==NULL) {
521
p_err("create err %s", outfile);
526
bzero(&head, sizeof(head));
527
strcpy(head.signature, TSIN_GTAB_KEY);
528
head.keybits = keybits;
529
head.maxkey = maxkey;
530
strcpy(head.keymap, keymap);
531
fwrite(&head, sizeof(head), 1, fw);
537
char outfileidx[512];
538
strcat(strcpy(outfileidx, outfile), ".idx");
540
dbg("Writing data %s\n", outfileidx);
541
if ((fw=fopen(outfileidx,"wb"))==NULL) {
542
p_err("cannot create %s", outfileidx);
545
fwrite(&phcount,4,1,fw);
546
fwrite(hashidx,1,sizeof(hashidx),fw);
547
fwrite(sidx,4,phcount,fw);
548
printf("%d phrases\n",phcount);
556
printf("reload....\n");
557
send_hime_message(GDK_DISPLAY(), RELOAD_TSIN_DB);