/* Copyright (C) 1995-2007 Edward Der-Hua Liu, Hsin-Chu, Taiwan */ #include "gcin.h" #include "pho.h" #include "tsin.h" int hashidx[TSIN_HASH_N]; int *phidx; FILE *fph; int phcount; char tsidxfname[64]=""; static int a_phcount; static char tsfname[64]=""; #if USE_TSIN void load_tsin_db() { if (!tsfname[0]) { if (!getenv("GCIN_TABLE_DIR")) get_gcin_user_fname("tsin32", tsfname); else get_sys_table_file_name("tsin32", tsfname); } strcpy(tsidxfname, tsfname); strcat(tsidxfname, ".idx"); FILE *fr; if ((fr=fopen(tsidxfname,"r"))==NULL) { p_err("Cannot open %s\n", tsidxfname); } fread(&phcount,4,1,fr); #if 0 printf("phcount:%d\n",phcount); #endif a_phcount=phcount+256; fread(&hashidx,1,sizeof(hashidx),fr); if (phidx) free(phidx); if ((phidx=tmalloc(int, a_phcount))==NULL) p_err("malloc err pp 1"); fread(phidx,4, phcount, fr); fclose(fr); if (fph) fclose(fph); dbg("tsfname: %s\n", tsfname); if ((fph=fopen(tsfname,"r+"))==NULL) p_err("Cannot open %s", tsfname); } #endif #if USE_TSIN void free_tsin() { if (fph) { fclose(fph); fph = NULL; } if (phidx) { free(phidx); phidx = NULL; } } #endif static int phseq(u_char *a, u_char *b) { u_char lena, lenb, mlen; int i; phokey_t ka,kb; lena=*(a++); lenb=*(b++); a+=sizeof(usecount_t); b+=sizeof(usecount_t); // skip usecount mlen=Min(lena,lenb); for(i=0;i kb) return 1; if (ka < kb) return -1; a+=sizeof(phokey_t); b+=sizeof(phokey_t); } if (lena > lenb) return 1; if (lena < lenb) return -1; return 0; } void inc_dec_tsin_use_count(phokey_t *pho, char *ch, int N, gboolean b_dec); gboolean save_phrase_to_db(phokey_t *phkeys, char *utf8str, int len, usecount_t usecount) { int mid, ord = 0, ph_ofs, hashno, i; FILE *fw; u_char tbuf[MAX_PHRASE_LEN*(sizeof(phokey_t)+CH_SZ) + 1 + sizeof(usecount_t)], sbuf[MAX_PHRASE_LEN*(sizeof(phokey_t)+CH_SZ) + 1 + sizeof(usecount_t)]; tbuf[0]=len; memcpy(&tbuf[1], &usecount, sizeof(usecount)); // usecount int tlen = utf8_tlen(utf8str, len); #if 0 dbg("tlen %d '", tlen); for(i=0; i < tlen; i++) putchar(utf8str[i]); dbg("'\n"); #endif memcpy(&tbuf[1 + sizeof(usecount_t)], phkeys, sizeof(phokey_t) * len); memcpy(&tbuf[sizeof(phokey_t)*len + 1 + sizeof(usecount_t)], utf8str, tlen); hashno=phkeys[0] >> TSIN_HASH_SHIFT; if (hashno >= TSIN_HASH_N) return FALSE; for(mid=hashidx[hashno]; mid=0) break; } // dbg("tlen:%d ord:%d %s\n", tlen, ord, utf8str); if (!ord && !memcmp(&sbuf[sbuf[0]*sizeof(phokey_t)+1+sizeof(usecount_t)], utf8str, tlen)) { // bell(); dbg("Phrase already exists\n"); inc_dec_tsin_use_count(phkeys, utf8str, len, FALSE); return FALSE; } for(i=phcount;i>=mid;i--) phidx[i+1]=phidx[i]; fseek(fph,0,SEEK_END); ph_ofs=ftell(fph); phidx[mid]=ph_ofs; phcount++; if (phcount>=a_phcount) { a_phcount+=256; if (!(phidx=trealloc(phidx, int, a_phcount*4))) { p_err("tsin.c:realloc err"); } } fwrite(tbuf, 1, sizeof(phokey_t)*len + tlen + 1+ sizeof(usecount_t), fph); fflush(fph); if (hashidx[hashno]>mid) hashidx[hashno]=mid; hashno++; for(;hashno<256;hashno++) hashidx[hashno]++; if ((fw=fopen(tsidxfname,"w"))==NULL) { dbg("%s create err", tsidxfname); return FALSE; } fwrite(&phcount,4,1,fw); fwrite(&hashidx,sizeof(hashidx),1,fw); fwrite(phidx,4,phcount,fw); fclose(fw); return TRUE; } int *ts_gtab; extern int ts_gtabN; int read_tsin_phrase(char *str) { u_char len; usecount_t usecount; u_char pho[sizeof(phokey_t) * MAX_PHRASE_LEN]; len = 0; fread(&len, 1, 1, fph); if (len > MAX_PHRASE_LEN || len <=0) return 0; fread(&usecount, sizeof(usecount_t), 1, fph); // use count fread(pho, sizeof(phokey_t), len, fph); int i; int tlen = 0; for(i=0; i < len; i++) { fread(&str[tlen], 1, 1, fph); int sz = utf8_sz(&str[tlen]); fread(&str[tlen+1], 1, sz-1, fph); tlen+=sz; } str[tlen] = 0; return tlen; } typedef struct { char ts[MAX_PHRASE_STR_LEN]; int ofs; } TS_TMP; static int qcmp_ts_gtab(const void *aa, const void *bb) { TS_TMP *a = (TS_TMP *)aa, *b = (TS_TMP *)bb; return strcmp(a->ts, b->ts); } #if USE_TSIN void build_ts_gtab() { load_tsin_db(); fseek(fph,0,SEEK_SET); if (ts_gtab) { free(ts_gtab); ts_gtab = NULL; } TS_TMP *tstmp=NULL; int tstmpN=0; while (!feof(fph)) { if (!(tstmp=trealloc(tstmp, TS_TMP, tstmpN + 1))) p_err("tsin.c:realloc err"); tstmp[tstmpN].ofs = ftell(fph); if (!read_tsin_phrase(tstmp[tstmpN].ts)) break; tstmpN++; } qsort(tstmp, tstmpN, sizeof(TS_TMP), qcmp_ts_gtab); ts_gtabN = tstmpN; ts_gtab = tmalloc(int, ts_gtabN); int i; for(i=0; i < tstmpN; i++) { ts_gtab[i] = tstmp[i].ofs; } free(tstmp); } #endif static int load_ts_gtab(int idx, char *tstr) { int ofs = ts_gtab[idx]; fseek(fph, ofs, SEEK_SET); return read_tsin_phrase(tstr); } #if USE_TSIN // len is in CH_SZ int find_match(char *str, int len, char *match_chars, int match_chars_max) { if (!len) return 0; if (!ts_gtabN) build_ts_gtab(); int bottom = 0; int top = ts_gtabN - 1; int mid, tlen; char tstr[MAX_PHRASE_STR_LEN]; int matchN=0; if (match_chars) match_chars[0] = 0; do { mid = (bottom + top) /2; // dbg("tstr:%s %d %d %d\n", tstr, bottom, mid, top); tlen = load_ts_gtab(mid, tstr); if (!tlen) { // error in db dbg("error in db\n"); build_ts_gtab(); return 0; } int r = strncmp(str, tstr, len); if (r < 0) { top = mid - 1; } else if (r > 0 || strlen(tstr)==len) { bottom = mid + 1; } else { strcpy(str, tstr); if (!match_chars) return 1; bottom = mid; int i; int totlen=0; for(i=mid; i>=0; i--) { tlen = load_ts_gtab(i, tstr); if (strncmp(str, tstr, len) || tlen <= len) break; if (matchN >= match_chars_max) break; int slen= u8cpy(&match_chars[totlen], &tstr[len]); totlen+=slen; matchN++; } for(i=mid+1; i< ts_gtabN; i++) { tlen = load_ts_gtab(i, tstr); if (strncmp(str, tstr, len) || tlen <= len) break; if (matchN >= match_chars_max) break; int slen = u8cpy(&match_chars[totlen], &tstr[len]); totlen+=slen; matchN++; } match_chars[totlen] = 0; return matchN; } } while (bottom <= top); // dbg("%d %d\n", bottom, top); return 0; } #endif void load_tsin_entry(int idx, char *len, usecount_t *usecount, phokey_t *pho, u_char *ch) { *usecount = 0; if (idx >= phcount) { load_tsin_db(); // probably db changed, reload; *len = 0; return; } int ph_ofs=phidx[idx]; fseek(fph, ph_ofs, SEEK_SET); fread(len, 1, 1, fph); if (*len > MAX_PHRASE_LEN || *len <= 0) { dbg("err: tsin db changed reload"); load_tsin_db(); // probably db changed, reload; *len = 0; return; } fread(usecount, sizeof(usecount_t), 1, fph); // use count fread(pho, sizeof(phokey_t), (int)(*len), fph); if (ch) fread(ch, CH_SZ, (int)(*len), fph); } int phokey_t_seq(phokey_t *a, phokey_t *b, int len) { int i; for (i=0;i b[i]) return 1; else if (a[i] < b[i]) return -1; } return 0; } // *** r_sti<= range < r_edi gboolean tsin_seek(phokey_t *pho, int plen, int *r_sti, int *r_edi) { int mid, cmp; phokey_t ss[MAX_PHRASE_LEN], stk[MAX_PHRASE_LEN]; u_char mlen, stch[MAX_PHRASE_LEN * CH_SZ]; char len; usecount_t usecount; int hashi= *pho >> TSIN_HASH_SHIFT; if (hashi >= TSIN_HASH_N) return FALSE; int top=hashidx[hashi]; int bot=hashidx[hashi+1]; if (top>=phcount) return FALSE; while (top <= bot) { mid=(top+bot)/ 2; load_tsin_entry(mid, &len, &usecount, ss, stch); if (len > plen) mlen=plen; else mlen=len; cmp=phokey_t_seq(ss, pho, mlen); if (!cmp && len < plen) cmp=-2; if (cmp>0) bot=mid-1; else if (cmp<0) top=mid+1; else break; } if (cmp) { // dbg("no match %d\n", cmp); return FALSE; } // seek to the first match because binary search is used int sti; for(sti = mid; sti>=0; sti--) { load_tsin_entry(sti, &len, &usecount, stk, stch); if (len >= plen && !phokey_t_seq(stk, pho, plen)) continue; break; } sti++; // seek to the tail int edi; for(edi = mid; edi < phcount; edi++) { load_tsin_entry(edi, &len, &usecount, stk, stch); if (len >= plen && !phokey_t_seq(stk, pho, plen)) continue; break; } *r_sti = sti; *r_edi = edi; return TRUE; } // och : orginal och; void inc_dec_tsin_use_count(phokey_t *pho, char *ch, int N, gboolean b_dec) { int sti, edi; if (!tsin_seek(pho, N, &sti, &edi)) return; int idx; int tlen = utf8_tlen(ch, N); #if 0 dbg("otlen %d ", tlen); int i; for(i=0; i < tlen; i++) putchar(ch[i]); puts(""); #endif for(idx=sti; idx < edi; idx++) { char len; usecount_t usecount, n_usecount; phokey_t phi[MAX_PHRASE_LEN]; char stch[MAX_PHRASE_LEN * CH_SZ]; load_tsin_entry(idx, &len, &usecount, phi, stch); n_usecount = usecount; if (len!=N || phokey_t_seq(phi, pho, N)) break; #if 0 for(i=0; i < tlen; i++) putchar(stch[i]); dbg(" ppp\n"); #endif if (!utf8_str_eq(stch, ch, N)) continue; #if 0 dbg("found match\n"); #endif int ph_ofs=phidx[idx]; fseek(fph, ph_ofs + 1, SEEK_SET); if (b_dec) { if (usecount > -127) n_usecount--; // dbg("dec %d\n", n_usecount); } else { if (usecount < 0x3fffffff) n_usecount++; // dbg("inc %d\n", n_usecount); } if (n_usecount != usecount) { fwrite(&n_usecount, sizeof(usecount_t), 1, fph); // use count fflush(fph); } } }