1
/**************************************************************************
4
* Index (.xjdx) generator program fron XJDIC
6
* V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
7
* Copyright 1998 Jim Breen <jwb@csse.monash.edu.au>
8
***************************************************************************/
9
/* This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation; either version 1, or (at your option)
14
This program is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
GNU General Public License for more details.
19
You should have received a copy of the GNU General Public License
20
along with this program; if not, write to the Free Software
21
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
23
/* Changed: ignore all rc stuff. use args 1 and 2 for input/output file.
26
/* Heavily commented, removed the unused header file, split off the
27
readDictionary function, removed unused functions and variables... cleaned
28
up the code in general. Preparing for integration to the rest of the program
30
Note that this indexer has been hacked off of Jim Breen's xjdic program,
31
and a lot of the things which have been removed were relevant to that
32
program, but not to this one.
36
#include <config-kiten.h>
47
#ifdef HAVE_INTTYPES_H
55
#define INDEX_VERSION 14; /*The last time the index structure changed was Version1.4*/
61
/*====== prototypes=================================================*/
62
void jqsort(int32_t i, int32_t j);
63
int Kstrcmp(uint32_t lhs, uint32_t rhs);
64
int alphaoreuc(unsigned char x);
65
unsigned char* readDictionary(const char* dictName,uint32_t *filesize);
66
uint32_t buildIndex(unsigned char* dict, uint32_t dictLength);
68
/*====function to Load Dictionary and load/create index table=======*/
79
printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");
80
printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n");
84
printf("\nUSAGE: kitengen input output.xjdx\n");
88
Dname = argv[1]; /*Name of the dictionary being scanned */
89
JDXname = argv[2]; /*Name of the output file */
90
printf("Commandline request to use files %s and %s \n", Dname, JDXname);
91
printf("\nWARNING!! This program may take a long time to run .....\n");
93
db = readDictionary(Dname,&diclen); /*Reads the dict, but leaves a space at the beginning*/
94
diclen++; /*add one to the number of bytes considered in the file */
95
db[diclen] = 10; /*set the first and final entry in the database to 10 */
97
printf("Dictionary size: %d bytes.\n",diclen);
100
indlen = (diclen * 3)/4; /*Make a wild guess at the index file length */
101
jindex = (uint32_t *)malloc(indlen); /* and allocate it */
104
fprintf(stderr,"malloc() for index table failed.\n");
108
printf("Parsing.... \n");
109
/*this is the dictionary parser. It places an entry in jindex for every
110
kana/kanji string and every alphabetic string it finds which is >=3
112
indptr = buildIndex(db,diclen);
114
printf("Index entries: %d \nSorting (this is slow)......\n",indptr);
115
jqsort((int32_t)1,indptr);
117
printf("Sorted\nWriting index file ....\n");
118
fp = fopen(JDXname,"wb");
121
printf("\nCannot open %s output file\n",JDXname);
124
jindex[0] = diclen+INDEX_VERSION; /* prepend the index file size + version # */
125
fwrite(jindex,sizeof(int32_t),indptr+1,fp);
131
/*=========function to parse the dict file and fill the jindex global with the index====*/
132
/*=========returns the size of the index file ====*/
134
A bit of explanation on what this thing generates is probably in order.
135
Essentially, it fills jindex with a large number of numbers... each number
136
being an offset to a byte location inside of the dictionary file. Starting
137
at position index 1 (second pos)
138
In other words... feeding this thing the dict file
139
"Llama X1\nJT Fred Flintstone X"
140
would generate: {<unmodified>,0,6,12,17}.
141
"X" is skipped because it is only 1 byte long.
142
"JT" is skipped because it is only two bytes long, the J is regular ascii
143
(<127), and the T is not a digit. If any of those were different, (it
144
was longer than 2 bytes, was an euc (kana or kanji) character, or T was
145
a digit) it would be included in the index.
148
/*First... an ugly #define to make our code a bit more readable*/
149
#define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \
150
printf("Index table overflow. Dictionary too large?\n"); exit(1); } }
152
uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) {
153
int nowReadingWord = FALSE; /*Boolean to track if we're mid-word in the dict */
154
int currentDictCharacter; /*Current character index in the dict */
155
unsigned char c; /*the current reading character*/
156
unsigned char currstr[TOKENLIM]; /* String that we're currently getting */
157
int currstrIndex = 0;
158
uint32_t indptr = 1; /* next 'slot' in the index to fill */
159
int saving = FALSE; /*is what we are doing right now slated for salvation?*/
161
for (currentDictCharacter =0; currentDictCharacter < dictLength;
162
currentDictCharacter++)
164
c = dict[currentDictCharacter]; /* Fetch the next character */
166
if(!nowReadingWord) /*if we are NOT in the middle of reading a word */
168
if (alphaoreuc(c) || c == SPTAG) /* if character or priority entry */
170
nowReadingWord = TRUE; /* Mark that we're mid word */
171
jindex[indptr] = currentDictCharacter;
172
/* copy the location of this character to our index structure */
174
/*mark the next position in the string to copy a char into */
176
/*set the current string to be equal to this character so far */
180
} else { /*If we're in the middle of parsing a word atm */
182
/*if it's alphanumeric or - or . copy it and increment where the
184
if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c<='9')))
186
currstr[currstrIndex] = c;
187
if(currstrIndex < TOKENLIM-1)
190
else /* We were reading a word... and we just encountered the
193
currstr[currstrIndex] = '\0'; /*null terminate the string */
194
nowReadingWord = FALSE;
196
/*Don't save single or dual character items where the
197
first item is ascii */
198
if ((strlen(currstr) <= 2) && (currstr[0] < 127))
200
/*EXCEPT: Save anything that's two character where the second
202
Note that this might catch single 2-byte kanji as well...
204
if ((strlen(currstr) == 2) && (currstr[1] <= '9'))
207
/* This is a latin-character string, either longer than 2 bytes
208
or having an ascii digit for a second byte */
209
if (saving && (currstr[0] < 127))
212
INDEX_OVERFLOW_CHECK(indptr);
214
/* If this is non-Japanese, and has a 'SPTAGn' tag, generate
216
if ( currstr[0] == SPTAG)
218
/*make a separate entry pointing to
219
the non-SPTAG'd entry (the next byte)*/
220
jindex[indptr] = jindex[indptr-1]+1;
221
/*overwrite the SPTAG marker*/
222
strcpy(currstr,currstr+1);
224
INDEX_OVERFLOW_CHECK(indptr);
228
/*For strings that start with non latin characters*/
229
if (saving && (currstr[0] > 127))
232
uint32_t possav = jindex[indptr]; /*Save the current marker*/
234
INDEX_OVERFLOW_CHECK(indptr);
236
/* generate index for *every* kanji in key */
238
/*if this is a three byte kanji, ignore the 0x8f marker */
239
if (currstr[0] == 0x8f)
241
/*step through... two by two*/
242
for ( ; i < strlen(currstr); i+=2)
244
if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
246
/*Add in a specific reference to the kanji*/
247
jindex[indptr] = possav+i;
249
INDEX_OVERFLOW_CHECK(indptr);
251
/*again the check if it's a three byte kanji*/
252
if(currstr[i] == 0x8f)
259
indptr--; /*correct for the overshoot */
263
/*===function to read the dictionary files into array, returning filesize===*/
264
/*Note: We leave a blank byte in the first byte of the returned dictionary, and
265
allocate an extra 99 bytes at the end */
267
readDictionary(const char* dictName,uint32_t *filesize) {
270
unsigned char *memDictionary;
273
if(stat(dictName, &buf) != 0) /* if the dict file doesn't exist */
276
printf("Cannot stat: %s \n",dictName);
280
*filesize = buf.st_size; /*file size in bytes*/
282
puts ("\nLoading Dictionary file. Please wait.....\n");
283
fp=fopen(dictName,"rb");
286
printf("\nCannot open dictionary file\n");
289
/*Allocate the database index 100 bytes larger than the dict filesize*/
290
memDictionary=(unsigned char*)malloc((*filesize+100)*sizeof(unsigned char));
291
if(memDictionary == NULL)
293
fprintf(stderr,"malloc() for dictionary failed.\n");
298
nodread = (*filesize)/1024; /*number of kilobytes in the file */
299
/*reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1*/
300
fread((unsigned char *)memDictionary+1, 1024, nodread, fp);
301
nodread = (*filesize) % 1024; /* "leftover" bytes after the previous read */
302
/*reads the remaining bytes from fp... for what filesystem is this split-read needed?*/
303
fread((unsigned char *)(memDictionary+((*filesize)/1024)*1024)+1, nodread,1, fp);
306
return memDictionary;
309
/*======function to sort jindex table====================*/
310
/*see the index generator for information about what jindex contains
311
This simply sorts that output according to the data in the dictionary*/
312
void jqsort(int32_t lhs, int32_t rhs)
317
if (lhs >= rhs) return;
319
midp = (lhs+rhs)/2; /* calculate the midpoint */
323
jindex[lhs] = jindex[midp];
327
for (i = lhs+1;i <= rhs; i++)
329
if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
334
jindex[i] = jindex[last];
339
/* Swap (lhs,last);*/
341
jindex[lhs] = jindex[last];
348
/*=====string comparison used by jqsort==========================*/
349
int Kstrcmp(uint32_t lhs, uint32_t rhs)
351
int i,c1 = 0, c2 = 0;
352
/* effectively does a strnicmp on two "strings" within the dictionary,
353
except it will make katakana and hirgana match (EUC A4 & A5) */
355
for (i = 0; i<20 ; i++) /*Compare up to 20 chars*/
360
if ((i % 2) == 0) /*If we're reading the first byte*/
362
if (c1 == 0xA5) /*Change hiragana to katakana for */
363
c1 = 0xA4; /*The purposes of this comparison */
368
/*If this is ascii, remove the difference between capitals and small*/
369
if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
370
if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
372
if (c1 != c2 ) break;
377
/*=======function to test a character for alpha or kana/kanji====*/
378
int alphaoreuc(unsigned char x)
383
if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
388
if ((c >= '0') && (c <= '9'))