~ubuntu-branches/ubuntu/quantal/kiten/quantal-proposed

« back to all changes in this revision

Viewing changes to xjdxgen.c

Committer: Bazaar Package Importer
Author(s): Harald Sitter
Date: 2011-07-10 11:23:47 UTC
Revision ID: james.westby@ubuntu.com-20110710112347-ykfhtvam3kgssspo

Tags: upstream-4.6.90+repack

Import upstream version 4.6.90+repack

files added:

AUTHORS

CMakeLists.txt

COPYING

COPYING.DOC

COPYING.LIB

Messages.sh

README

TODO

app/CMakeLists.txt

app/configdictionaryselector.cpp

app/configdictionaryselector.h

app/configdictselect.ui

app/configfont.ui

app/configlearn.ui

app/configsearching.ui

app/configsorting.ui

app/configsortingpage.cpp

app/configsortingpage.h

app/configuredialog.cpp

app/configuredialog.h

app/entrylistmodel.cpp

app/entrylistmodel.h

app/entrylistview.cpp

app/entrylistview.h

app/kiten.cpp

app/kiten.desktop

app/kiten.h

app/kiten.kcfg

app/kitenconfig.kcfgc

app/kitenui.rc

app/main.cpp

app/resultsview.cpp

app/resultsview.h

app/searchstringinput.cpp

app/searchstringinput.h

app/wordtype.cpp

app/wordtype.h

config-kiten.h.cmake

data

data/edict

data/edict_doc.html

data/edict_kanjidic_licence.html

data/kanjidic

data/pics

data/pics/CMakeLists.txt

data/pics/hi128-apps-kiten.png

data/pics/hi16-apps-kiten.png

data/pics/hi22-apps-kiten.png

data/pics/hi32-apps-kiten.png

data/pics/hi48-apps-kiten.png

data/pics/hi64-apps-kiten.png

data/pics/hisc-apps-kiten.svgz

data/radkfile

data/romkana.cnv

data/vconj

doc/CMakeLists.txt

doc/index.cache.bz2

doc/index.docbook

doc/kiten1.png

doc/kiten2.png

lib/CMakeLists.txt

lib/DictDeinflect

lib/DictDeinflect/dictfiledeinflect.cpp

lib/DictDeinflect/dictfiledeinflect.h

lib/DictDeinflect/entrydeinflect.cpp

lib/DictDeinflect/entrydeinflect.h

lib/DictEdict

lib/DictEdict/dictfileedict.cpp

lib/DictEdict/dictfileedict.h

lib/DictEdict/dictfilefieldselector.cpp

lib/DictEdict/dictfilefieldselector.h

lib/DictEdict/entryedict.cpp

lib/DictEdict/entryedict.h

lib/DictEdict/indexededictfile.cpp

lib/DictEdict/indexededictfile.h

lib/DictEdict/linearedictfile.cpp

lib/DictEdict/linearedictfile.h

lib/DictKanjidic

lib/DictKanjidic/dictfilekanjidic.cpp

lib/DictKanjidic/dictfilekanjidic.h

lib/DictKanjidic/entrykanjidic.cpp

lib/DictKanjidic/entrykanjidic.h

lib/Mainpage.dox

lib/dictfile.h

lib/dictionarymanager.cpp

lib/dictionarymanager.h

lib/dictionarypreferencedialog.cpp

lib/dictionarypreferencedialog.h

lib/dictquery.cpp

lib/dictquery.h

lib/entry.cpp

lib/entry.h

lib/entrylist.cpp

lib/entrylist.h

lib/historyptrlist.cpp

lib/historyptrlist.h

lib/kromajiedit.cpp

lib/kromajiedit.h

lib/libkitenexport.h

radselect

radselect/CMakeLists.txt

radselect/buttongrid.cpp

radselect/buttongrid.h

radselect/kanji.cpp

radselect/kanji.h

radselect/main.cpp

radselect/radical.cpp

radselect/radical.h

radselect/radical_selector.ui

radselect/radicalbutton.cpp

radselect/radicalbutton.h

radselect/radicalfile.cpp

radselect/radicalfile.h

radselect/radselect.cpp

radselect/radselect.h

radselect/radselect.lsm

radselect/radselectconfig.kcfg

radselect/radselectconfig.kcfgc

radselect/radselectprefdialog.ui

radselect/radselectui.rc

radselect/radselectview.cpp

radselect/radselectview.h

xjdxgen.c

Show diffs side-by-side

added added

removed removed

xjdxgen.c

/**************************************************************************

* X J D X G E N

* Author: Jim Breen

* Index (.xjdx) generator program fron XJDIC

* V2.3 - indexes JIS X 0212 (3-byte EUC) kanji

***************************************************************************/

/* This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 1, or (at your option)

any later version.

This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program; if not, write to the Free Software

Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */

/* Changed: ignore all rc stuff. use args 1 and 2 for input/output file.

-- jason */

/* Heavily commented, removed the unused header file, split off the

readDictionary function, removed unused functions and variables... cleaned

up the code in general. Preparing for integration to the rest of the program

Note that this indexer has been hacked off of Jim Breen's xjdic program,

and a lot of the things which have been removed were relevant to that

program, but not to this one.

--Joe

#include <config-kiten.h>

#include <sys/stat.h>

#include <unistd.h>

#include <stdio.h>

#include <stdlib.h>

#include <ctype.h>

#include <string.h>

#ifdef HAVE_STDINT_H

#include <stdint.h>

#endif

#ifdef HAVE_INTTYPES_H

#include <inttypes.h>

#endif

#define TRUE 1

#define FALSE 0

#define SPTAG '@'

#define TOKENLIM 40

#define INDEX_VERSION 14; /*The last time the index structure changed was Version1.4*/

unsigned char *db;

uint32_t *jindex;

uint32_t indlen;

/*====== prototypes=================================================*/

void jqsort(int32_t i, int32_t j);

int Kstrcmp(uint32_t lhs, uint32_t rhs);

int alphaoreuc(unsigned char x);

unsigned char* readDictionary(const char* dictName,uint32_t *filesize);

uint32_t buildIndex(unsigned char* dict, uint32_t dictLength);

/*====function to Load Dictionary and load/create index table=======*/

int main(argc,argv)

int argc;

unsigned char **argv;

{

const char *Dname;

const char *JDXname;

FILE *fp;

uint32_t diclen;

uint32_t indptr;

printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");

printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n");

if (argc < 3)

{

printf("\nUSAGE: kitengen input output.xjdx\n");

exit(2);

}

Dname = argv[1]; /*Name of the dictionary being scanned */

JDXname = argv[2]; /*Name of the output file */

printf("Commandline request to use files %s and %s \n", Dname, JDXname);

printf("\nWARNING!! This program may take a long time to run .....\n");

db = readDictionary(Dname,&diclen); /*Reads the dict, but leaves a space at the beginning*/

diclen++; /*add one to the number of bytes considered in the file */

db[diclen] = 10; /*set the first and final entry in the database to 10 */

db[0] = 10;

printf("Dictionary size: %d bytes.\n",diclen);

100

indlen = (diclen * 3)/4; /*Make a wild guess at the index file length */

101

jindex = (uint32_t *)malloc(indlen); /* and allocate it */

102

if(jindex == NULL)

103

{

104

fprintf(stderr,"malloc() for index table failed.\n");

105

exit(1);

106

}

107

108

printf("Parsing.... \n");

109

/*this is the dictionary parser. It places an entry in jindex for every

110

kana/kanji string and every alphabetic string it finds which is >=3

111

characters */

112

indptr = buildIndex(db,diclen);

113

114

printf("Index entries: %d \nSorting (this is slow)......\n",indptr);

115

jqsort((int32_t)1,indptr);

116

117

printf("Sorted\nWriting index file ....\n");

118

fp = fopen(JDXname,"wb");

119

if (fp==NULL )

120

{

121

printf("\nCannot open %s output file\n",JDXname);

122

exit(1);

123

}

124

jindex[0] = diclen+INDEX_VERSION; /* prepend the index file size + version # */

125

fwrite(jindex,sizeof(int32_t),indptr+1,fp);

126

fclose(fp);

127

128

return 0;

129

}

130

131

/*=========function to parse the dict file and fill the jindex global with the index====*/

132

/*=========returns the size of the index file ====*/

133

134

A bit of explanation on what this thing generates is probably in order.

135

Essentially, it fills jindex with a large number of numbers... each number

136

being an offset to a byte location inside of the dictionary file. Starting

137

at position index 1 (second pos)

138

In other words... feeding this thing the dict file

139

"Llama X1\nJT Fred Flintstone X"

140

would generate: {<unmodified>,0,6,12,17}.

141

"X" is skipped because it is only 1 byte long.

142

"JT" is skipped because it is only two bytes long, the J is regular ascii

143

(<127), and the T is not a digit. If any of those were different, (it

144

was longer than 2 bytes, was an euc (kana or kanji) character, or T was

145

a digit) it would be included in the index.

146

147

148

/*First... an ugly #define to make our code a bit more readable*/

149

#define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \

150

printf("Index table overflow. Dictionary too large?\n"); exit(1); } }

151

152

uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) {

153

int nowReadingWord = FALSE; /*Boolean to track if we're mid-word in the dict */

154

int currentDictCharacter; /*Current character index in the dict */

155

unsigned char c; /*the current reading character*/

156

unsigned char currstr[TOKENLIM]; /* String that we're currently getting */

157

int currstrIndex = 0;

158

uint32_t indptr = 1; /* next 'slot' in the index to fill */

159

int saving = FALSE; /*is what we are doing right now slated for salvation?*/

160

161

for (currentDictCharacter =0; currentDictCharacter < dictLength;

162

currentDictCharacter++)

163

{

164

c = dict[currentDictCharacter]; /* Fetch the next character */

165

166

if(!nowReadingWord) /*if we are NOT in the middle of reading a word */

167

{

168

if (alphaoreuc(c) || c == SPTAG) /* if character or priority entry */

169

{

170

nowReadingWord = TRUE; /* Mark that we're mid word */

171

jindex[indptr] = currentDictCharacter;

172

/* copy the location of this character to our index structure */

173

currstrIndex = 1;

174

/*mark the next position in the string to copy a char into */

175

currstr[0] = c;

176

/*set the current string to be equal to this character so far */

177

currstr[1] = '\0';

178

saving = TRUE;

179

}

180

} else { /*If we're in the middle of parsing a word atm */

181

182

/*if it's alphanumeric or - or . copy it and increment where the

183

next one goes */

184

if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c<='9')))

185

{

186

currstr[currstrIndex] = c;

187

if(currstrIndex < TOKENLIM-1)

188

currstrIndex++;

189

}

190

else /* We were reading a word... and we just encountered the

191

end of the word */

192

{

193

currstr[currstrIndex] = '\0'; /*null terminate the string */

194

nowReadingWord = FALSE;

195

196

/*Don't save single or dual character items where the

197

first item is ascii */

198

if ((strlen(currstr) <= 2) && (currstr[0] < 127))

199

saving = FALSE;

200

/*EXCEPT: Save anything that's two character where the second

201

is a number

202

Note that this might catch single 2-byte kanji as well...

203

but it might not*/

204

if ((strlen(currstr) == 2) && (currstr[1] <= '9'))

205

saving = TRUE;

206

207

/* This is a latin-character string, either longer than 2 bytes

208

or having an ascii digit for a second byte */

209

if (saving && (currstr[0] < 127))

210

{

211

indptr++;

212

INDEX_OVERFLOW_CHECK(indptr);

213

214

/* If this is non-Japanese, and has a 'SPTAGn' tag, generate

215

two indices */

216

if ( currstr[0] == SPTAG)

217

{

218

/*make a separate entry pointing to

219

the non-SPTAG'd entry (the next byte)*/

220

jindex[indptr] = jindex[indptr-1]+1;

221

/*overwrite the SPTAG marker*/

222

strcpy(currstr,currstr+1);

223

indptr++;

224

INDEX_OVERFLOW_CHECK(indptr);

225

}

226

}

227

228

/*For strings that start with non latin characters*/

229

if (saving && (currstr[0] > 127))

230

{

231

int i;

232

uint32_t possav = jindex[indptr]; /*Save the current marker*/

233

indptr++;

234

INDEX_OVERFLOW_CHECK(indptr);

235

236

/* generate index for *every* kanji in key */

237

i = 2;

238

/*if this is a three byte kanji, ignore the 0x8f marker */

239

if (currstr[0] == 0x8f)

240

i++;

241

/*step through... two by two*/

242

for ( ; i < strlen(currstr); i+=2)

243

{

244

if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))

245

{

246

/*Add in a specific reference to the kanji*/

247

jindex[indptr] = possav+i;

248

indptr++;

249

INDEX_OVERFLOW_CHECK(indptr);

250

}

251

/*again the check if it's a three byte kanji*/

252

if(currstr[i] == 0x8f)

253

i++;

254

}

255

}

256

}

257

}

258

}

259

indptr--; /*correct for the overshoot */

260

return indptr;

261

}

262

263

/*===function to read the dictionary files into array, returning filesize===*/

264

/*Note: We leave a blank byte in the first byte of the returned dictionary, and

265

allocate an extra 99 bytes at the end */

266

unsigned char*

267

readDictionary(const char* dictName,uint32_t *filesize) {

268

FILE *fp;

269

struct stat buf;

270

unsigned char *memDictionary;

271

int nodread;

272

273

if(stat(dictName, &buf) != 0) /* if the dict file doesn't exist */

274

{

275

perror(NULL);

276

printf("Cannot stat: %s \n",dictName);

277

exit(1);

278

}

279

280

*filesize = buf.st_size; /*file size in bytes*/

281

282

puts ("\nLoading Dictionary file. Please wait.....\n");

283

fp=fopen(dictName,"rb");

284

if (fp==NULL )

285

{

286

printf("\nCannot open dictionary file\n");

287

exit(1);

288

}

289

/*Allocate the database index 100 bytes larger than the dict filesize*/

290

memDictionary=(unsigned char*)malloc((*filesize+100)*sizeof(unsigned char));

291

if(memDictionary == NULL)

292

{

293

fprintf(stderr,"malloc() for dictionary failed.\n");

294

fclose(fp);

295

exit(1);

296

}

297

298

nodread = (*filesize)/1024; /*number of kilobytes in the file */

299

/*reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1*/

300

fread((unsigned char *)memDictionary+1, 1024, nodread, fp);

301

nodread = (*filesize) % 1024; /* "leftover" bytes after the previous read */

302

/*reads the remaining bytes from fp... for what filesystem is this split-read needed?*/

303

fread((unsigned char *)(memDictionary+((*filesize)/1024)*1024)+1, nodread,1, fp);

304

fclose(fp);

305

306

return memDictionary;

307

}

308

309

/*======function to sort jindex table====================*/

310

/*see the index generator for information about what jindex contains

311

This simply sorts that output according to the data in the dictionary*/

312

void jqsort(int32_t lhs, int32_t rhs)

313

{

314

int32_t i,last,midp;

315

uint32_t temp;

316

317

if (lhs >= rhs) return;

318

319

midp = (lhs+rhs)/2; /* calculate the midpoint */

320

321

/*Swap (midp,lhs) */

322

temp = jindex[lhs];

323

jindex[lhs] = jindex[midp];

324

jindex[midp] = temp;

325

326

last = lhs;

327

for (i = lhs+1;i <= rhs; i++)

328

{

329

if (Kstrcmp(jindex[i],jindex[lhs]) < 0)

330

{

331

/* Swap(++last,i);*/

332

last++;

333

temp = jindex[i];

334

jindex[i] = jindex[last];

335

jindex[last] = temp;

336

}

337

}

338

339

/* Swap (lhs,last);*/

340

temp = jindex[lhs];

341

jindex[lhs] = jindex[last];

342

jindex[last] = temp;

343

344

jqsort(lhs,last-1);

345

jqsort(last+1,rhs);

346

}

347

348

/*=====string comparison used by jqsort==========================*/

349

int Kstrcmp(uint32_t lhs, uint32_t rhs)

350

{

351

int i,c1 = 0, c2 = 0;

352

/* effectively does a strnicmp on two "strings" within the dictionary,

353

except it will make katakana and hirgana match (EUC A4 & A5) */

354

355

for (i = 0; i<20 ; i++) /*Compare up to 20 chars*/

356

{

357

c1 = db[lhs+i];

358

c2 = db[rhs+i];

359

360

if ((i % 2) == 0) /*If we're reading the first byte*/

361

{

362

if (c1 == 0xA5) /*Change hiragana to katakana for */

363

c1 = 0xA4; /*The purposes of this comparison */

364

if (c2 == 0xA5)

365

c2 = 0xA4;

366

}

367

368

/*If this is ascii, remove the difference between capitals and small*/

369

if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;

370

if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;

371

372

if (c1 != c2 ) break;

373

}

374

return(c1-c2);

375

}

376

377

/*=======function to test a character for alpha or kana/kanji====*/

378

int alphaoreuc(unsigned char x)

379

{

380

int c;

381

382

c = x & 0xff;

383

if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))

384

/*ASCII alphabet*/

385

{

386

return (TRUE);

387

}

388

if ((c >= '0') && (c <= '9'))

389

/*digits*/

390

{

391

return(TRUE);

392

}

393

if ((c & 0x80) > 0)

394

/*EUC kanji/kana*/

395

{

396

return(TRUE);

397

}

398

return (FALSE);

399

}

400

Older »