~slub.team/goobi-indexserver/3.x

public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {

this.version = version;

100

this.ignoreCase = ignoreCase;

101

String encoding = getDictionaryEncoding(affix);

102

CharsetDecoder decoder = getJavaEncoding(encoding);

103

readAffixFile(affix, decoder);

104

words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);

105

for (InputStream dictionary : dictionaries) {

106

readDictionaryFile(dictionary, decoder);

107

}

108

}

109

110

/**

111

* Looks up HunspellWords that match the String created from the given char array, offset and length

112

113

* @param word Char array to generate the String from

114

* @param offset Offset in the char array that the String starts at

115

* @param length Length from the offset that the String is

116

* @return List of HunspellWords that match the generated String, or {@code null} if none are found

117

118

public List<HunspellWord> lookupWord(char word[], int offset, int length) {

119

return words.get(word, offset, length);

120

}

121

122

/**

123

* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length

124

125

* @param word Char array to generate the String from

126

* @param offset Offset in the char array that the String starts at

127

* @param length Length from the offset that the String is

128

* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found

129

130

public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {

131

return prefixes.get(word, offset, length);

132

}

133

134

/**

135

* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length

136

137

* @param word Char array to generate the String from

138

* @param offset Offset in the char array that the String starts at

139

* @param length Length from the offset that the String is

140

* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found

141

142

public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {

143

return suffixes.get(word, offset, length);

144

}

145

146

/**

147

* Reads the affix file through the provided InputStream, building up the prefix and suffix maps

148

149

* @param affixStream InputStream to read the content of the affix file from

150

* @param decoder CharsetDecoder to decode the content of the file

151

* @throws IOException Can be thrown while reading from the InputStream

152

153

private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {

154

prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);

155

suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);

156

157

BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));

158

String line = null;

159

while ((line = reader.readLine()) != null) {

160

if (line.startsWith(PREFIX_KEY)) {

161

parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);

162

} else if (line.startsWith(SUFFIX_KEY)) {

163

parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);

164

} else if (line.startsWith(FLAG_KEY)) {

165

// Assume that the FLAG line comes before any prefix or suffixes

166

// Store the strategy so it can be used when parsing the dic file

167

flagParsingStrategy = getFlagParsingStrategy(line);

168

}

169

}

170

reader.close();

171

}

172

173

/**

174

* Parses a specific affix rule putting the result into the provided affix map

175

176

* @param affixes Map where the result of the parsing will be put

177

* @param header Header line of the affix rule

178

* @param reader BufferedReader to read the content of the rule from

179

* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex

180

* pattern

181

* @throws IOException Can be thrown while reading the rule

182

183

private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,

184

String header,

185

BufferedReader reader,

186

String conditionPattern) throws IOException {

187

String args[] = header.split("\\s+");

188

189

boolean crossProduct = args[2].equals("Y");

190

191

int numLines = Integer.parseInt(args[3]);

192

for (int i = 0; i < numLines; i++) {

193

String line = reader.readLine();

194

String ruleArgs[] = line.split("\\s+");

195

196

HunspellAffix affix = new HunspellAffix();

197

198

affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));

199

affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);

200

201

String affixArg = ruleArgs[3];

202

203

int flagSep = affixArg.lastIndexOf('/');

204

if (flagSep != -1) {

205

char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));

206

Arrays.sort(appendFlags);

207

affix.setAppendFlags(appendFlags);

208

affix.setAppend(affixArg.substring(0, flagSep));

209

} else {

210

affix.setAppend(affixArg);

211

}

212

213

String condition = ruleArgs[4];

214

affix.setCondition(condition, String.format(conditionPattern, condition));

215

affix.setCrossProduct(crossProduct);

216

217

List<HunspellAffix> list = affixes.get(affix.getAppend());

218

if (list == null) {

219

list = new ArrayList<HunspellAffix>();

220

affixes.put(affix.getAppend(), list);

221

}

222

223

list.add(affix);

224

}

225

}

226

227

/**

228

* Parses the encoding specificed in the affix file readable through the provided InputStream

229

230

* @param affix InputStream for reading the affix file

231

* @return Encoding specified in the affix file

232

* @throws IOException Can be thrown while reading from the InputStream

233

* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}

234

235

private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {

236

final StringBuilder encoding = new StringBuilder();

237

for (;;) {

238

encoding.setLength(0);

239

int ch;

240

while ((ch = affix.read()) >= 0) {

241

if (ch == '\n') {

242

break;

243

}

244

if (ch != '\r') {

245

encoding.append((char)ch);

246

}

247

}

248

if (

249

encoding.length() == 0 || encoding.charAt(0) == '#' ||

250

// this test only at the end as ineffective but would allow lines only containing spaces:

251

encoding.toString().trim().length() == 0

252

) {

253

if (ch < 0) {

254

throw new ParseException("Unexpected end of affix file.", 0);

255

}

256

continue;

257

}

258

if ("SET ".equals(encoding.substring(0, 4))) {

259

// cleanup the encoding string, too (whitespace)

260

return encoding.substring(4).trim();

261

}

262

throw new ParseException("The first non-comment line in the affix file must "+

263

"be a 'SET charset', was: '" + encoding +"'", 0);

264

}

265

}

266

267

/**

268

* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and

269

* MICROSOFT-CP1251 etc are allowed...

270

271

* @param encoding Encoding to retrieve the CharsetDecoder for

272

* @return CharSetDecoder for the given encoding

273

274

private CharsetDecoder getJavaEncoding(String encoding) {

275

Charset charset = Charset.forName(encoding);

276

return charset.newDecoder();

277

}

278

279

/**

280

* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file

281

282

* @param flagLine Line containing the flag information

283

* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton

284

285

private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {

286

String flagType = flagLine.substring(5);

287

288

if (NUM_FLAG_TYPE.equals(flagType)) {

289

return new NumFlagParsingStrategy();

290

} else if (UTF8_FLAG_TYPE.equals(flagType)) {

291

return new SimpleFlagParsingStrategy();

292

} else if (LONG_FLAG_TYPE.equals(flagType)) {

293

return new DoubleASCIIFlagParsingStrategy();

294

}

295

296

throw new IllegalArgumentException("Unknown flag type: " + flagType);

297

}

298

299

/**

300

* Reads the dictionary file through the provided InputStream, building up the words map

301

302

* @param dictionary InputStream to read the dictionary file through

303

* @param decoder CharsetDecoder used to decode the contents of the file

304

* @throws IOException Can be thrown while reading from the file

305

306

private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {

307

BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));

308

// TODO: don't create millions of strings.

309

String line = reader.readLine(); // first line is number of entries

310

int numEntries = Integer.parseInt(line);

311

312

// TODO: the flags themselves can be double-chars (long) or also numeric

313

// either way the trick is to encode them as char... but they must be parsed differently

314

while ((line = reader.readLine()) != null) {

315

String entry;

316

HunspellWord wordForm;

317

318

int flagSep = line.lastIndexOf('/');

319

if (flagSep == -1) {

320

wordForm = NOFLAGS;

321

entry = line;

322

} else {

323

// note, there can be comments (morph description) after a flag.

324

// we should really look for any whitespace

325

int end = line.indexOf('\t', flagSep);

326

if (end == -1)

327

end = line.length();

328

329

330

wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));

331

Arrays.sort(wordForm.getFlags());

332

entry = line.substring(0, flagSep);

333

if(ignoreCase) {

334

entry = entry.toLowerCase(Locale.ENGLISH);

335

}

336

}

337

338

List<HunspellWord> entries = words.get(entry);

339

if (entries == null) {

340

entries = new ArrayList<HunspellWord>();

341

words.put(entry, entries);

342

}

343

entries.add(wordForm);

344

}

345

}

346

347

public Version getVersion() {

348

return version;

349

}

350

351

/**

352

* Abstraction of the process of parsing flags taken from the affix and dic files

353

354

private static abstract class FlagParsingStrategy {

355

356

/**

357

* Parses the given String into a single flag

358

359

* @param rawFlag String to parse into a flag

360

* @return Parsed flag

361

362

char parseFlag(String rawFlag) {

363

return parseFlags(rawFlag)[0];

364

}

365

366

/**

367

* Parses the given String into multiple flags

368

369

* @param rawFlags String to parse into flags

370

* @return Parsed flags

371

372

abstract char[] parseFlags(String rawFlags);

373

}

374

375

/**

376

* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.

377

* Can be used with both the ASCII and UTF-8 flag types.

378

379

private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {

380

/**

381

* {@inheritDoc}

382

383

public char[] parseFlags(String rawFlags) {

384

return rawFlags.toCharArray();

385

}

386

}

387

388

/**

389

* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case

390

* of multiple flags, each number is separated by a comma.

391

392

private static class NumFlagParsingStrategy extends FlagParsingStrategy {

393

/**

394

* {@inheritDoc}

395

396

public char[] parseFlags(String rawFlags) {

397

String[] rawFlagParts = rawFlags.trim().split(",");

398

char[] flags = new char[rawFlagParts.length];

399

400

for (int i = 0; i < rawFlagParts.length; i++) {

401

// note, removing the trailing X/leading I for nepali... what is the rule here?!

402

flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));

403

}

404

405

return flags;

406

}

407

}

408

409

/**

410

* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes

411

* must be combined into a single character.

412

413

* TODO (rmuir) test

414

415

private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

416

417

/**

418

* {@inheritDoc}

419

420

public char[] parseFlags(String rawFlags) {

421

if (rawFlags.length() == 0) {

422

return new char[0];

423

}

424

425

StringBuilder builder = new StringBuilder();

426

for (int i = 0; i < rawFlags.length(); i+=2) {

427

char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));

428

builder.append(cookedFlag);

429

}

430

431

char flags[] = new char[builder.length()];

432

builder.getChars(0, builder.length(), flags, 0);

433

return flags;

434

}

435

}

436

437

public boolean isIgnoreCase() {

438

return ignoreCase;

439

}

440

}

Older »