~slub.team/goobi-indexserver/3.x

* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain

235

* lower case strings.

236

* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.

237

238

@Deprecated

239

public HyphenationCompoundWordTokenFilter(TokenStream input,

240

HyphenationTree hyphenator, Set<?> dictionary) {

241

this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,

242

DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);

243

}

244

245

/**

246

* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.

247

248

* @param input the {@link TokenStream} to process

249

* @param hyphenator the hyphenation pattern tree to use for hyphenation

250

* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain

251

* lower case strings.

252

* @param minWordSize only words longer than this get processed

253

* @param minSubwordSize only subwords longer than this get to the output

254

* stream

255

* @param maxSubwordSize only subwords shorter than this get to the output

256

* stream

257

* @param onlyLongestMatch Add only the longest matching subword to the stream

258

* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.

259

260

@Deprecated

261

public HyphenationCompoundWordTokenFilter(TokenStream input,

262

HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,

263

int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {

264

super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,

265

onlyLongestMatch);

266

267

this.hyphenator = hyphenator;

268

}

269

270

/**

271

* Create a hyphenator tree

272

273

* @param hyphenationFilename the filename of the XML grammar to load

274

* @return An object representing the hyphenation patterns

275

* @throws Exception

276

277

public static HyphenationTree getHyphenationTree(String hyphenationFilename)

278

throws Exception {

279

return getHyphenationTree(new InputSource(hyphenationFilename));

280

}

281

282

/**

283

* Create a hyphenator tree

284

285

* @param hyphenationFile the file of the XML grammar to load

286

* @return An object representing the hyphenation patterns

287

* @throws Exception

288

289

public static HyphenationTree getHyphenationTree(File hyphenationFile)

290

throws Exception {

291

return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));

292

}

293

294

/**

295

* Create a hyphenator tree

296

297

* @param hyphenationReader the reader of the XML grammar to load from

298

* @return An object representing the hyphenation patterns

299

* @throws Exception

300

* @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.

301

* Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input

302

* stream, if you like.

303

304

@Deprecated

305

public static HyphenationTree getHyphenationTree(Reader hyphenationReader)

306

throws Exception {

307

final InputSource is = new InputSource(hyphenationReader);

308

// we need this to load the DTD in very old parsers (like the one in JDK 1.4).

309

// The DTD itsself is provided via EntityResolver, so it should always load, but

310

// some parsers still want to have a base URL (Crimson).

311

is.setSystemId("urn:java:" + HyphenationTree.class.getName());

312

return getHyphenationTree(is);

313

}

314

315

/**

316

* Create a hyphenator tree

317

318

* @param hyphenationSource the InputSource pointing to the XML grammar

319

* @return An object representing the hyphenation patterns

320

* @throws Exception

321

322

public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)

323

throws Exception {

324

HyphenationTree tree = new HyphenationTree();

325

tree.loadPatterns(hyphenationSource);

326

return tree;

327

}

328

329

@Override

330

protected void decompose() {

331

// get the hyphenation points

332

Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);

333

// No hyphen points found -> exit

334

if (hyphens == null) {

335

return;

336

}

337

338

final int[] hyp = hyphens.getHyphenationPoints();

339

340

for (int i = 0; i < hyp.length; ++i) {

341

int remaining = hyp.length - i;

342

int start = hyp[i];

343

CompoundToken longestMatchToken = null;

344

for (int j = 1; j < remaining; j++) {

345

int partLength = hyp[i + j] - start;

346

347

// if the part is longer than maxSubwordSize we

348

// are done with this round

349

if (partLength > this.maxSubwordSize) {

350

break;

351

}

352

353

// we only put subwords to the token stream

354

// that are longer than minPartSize

355

if (partLength < this.minSubwordSize) {

356

continue;

357

}

358

359

// check the dictionary

360

if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {

361

if (this.onlyLongestMatch) {

362

if (longestMatchToken != null) {

363

if (longestMatchToken.txt.length() < partLength) {

364

longestMatchToken = new CompoundToken(start, partLength);

365

}

366

} else {

367

longestMatchToken = new CompoundToken(start, partLength);

368

}

369

} else {

370

tokens.add(new CompoundToken(start, partLength));

371

}

372

} else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {

373

// check the dictionary again with a word that is one character

374

// shorter

375

// to avoid problems with genitive 's characters and other binding

376

// characters

377

if (this.onlyLongestMatch) {

378

if (longestMatchToken != null) {

379

if (longestMatchToken.txt.length() < partLength - 1) {

380

longestMatchToken = new CompoundToken(start, partLength - 1);

381

}

382

} else {

383

longestMatchToken = new CompoundToken(start, partLength - 1);

384

}

385

} else {

386

tokens.add(new CompoundToken(start, partLength - 1));

387

}

388

}

389

}

390

if (this.onlyLongestMatch && longestMatchToken!=null) {

391

tokens.add(longestMatchToken);

392

}

393

}

394

}

395

}

Older »