~slub.team/goobi-indexserver/3.x

"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),

132

MockTokenizer.WHITESPACE, false),

133

dict);

134

135

assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",

136

"Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",

137

"Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",

138

"Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",

139

"fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",

140

"fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",

141

"Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",

142

"Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,

143

17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,

144

77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,

145

137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,

146

28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,

147

87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,

148

155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,

149

0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,

150

0, 0, 0, 1 });

151

}

152

153

public void testDumbCompoundWordsSELongestMatch() throws Exception {

154

String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",

155

"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",

156

"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };

157

158

DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,

159

new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),

160

dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,

161

CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,

162

CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);

163

164

assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",

165

"fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,

166

14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,

167

0, 0 });

168

}

169

170

public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {

171

String[] dict = {"ab", "cd", "ef"};

172

173

DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,

174

new WhitespaceTokenizer(TEST_VERSION_CURRENT,

175

new StringReader(

176

"abcdef")

177

178

dict,

179

CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,

180

CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,

181

CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

182

183

assertTokenStreamContents(tf,

184

new String[] { "abcdef", "ab", "cd", "ef" },

185

new int[] { 0, 0, 2, 4},

186

new int[] { 6, 2, 4, 6},

187

new int[] { 1, 0, 0, 0}

188

);

189

}

190

191

public void testWordComponentWithLessThanMinimumLength() throws Exception {

192

String[] dict = {"abc", "d", "efg"};

193

194

DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,

195

new WhitespaceTokenizer(TEST_VERSION_CURRENT,

196

new StringReader(

197

"abcdefg")

198

199

dict,

200

CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,

201

CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,

202

CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

203

204

// since "d" is shorter than the minimum subword size, it should not be added to the token stream

205

assertTokenStreamContents(tf,

206

new String[] { "abcdefg", "abc", "efg" },

207

new int[] { 0, 0, 4},

208

new int[] { 7, 3, 7},

209

new int[] { 1, 0, 0}

210

);

211

}

212

213

public void testReset() throws Exception {

214

String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",

215

"Aufgabe", "Überwachung" };

216

217

Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(

218

"Rindfleischüberwachungsgesetz"));

219

DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,

220

wsTokenizer, dict,

221

CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,

222

CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,

223

CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

224

225

CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);

226

assertTrue(tf.incrementToken());

227

assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());

228

assertTrue(tf.incrementToken());

229

assertEquals("Rind", termAtt.toString());

230

wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));

231

tf.reset();

232

assertTrue(tf.incrementToken());

233

assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());

234

}

235

236

public void testRetainMockAttribute() throws Exception {

237

String[] dict = { "abc", "d", "efg" };

238

Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,

239

new StringReader("abcdefg"));

240

TokenStream stream = new MockRetainAttributeFilter(tokenizer);

241

stream = new DictionaryCompoundWordTokenFilter(

242

TEST_VERSION_CURRENT, stream, dict,

243

CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,

244

CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,

245

CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

246

MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);

247

while (stream.incrementToken()) {

248

assertTrue("Custom attribute value was lost", retAtt.getRetain());

249

}

250

251

}

252

253

public static interface MockRetainAttribute extends Attribute {

254

void setRetain(boolean attr);

255

boolean getRetain();

256

}

257

258

public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {

259

private boolean retain = false;

260

@Override

261

public void clear() {

262

retain = false;

263

}

264

public boolean getRetain() {

265

return retain;

266

}

267

public void setRetain(boolean retain) {

268

this.retain = retain;

269

}

270

@Override

271

public void copyTo(AttributeImpl target) {

272

MockRetainAttribute t = (MockRetainAttribute) target;

273

t.setRetain(retain);

274

}

275

}

276

277

private static class MockRetainAttributeFilter extends TokenFilter {

278

279

MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);

280

281

MockRetainAttributeFilter(TokenStream input) {

282

super(input);

283

}

284

285

@Override

286

public boolean incrementToken() throws IOException {

287

if (input.incrementToken()){

288

retainAtt.setRetain(true);

289

return true;

290

} else {

291

return false;

292

}

293

}

294

}

295

296

}

Older »