~slub.team/goobi-indexserver/3.x

protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {

this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);

}

/**

* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead

@Deprecated

protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {

this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);

}

/**

100

* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead

101

102

@Deprecated

103

protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {

104

this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);

105

}

106

107

/**

108

* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead

109

110

@Deprecated

111

protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {

112

this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);

113

}

114

115

/**

116

* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead

117

118

@Deprecated

119

protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {

120

this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);

121

}

122

123

/**

124

* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead

125

126

@Deprecated

127

protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {

128

this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);

129

}

130

131

protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {

132

this(matchVersion, input,makeDictionary(matchVersion, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);

133

}

134

135

protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {

136

this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);

137

}

138

139

protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {

140

this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);

141

}

142

143

protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {

144

this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);

145

}

146

147

protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {

148

this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);

149

}

150

151

protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {

152

super(input);

153

154

this.tokens=new LinkedList<CompoundToken>();

155

this.minWordSize=minWordSize;

156

this.minSubwordSize=minSubwordSize;

157

this.maxSubwordSize=maxSubwordSize;

158

this.onlyLongestMatch=onlyLongestMatch;

159

160

if (dictionary==null || dictionary instanceof CharArraySet) {

161

this.dictionary = (CharArraySet) dictionary;

162

} else {

163

this.dictionary = new CharArraySet(matchVersion, dictionary, true);

164

}

165

}

166

167

/** @deprecated Only available for backwards compatibility. */

168

@Deprecated

169

public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) {

170

if (dictionary == null) {

171

return null;

172

}

173

return new CharArraySet(matchVersion, Arrays.asList(dictionary), true);

174

}

175

176

@Override

177

public final boolean incrementToken() throws IOException {

178

if (!tokens.isEmpty()) {

179

assert current != null;

180

CompoundToken token = tokens.removeFirst();

181

restoreState(current); // keep all other attributes untouched

182

termAtt.setEmpty().append(token.txt);

183

offsetAtt.setOffset(token.startOffset, token.endOffset);

184

posIncAtt.setPositionIncrement(0);

185

return true;

186

}

187

188

current = null; // not really needed, but for safety

189

if (input.incrementToken()) {

190

// Only words longer than minWordSize get processed

191

if (termAtt.length() >= this.minWordSize) {

192

decompose();

193

// only capture the state if we really need it for producing new tokens

194

if (!tokens.isEmpty()) {

195

current = captureState();

196

}

197

}

198

// return original token:

199

return true;

200

} else {

201

return false;

202

}

203

}

204

205

/** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.

206

* The original token may not be placed in the list, as it is automatically passed through this filter.

207

208

protected abstract void decompose();

209

210

@Override

211

public void reset() throws IOException {

212

super.reset();

213

tokens.clear();

214

current = null;

215

}

216

217

/**

218

* Helper class to hold decompounded token information

219

220

protected class CompoundToken {

221

public final CharSequence txt;

222

public final int startOffset, endOffset;

223

224

/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */

225

public CompoundToken(int offset, int length) {

226

final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;

227

this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);

228

// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed

229

// chars from the term, offsets may not match correctly (other filters producing tokens

230

// may also have this problem):

231

this.startOffset = newStart;

232

this.endOffset = newStart + length;

233

}

234

235

}

236

}

Older »