2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
18
package org.apache.solr.handler;
20
import org.apache.lucene.analysis.Analyzer;
21
import org.apache.lucene.analysis.CharReader;
22
import org.apache.lucene.analysis.CharStream;
23
import org.apache.lucene.analysis.TokenStream;
24
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
25
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
26
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
28
import org.apache.lucene.index.Payload;
29
import org.apache.lucene.util.Attribute;
30
import org.apache.lucene.util.AttributeImpl;
31
import org.apache.lucene.util.AttributeSource;
32
import org.apache.lucene.util.AttributeReflector;
33
import org.apache.lucene.util.ArrayUtil;
34
import org.apache.solr.analysis.CharFilterFactory;
35
import org.apache.solr.analysis.TokenFilterFactory;
36
import org.apache.solr.analysis.TokenizerChain;
37
import org.apache.solr.analysis.TokenizerFactory;
38
import org.apache.solr.common.util.NamedList;
39
import org.apache.solr.common.util.SimpleOrderedMap;
40
import org.apache.solr.common.SolrException;
41
import org.apache.solr.request.SolrQueryRequest;
42
import org.apache.solr.response.SolrQueryResponse;
43
import org.apache.solr.schema.FieldType;
45
import java.io.IOException;
46
import java.io.StringReader;
48
import java.math.BigInteger;
49
import org.apache.commons.lang.ArrayUtils;
52
* A base class for all analysis request handlers.
54
* @version $Id: AnalysisRequestHandlerBase.java 1143785 2011-07-07 11:59:59Z uschindler $
57
public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
60
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
61
rsp.add("analysis", doAnalysis(req));
65
* Performs the analysis based on the given solr request and returns the analysis result as a named list.
67
* @param req The solr request.
69
* @return The analysis result as a named list.
71
* @throws Exception When analysis fails.
73
protected abstract NamedList doAnalysis(SolrQueryRequest req) throws Exception;
76
* Analyzes the given value using the given Analyzer.
78
* @param value Value to analyze
79
* @param context The {@link AnalysisContext analysis context}.
81
* @return NamedList containing the tokens produced by analyzing the given value
83
protected NamedList<List<NamedList>> analyzeValue(String value, AnalysisContext context) {
85
Analyzer analyzer = context.getAnalyzer();
87
if (!TokenizerChain.class.isInstance(analyzer)) {
89
TokenStream tokenStream = null;
91
tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value));
92
} catch (IOException e) {
93
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
95
NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
96
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
100
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
101
CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
102
TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
103
TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
105
NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
107
if( cfiltfacs != null ){
108
String source = value;
109
for(CharFilterFactory cfiltfac : cfiltfacs ){
110
CharStream reader = CharReader.get(new StringReader(source));
111
reader = cfiltfac.create(reader);
112
source = writeCharStream(namedList, reader);
116
TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value)));
117
List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
119
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
121
ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens);
123
for (TokenFilterFactory tokenFilterFactory : filtfacs) {
124
for (final AttributeSource tok : tokens) {
125
tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
127
tokenStream = tokenFilterFactory.create(listBasedTokenStream);
128
tokens = analyzeTokenStream(tokenStream);
129
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
130
listBasedTokenStream = new ListBasedTokenStream(tokens);
137
* Analyzes the given text using the given analyzer and returns the produced tokens.
139
* @param value The value to analyze.
140
* @param analyzer The analyzer to use.
142
* @return The produces token list.
143
* @deprecated This method is no longer used by Solr
144
* @see #getQueryTokenSet
147
protected List<AttributeSource> analyzeValue(String value, Analyzer analyzer) {
148
TokenStream tokenStream = analyzer.tokenStream("", new StringReader(value));
149
return analyzeTokenStream(tokenStream);
153
* Analyzes the given text using the given analyzer and returns the produced tokens.
155
* @param query The query to analyze.
156
* @param analyzer The analyzer to use.
158
protected Set<String> getQueryTokenSet(String query, Analyzer analyzer) {
159
final Set<String> tokens = new HashSet<String>();
160
final TokenStream tokenStream = analyzer.tokenStream("", new StringReader(query));
161
final CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
164
while (tokenStream.incrementToken()) {
165
tokens.add(termAtt.toString());
167
} catch (IOException ioe) {
168
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
174
* Analyzes the given TokenStream, collecting the Tokens it produces.
176
* @param tokenStream TokenStream to analyze
178
* @return List of tokens produced from the TokenStream
180
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
181
final List<AttributeSource> tokens = new ArrayList<AttributeSource>();
182
final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
183
final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
184
// for backwards compatibility, add all "common" attributes
185
tokenStream.addAttribute(CharTermAttribute.class);
186
tokenStream.addAttribute(OffsetAttribute.class);
187
tokenStream.addAttribute(TypeAttribute.class);
191
while (tokenStream.incrementToken()) {
192
position += posIncrAtt.getPositionIncrement();
193
trackerAtt.setActPosition(position);
194
tokens.add(tokenStream.cloneAttributes());
196
} catch (IOException ioe) {
197
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
203
// a static mapping of the reflected attribute keys to the names used in Solr 1.4
204
static Map<String,String> ATTRIBUTE_MAPPING = Collections.unmodifiableMap(new HashMap<String,String>() {{
205
put(OffsetAttribute.class.getName() + "#startOffset", "start");
206
put(OffsetAttribute.class.getName() + "#endOffset", "end");
207
put(TypeAttribute.class.getName() + "#type", "type");
208
put(TokenTrackingAttribute.class.getName() + "#position", "position");
209
put(TokenTrackingAttribute.class.getName() + "#positionHistory", "positionHistory");
213
* Converts the list of Tokens to a list of NamedLists representing the tokens.
215
* @param tokens Tokens to convert
216
* @param context The analysis context
218
* @return List of NamedLists containing the relevant information taken from the tokens
220
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
221
final List<NamedList> tokensNamedLists = new ArrayList<NamedList>();
222
final FieldType fieldType = context.getFieldType();
223
final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
225
// sort the tokens by absoulte position
226
ArrayUtil.mergeSort(tokens, new Comparator<AttributeSource>() {
227
public int compare(AttributeSource a, AttributeSource b) {
229
a.getAttribute(TokenTrackingAttribute.class).getPositions(),
230
b.getAttribute(TokenTrackingAttribute.class).getPositions()
234
private int arrayCompare(int[] a, int[] b) {
236
final int stop = Math.min(a.length, b.length);
238
int diff = a[p] - b[p];
239
if (diff != 0) return diff;
242
// One is a prefix of the other, or, they are equal:
243
return a.length - b.length;
247
for (int i = 0; i < tokens.length; i++) {
248
AttributeSource token = tokens[i];
249
final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
250
final String rawText = token.addAttribute(CharTermAttribute.class).toString();
252
String text = fieldType.indexedToReadable(rawText);
253
tokenNamedList.add("text", text);
254
if (!text.equals(rawText)) {
255
tokenNamedList.add("raw_text", rawText);
258
if (context.getTermsToMatch().contains(rawText)) {
259
tokenNamedList.add("match", true);
262
token.reflectWith(new AttributeReflector() {
263
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
264
// leave out position and term
265
if (CharTermAttribute.class.isAssignableFrom(attClass))
267
if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
270
String k = attClass.getName() + '#' + key;
272
// map keys for "standard attributes":
273
if (ATTRIBUTE_MAPPING.containsKey(k)) {
274
k = ATTRIBUTE_MAPPING.get(k);
277
// TODO: special handling for payloads - move this to ResponseWriter?
278
if (value instanceof Payload) {
279
Payload p = (Payload) value;
281
BigInteger bi = new BigInteger( p.getData() );
282
String ret = bi.toString( 16 );
283
if (ret.length() % 2 != 0) {
293
tokenNamedList.add(k, value);
297
tokensNamedLists.add(tokenNamedList);
300
return tokensNamedLists;
303
private String writeCharStream(NamedList out, CharStream input ){
304
final int BUFFER_SIZE = 1024;
305
char[] buf = new char[BUFFER_SIZE];
307
StringBuilder sb = new StringBuilder();
310
len = input.read( buf, 0, BUFFER_SIZE );
311
} catch (IOException e) {
312
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
315
sb.append(buf, 0, len);
316
} while( len == BUFFER_SIZE );
317
out.add( input.getClass().getName(), sb.toString());
318
return sb.toString();
322
// ================================================= Inner classes =================================================
325
* TokenStream that iterates over a list of pre-existing Tokens
328
protected final static class ListBasedTokenStream extends TokenStream {
329
private final List<AttributeSource> tokens;
330
private Iterator<AttributeSource> tokenIterator;
333
* Creates a new ListBasedTokenStream which uses the given tokens as its token source.
335
* @param tokens Source of tokens to be used
337
ListBasedTokenStream(List<AttributeSource> tokens) {
338
this.tokens = tokens;
339
tokenIterator = tokens.iterator();
343
public boolean incrementToken() throws IOException {
344
if (tokenIterator.hasNext()) {
346
AttributeSource next = tokenIterator.next();
347
Iterator<Class<? extends Attribute>> atts = next.getAttributeClassesIterator();
348
while (atts.hasNext()) // make sure all att impls in the token exist here
349
addAttribute(atts.next());
358
public void reset() throws IOException {
360
tokenIterator = tokens.iterator();
364
/** This is an {@link Attribute} used to track the positions of tokens
365
* in the analysis chain.
366
* @lucene.internal This class is only public for usage by the {@link AttributeSource} API.
368
public interface TokenTrackingAttribute extends Attribute {
370
void setActPosition(int pos);
371
int[] getPositions();
372
void reset(int[] basePositions, int position);
375
/** Implementation of {@link TokenTrackingAttribute}.
376
* @lucene.internal This class is only public for usage by the {@link AttributeSource} API.
378
public static final class TokenTrackingAttributeImpl extends AttributeImpl implements TokenTrackingAttribute {
379
private int[] basePositions = new int[0];
380
private int position = 0;
381
private transient int[] cachedPositions = null;
383
public void freezeStage() {
384
this.basePositions = getPositions();
386
this.cachedPositions = null;
389
public void setActPosition(int pos) {
391
this.cachedPositions = null;
394
public int[] getPositions() {
395
if (cachedPositions == null) {
396
cachedPositions = ArrayUtils.add(basePositions, position);
398
return cachedPositions;
401
public void reset(int[] basePositions, int position) {
402
this.basePositions = basePositions;
403
this.position = position;
404
this.cachedPositions = null;
408
public void clear() {
409
// we do nothing here, as all attribute values are controlled externally by consumer
413
public void reflectWith(AttributeReflector reflector) {
414
reflector.reflect(TokenTrackingAttribute.class, "position", position);
415
// convert to Integer[] array, as only such one can be serialized by ResponseWriters
416
reflector.reflect(TokenTrackingAttribute.class, "positionHistory", ArrayUtils.toObject(getPositions()));
420
public void copyTo(AttributeImpl target) {
421
final TokenTrackingAttribute t = (TokenTrackingAttribute) target;
422
t.reset(basePositions, position);
427
* Serves as the context of an analysis process. This context contains the following constructs
429
protected static class AnalysisContext {
431
private final String fieldName;
432
private final FieldType fieldType;
433
private final Analyzer analyzer;
434
private final Set<String> termsToMatch;
437
* Constructs a new AnalysisContext with a given field tpe, analyzer and
438
* termsToMatch. By default the field name in this context will be
439
* {@code null}. During the analysis processs, The produced tokens will
440
* be compaired to the terms in the {@code termsToMatch} set. When found,
441
* these tokens will be marked as a match.
443
* @param fieldType The type of the field the analysis is performed on.
444
* @param analyzer The analyzer to be used.
445
* @param termsToMatch Holds all the terms that should match during the
448
public AnalysisContext(FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) {
449
this(null, fieldType, analyzer, termsToMatch);
453
* Constructs an AnalysisContext with a given field name, field type
454
* and analyzer. By default this context will hold no terms to match
456
* @param fieldName The name of the field the analysis is performed on
457
* (may be {@code null}).
458
* @param fieldType The type of the field the analysis is performed on.
459
* @param analyzer The analyzer to be used during the analysis process.
462
public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer) {
463
this(fieldName, fieldType, analyzer, Collections.EMPTY_SET);
467
* Constructs a new AnalysisContext with a given field tpe, analyzer and
468
* termsToMatch. During the analysis processs, The produced tokens will be
469
* compaired to the termes in the {@code termsToMatch} set. When found,
470
* these tokens will be marked as a match.
472
* @param fieldName The name of the field the analysis is performed on
473
* (may be {@code null}).
474
* @param fieldType The type of the field the analysis is performed on.
475
* @param analyzer The analyzer to be used.
476
* @param termsToMatch Holds all the terms that should match during the
479
public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) {
480
this.fieldName = fieldName;
481
this.fieldType = fieldType;
482
this.analyzer = analyzer;
483
this.termsToMatch = termsToMatch;
486
public String getFieldName() {
490
public FieldType getFieldType() {
494
public Analyzer getAnalyzer() {
498
public Set<String> getTermsToMatch() {