1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
8
#ifndef _lucene_search_highlight_tokensources_
9
#define _lucene_search_highlight_tokensources_
11
#include "CLucene/analysis/AnalysisHeader.h"
12
CL_CLASS_DEF(index, IndexReader)
13
CL_CLASS_DEF(index, TermPositionVector)
14
//#include "CLucene/index/IndexReader.h"
15
//#include "CLucene/index/TermVector.h"
17
CL_NS_DEF2(search,highlight)
19
class CLUCENE_CONTRIBS_EXPORT TokenSources: LUCENE_BASE
21
//an object used to iterate across an array of tokens
22
class StoredTokenStream:public CL_NS(analysis)::TokenStream
25
CL_NS(analysis)::Token** tokens;
28
StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len);
29
CL_NS(analysis)::Token* next(CL_NS(analysis)::Token* token);
37
* A convenience method that tries a number of approaches to getting a token stream.
38
* The cost of finding there are no termVectors in the index is minimal (1000 invocations still
39
* registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
44
* @return null if field not stored correctly
47
static CL_NS(analysis)::TokenStream* getAnyTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field, CL_NS(analysis)::Analyzer* analyzer);
49
static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv);
53
* Returns a token stream or null if no offset info available in index.
54
* This can be used to feed the highlighter with a pre-parsed token stream
56
* In my tests the speeds to recreate 1000 token streams using this method are:
57
* - with TermVector offset only data stored - 420 milliseconds
58
* - with TermVector offset AND position data stored - 271 milliseconds
59
* (nb timings for TermVector with position data are based on a tokenizer with contiguous
60
* positions - no overlaps or gaps)
61
* The cost of not using TermPositionVector to store
62
* pre-parsed content and using an analyzer to re-parse the original content:
63
* - reanalyzing the original content - 980 milliseconds
65
* The re-analyze timings will typically vary depending on -
66
* 1) The complexity of the analyzer code (timings above were using a
67
* stemmer/lowercaser/stopword combo)
68
* 2) The number of other fields (Lucene reads ALL fields off the disk
69
* when accessing just one document field - can cost dear!)
70
* 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
71
* or slower (more CPU burn) depending on the content.
74
* @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
75
* to eek out the last drops of performance, set to true. If in doubt, set to false.
77
static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous);
79
static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field);
82
static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field,CL_NS(analysis)::Analyzer* analyzer);