1
/* This file is part of Strigi Desktop Search
3
* Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
5
* This library is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU Library General Public
7
* License as published by the Free Software Foundation; either
8
* version 2 of the License, or (at your option) any later version.
10
* This library is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
* Library General Public License for more details.
15
* You should have received a copy of the GNU Library General Public License
16
* along with this library; see the file COPYING.LIB. If not, write to
17
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18
* Boston, MA 02110-1301, USA.
20
#ifndef STRIGI_ANALYZERCONFIGURATION_H
21
#define STRIGI_ANALYZERCONFIGURATION_H
23
#include "streamendanalyzer.h"
24
#include "streamsaxanalyzer.h"
25
#include "streamlineanalyzer.h"
26
#include "streamthroughanalyzer.h"
27
#include "streameventanalyzer.h"
28
#include "fieldtypes.h"
31
class AnalyzerConfigurationPrivate;
33
* @brief This class provides information and functions to control
36
* For example, it allows the files to be indexed to be limited based
37
* on the name and path of the files. It also stores the field
38
* register (see AnalyzerConfiguration::fieldRegister and
39
* Strigi::FieldRegister).
41
* It can be subclassed to provide finer control over the analysis
44
class STREAMANALYZER_EXPORT AnalyzerConfiguration {
47
* @brief Provides hints about how the IndexWriter should store
50
* This is a flag type - the different attributes can be
53
* Note that if neither Indexed nor Stored is set for a field,
54
* the field may not be stored in the index. If it is Stored
55
* but not Indexed, the field value will not be able to be
56
* searched for quickly, but may be returned as part of the
57
* results from a search.
60
None = 0x0000 /**< No hint. */,
61
Binary = 0x0001 /**< The field should be stored as binary data. */,
62
Compressed = 0x0002 /**< If the field is stored, the data
63
should be compressed. */,
64
Indexed = 0x0004 /**< The field should be indexed. */,
65
Stored = 0x0020 /**< The field should be stored. */,
66
Tokenized = 0x0040 /**< If the field contains text, it
67
should be tokenized. */
70
AnalyzerConfigurationPrivate* const p;
72
AnalyzerConfiguration();
73
virtual ~AnalyzerConfiguration();
75
* @brief Whether a given file should be indexed.
77
* In the default implementation, the path and filename
78
* are checked against the filters specified by setFilters().
79
* @p path is used if the filter pattern contains a /,
80
* and @p filename is checked otherwise.
82
* The default implementation only checks against patterns
83
* that do not end with @c /
85
* @param path the path to the file (eg: "/folder/a.txt")
86
* @param filename the name of the file (eg: "a.txt")
88
virtual bool indexFile(const char* path, const char* filename) const;
90
* @brief Whether a given directory should be indexed.
92
* In the default implementation, the path and filename
93
* are checked against the filters specified by setFilters().
94
* @p path is used if the filter pattern contains a /,
95
* and @p filename is checked otherwise.
97
* The default implementation only checks against patterns
100
* @param path the path to the directory, including
102
* @param filename the name of the directory
104
virtual bool indexDir(const char* path, const char* filename) const;
106
* @brief Whether to use the given factory.
108
* Allows you to prevent the analyzers produced by a particular
109
* factory from being used.
111
* The default implementation allows all factories.
113
virtual bool useFactory(StreamAnalyzerFactory*) const {
117
* @brief Whether to use the given factory.
119
* This is an overloaded function. See
120
* useFactory(StreamEndAnalyzerFactory*)
121
* for more information.
123
virtual bool useFactory(StreamEndAnalyzerFactory* f) const {
124
return useFactory(static_cast<StreamAnalyzerFactory*>(f));
127
* @brief Whether to use the given factory.
129
* This is an overloaded function. See
130
* useFactory(StreamEndAnalyzerFactory*)
131
* for more information.
133
virtual bool useFactory(StreamThroughAnalyzerFactory* f) const {
134
return useFactory(static_cast<StreamAnalyzerFactory*>(f));
137
* @brief Whether to use the given factory.
139
* This is an overloaded function. See
140
* useFactory(StreamEndAnalyzerFactory*)
141
* for more information.
143
virtual bool useFactory(StreamSaxAnalyzerFactory* f) const {
144
return useFactory(static_cast<StreamAnalyzerFactory*>(f));
147
* @brief Whether to use the given factory.
149
* This is an overloaded function. See
150
* useFactory(StreamEndAnalyzerFactory*)
151
* for more information.
153
virtual bool useFactory(StreamEventAnalyzerFactory* f) const {
154
return useFactory(static_cast<StreamAnalyzerFactory*>(f));
157
* @brief Whether to use the given factory.
159
* This is an overloaded function. See
160
* useFactory(StreamEndAnalyzerFactory*)
161
* for more information.
163
virtual bool useFactory(StreamLineAnalyzerFactory* f) const {
164
return useFactory(static_cast<StreamAnalyzerFactory*>(f));
167
* @brief Allows end analyzer to check whether they should continue
170
* This should be called by end analyzers at convenient points to check
171
* whether they should continue indexing. For example, an end analyzer
172
* analyzing a tar archive might call this to check whether it should
173
* index the archive's children.
175
* This can be used to stop the indexing process at the next convenient
176
* time. For example, if the user wishes to interrupt the indexing
177
* process, or if the tool @c deepgrep was asked to find the first
178
* occurrence of a term and then stop.
180
* @return true if indexing should continue, false if it should stop
182
virtual bool indexMore() const {return true;}
183
bool indexArchiveContents() const;
185
* @brief Allows end analyzer to check whether they should continue
186
* adding text fragments to the index.
188
* This should be called by end analyzers before adding text
189
* fragments with AnalysisResult::addText().
191
* This can be used to prevent the text index from being created,
192
* or to prevent it from expanding.
194
* @return true if more text should be added to the index, false
195
* if no more text should be added
197
virtual bool addMoreText() const {
201
* @brief Return the maximal number of bytes that may be read from the
202
* stream whose results are being written into @p ar.
204
* This function allows one to do analyses that only look at the first
205
* bytes of streams for performance reasons. A scenario could be for getting
206
* metadata for showing in a file manager.
208
* The individual analyzers should honour the value that is returned from
209
* this function. They should also not assume that this value is constant
210
* during the analysis and should regularly check whether they have not
213
* @return the maximal number of bytes that may be read, or -1 if there is
216
virtual int64_t maximalStreamReadLength(const Strigi::AnalysisResult&/*ar*/) {
220
* @brief Determine the field indexing properties of a field.
222
* See AnalyzerConfiguration::FieldType for more information.
224
* @param f the field to determine the indexing properties for
226
virtual FieldType indexType(const Strigi::RegisteredField* f) const;
228
* @brief Set the list of patterns used to filter out files and directories
231
* These are used in the default implementations of indexFile() and
232
* indexDir(). They are parsed in strict order of occurrence in the
235
* Each filter is a pair, linking a boolean to a string. The string
236
* should be a shell wildcard pattern (note that wildcards can match
237
* @c /, but will not match a leading @c . in a filename). The boolean
238
* indicates whether files that match should be indexed (true) or not
241
* If the pattern ends with a @c /, it will only match directories.
242
* Otherwise, the pattern will only match files.
244
* The first pattern that matches will be used to determine whether
245
* a file or directory should be indexed, and subsequent patterns
248
* TODO: write proper documentation of the pattern syntax.
250
* @param filters a list of pairs of patterns together with whether
251
* files or directories matching the pattern should be indexed
253
void setFilters(const std::vector<std::pair<bool,std::string> >& filters);
255
* @brief set the list of patterns used to filter out files and directories
258
* See setFilters() for more details.
260
const std::vector<std::pair<bool,std::string> >& filters() const;
262
* @brief Get the field register.
264
* This gets the field register for this analysis. This will contain
265
* all the fields that may be assigned to during this analysis.
266
* There is no guarantee that all these fields will be assigned to.
267
* In fact, it is pretty certain that not all the fields in the
268
* field register will be assigned to, since this will depend on the
269
* files being analyzed.
271
* The field register is populated by the registerFields() function
272
* of the analyzer factories.
274
* @return the field register
276
FieldRegister& fieldRegister();
278
* @brief Get the field register.
280
* See the documentation for the non-const version of this function.
282
const FieldRegister& fieldRegister() const;
286
* indexArchiveContents is not virtual to keep binary comp.
288
void setIndexArchiveContents( bool );
292
* Overloaded operator| that retains the type of the flag when |'ing two
295
//AnalyzerConfiguration::FieldType
296
//operator|(AnalyzerConfiguration::FieldType a, AnalyzerConfiguration::FieldType b);