4
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
6
------------------------------------------------------------------------------*/
16
#include "CLucene/StdHeader.h"
17
#include "CLucene/_clucene-config.h"
9
19
#include "CLucene.h"
10
#include "CLucene/util/Reader.h"
20
#include "CLucene/util/CLStreams.h"
21
#include "CLucene/util/dirent.h"
22
#include "CLucene/config/repl_tchar.h"
11
23
#include "CLucene/util/Misc.h"
12
#include "CLucene/util/dirent.h"
24
#include "CLucene/util/StringBuffer.h"
16
26
using namespace std;
17
27
using namespace lucene::index;
20
30
using namespace lucene::store;
21
31
using namespace lucene::document;
23
Document* FileDocument(const char* f){
24
// make a new, empty document
25
Document* doc = _CLNEW Document();
27
// Add the path of the file as a field named "path". Use a Tex t field, so
28
// that the index stores the path, and so that the path is searchable
30
STRCPY_AtoT(tf,f,CL_MAX_DIR);
31
doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );
33
// Add the last modified date of the file a field named "modified". Use a
34
// Keyword field, so that it's searchable, but so that no attempt is made
35
// to tokenize the field into words.
36
//doc->add( *Field.Keyword("modified", DateField.timeToString(f->lastModified())));
38
// Add the contents of the file a field named "contents". Use a Text
39
// field, specifying a Reader, so that the text of the file is tokenized.
41
//read the data without any encoding. if you want to use special encoding
42
//see the contrib/jstreams - they contain various types of stream readers
33
void FileDocument(const char* f, Document* doc){
35
// Add the path of the file as a field named "path". Use an indexed and stored field, so
36
// that the index stores the path, and so that the path is searchable.
38
STRCPY_AtoT(tf,f,CL_MAX_DIR);
39
doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );
41
// Add the last modified date of the file a field named "modified". Again, we make it
42
// searchable, but no attempt is made to tokenize the field into words.
43
//doc->add( *_CLNEW Field(_T("modified"), DateTools::timeToString(f->lastModified()), Field::STORE_YES | Field::INDEX_NO));
45
// Add the contents of the file a field named "contents". This time we use a tokenized
46
// field so that the text can be searched for words in it.
48
// Here we read the data without any encoding. If you want to use special encoding
49
// see the contrib/jstreams - they contain various types of stream readers
43
50
FILE* fh = fopen(f,"r");
46
// use fstat for portability
50
str.reserve(filestat.st_size);
51
//str.reserve(fileSize(fh->_file));
64
doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED) );
67
//_tprintf(_T("%s\n"),doc->toString());
68
// return the document
72
void indexDocs(IndexWriter* writer, char* directory) {
73
DIR* dir = opendir(directory);
79
char path[CL_MAX_DIR];
80
strcpy(path,directory);
81
strcat(path,PATH_DELIMITERA);
82
char* pathP = path + strlen(path);
86
if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {
88
strcat(pathP,fl->d_name);
89
int32_t ret = fileStat(path,&buf);
90
if ( buf.st_mode & S_IFDIR ) {
91
indexDocs(writer, path );
93
printf( "adding: %s\n", fl->d_name );
95
Document* doc = FileDocument( path );
96
writer->addDocument( doc );
105
printf( "adding: %s\n", directory);
107
Document* doc = FileDocument( directory );
108
writer->addDocument( doc );
112
void IndexFiles(char* path, char* target, const bool clearIndex){
65
doc->add( *_CLNEW Field(_T("contents"), str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED) );
69
void indexDocs(IndexWriter* writer, const char* directory) {
71
std::sort(files.begin(),files.end());
72
Misc::listFiles(directory,files,true);
73
vector<string>::iterator itr = files.begin();
75
// Re-use the document object
78
while ( itr != files.end() ){
79
const char* path = itr->c_str();
80
printf( "adding file %d: %s\n", ++i, path );
83
FileDocument( path, &doc );
84
writer->addDocument( &doc );
88
void IndexFiles(const char* path, const char* target, const bool clearIndex){
113
89
IndexWriter* writer = NULL;
114
//lucene::analysis::SimpleAnalyzer* an = *_CLNEW lucene::analysis::SimpleAnalyzer();
115
lucene::analysis::standard::StandardAnalyzer an;
90
lucene::analysis::WhitespaceAnalyzer an;
117
92
if ( !clearIndex && IndexReader::indexExists(target) ){
118
93
if ( IndexReader::isLocked(target) ){
125
100
writer = _CLNEW IndexWriter( target ,&an, true);
127
writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
128
/*printf("Set MaxFieldLength: ");
130
fgets(mfl,250,stdin);
131
mfl[strlen(mfl)-1] = 0;
133
writer->setMaxFieldLength(atoi(mfl));*/
134
//writer->infoStream = cout; //TODO: infoStream - unicode
136
uint64_t str = lucene::util::Misc::currentTimeMillis();
103
//writer->setInfoStream(&std::cout);
105
// We can tell the writer to flush at certain occasions
106
//writer->setRAMBufferSizeMB(0.5);
107
//writer->setMaxBufferedDocs(3);
109
// To bypass a possible exception (we have no idea what we will be indexing...)
110
writer->setMaxFieldLength(0x7FFFFFFFL); // LUCENE_INT32_MAX_SHOULDBE
112
// Turn this off to make indexing faster; we'll turn it on later before optimizing
113
writer->setUseCompoundFile(false);
115
uint64_t str = Misc::currentTimeMillis();
138
117
indexDocs(writer, path);
119
// Make the index use as little files as possible, and optimize it
120
writer->setUseCompoundFile(true);
123
// Close and clean up
143
printf("Indexing took: %d ms.\n\n", lucene::util::Misc::currentTimeMillis() - str);
127
printf("Indexing took: %d ms.\n\n", (int32_t)(Misc::currentTimeMillis() - str));