1
package org.apache.lucene.benchmark.byTask.feeds;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import java.io.Closeable;
21
import java.io.IOException;
22
import java.io.UnsupportedEncodingException;
23
import java.util.HashMap;
24
import java.util.Calendar;
26
import java.util.Properties;
27
import java.util.Locale;
28
import java.util.Random;
29
import java.util.Date;
30
import java.util.concurrent.atomic.AtomicInteger;
31
import java.text.SimpleDateFormat;
32
import java.text.ParsePosition;
34
import org.apache.lucene.benchmark.byTask.utils.Config;
35
import org.apache.lucene.document.Document;
36
import org.apache.lucene.document.Field;
37
import org.apache.lucene.document.NumericField;
38
import org.apache.lucene.document.Field.Index;
39
import org.apache.lucene.document.Field.Store;
40
import org.apache.lucene.document.Field.TermVector;
43
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate
44
* {@link DocData} objects. Supports the following parameters:
46
* <li><b>content.source</b> - specifies the {@link ContentSource} class to use
47
* (default <b>SingleDocSource</b>).
48
* <li><b>doc.stored</b> - specifies whether fields should be stored (default
50
* <li><b>doc.body.stored</b> - specifies whether the body field should be stored (default
51
* = <b>doc.stored</b>).
52
* <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
53
* (default <b>true</b>).
54
* <li><b>doc.body.tokenized</b> - specifies whether the
55
* body field should be tokenized (default = <b>doc.tokenized</b>).
56
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
57
* the index or not. (default <b>false</b>).
58
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
59
* stored in the index for the body field. This can be set to true, while
60
* <code>doc.tokenized.norms</code> is set to false, to allow norms storing just
61
* for the body field. (default <b>true</b>).
62
* <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
63
* for fields (default <b>false</b>).
64
* <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
65
* be stored with positions (default <b>false</b>).
66
* <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be
67
* stored with offsets (default <b>false</b>).
68
* <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of
69
* the document's content in the document (default <b>false</b>).
70
* <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
71
* should be reused (default <b>true</b>).
72
* <li><b>doc.index.props</b> - specifies whether the properties returned by
73
* <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random
74
* IDs from 0 to this limit. This is useful with UpdateDoc
75
* for testing performance of IndexWriter.updateDocument.
76
* {@link DocData#getProps()} will be indexed. (default <b>false</b>).
79
public class DocMaker implements Closeable {
81
private static class LeftOver {
88
private int updateDocIDLimit;
90
static class DocState {
92
private final Map<String,Field> fields;
93
private final Map<String,NumericField> numericFields;
94
private final boolean reuseFields;
96
DocData docData = new DocData();
98
public DocState(boolean reuseFields, Store store, Store bodyStore, Index index, Index bodyIndex, TermVector termVector) {
100
this.reuseFields = reuseFields;
103
fields = new HashMap<String,Field>();
104
numericFields = new HashMap<String,NumericField>();
106
// Initialize the map with the default fields.
107
fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
108
fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
109
fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
110
fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
111
fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
113
numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
114
numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
116
doc = new Document();
118
numericFields = null;
125
* Returns a field corresponding to the field name. If
126
* <code>reuseFields</code> was set to true, then it attempts to reuse a
127
* Field instance. If such a field does not exist, it creates a new one.
129
Field getField(String name, Store store, Index index, TermVector termVector) {
131
return new Field(name, "", store, index, termVector);
134
Field f = fields.get(name);
136
f = new Field(name, "", store, index, termVector);
142
NumericField getNumericField(String name) {
144
return new NumericField(name);
147
NumericField f = numericFields.get(name);
149
f = new NumericField(name);
150
numericFields.put(name, f);
156
private boolean storeBytes = false;
158
private static class DateUtil {
159
public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
160
public Calendar cal = Calendar.getInstance();
161
public ParsePosition pos = new ParsePosition(0);
163
parser.setLenient(true);
167
// leftovers are thread local, because it is unsafe to share residues between threads
168
private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
169
private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
170
private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<DateUtil>();
172
public static final String BODY_FIELD = "body";
173
public static final String TITLE_FIELD = "doctitle";
174
public static final String DATE_FIELD = "docdate";
175
public static final String DATE_MSEC_FIELD = "docdatenum";
176
public static final String TIME_SEC_FIELD = "doctimesecnum";
177
public static final String ID_FIELD = "docid";
178
public static final String BYTES_FIELD = "bytes";
179
public static final String NAME_FIELD = "docname";
181
protected Config config;
183
protected Store storeVal = Store.NO;
184
protected Store bodyStoreVal = Store.NO;
185
protected Index indexVal = Index.ANALYZED_NO_NORMS;
186
protected Index bodyIndexVal = Index.ANALYZED;
187
protected TermVector termVecVal = TermVector.NO;
189
protected ContentSource source;
190
protected boolean reuseFields;
191
protected boolean indexProperties;
193
private final AtomicInteger numDocsCreated = new AtomicInteger();
196
// use only part of the body, modify it to keep the rest (or use all if size==0).
197
// reset the docdata properties so they are not added more than once.
198
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
200
final DocState ds = getDocState();
201
final Document doc = reuseFields ? ds.doc : new Document();
202
doc.getFields().clear();
205
Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
208
id = r.nextInt(updateDocIDLimit);
210
id = docData.getID();
212
id = numDocsCreated.getAndIncrement();
215
idField.setValue(Integer.toString(id));
219
String name = docData.getName();
220
if (name == null) name = "";
221
name = cnt < 0 ? name : name + "_" + cnt;
222
Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
223
nameField.setValue(name);
227
DateUtil util = dateParsers.get();
229
util = new DateUtil();
230
dateParsers.set(util);
233
String dateString = docData.getDate();
234
if (dateString != null) {
235
util.pos.setIndex(0);
236
date = util.parser.parse(dateString, util.pos);
237
//System.out.println(dateString + " parsed to " + date);
241
Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
242
dateStringField.setValue(dateString);
243
doc.add(dateStringField);
246
// just set to right now
250
NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
251
dateField.setLongValue(date.getTime());
254
util.cal.setTime(date);
255
final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
257
NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
258
timeSecField.setIntValue(sec);
259
doc.add(timeSecField);
262
String title = docData.getTitle();
263
Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
264
titleField.setValue(title == null ? "" : title);
267
String body = docData.getBody();
268
if (body != null && body.length() > 0) {
270
if (size <= 0 || size >= body.length()) {
271
bdy = body; // use all
272
docData.setBody(""); // nothing left
274
// attempt not to break words - if whitespace found within next 20 chars...
275
for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
276
if (Character.isWhitespace(body.charAt(n))) {
281
bdy = body.substring(0, size); // use part
282
docData.setBody(body.substring(size)); // some left
284
Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal);
285
bodyField.setValue(bdy);
289
Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
290
bytesField.setValue(bdy.getBytes("UTF-8"));
295
if (indexProperties) {
296
Properties props = docData.getProps();
298
for (final Map.Entry<Object,Object> entry : props.entrySet()) {
299
Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
300
f.setValue((String) entry.getValue());
303
docData.setProps(null);
307
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
311
private void resetLeftovers() {
315
protected DocState getDocState() {
316
DocState ds = docState.get();
318
ds = new DocState(reuseFields, storeVal, bodyStoreVal, indexVal, bodyIndexVal, termVecVal);
325
* Closes the {@link DocMaker}. The base implementation closes the
326
* {@link ContentSource}, and it can be overridden to do more work (but make
327
* sure to call super.close()).
329
public void close() throws IOException {
334
* Returns the number of bytes generated by the content source since last
337
public synchronized long getBytesCount() {
338
return source.getBytesCount();
342
* Returns the total number of bytes that were generated by the content source
343
* defined to that doc maker.
345
public long getTotalBytesCount() {
346
return source.getTotalBytesCount();
350
* Creates a {@link Document} object ready for indexing. This method uses the
351
* {@link ContentSource} to get the next document from the source, and creates
352
* a {@link Document} object from the returned fields. If
353
* <code>reuseFields</code> was set to true, it will reuse {@link Document}
354
* and {@link Field} instances.
356
public Document makeDocument() throws Exception {
358
DocData docData = source.getNextDocData(getDocState().docData);
359
Document doc = createDocument(docData, 0, -1);
364
* Same as {@link #makeDocument()}, only this method creates a document of the
365
* given size input by <code>size</code>.
367
public Document makeDocument(int size) throws Exception {
368
LeftOver lvr = leftovr.get();
369
if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
370
|| lvr.docdata.getBody().length() == 0) {
373
DocData docData = getDocState().docData;
374
DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
375
int cnt = (lvr == null ? 0 : lvr.cnt);
376
while (dd.getBody() == null || dd.getBody().length() < size) {
378
dd = source.getNextDocData(new DocData());
380
dd.setBody(dd2.getBody() + dd.getBody());
382
Document doc = createDocument(dd, size, cnt);
383
if (dd.getBody() == null || dd.getBody().length() == 0) {
387
lvr = new LeftOver();
396
/** Reset inputs so that the test run would behave, input wise, as if it just started. */
397
public synchronized void resetInputs() throws IOException {
398
source.printStatistics("docs");
399
// re-initiate since properties by round may have changed.
401
source.resetInputs();
402
numDocsCreated.set(0);
406
/** Set the configuration parameters of this doc maker. */
407
public void setConfig(Config config) {
408
this.config = config;
410
String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
411
source = Class.forName(sourceClass).asSubclass(ContentSource.class).newInstance();
412
source.setConfig(config);
413
} catch (Exception e) {
414
// Should not get here. Throw runtime exception.
415
throw new RuntimeException(e);
418
boolean stored = config.get("doc.stored", false);
419
boolean bodyStored = config.get("doc.body.stored", stored);
420
boolean tokenized = config.get("doc.tokenized", true);
421
boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
422
boolean norms = config.get("doc.tokenized.norms", false);
423
boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
424
boolean termVec = config.get("doc.term.vector", false);
425
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
426
bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO);
428
indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
430
indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
434
bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
436
bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
439
boolean termVecPositions = config.get("doc.term.vector.positions", false);
440
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
441
if (termVecPositions && termVecOffsets) {
442
termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
443
} else if (termVecPositions) {
444
termVecVal = TermVector.WITH_POSITIONS;
445
} else if (termVecOffsets) {
446
termVecVal = TermVector.WITH_OFFSETS;
447
} else if (termVec) {
448
termVecVal = TermVector.YES;
450
termVecVal = TermVector.NO;
452
storeBytes = config.get("doc.store.body.bytes", false);
454
reuseFields = config.get("doc.reuse.fields", true);
456
// In a multi-rounds run, it is important to reset DocState since settings
457
// of fields may change between rounds, and this is the only way to reset
458
// the cache of all threads.
459
docState = new ThreadLocal<DocState>();
461
indexProperties = config.get("doc.index.props", false);
463
updateDocIDLimit = config.get("doc.random.id.limit", -1);
464
if (updateDocIDLimit != -1) {