2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
17
package org.apache.lucene.benchmark.quality.trec;
19
import java.io.BufferedReader;
20
import java.io.IOException;
21
import java.util.ArrayList;
22
import java.util.Arrays;
23
import java.util.HashMap;
25
import org.apache.lucene.benchmark.quality.QualityQuery;
30
* Expects this topic format -
33
* <num> Number: nnn
35
* <title> title of the topic
37
* <desc> Description:
38
* description of the topic
40
* <narr> Narrative:
41
* "story" composed by assessors.
45
* Comment lines starting with '#' are ignored.
47
public class TrecTopicsReader {
49
private static final String newline = System.getProperty("line.separator");
52
* Constructor for Trec's TopicsReader
54
public TrecTopicsReader() {
59
* Read quality queries from trec format topics file.
60
* @param reader where queries are read from.
61
* @return the result quality queries.
62
* @throws IOException if cannot read the queries.
64
public QualityQuery[] readQueries(BufferedReader reader) throws IOException {
65
ArrayList res = new ArrayList();
68
while (null!=(sb=read(reader,"<top>",null,false,false))) {
69
HashMap fields = new HashMap();
71
sb = read(reader,"<num>",null,true,false);
72
int k = sb.indexOf(":");
73
String id = sb.substring(k+1).trim();
75
sb = read(reader,"<title>",null,true,false);
77
String title = sb.substring(k+1).trim();
79
sb = read(reader,"<desc>",null,false,false);
80
sb = read(reader,"<narr>",null,false,true);
81
String descripion = sb.toString().trim();
83
fields.put("title",title);
84
fields.put("description",descripion);
85
QualityQuery topic = new QualityQuery(id,fields);
87
// skip narrative, get to end of doc
88
read(reader,"</top>",null,false,false);
93
// sort result array (by ID)
94
QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]);
99
// read until finding a line that starts with the specified prefix
100
private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException {
101
sb = (sb==null ? new StringBuffer() : sb);
104
String line = reader.readLine();
108
if (line.startsWith(prefix)) {
109
if (collectMatchLine) {
120
//System.out.println("read: "+sb);