~ubuntu-branches/ubuntu/wily/ruby-ferret/wily

« back to all changes in this revision

Viewing changes to lib/ferret/index.rb

Committer: Bazaar Package Importer
Author(s): Antonio Terceiro
Date: 2011-07-28 00:02:49 UTC
Revision ID: james.westby@ubuntu.com-20110728000249-v0443y69ftcpxwi6

Tags: upstream-0.11.6

Import upstream version 0.11.6

files added:

CHANGELOG

MIT-LICENSE

README

Rakefile

TODO

TUTORIAL

bin/ferret-browser

ext/analysis.c

ext/analysis.h

ext/api.c

ext/api.h

ext/array.c

ext/array.h

ext/bitvector.c

ext/bitvector.h

ext/compound_io.c

ext/config.h

ext/document.c

ext/document.h

ext/except.c

ext/except.h

ext/extconf.rb

ext/ferret.c

ext/ferret.h

ext/filter.c

ext/fs_store.c

ext/global.c

ext/global.h

ext/hash.c

ext/hash.h

ext/hashset.c

ext/hashset.h

ext/header.h

ext/helper.c

ext/helper.h

ext/inc

ext/inc/lang.h

ext/inc/threading.h

ext/index.c

ext/index.h

ext/lang.h

ext/libstemmer.c

ext/libstemmer.h

ext/mempool.c

ext/mempool.h

ext/modules.h

ext/multimapper.c

ext/multimapper.h

ext/posh.c

ext/posh.h

ext/priorityqueue.c

ext/priorityqueue.h

ext/q_boolean.c

ext/q_const_score.c

ext/q_filtered_query.c

ext/q_fuzzy.c

ext/q_match_all.c

ext/q_multi_term.c

ext/q_parser.c

ext/q_phrase.c

ext/q_prefix.c

ext/q_range.c

ext/q_span.c

ext/q_term.c

ext/q_wildcard.c

ext/r_analysis.c

ext/r_index.c

ext/r_qparser.c

ext/r_search.c

ext/r_store.c

ext/r_utils.c

ext/ram_store.c

ext/search.c

ext/search.h

ext/similarity.c

ext/similarity.h

ext/sort.c

ext/stem_ISO_8859_1_danish.c

ext/stem_ISO_8859_1_danish.h

ext/stem_ISO_8859_1_dutch.c

ext/stem_ISO_8859_1_dutch.h

ext/stem_ISO_8859_1_english.c

ext/stem_ISO_8859_1_english.h

ext/stem_ISO_8859_1_finnish.c

ext/stem_ISO_8859_1_finnish.h

ext/stem_ISO_8859_1_french.c

ext/stem_ISO_8859_1_french.h

ext/stem_ISO_8859_1_german.c

ext/stem_ISO_8859_1_german.h

ext/stem_ISO_8859_1_italian.c

ext/stem_ISO_8859_1_italian.h

ext/stem_ISO_8859_1_norwegian.c

ext/stem_ISO_8859_1_norwegian.h

ext/stem_ISO_8859_1_porter.c

ext/stem_ISO_8859_1_porter.h

ext/stem_ISO_8859_1_portuguese.c

ext/stem_ISO_8859_1_portuguese.h

ext/stem_ISO_8859_1_spanish.c

ext/stem_ISO_8859_1_spanish.h

ext/stem_ISO_8859_1_swedish.c

ext/stem_ISO_8859_1_swedish.h

ext/stem_KOI8_R_russian.c

ext/stem_KOI8_R_russian.h

ext/stem_UTF_8_danish.c

ext/stem_UTF_8_danish.h

ext/stem_UTF_8_dutch.c

ext/stem_UTF_8_dutch.h

ext/stem_UTF_8_english.c

ext/stem_UTF_8_english.h

ext/stem_UTF_8_finnish.c

ext/stem_UTF_8_finnish.h

ext/stem_UTF_8_french.c

ext/stem_UTF_8_french.h

ext/stem_UTF_8_german.c

ext/stem_UTF_8_german.h

ext/stem_UTF_8_italian.c

ext/stem_UTF_8_italian.h

ext/stem_UTF_8_norwegian.c

ext/stem_UTF_8_norwegian.h

ext/stem_UTF_8_porter.c

ext/stem_UTF_8_porter.h

ext/stem_UTF_8_portuguese.c

ext/stem_UTF_8_portuguese.h

ext/stem_UTF_8_russian.c

ext/stem_UTF_8_russian.h

ext/stem_UTF_8_spanish.c

ext/stem_UTF_8_spanish.h

ext/stem_UTF_8_swedish.c

ext/stem_UTF_8_swedish.h

ext/stopwords.c

ext/store.c

ext/store.h

ext/term_vectors.c

ext/threading.h

ext/utilities.c

ext/win32.h

lib/ferret

lib/ferret.rb

lib/ferret/browser

lib/ferret/browser.rb

lib/ferret/browser/s

lib/ferret/browser/s/global.js

lib/ferret/browser/s/style.css

lib/ferret/browser/views

lib/ferret/browser/views/document

lib/ferret/browser/views/document/list.rhtml

lib/ferret/browser/views/document/show.rhtml

lib/ferret/browser/views/error

lib/ferret/browser/views/error/index.rhtml

lib/ferret/browser/views/help

lib/ferret/browser/views/help/index.rhtml

lib/ferret/browser/views/home

lib/ferret/browser/views/home/index.rhtml

lib/ferret/browser/views/layout.rhtml

lib/ferret/browser/views/term

lib/ferret/browser/views/term-vector

lib/ferret/browser/views/term-vector/index.rhtml

lib/ferret/browser/views/term/index.rhtml

lib/ferret/browser/views/term/termdocs.rhtml

lib/ferret/browser/webrick.rb

lib/ferret/document.rb

lib/ferret/field_infos.rb

lib/ferret/index.rb

lib/ferret/number_tools.rb

lib/ferret_version.rb

setup.rb

test

test/test_all.rb

test/test_helper.rb

test/threading

test/threading/number_to_spoken.rb

test/threading/thread_safety_index_test.rb

test/threading/thread_safety_read_write_test.rb

test/threading/thread_safety_test.rb

test/unit

test/unit/analysis

test/unit/analysis/tc_analyzer.rb

test/unit/analysis/tc_token_stream.rb

test/unit/index

test/unit/index/tc_index.rb

test/unit/index/tc_index_reader.rb

test/unit/index/tc_index_writer.rb

test/unit/index/th_doc.rb

test/unit/largefile

test/unit/largefile/tc_largefile.rb

test/unit/query_parser

test/unit/query_parser/tc_query_parser.rb

test/unit/search

test/unit/search/tc_filter.rb

test/unit/search/tc_fuzzy_query.rb

test/unit/search/tc_index_searcher.rb

test/unit/search/tc_multi_searcher.rb

test/unit/search/tc_multiple_search_requests.rb

test/unit/search/tc_search_and_sort.rb

test/unit/search/tc_sort.rb

test/unit/search/tc_sort_field.rb

test/unit/search/tc_spans.rb

test/unit/search/tm_searcher.rb

test/unit/store

test/unit/store/tc_fs_store.rb

test/unit/store/tc_ram_store.rb

test/unit/store/tm_store.rb

test/unit/store/tm_store_lock.rb

test/unit/tc_document.rb

test/unit/ts_analysis.rb

test/unit/ts_index.rb

test/unit/ts_largefile.rb

test/unit/ts_query_parser.rb

test/unit/ts_search.rb

test/unit/ts_store.rb

test/unit/ts_utils.rb

test/unit/utils

test/unit/utils/tc_bit_vector.rb

test/unit/utils/tc_number_tools.rb

test/unit/utils/tc_priority_queue.rb

Show diffs side-by-side

added added

removed removed

lib/ferret/index.rb

require 'monitor'

module Ferret::Index

module SynchroLockMixin

def synchrolock

trys = 5

begin

synchronize {yield}

rescue Ferret::Store::Lock::LockError => e

if (trys -= 1) <= 0

raise e

else

retry

end

# This is a simplified interface to the index. See the TUTORIAL for more

# information on how to use this class.

class Index

include MonitorMixin

include Ferret::Store

include Ferret::Search

attr_reader :options

# If you create an Index without any options, it'll simply create an index

# in memory. But this class is highly configurable and every option that

# you can supply to IndexWriter and QueryParser, you can also set here.

# Please look at the options for the constructors to these classes.

# === Options

# See;

# * QueryParser

# * IndexWriter

# default_input_field:: Default: "id". This specifies the default field

# that will be used when you add a simple string

# to the index using #add_document or <<.

# id_field: Default: "id". This field is as the field to

# search when doing searches on a term. For

# example, if you do a lookup by term "cat", ie

# index["cat"], this will be the field that is

# searched.

# key:: Default: nil. Expert: This should only be used

# if you really know what you are doing. Basically

# you can set a field or an array of fields to be

# the key for the index. So if you add a document

# with a same key as an existing document, the

# existing document will be replaced by the new

# object. Using a multiple field key will slow

# down indexing so it should not be done if

# performance is a concern. A single field key (or

# id) should be find however. Also, you must make

# sure that your key/keys are either untokenized

# or that they are not broken up by the analyzer.

# auto_flush:: Default: false. Set this option to true if you

# want the index automatically flushed every time

# you do a write (includes delete) to the index.

# This is useful if you have multiple processes

# accessing the index and you don't want lock

# errors. Setting :auto_flush to true has a huge

# performance impact so don't use it if you are

# concerned about performance. In that case you

# should think about setting up a DRb indexing

# service.

# lock_retry_time:: Default: 2 seconds. This parameter specifies how

# long to wait before retrying to obtain the

# commit lock when detecting if the IndexReader is

# at the latest version.

# close_dir:: Default: false. If you explicitly pass a

# Directory object to this class and you want

# Index to close it when it is closed itself then

# set this to true.

# Some examples;

# index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())

# index = Index::Index.new(:path => '/path/to/index',

# :create_if_missing => false,

# :auto_flush => true)

# index = Index::Index.new(:dir => directory,

# :default_slop => 2,

# :handle_parse_errors => false)

# You can also pass a block if you like. The index will be yielded and

# closed at the index of the box. For example;

# Ferret::I.new() do |index|

# # do stuff with index. Most of your actions will be cached.

# end

def initialize(options = {}, &block)

super()

100

if options[:key]

101

@key = options[:key]

102

if @key.is_a?(Array)

103

@key.flatten.map {|k| k.to_s.intern}

104

end

105

else

106

@key = nil

107

end

108

109

if (fi = options[:field_infos]).is_a?(String)

110

options[:field_infos] = FieldInfos.load(fi)

111

end

112

113

@close_dir = options[:close_dir]

114

if options[:dir].is_a?(String)

115

options[:path] = options[:dir]

116

end

117

if options[:path]

118

@close_dir = true

119

begin

120

@dir = FSDirectory.new(options[:path], options[:create])

121

rescue IOError => io

122

@dir = FSDirectory.new(options[:path],

123

options[:create_if_missing] != false)

124

end

125

elsif options[:dir]

126

@dir = options[:dir]

127

else

128

options[:create] = true # this should always be true for a new RAMDir

129

@close_dir = true

130

@dir = RAMDirectory.new

131

end

132

133

@dir.extend(MonitorMixin).extend(SynchroLockMixin)

134

options[:dir] = @dir

135

options[:lock_retry_time]||= 2

136

@options = options

137

if (!@dir.exists?("segments")) || options[:create]

138

IndexWriter.new(options).close

139

end

140

options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new

141

142

@searcher = nil

143

@writer = nil

144

@reader = nil

145

146

@options.delete(:create) # only create the first time if at all

147

@auto_flush = @options[:auto_flush] || false

148

if (@options[:id_field].nil? and @key.is_a?(Symbol))

149

@id_field = @key

150

else

151

@id_field = @options[:id_field] || :id

152

end

153

@default_field = (@options[:default_field]||= :*)

154

@default_input_field = options[:default_input_field] || @id_field

155

156

if @default_input_field.respond_to?(:intern)

157

@default_input_field = @default_input_field.intern

158

end

159

@open = true

160

@qp = nil

161

if block

162

yield self

163

self.close

164

end

165

end

166

167

# Returns an array of strings with the matches highlighted. The +query+ can

168

# either a query String or a Ferret::Search::Query object. The doc_id is

169

# the id of the document you want to highlight (usually returned by the

170

# search methods). There are also a number of options you can pass;

171

172

# === Options

173

174

# field:: Default: @options[:default_field]. The default_field

175

# is the field that is usually highlighted but you can

176

# specify which field you want to highlight here. If

177

# you want to highlight multiple fields then you will

178

# need to call this method multiple times.

179

# excerpt_length:: Default: 150. Length of excerpt to show. Highlighted

180

# terms will be in the centre of the excerpt. Set to

181

# :all to highlight the entire field.

182

# num_excerpts:: Default: 2. Number of excerpts to return.

183

# pre_tag:: Default: "<b>". Tag to place to the left of the

184

# match. You'll probably want to change this to a

185

# "<span>" tag with a class. Try "\033[36m" for use in

186

# a terminal.

187

# post_tag:: Default: "</b>". This tag should close the

188

# +:pre_tag+. Try tag "\033[m" in the terminal.

189

# ellipsis:: Default: "...". This is the string that is appended

190

# at the beginning and end of excerpts (unless the

191

# excerpt hits the start or end of the field.

192

# Alternatively you may want to use the HTML entity

193

# … or the UTF-8 string "\342\200\246".

194

def highlight(query, doc_id, options = {})

195

@dir.synchronize do

196

ensure_searcher_open()

197

@searcher.highlight(do_process_query(query),

198

doc_id,

199

options[:field]||@options[:default_field],

200

options)

201

end

202

end

203

204

# Closes this index by closing its associated reader and writer objects.

205

def close

206

@dir.synchronize do

207

if not @open

208

raise(StandardError, "tried to close an already closed directory")

209

end

210

@searcher.close() if @searcher

211

@reader.close() if @reader

212

@writer.close() if @writer

213

@dir.close() if @close_dir

214

215

@open = false

216

end

217

end

218

219

# Get the reader for this index.

220

# NOTE:: This will close the writer from this index.

221

def reader

222

ensure_reader_open()

223

return @reader

224

end

225

226

# Get the searcher for this index.

227

# NOTE:: This will close the writer from this index.

228

def searcher

229

ensure_searcher_open()

230

return @searcher

231

end

232

233

# Get the writer for this index.

234

# NOTE:: This will close the reader from this index.

235

def writer

236

ensure_writer_open()

237

return @writer

238

end

239

240

# Adds a document to this index, using the provided analyzer instead of

241

# the local analyzer if provided. If the document contains more than

242

# IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are

243

# discarded.

244

245

# There are three ways to add a document to the index.

246

# To add a document you can simply add a string or an array of strings.

247

# This will store all the strings in the "" (ie empty string) field

248

# (unless you specify the default_field when you create the index).

249

250

# index << "This is a new document to be indexed"

251

# index << ["And here", "is another", "new document", "to be indexed"]

252

253

# But these are pretty simple documents. If this is all you want to index

254

# you could probably just use SimpleSearch. So let's give our documents

255

# some fields;

256

257

# index << {:title => "Programming Ruby", :content => "blah blah blah"}

258

# index << {:title => "Programming Ruby", :content => "yada yada yada"}

259

260

# Or if you are indexing data stored in a database, you'll probably want

261

# to store the id;

262

263

# index << {:id => row.id, :title => row.title, :date => row.date}

264

265

# See FieldInfos for more information on how to set field properties.

266

def add_document(doc, analyzer = nil)

267

@dir.synchrolock do

268

ensure_writer_open()

269

if doc.is_a?(String) or doc.is_a?(Array)

270

doc = {@default_input_field => doc}

271

end

272

273

# delete existing documents with the same key

274

if @key

275

if @key.is_a?(Array)

276

query = @key.inject(BooleanQuery.new()) do |bq, field|

277

bq.add_query(TermQuery.new(field, doc[field].to_s), :must)

278

279

end

280

query_delete(query)

281

else

282

id = doc[@key].to_s

283

if id

284

ensure_writer_open()

285

@writer.delete(@key, id)

286

@writer.commit

287

end

288

end

289

end

290

ensure_writer_open()

291

292

if analyzer

293

old_analyzer = @writer.analyzer

294

@writer.analyzer = analyzer

295

@writer.add_document(doc)

296

@writer.analyzer = old_analyzer

297

else

298

@writer.add_document(doc)

299

end

300

301

flush() if @auto_flush

302

end

303

end

304

alias :<< :add_document

305

306

# Run a query through the Searcher on the index. A TopDocs object is

307

# returned with the relevant results. The +query+ is a built in Query

308

# object or a query string that can be parsed by the Ferret::QueryParser.

309

# Here are the options;

310

311

# === Options

312

313

# offset:: Default: 0. The offset of the start of the section of the

314

# result-set to return. This is used for paging through

315

# results. Let's say you have a page size of 10. If you

316

# don't find the result you want among the first 10 results

317

# then set +:offset+ to 10 and look at the next 10 results,

318

# then 20 and so on.

319

# limit:: Default: 10. This is the number of results you want

320

# returned, also called the page size. Set +:limit+ to

321

# +:all+ to return all results

322

# sort:: A Sort object or sort string describing how the field

323

# should be sorted. A sort string is made up of field names

324

# which cannot contain spaces and the word "DESC" if you

325

# want the field reversed, all separated by commas. For

326

# example; "rating DESC, author, title". Note that Ferret

327

# will try to determine a field's type by looking at the

328

# first term in the index and seeing if it can be parsed as

329

# an integer or a float. Keep this in mind as you may need

330

# to specify a fields type to sort it correctly. For more

331

# on this, see the documentation for SortField

332

# filter:: a Filter object to filter the search results with

333

# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score

334

# and the Searcher object as its parameters and returns a

335

# Boolean value specifying whether the result should be

336

# included in the result set.

337

def search(query, options = {})

338

@dir.synchronize do

339

return do_search(query, options)

340

end

341

end

342

343

# Run a query through the Searcher on the index. A TopDocs object is

344

# returned with the relevant results. The +query+ is a Query object or a

345

# query string that can be validly parsed by the Ferret::QueryParser. The

346

# Searcher#search_each method yields the internal document id (used to

347

# reference documents in the Searcher object like this;

348

# +searcher[doc_id]+) and the search score for that document. It is

349

# possible for the score to be greater than 1.0 for some queries and

350

# taking boosts into account. This method will also normalize scores to

351

# the range 0.0..1.0 when the max-score is greater than 1.0. Here are the

352

# options;

353

354

# === Options

355

356

# offset:: Default: 0. The offset of the start of the section of the

357

# result-set to return. This is used for paging through

358

# results. Let's say you have a page size of 10. If you

359

# don't find the result you want among the first 10 results

360

# then set +:offset+ to 10 and look at the next 10 results,

361

# then 20 and so on.

362

# limit:: Default: 10. This is the number of results you want

363

# returned, also called the page size. Set +:limit+ to

364

# +:all+ to return all results

365

# sort:: A Sort object or sort string describing how the field

366

# should be sorted. A sort string is made up of field names

367

# which cannot contain spaces and the word "DESC" if you

368

# want the field reversed, all separated by commas. For

369

# example; "rating DESC, author, title". Note that Ferret

370

# will try to determine a field's type by looking at the

371

# first term in the index and seeing if it can be parsed as

372

# an integer or a float. Keep this in mind as you may need

373

# to specify a fields type to sort it correctly. For more

374

# on this, see the documentation for SortField

375

# filter:: a Filter object to filter the search results with

376

# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score

377

# and the Searcher object as its parameters and returns a

378

# Boolean value specifying whether the result should be

379

# included in the result set.

380

381

# returns:: The total number of hits.

382

383

# === Example

384

# eg.

385

# index.search_each(query, options = {}) do |doc, score|

386

# puts "hit document number #{doc} with a score of #{score}"

387

# end

388

389

def search_each(query, options = {}) # :yield: doc, score

390

@dir.synchronize do

391

ensure_searcher_open()

392

query = do_process_query(query)

393

394

@searcher.search_each(query, options) do |doc, score|

395

yield doc, score

396

end

397

end

398

end

399

400

# Retrieves a document/documents from the index. The method for retrieval

401

# depends on the type of the argument passed.

402

403

# If +arg+ is an Integer then return the document based on the internal

404

# document number.

405

406

# If +arg+ is a Range, then return the documents within the range based on

407

# internal document number.

408

409

# If +arg+ is a String then search for the first document with +arg+ in

410

# the +id+ field. The +id+ field is either :id or whatever you set

411

# :id_field parameter to when you create the Index object.

412

def doc(*arg)

413

@dir.synchronize do

414

id = arg[0]

415

if id.kind_of?(String) or id.kind_of?(Symbol)

416

ensure_reader_open()

417

term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)

418

return term_doc_enum.next? ? @reader[term_doc_enum.doc] : nil

419

else

420

ensure_reader_open(false)

421

return @reader[*arg]

422

end

423

end

424

end

425

alias :[] :doc

426

427

# Deletes a document/documents from the index. The method for determining

428

# the document to delete depends on the type of the argument passed.

429

430

# If +arg+ is an Integer then delete the document based on the internal

431

# document number. Will raise an error if the document does not exist.

432

433

# If +arg+ is a String then search for the documents with +arg+ in the

434

# +id+ field. The +id+ field is either :id or whatever you set :id_field

435

# parameter to when you create the Index object. Will fail quietly if the

436

# no document exists.

437

def delete(arg)

438

@dir.synchrolock do

439

ensure_writer_open()

440

if arg.is_a?(String) or arg.is_a?(Symbol)

441

ensure_writer_open()

442

@writer.delete(@id_field, arg.to_s)

443

elsif arg.is_a?(Integer)

444

ensure_reader_open()

445

cnt = @reader.delete(arg)

446

else

447

raise ArgumentError, "Cannot delete for arg of type #{arg.class}"

448

end

449

flush() if @auto_flush

450

end

451

return self

452

end

453

454

# Delete all documents returned by the query.

455

456

# query:: The query to find documents you wish to delete. Can either be a

457

# string (in which case it is parsed by the standard query parser)

458

# or an actual query object.

459

def query_delete(query)

460

@dir.synchrolock do

461

ensure_writer_open()

462

ensure_searcher_open()

463

query = do_process_query(query)

464

@searcher.search_each(query, :limit => :all) do |doc, score|

465

@reader.delete(doc)

466

end

467

flush() if @auto_flush

468

end

469

end

470

471

# Returns true if document +n+ has been deleted

472

def deleted?(n)

473

@dir.synchronize do

474

ensure_reader_open()

475

return @reader.deleted?(n)

476

end

477

end

478

479

# Update the document referenced by the document number +id+ if +id+ is an

480

# integer or all of the documents which have the term +id+ if +id+ is a

481

# term..

482

483

# id:: The number of the document to update. Can also be a string

484

# representing the value in the +id+ field. Also consider using

485

# the :key attribute.

486

# new_doc:: The document to replace the old document with

487

def update(id, new_doc)

488

@dir.synchrolock do

489

ensure_writer_open()

490

delete(id)

491

if id.is_a?(String) or id.is_a?(Symbol)

492

@writer.commit

493

else

494

ensure_writer_open()

495

end

496

@writer << new_doc

497

flush() if @auto_flush

498

end

499

end

500

501

# Update all the documents returned by the query.

502

503

# query:: The query to find documents you wish to update. Can either be

504

# a string (in which case it is parsed by the standard query

505

# parser) or an actual query object.

506

# new_val:: The values we are updating. This can be a string in which case

507

# the default field is updated, or it can be a hash, in which

508

# case, all fields in the hash are merged into the old hash.

509

# That is, the old fields are replaced by values in the new hash

510

# if they exist.

511

512

# === Example

513

514

# index << {:id => "26", :title => "Babylon", :artist => "David Grey"}

515

# index << {:id => "29", :title => "My Oh My", :artist => "David Grey"}

516

517

# # correct

518

# index.query_update('artist:"David Grey"', {:artist => "David Gray"})

519

520

# index["26"]

521

# #=> {:id => "26", :title => "Babylon", :artist => "David Gray"}

522

# index["28"]

523

# #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}

524

525

def query_update(query, new_val)

526

@dir.synchrolock do

527

ensure_writer_open()

528

ensure_searcher_open()

529

docs_to_add = []

530

query = do_process_query(query)

531

@searcher.search_each(query) do |id, score|

532

document = @searcher[id].load

533

if new_val.is_a?(Hash)

534

document.merge!(new_val)

535

else new_val.is_a?(String) or new_val.is_a?(Symbol)

536

document[@default_input_field] = new_val.to_s

537

end

538

docs_to_add << document

539

@reader.delete(id)

540

end

541

ensure_writer_open()

542

docs_to_add.each {|doc| @writer << doc }

543

flush() if @auto_flush

544

end

545

end

546

547

# Returns true if any documents have been deleted since the index was last

548

# flushed.

549

def has_deletions?()

550

@dir.synchronize do

551

ensure_reader_open()

552

return @reader.has_deletions?

553

end

554

end

555

556

# Flushes all writes to the index. This will not optimize the index but it

557

# will make sure that all writes are written to it.

558

559

# NOTE: this is not necessary if you are only using this class. All writes

560

# will automatically flush when you perform an operation that reads the

561

# index.

562

def flush()

563

@dir.synchronize do

564

if @reader

565

if @searcher

566

@searcher.close

567

@searcher = nil

568

end

569

@reader.commit

570

elsif @writer

571

@writer.commit

572

end

573

end

574

end

575

alias :commit :flush

576

577

# optimizes the index. This should only be called when the index will no

578

# longer be updated very often, but will be read a lot.

579

def optimize()

580

@dir.synchrolock do

581

ensure_writer_open()

582

@writer.optimize()

583

@writer.close()

584

@writer = nil

585

end

586

end

587

588

# returns the number of documents in the index

589

def size()

590

@dir.synchronize do

591

ensure_reader_open()

592

return @reader.num_docs()

593

end

594

end

595

596

# Merges all segments from an index or an array of indexes into this

597

# index. You can pass a single Index::Index, Index::Reader,

598

# Store::Directory or an array of any single one of these.

599

600

# This may be used to parallelize batch indexing. A large document

601

# collection can be broken into sub-collections. Each sub-collection can

602

# be indexed in parallel, on a different thread, process or machine and

603

# perhaps all in memory. The complete index can then be created by

604

# merging sub-collection indexes with this method.

605

606

# After this completes, the index is optimized.

607

def add_indexes(indexes)

608

@dir.synchrolock do

609

ensure_writer_open()

610

indexes = [indexes].flatten # make sure we have an array

611

return if indexes.size == 0 # nothing to do

612

if indexes[0].is_a?(Index)

613

indexes.delete(self) # don't merge with self

614

indexes = indexes.map {|index| index.reader }

615

elsif indexes[0].is_a?(Ferret::Store::Directory)

616

indexes.delete(@dir) # don't merge with self

617

indexes = indexes.map {|dir| IndexReader.new(dir) }

618

elsif indexes[0].is_a?(IndexReader)

619

indexes.delete(@reader) # don't merge with self

620

else

621

raise ArgumentError, "Unknown index type when trying to merge indexes"

622

end

623

ensure_writer_open

624

@writer.add_readers(indexes)

625

end

626

end

627

628

# This is a simple utility method for saving an in memory or RAM index to

629

# the file system. The same thing can be achieved by using the

630

# Index::Index#add_indexes method and you will have more options when

631

# creating the new index, however this is a simple way to turn a RAM index

632

# into a file system index.

633

634

# directory:: This can either be a Store::Directory object or a String

635

# representing the path to the directory where you would

636

# like to store the index.

637

638

# create:: True if you'd like to create the directory if it doesn't

639

# exist or copy over an existing directory. False if you'd

640

# like to merge with the existing directory. This defaults to

641

# false.

642

def persist(directory, create = true)

643

synchronize do

644

close_all()

645

old_dir = @dir

646

if directory.is_a?(String)

647

@dir = FSDirectory.new(directory, create)

648

elsif directory.is_a?(Ferret::Store::Directory)

649

@dir = directory

650

end

651

@dir.extend(MonitorMixin).extend(SynchroLockMixin)

652

@options[:dir] = @dir

653

@options[:create_if_missing] = true

654

add_indexes([old_dir])

655

end

656

end

657

658

def to_s

659

buf = ""

660

(0...(size)).each do |i|

661

buf << self[i].to_s + "\n" if not deleted?(i)

662

end

663

buf

664

end

665

666

# Returns an Explanation that describes how +doc+ scored against

667

# +query+.

668

669

# This is intended to be used in developing Similarity implementations,

670

# and, for good performance, should not be displayed with every hit.

671

# Computing an explanation is as expensive as executing the query over the

672

# entire index.

673

def explain(query, doc)

674

@dir.synchronize do

675

ensure_searcher_open()

676

query = do_process_query(query)

677

678

return @searcher.explain(query, doc)

679

end

680

end

681

682

# Turn a query string into a Query object with the Index's QueryParser

683

def process_query(query)

684

@dir.synchronize do

685

ensure_searcher_open()

686

return do_process_query(query)

687

end

688

end

689

690

# Returns the field_infos object so that you can add new fields to the

691

# index.

692

def field_infos

693

@dir.synchrolock do

694

ensure_writer_open()

695

return @writer.field_infos

696

end

697

end

698

699

700

protected

701

def ensure_writer_open()

702

raise "tried to use a closed index" if not @open

703

return if @writer

704

if @reader

705

@searcher.close if @searcher

706

@reader.close

707

@reader = nil

708

@searcher = nil

709

end

710

@writer = IndexWriter.new(@options)

711

end

712

713

# returns the new reader if one is opened

714

def ensure_reader_open(get_latest = true)

715

raise "tried to use a closed index" if not @open

716

if @reader

717

if get_latest

718

latest = false

719

begin

720

latest = @reader.latest?

721

rescue Lock::LockError => le

722

sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again

723

latest = @reader.latest?

724

end

725

if not latest

726

@searcher.close if @searcher

727

@reader.close

728

return @reader = IndexReader.new(@dir)

729

end

730

end

731

else

732

if @writer

733

@writer.close

734

@writer = nil

735

end

736

return @reader = IndexReader.new(@dir)

737

end

738

return false

739

end

740

741

def ensure_searcher_open()

742

raise "tried to use a closed index" if not @open

743

if ensure_reader_open() or not @searcher

744

@searcher = Searcher.new(@reader)

745

end

746

end

747

748

private

749

def do_process_query(query)

750

if query.is_a?(String)

751

if @qp.nil?

752

@qp = Ferret::QueryParser.new(@options)

753

end

754

# we need to set this every time, in case a new field has been added

755

@qp.fields =

756

@reader.fields unless options[:all_fields] || options[:fields]

757

@qp.tokenized_fields =

758

@reader.tokenized_fields unless options[:tokenized_fields]

759

query = @qp.parse(query)

760

end

761

return query

762

end

763

764

def do_search(query, options)

765

ensure_searcher_open()

766

query = do_process_query(query)

767

768

return @searcher.search(query, options)

769

end

770

771

def close_all()

772

@dir.synchronize do

773

@searcher.close if @searcher

774

@reader.close if @reader

775

@writer.close if @writer

776

@reader = nil

777

@searcher = nil

778

@writer = nil

779

end

780

end

781

end

782

end

783

784

module Ferret

785

I = Index::Index

786

end

Older »