4
module SynchroLockMixin
9
rescue Ferret::Store::Lock::LockError => e
18
# This is a simplified interface to the index. See the TUTORIAL for more
19
# information on how to use this class.
24
include Ferret::Search
28
# If you create an Index without any options, it'll simply create an index
29
# in memory. But this class is highly configurable and every option that
30
# you can supply to IndexWriter and QueryParser, you can also set here.
31
# Please look at the options for the constructors to these classes.
40
# default_input_field:: Default: "id". This specifies the default field
41
# that will be used when you add a simple string
42
# to the index using #add_document or <<.
43
# id_field: Default: "id". This field is as the field to
44
# search when doing searches on a term. For
45
# example, if you do a lookup by term "cat", ie
46
# index["cat"], this will be the field that is
48
# key:: Default: nil. Expert: This should only be used
49
# if you really know what you are doing. Basically
50
# you can set a field or an array of fields to be
51
# the key for the index. So if you add a document
52
# with a same key as an existing document, the
53
# existing document will be replaced by the new
54
# object. Using a multiple field key will slow
55
# down indexing so it should not be done if
56
# performance is a concern. A single field key (or
57
# id) should be find however. Also, you must make
58
# sure that your key/keys are either untokenized
59
# or that they are not broken up by the analyzer.
60
# auto_flush:: Default: false. Set this option to true if you
61
# want the index automatically flushed every time
62
# you do a write (includes delete) to the index.
63
# This is useful if you have multiple processes
64
# accessing the index and you don't want lock
65
# errors. Setting :auto_flush to true has a huge
66
# performance impact so don't use it if you are
67
# concerned about performance. In that case you
68
# should think about setting up a DRb indexing
70
# lock_retry_time:: Default: 2 seconds. This parameter specifies how
71
# long to wait before retrying to obtain the
72
# commit lock when detecting if the IndexReader is
73
# at the latest version.
74
# close_dir:: Default: false. If you explicitly pass a
75
# Directory object to this class and you want
76
# Index to close it when it is closed itself then
81
# index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
83
# index = Index::Index.new(:path => '/path/to/index',
84
# :create_if_missing => false,
85
# :auto_flush => true)
87
# index = Index::Index.new(:dir => directory,
89
# :handle_parse_errors => false)
91
# You can also pass a block if you like. The index will be yielded and
92
# closed at the index of the box. For example;
94
# Ferret::I.new() do |index|
95
# # do stuff with index. Most of your actions will be cached.
97
def initialize(options = {}, &block)
103
@key.flatten.map {|k| k.to_s.intern}
109
if (fi = options[:field_infos]).is_a?(String)
110
options[:field_infos] = FieldInfos.load(fi)
113
@close_dir = options[:close_dir]
114
if options[:dir].is_a?(String)
115
options[:path] = options[:dir]
120
@dir = FSDirectory.new(options[:path], options[:create])
122
@dir = FSDirectory.new(options[:path],
123
options[:create_if_missing] != false)
128
options[:create] = true # this should always be true for a new RAMDir
130
@dir = RAMDirectory.new
133
@dir.extend(MonitorMixin).extend(SynchroLockMixin)
135
options[:lock_retry_time]||= 2
137
if (!@dir.exists?("segments")) || options[:create]
138
IndexWriter.new(options).close
140
options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
146
@options.delete(:create) # only create the first time if at all
147
@auto_flush = @options[:auto_flush] || false
148
if (@options[:id_field].nil? and @key.is_a?(Symbol))
151
@id_field = @options[:id_field] || :id
153
@default_field = (@options[:default_field]||= :*)
154
@default_input_field = options[:default_input_field] || @id_field
156
if @default_input_field.respond_to?(:intern)
157
@default_input_field = @default_input_field.intern
167
# Returns an array of strings with the matches highlighted. The +query+ can
168
# either a query String or a Ferret::Search::Query object. The doc_id is
169
# the id of the document you want to highlight (usually returned by the
170
# search methods). There are also a number of options you can pass;
174
# field:: Default: @options[:default_field]. The default_field
175
# is the field that is usually highlighted but you can
176
# specify which field you want to highlight here. If
177
# you want to highlight multiple fields then you will
178
# need to call this method multiple times.
179
# excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
180
# terms will be in the centre of the excerpt. Set to
181
# :all to highlight the entire field.
182
# num_excerpts:: Default: 2. Number of excerpts to return.
183
# pre_tag:: Default: "<b>". Tag to place to the left of the
184
# match. You'll probably want to change this to a
185
# "<span>" tag with a class. Try "\033[36m" for use in
187
# post_tag:: Default: "</b>". This tag should close the
188
# +:pre_tag+. Try tag "\033[m" in the terminal.
189
# ellipsis:: Default: "...". This is the string that is appended
190
# at the beginning and end of excerpts (unless the
191
# excerpt hits the start or end of the field.
192
# Alternatively you may want to use the HTML entity
193
# … or the UTF-8 string "\342\200\246".
194
def highlight(query, doc_id, options = {})
196
ensure_searcher_open()
197
@searcher.highlight(do_process_query(query),
199
options[:field]||@options[:default_field],
204
# Closes this index by closing its associated reader and writer objects.
208
raise(StandardError, "tried to close an already closed directory")
210
@searcher.close() if @searcher
211
@reader.close() if @reader
212
@writer.close() if @writer
213
@dir.close() if @close_dir
219
# Get the reader for this index.
220
# NOTE:: This will close the writer from this index.
226
# Get the searcher for this index.
227
# NOTE:: This will close the writer from this index.
229
ensure_searcher_open()
233
# Get the writer for this index.
234
# NOTE:: This will close the reader from this index.
240
# Adds a document to this index, using the provided analyzer instead of
241
# the local analyzer if provided. If the document contains more than
242
# IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
245
# There are three ways to add a document to the index.
246
# To add a document you can simply add a string or an array of strings.
247
# This will store all the strings in the "" (ie empty string) field
248
# (unless you specify the default_field when you create the index).
250
# index << "This is a new document to be indexed"
251
# index << ["And here", "is another", "new document", "to be indexed"]
253
# But these are pretty simple documents. If this is all you want to index
254
# you could probably just use SimpleSearch. So let's give our documents
257
# index << {:title => "Programming Ruby", :content => "blah blah blah"}
258
# index << {:title => "Programming Ruby", :content => "yada yada yada"}
260
# Or if you are indexing data stored in a database, you'll probably want
263
# index << {:id => row.id, :title => row.title, :date => row.date}
265
# See FieldInfos for more information on how to set field properties.
266
def add_document(doc, analyzer = nil)
269
if doc.is_a?(String) or doc.is_a?(Array)
270
doc = {@default_input_field => doc}
273
# delete existing documents with the same key
276
query = @key.inject(BooleanQuery.new()) do |bq, field|
277
bq.add_query(TermQuery.new(field, doc[field].to_s), :must)
285
@writer.delete(@key, id)
293
old_analyzer = @writer.analyzer
294
@writer.analyzer = analyzer
295
@writer.add_document(doc)
296
@writer.analyzer = old_analyzer
298
@writer.add_document(doc)
301
flush() if @auto_flush
304
alias :<< :add_document
306
# Run a query through the Searcher on the index. A TopDocs object is
307
# returned with the relevant results. The +query+ is a built in Query
308
# object or a query string that can be parsed by the Ferret::QueryParser.
309
# Here are the options;
313
# offset:: Default: 0. The offset of the start of the section of the
314
# result-set to return. This is used for paging through
315
# results. Let's say you have a page size of 10. If you
316
# don't find the result you want among the first 10 results
317
# then set +:offset+ to 10 and look at the next 10 results,
319
# limit:: Default: 10. This is the number of results you want
320
# returned, also called the page size. Set +:limit+ to
321
# +:all+ to return all results
322
# sort:: A Sort object or sort string describing how the field
323
# should be sorted. A sort string is made up of field names
324
# which cannot contain spaces and the word "DESC" if you
325
# want the field reversed, all separated by commas. For
326
# example; "rating DESC, author, title". Note that Ferret
327
# will try to determine a field's type by looking at the
328
# first term in the index and seeing if it can be parsed as
329
# an integer or a float. Keep this in mind as you may need
330
# to specify a fields type to sort it correctly. For more
331
# on this, see the documentation for SortField
332
# filter:: a Filter object to filter the search results with
333
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
334
# and the Searcher object as its parameters and returns a
335
# Boolean value specifying whether the result should be
336
# included in the result set.
337
def search(query, options = {})
339
return do_search(query, options)
343
# Run a query through the Searcher on the index. A TopDocs object is
344
# returned with the relevant results. The +query+ is a Query object or a
345
# query string that can be validly parsed by the Ferret::QueryParser. The
346
# Searcher#search_each method yields the internal document id (used to
347
# reference documents in the Searcher object like this;
348
# +searcher[doc_id]+) and the search score for that document. It is
349
# possible for the score to be greater than 1.0 for some queries and
350
# taking boosts into account. This method will also normalize scores to
351
# the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
356
# offset:: Default: 0. The offset of the start of the section of the
357
# result-set to return. This is used for paging through
358
# results. Let's say you have a page size of 10. If you
359
# don't find the result you want among the first 10 results
360
# then set +:offset+ to 10 and look at the next 10 results,
362
# limit:: Default: 10. This is the number of results you want
363
# returned, also called the page size. Set +:limit+ to
364
# +:all+ to return all results
365
# sort:: A Sort object or sort string describing how the field
366
# should be sorted. A sort string is made up of field names
367
# which cannot contain spaces and the word "DESC" if you
368
# want the field reversed, all separated by commas. For
369
# example; "rating DESC, author, title". Note that Ferret
370
# will try to determine a field's type by looking at the
371
# first term in the index and seeing if it can be parsed as
372
# an integer or a float. Keep this in mind as you may need
373
# to specify a fields type to sort it correctly. For more
374
# on this, see the documentation for SortField
375
# filter:: a Filter object to filter the search results with
376
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
377
# and the Searcher object as its parameters and returns a
378
# Boolean value specifying whether the result should be
379
# included in the result set.
381
# returns:: The total number of hits.
385
# index.search_each(query, options = {}) do |doc, score|
386
# puts "hit document number #{doc} with a score of #{score}"
389
def search_each(query, options = {}) # :yield: doc, score
391
ensure_searcher_open()
392
query = do_process_query(query)
394
@searcher.search_each(query, options) do |doc, score|
400
# Retrieves a document/documents from the index. The method for retrieval
401
# depends on the type of the argument passed.
403
# If +arg+ is an Integer then return the document based on the internal
406
# If +arg+ is a Range, then return the documents within the range based on
407
# internal document number.
409
# If +arg+ is a String then search for the first document with +arg+ in
410
# the +id+ field. The +id+ field is either :id or whatever you set
411
# :id_field parameter to when you create the Index object.
415
if id.kind_of?(String) or id.kind_of?(Symbol)
417
term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
418
return term_doc_enum.next? ? @reader[term_doc_enum.doc] : nil
420
ensure_reader_open(false)
427
# Deletes a document/documents from the index. The method for determining
428
# the document to delete depends on the type of the argument passed.
430
# If +arg+ is an Integer then delete the document based on the internal
431
# document number. Will raise an error if the document does not exist.
433
# If +arg+ is a String then search for the documents with +arg+ in the
434
# +id+ field. The +id+ field is either :id or whatever you set :id_field
435
# parameter to when you create the Index object. Will fail quietly if the
436
# no document exists.
440
if arg.is_a?(String) or arg.is_a?(Symbol)
442
@writer.delete(@id_field, arg.to_s)
443
elsif arg.is_a?(Integer)
445
cnt = @reader.delete(arg)
447
raise ArgumentError, "Cannot delete for arg of type #{arg.class}"
449
flush() if @auto_flush
454
# Delete all documents returned by the query.
456
# query:: The query to find documents you wish to delete. Can either be a
457
# string (in which case it is parsed by the standard query parser)
458
# or an actual query object.
459
def query_delete(query)
462
ensure_searcher_open()
463
query = do_process_query(query)
464
@searcher.search_each(query, :limit => :all) do |doc, score|
467
flush() if @auto_flush
471
# Returns true if document +n+ has been deleted
475
return @reader.deleted?(n)
479
# Update the document referenced by the document number +id+ if +id+ is an
480
# integer or all of the documents which have the term +id+ if +id+ is a
483
# id:: The number of the document to update. Can also be a string
484
# representing the value in the +id+ field. Also consider using
485
# the :key attribute.
486
# new_doc:: The document to replace the old document with
487
def update(id, new_doc)
491
if id.is_a?(String) or id.is_a?(Symbol)
497
flush() if @auto_flush
501
# Update all the documents returned by the query.
503
# query:: The query to find documents you wish to update. Can either be
504
# a string (in which case it is parsed by the standard query
505
# parser) or an actual query object.
506
# new_val:: The values we are updating. This can be a string in which case
507
# the default field is updated, or it can be a hash, in which
508
# case, all fields in the hash are merged into the old hash.
509
# That is, the old fields are replaced by values in the new hash
514
# index << {:id => "26", :title => "Babylon", :artist => "David Grey"}
515
# index << {:id => "29", :title => "My Oh My", :artist => "David Grey"}
518
# index.query_update('artist:"David Grey"', {:artist => "David Gray"})
521
# #=> {:id => "26", :title => "Babylon", :artist => "David Gray"}
523
# #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}
525
def query_update(query, new_val)
528
ensure_searcher_open()
530
query = do_process_query(query)
531
@searcher.search_each(query) do |id, score|
532
document = @searcher[id].load
533
if new_val.is_a?(Hash)
534
document.merge!(new_val)
535
else new_val.is_a?(String) or new_val.is_a?(Symbol)
536
document[@default_input_field] = new_val.to_s
538
docs_to_add << document
542
docs_to_add.each {|doc| @writer << doc }
543
flush() if @auto_flush
547
# Returns true if any documents have been deleted since the index was last
552
return @reader.has_deletions?
556
# Flushes all writes to the index. This will not optimize the index but it
557
# will make sure that all writes are written to it.
559
# NOTE: this is not necessary if you are only using this class. All writes
560
# will automatically flush when you perform an operation that reads the
577
# optimizes the index. This should only be called when the index will no
578
# longer be updated very often, but will be read a lot.
588
# returns the number of documents in the index
592
return @reader.num_docs()
596
# Merges all segments from an index or an array of indexes into this
597
# index. You can pass a single Index::Index, Index::Reader,
598
# Store::Directory or an array of any single one of these.
600
# This may be used to parallelize batch indexing. A large document
601
# collection can be broken into sub-collections. Each sub-collection can
602
# be indexed in parallel, on a different thread, process or machine and
603
# perhaps all in memory. The complete index can then be created by
604
# merging sub-collection indexes with this method.
606
# After this completes, the index is optimized.
607
def add_indexes(indexes)
610
indexes = [indexes].flatten # make sure we have an array
611
return if indexes.size == 0 # nothing to do
612
if indexes[0].is_a?(Index)
613
indexes.delete(self) # don't merge with self
614
indexes = indexes.map {|index| index.reader }
615
elsif indexes[0].is_a?(Ferret::Store::Directory)
616
indexes.delete(@dir) # don't merge with self
617
indexes = indexes.map {|dir| IndexReader.new(dir) }
618
elsif indexes[0].is_a?(IndexReader)
619
indexes.delete(@reader) # don't merge with self
621
raise ArgumentError, "Unknown index type when trying to merge indexes"
624
@writer.add_readers(indexes)
628
# This is a simple utility method for saving an in memory or RAM index to
629
# the file system. The same thing can be achieved by using the
630
# Index::Index#add_indexes method and you will have more options when
631
# creating the new index, however this is a simple way to turn a RAM index
632
# into a file system index.
634
# directory:: This can either be a Store::Directory object or a String
635
# representing the path to the directory where you would
636
# like to store the index.
638
# create:: True if you'd like to create the directory if it doesn't
639
# exist or copy over an existing directory. False if you'd
640
# like to merge with the existing directory. This defaults to
642
def persist(directory, create = true)
646
if directory.is_a?(String)
647
@dir = FSDirectory.new(directory, create)
648
elsif directory.is_a?(Ferret::Store::Directory)
651
@dir.extend(MonitorMixin).extend(SynchroLockMixin)
652
@options[:dir] = @dir
653
@options[:create_if_missing] = true
654
add_indexes([old_dir])
660
(0...(size)).each do |i|
661
buf << self[i].to_s + "\n" if not deleted?(i)
666
# Returns an Explanation that describes how +doc+ scored against
669
# This is intended to be used in developing Similarity implementations,
670
# and, for good performance, should not be displayed with every hit.
671
# Computing an explanation is as expensive as executing the query over the
673
def explain(query, doc)
675
ensure_searcher_open()
676
query = do_process_query(query)
678
return @searcher.explain(query, doc)
682
# Turn a query string into a Query object with the Index's QueryParser
683
def process_query(query)
685
ensure_searcher_open()
686
return do_process_query(query)
690
# Returns the field_infos object so that you can add new fields to the
695
return @writer.field_infos
701
def ensure_writer_open()
702
raise "tried to use a closed index" if not @open
705
@searcher.close if @searcher
710
@writer = IndexWriter.new(@options)
713
# returns the new reader if one is opened
714
def ensure_reader_open(get_latest = true)
715
raise "tried to use a closed index" if not @open
720
latest = @reader.latest?
721
rescue Lock::LockError => le
722
sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
723
latest = @reader.latest?
726
@searcher.close if @searcher
728
return @reader = IndexReader.new(@dir)
736
return @reader = IndexReader.new(@dir)
741
def ensure_searcher_open()
742
raise "tried to use a closed index" if not @open
743
if ensure_reader_open() or not @searcher
744
@searcher = Searcher.new(@reader)
749
def do_process_query(query)
750
if query.is_a?(String)
752
@qp = Ferret::QueryParser.new(@options)
754
# we need to set this every time, in case a new field has been added
756
@reader.fields unless options[:all_fields] || options[:fields]
757
@qp.tokenized_fields =
758
@reader.tokenized_fields unless options[:tokenized_fields]
759
query = @qp.parse(query)
764
def do_search(query, options)
765
ensure_searcher_open()
766
query = do_process_query(query)
768
return @searcher.search(query, options)
773
@searcher.close if @searcher
774
@reader.close if @reader
775
@writer.close if @writer