40
26
# default_input_field:: Default: "id". This specifies the default field
41
27
# that will be used when you add a simple string
42
28
# to the index using #add_document or <<.
43
# id_field: Default: "id". This field is as the field to
29
# id_field:: Default: "id". This field is as the field to
44
30
# search when doing searches on a term. For
45
31
# example, if you do a lookup by term "cat", ie
46
32
# index["cat"], this will be the field that is
75
61
# Directory object to this class and you want
76
62
# Index to close it when it is closed itself then
77
63
# set this to true.
64
# use_typed_range_query:: Default: true. Use TypedRangeQuery instead of
65
# the standard RangeQuery when parsing
66
# range queries. This is useful if you have number
67
# fields which you want to perform range queries
68
# on. You won't need to pad or normalize the data
69
# in the field in anyway to get correct results.
70
# However, performance will be a lot slower for
71
# large indexes, hence the default.
81
75
# index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
265
262
# See FieldInfos for more information on how to set field properties.
266
263
def add_document(doc, analyzer = nil)
268
265
ensure_writer_open()
269
266
if doc.is_a?(String) or doc.is_a?(Array)
270
267
doc = {@default_input_field => doc}
395
# Run a query through the Searcher on the index, ignoring scoring and
396
# starting at +:start_doc+ and stopping when +:limit+ matches have been
397
# found. It returns an array of the matching document numbers.
399
# There is a big performance advange when using this search method on a
400
# very large index when there are potentially thousands of matching
401
# documents and you only want say 50 of them. The other search methods need
402
# to look at every single match to decide which one has the highest score.
403
# This search method just needs to find +:limit+ number of matches before
408
# start_doc:: Default: 0. The start document to start the search from.
409
# NOTE very carefully that this is not the same as the
410
# +:offset+ parameter used in the other search methods
411
# which refers to the offset in the result-set. This is the
412
# document to start the scan from. So if you scanning
413
# through the index in increments of 50 documents at a time
414
# you need to use the last matched doc in the previous
415
# search to start your next search. See the example below.
416
# limit:: Default: 50. This is the number of results you want
417
# returned, also called the page size. Set +:limit+ to
418
# +:all+ to return all results.
419
# TODO: add option to return loaded documents instead
425
# results = @searcher.scan(query, :start_doc => start_doc)
426
# yield results # or do something with them
427
# start_doc = results.last
428
# # start_doc will be nil now if results is empty, ie no more matches
429
# end while start_doc
430
def scan(query, options = {})
432
ensure_searcher_open()
433
query = do_process_query(query)
435
@searcher.scan(query, options)
400
439
# Retrieves a document/documents from the index. The method for retrieval
401
440
# depends on the type of the argument passed.
409
448
# If +arg+ is a String then search for the first document with +arg+ in
410
449
# the +id+ field. The +id+ field is either :id or whatever you set
411
# :id_field parameter to when you create the Index object.
450
# +:id_field+ parameter to when you create the Index object.
413
452
@dir.synchronize do
466
# Retrieves the term_vector for a document. The document can be referenced
467
# by either a string id to match the id field or an integer corresponding
468
# to Ferret's document number.
470
# See Ferret::Index::IndexReader#term_vector
471
def term_vector(id, field)
474
if id.kind_of?(String) or id.kind_of?(Symbol)
475
term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
476
if term_doc_enum.next?
477
id = term_doc_enum.doc
482
return @reader.term_vector(id, field)
486
# iterate through all documents in the index. This method preloads the
487
# documents so you don't need to call #load on the document to load all the
492
(0...@reader.max_doc).each do |i|
493
yield @reader[i].load unless @reader.deleted?(i)
427
498
# Deletes a document/documents from the index. The method for determining
428
499
# the document to delete depends on the type of the argument passed.
431
502
# document number. Will raise an error if the document does not exist.
433
504
# If +arg+ is a String then search for the documents with +arg+ in the
434
# +id+ field. The +id+ field is either :id or whatever you set :id_field
505
# +id+ field. The +id+ field is either :id or whatever you set +:id_field+
435
506
# parameter to when you create the Index object. Will fail quietly if the
436
507
# no document exists.
509
# If +arg+ is a Hash or an Array then a batch delete will be performed.
510
# If +arg+ is an Array then it will be considered an array of +id+'s. If
511
# it is a Hash, then its keys will be used instead as the Array of
512
# document +id+'s. If the +id+ is an Integer then it is considered a
513
# Ferret document number and the corresponding document will be deleted.
514
# If the +id+ is a String or a Symbol then the +id+ will be considered a
515
# term and the documents that contain that term in the +:id_field+ will be
440
519
if arg.is_a?(String) or arg.is_a?(Symbol)
441
520
ensure_writer_open()
442
521
@writer.delete(@id_field, arg.to_s)
443
522
elsif arg.is_a?(Integer)
444
523
ensure_reader_open()
445
524
cnt = @reader.delete(arg)
525
elsif arg.is_a?(Hash) or arg.is_a?(Array)
447
528
raise ArgumentError, "Cannot delete for arg of type #{arg.class}"
479
560
# Update the document referenced by the document number +id+ if +id+ is an
480
561
# integer or all of the documents which have the term +id+ if +id+ is a
563
# For batch update of set of documents, for performance reasons, see batch_update
483
565
# id:: The number of the document to update. Can also be a string
484
566
# representing the value in the +id+ field. Also consider using
485
567
# the :key attribute.
486
568
# new_doc:: The document to replace the old document with
487
569
def update(id, new_doc)
489
571
ensure_writer_open()
491
573
if id.is_a?(String) or id.is_a?(Symbol)
583
# Batch updates the documents in an index. You can pass either a Hash or
586
# === Array (recommended)
588
# If you pass an Array then each value needs to be a Document or a Hash
589
# and each of those documents must have an +:id_field+ which will be used
590
# to delete the old document that this document is replacing.
594
# If you pass a Hash then the keys of the Hash will be considered the
595
# +id+'s and the values will be the new documents to replace the old ones
596
# with.If the +id+ is an Integer then it is considered a Ferret document
597
# number and the corresponding document will be deleted. If the +id+ is a
598
# String or a Symbol then the +id+ will be considered a term and the
599
# documents that contain that term in the +:id_field+ will be deleted.
601
# Note: No error will be raised if the document does not currently
602
# exist. A new document will simply be created.
606
# # will replace the documents with the +id+'s id:133 and id:254
607
# @index.batch_update({
608
# '133' => {:id => '133', :content => 'yada yada yada'},
609
# '253' => {:id => '253', :content => 'bla bla bal'}
612
# # will replace the documents with the Ferret Document numbers 2 and 92
613
# @index.batch_update({
614
# 2 => {:id => '133', :content => 'yada yada yada'},
615
# 92 => {:id => '253', :content => 'bla bla bal'}
618
# # will replace the documents with the +id+'s id:133 and id:254
619
# # this is recommended as it guarantees no duplicate keys
620
# @index.batch_update([
621
# {:id => '133', :content => 'yada yada yada'},
622
# {:id => '253', :content => 'bla bla bal'}
625
# docs:: A Hash of id/document pairs. The set of documents to be updated
626
def batch_update(docs)
631
ids = docs.collect{|doc| doc[@id_field].to_s}
633
raise ArgumentError, "all documents must have an #{@id_field} "
634
"field when doing a batch update"
640
raise ArgumentError, "must pass Hash or Array, not #{docs.class}"
644
docs.each {|new_doc| @writer << new_doc }
501
650
# Update all the documents returned by the query.
503
652
# query:: The query to find documents you wish to update. Can either be
523
672
# #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}
525
674
def query_update(query, new_val)
527
676
ensure_writer_open()
528
677
ensure_searcher_open()
530
679
query = do_process_query(query)
531
@searcher.search_each(query) do |id, score|
680
@searcher.search_each(query, :limit => :all) do |id, score|
532
681
document = @searcher[id].load
533
682
if new_val.is_a?(Hash)
534
683
document.merge!(new_val)
932
# If +docs+ is a Hash or an Array then a batch delete will be performed.
933
# If +docs+ is an Array then it will be considered an array of +id+'s. If
934
# it is a Hash, then its keys will be used instead as the Array of
935
# document +id+'s. If the +id+ is an Integers then it is considered a
936
# Ferret document number and the corresponding document will be deleted.
937
# If the +id+ is a String or a Symbol then the +id+ will be considered a
938
# term and the documents that contain that term in the +:id_field+ will
941
# docs:: An Array of docs to be deleted, or a Hash (in which case the keys
943
def batch_delete(docs)
944
docs = docs.keys if docs.is_a?(Hash)
945
raise ArgumentError, "must pass Array or Hash" unless docs.is_a? Array
950
when String then terms << doc
951
when Symbol then terms << doc.to_s
952
when Integer then ids << doc
954
raise ArgumentError, "Cannot delete for arg of type #{id.class}"
959
ids.each {|id| @reader.delete(id)}
963
@writer.delete(@id_field, terms)