~ubuntu-branches/ubuntu/raring/ruby-ferret/raring

Committer: Package Import Robot
Author(s): Cédric Boutillier
Date: 2012-06-14 23:04:48 UTC
mfrom: (2.1.1 sid)
Revision ID: package-import@ubuntu.com-20120614230448-wd5se4ia1yz7dvms

Tags: 0.11.8.4+debian-1

http://bugs.debian.org/672069

http://bugs.debian.org/655636

* New upstream version from a new source
  + the new code fixes format security issues (Closes: #672069)
  + change homepage to https://github.com/jkraemer/ferret/
* Build for all Ruby versions (Closes: #655636)
  + change depends accordingly
  + do not set shebang of bin/ferret to ruby1.8
* Repack source to remove convenience copy of bzlib
  + build-dep on libbz2-dev
  + dversionmangle in debian/watch
  + add debian/README.source explaining how to clean the source
* debian/patches:
  + disable_load_path_manipulation.patch: do not override $LOAD_PATH
  + disable_test_causing_segfault.patch: temporarily disable a test known to
    cause segfaults
  + fix_compatibility_with_minitest.patch: fix a failing test with Ruby1.9
  + use_system_bzlib.patch: adapt the source to use system libbz2
  + fix_typos_in_source_code.patch: correct some spelling errors in the
    source code
  + block_variables_have_local_scopes.patch: fix syntax in
    bin/ferret-browser
* Override dh_auto_clean to remove test/temp when cleaning
* Bump Standards-Version to 3.9.3 (no changes needed)
* Set priority of transitional packages to extra
* Add myself to Uploaders:
* Update copyright to DEP-5 copyright-format/1.0
* Add TUTORIAL and debian/README.source to documents
* Override lintian warnings about duplicate descriptions of transitional
  packages

files added:
.pc/block_variables_have_local_scopes.patch

.pc/block_variables_have_local_scopes.patch/bin

.pc/block_variables_have_local_scopes.patch/bin/ferret-browser

.pc/disable_load_path_manipulation.patch

.pc/disable_load_path_manipulation.patch/bin

.pc/disable_load_path_manipulation.patch/bin/ferret-browser

.pc/disable_load_path_manipulation.patch/lib

.pc/disable_load_path_manipulation.patch/lib/ferret.rb

.pc/disable_load_path_manipulation.patch/test

.pc/disable_load_path_manipulation.patch/test/test_helper.rb

.pc/disable_load_path_manipulation.patch/test/threading

.pc/disable_load_path_manipulation.patch/test/threading/thread_safety_index_test.rb

.pc/disable_test_causing_segfault.patch

.pc/disable_test_causing_segfault.patch/test

.pc/disable_test_causing_segfault.patch/test/unit

.pc/disable_test_causing_segfault.patch/test/unit/index

.pc/disable_test_causing_segfault.patch/test/unit/index/tc_index_writer.rb

.pc/fix_compatibility_with_minitest.patch

.pc/fix_compatibility_with_minitest.patch/test

.pc/fix_compatibility_with_minitest.patch/test/unit

.pc/fix_compatibility_with_minitest.patch/test/unit/tc_field_symbol.rb

.pc/fix_typos_in_source_code.patch

.pc/fix_typos_in_source_code.patch/ext

.pc/fix_typos_in_source_code.patch/ext/compound_io.c

.pc/fix_typos_in_source_code.patch/ext/posh.c

.pc/fix_typos_in_source_code.patch/ext/posh.h

.pc/use_system_bzlib.patch

.pc/use_system_bzlib.patch/ext

.pc/use_system_bzlib.patch/ext/extconf.rb

.pc/use_system_bzlib.patch/ext/index.c

RELEASE_CHANGES

RELEASE_NOTES

debian/README.source

debian/patches/block_variables_have_local_scopes.patch

debian/patches/disable_load_path_manipulation.patch

debian/patches/disable_test_causing_segfault.patch

debian/patches/fix_compatibility_with_minitest.patch

debian/patches/fix_typos_in_source_code.patch

debian/patches/use_system_bzlib.patch

debian/ruby-tests.rb

debian/source/lintian-overrides

ext/STEMMER_api.c

ext/STEMMER_libstemmer.c

ext/STEMMER_stem_ISO_8859_1_danish.c

ext/STEMMER_stem_ISO_8859_1_dutch.c

ext/STEMMER_stem_ISO_8859_1_english.c

ext/STEMMER_stem_ISO_8859_1_finnish.c

ext/STEMMER_stem_ISO_8859_1_french.c

ext/STEMMER_stem_ISO_8859_1_german.c

ext/STEMMER_stem_ISO_8859_1_hungarian.c

ext/STEMMER_stem_ISO_8859_1_italian.c

ext/STEMMER_stem_ISO_8859_1_norwegian.c

ext/STEMMER_stem_ISO_8859_1_porter.c

ext/STEMMER_stem_ISO_8859_1_portuguese.c

ext/STEMMER_stem_ISO_8859_1_spanish.c

ext/STEMMER_stem_ISO_8859_1_swedish.c

ext/STEMMER_stem_ISO_8859_2_romanian.c

ext/STEMMER_stem_KOI8_R_russian.c

ext/STEMMER_stem_UTF_8_danish.c

ext/STEMMER_stem_UTF_8_dutch.c

ext/STEMMER_stem_UTF_8_english.c

ext/STEMMER_stem_UTF_8_finnish.c

ext/STEMMER_stem_UTF_8_french.c

ext/STEMMER_stem_UTF_8_german.c

ext/STEMMER_stem_UTF_8_hungarian.c

ext/STEMMER_stem_UTF_8_italian.c

ext/STEMMER_stem_UTF_8_norwegian.c

ext/STEMMER_stem_UTF_8_porter.c

ext/STEMMER_stem_UTF_8_portuguese.c

ext/STEMMER_stem_UTF_8_romanian.c

ext/STEMMER_stem_UTF_8_russian.c

ext/STEMMER_stem_UTF_8_spanish.c

ext/STEMMER_stem_UTF_8_swedish.c

ext/STEMMER_stem_UTF_8_turkish.c

ext/STEMMER_utilities.c

ext/field_index.c

ext/field_index.h

ext/internal.h

ext/lang.c

ext/scanner.c

ext/scanner.h

ext/scanner_mb.c

ext/scanner_utf8.c

ext/stem_ISO_8859_1_hungarian.h

ext/stem_ISO_8859_2_romanian.h

ext/stem_UTF_8_hungarian.h

ext/stem_UTF_8_romanian.h

ext/stem_UTF_8_turkish.h

ext/symbol.c

ext/symbol.h

lib/ferret/field_symbol.rb

lib/ferret/version.rb

metadata.yml

test/long_running

test/long_running/largefile

test/long_running/largefile/tc_largefile.rb

test/test_installed.rb

test/unit/tc_field_symbol.rb

test/utils

test/utils/content_generator.rb

files removed:
.pc/debian-changes

.pc/debian-changes/bin

.pc/debian-changes/bin/ferret-browser

debian/patches/debian-changes

debian/ruby-tests.rake

ext/api.c

ext/inc

ext/inc/lang.h

ext/inc/threading.h

ext/libstemmer.c

ext/stem_ISO_8859_1_danish.c

ext/stem_ISO_8859_1_dutch.c

ext/stem_ISO_8859_1_english.c

ext/stem_ISO_8859_1_finnish.c

ext/stem_ISO_8859_1_french.c

ext/stem_ISO_8859_1_german.c

ext/stem_ISO_8859_1_italian.c

ext/stem_ISO_8859_1_norwegian.c

ext/stem_ISO_8859_1_porter.c

ext/stem_ISO_8859_1_portuguese.c

ext/stem_ISO_8859_1_spanish.c

ext/stem_ISO_8859_1_swedish.c

ext/stem_KOI8_R_russian.c

ext/stem_UTF_8_danish.c

ext/stem_UTF_8_dutch.c

ext/stem_UTF_8_english.c

ext/stem_UTF_8_finnish.c

ext/stem_UTF_8_french.c

ext/stem_UTF_8_german.c

ext/stem_UTF_8_italian.c

ext/stem_UTF_8_norwegian.c

ext/stem_UTF_8_porter.c

ext/stem_UTF_8_portuguese.c

ext/stem_UTF_8_russian.c

ext/stem_UTF_8_spanish.c

ext/stem_UTF_8_swedish.c

ext/utilities.c

lib/ferret_version.rb

test/unit/largefile

test/unit/largefile/tc_largefile.rb

files modified:
.pc/applied-patches

README

Rakefile

TODO

bin/ferret-browser

debian/changelog

debian/control

debian/copyright

debian/patches/series

debian/ruby-ferret.docs

debian/rules

debian/watch

ext/analysis.c

ext/analysis.h

ext/api.h

ext/array.c

ext/array.h

ext/bitvector.c

ext/bitvector.h

ext/compound_io.c

ext/config.h

ext/document.c

ext/document.h

ext/except.c

ext/except.h

ext/extconf.rb

ext/ferret.c

ext/ferret.h

ext/filter.c

ext/fs_store.c

ext/global.c

ext/global.h

ext/hash.c

ext/hash.h

ext/hashset.c

ext/hashset.h

ext/header.h

ext/helper.c

ext/helper.h

ext/index.c

ext/index.h

ext/lang.h

ext/mempool.c

ext/mempool.h

ext/modules.h

ext/multimapper.c

ext/multimapper.h

ext/posh.c

ext/posh.h

ext/priorityqueue.c

ext/priorityqueue.h

ext/q_boolean.c

ext/q_const_score.c

ext/q_filtered_query.c

ext/q_fuzzy.c

ext/q_match_all.c

ext/q_multi_term.c

ext/q_parser.c

ext/q_phrase.c

ext/q_prefix.c

ext/q_range.c

ext/q_span.c

ext/q_term.c

ext/q_wildcard.c

ext/r_analysis.c

ext/r_index.c

ext/r_qparser.c

ext/r_search.c

ext/r_store.c

ext/r_utils.c

ext/ram_store.c

ext/search.c

ext/search.h

ext/similarity.c

ext/similarity.h

ext/sort.c

ext/stopwords.c

ext/store.c

ext/store.h

ext/term_vectors.c

ext/threading.h

ext/win32.h

lib/ferret.rb

lib/ferret/browser.rb

lib/ferret/index.rb

lib/ferret/number_tools.rb *

test/test_helper.rb

test/threading/thread_safety_index_test.rb

test/threading/thread_safety_read_write_test.rb

test/unit/analysis/tc_analyzer.rb

test/unit/analysis/tc_token_stream.rb

test/unit/index/tc_index.rb

test/unit/index/tc_index_reader.rb

test/unit/index/tc_index_writer.rb

test/unit/index/th_doc.rb

test/unit/search/tc_filter.rb

test/unit/search/tc_index_searcher.rb

test/unit/search/tm_searcher.rb

test/unit/store/tc_fs_store.rb

test/unit/store/tm_store_lock.rb

Show diffs side-by-side

added added

removed removed

lib/ferret/index.rb

require 'monitor'

module Ferret::Index

module SynchroLockMixin

def synchrolock

trys = 5

begin

synchronize {yield}

rescue Ferret::Store::Lock::LockError => e

if (trys -= 1) <= 0

raise e

else

retry

end

# This is a simplified interface to the index. See the TUTORIAL for more

# information on how to use this class.

class Index

# default_input_field:: Default: "id". This specifies the default field

# that will be used when you add a simple string

# to the index using #add_document or <<.

# id_field: Default: "id". This field is as the field to

# id_field:: Default: "id". This field is as the field to

# search when doing searches on a term. For

# example, if you do a lookup by term "cat", ie

# index["cat"], this will be the field that is

# Directory object to this class and you want

# Index to close it when it is closed itself then

# set this to true.

# Some examples;

# use_typed_range_query:: Default: true. Use TypedRangeQuery instead of

# the standard RangeQuery when parsing

# range queries. This is useful if you have number

# fields which you want to perform range queries

# on. You won't need to pad or normalize the data

# in the field in anyway to get correct results.

# However, performance will be a lot slower for

# large indexes, hence the default.

# == Examples

# index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())

130

124

@dir = RAMDirectory.new

131

125

end

132

126

133

@dir.extend(MonitorMixin).extend(SynchroLockMixin)

127

@dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin

134

128

options[:dir] = @dir

135

129

options[:lock_retry_time]||= 2

136

130

@options = options

138

132

IndexWriter.new(options).close

139

133

end

140

134

options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new

135

if options[:use_typed_range_query].nil?

136

options[:use_typed_range_query] = true

137

end

141

138

142

139

@searcher = nil

143

140

@writer = nil

264

261

265

262

# See FieldInfos for more information on how to set field properties.

266

263

def add_document(doc, analyzer = nil)

267

@dir.synchrolock do

264

@dir.synchronize do

268

265

ensure_writer_open()

269

266

if doc.is_a?(String) or doc.is_a?(Array)

270

267

doc = {@default_input_field => doc}

281

278

else

282

279

id = doc[@key].to_s

283

280

if id

284

ensure_writer_open()

285

281

@writer.delete(@key, id)

286

@writer.commit

287

282

end

288

283

end

289

284

end

397

392

end

398

393

end

399

394

395

# Run a query through the Searcher on the index, ignoring scoring and

396

# starting at +:start_doc+ and stopping when +:limit+ matches have been

397

# found. It returns an array of the matching document numbers.

398

399

# There is a big performance advange when using this search method on a

400

# very large index when there are potentially thousands of matching

401

# documents and you only want say 50 of them. The other search methods need

402

# to look at every single match to decide which one has the highest score.

403

# This search method just needs to find +:limit+ number of matches before

404

# it returns.

405

406

# === Options

407

408

# start_doc:: Default: 0. The start document to start the search from.

409

# NOTE very carefully that this is not the same as the

410

# +:offset+ parameter used in the other search methods

411

# which refers to the offset in the result-set. This is the

412

# document to start the scan from. So if you scanning

413

# through the index in increments of 50 documents at a time

414

# you need to use the last matched doc in the previous

415

# search to start your next search. See the example below.

416

# limit:: Default: 50. This is the number of results you want

417

# returned, also called the page size. Set +:limit+ to

418

# +:all+ to return all results.

419

# TODO: add option to return loaded documents instead

420

421

# === Options

422

423

# start_doc = 0

424

# begin

425

# results = @searcher.scan(query, :start_doc => start_doc)

426

# yield results # or do something with them

427

# start_doc = results.last

428

# # start_doc will be nil now if results is empty, ie no more matches

429

# end while start_doc

430

def scan(query, options = {})

431

@dir.synchronize do

432

ensure_searcher_open()

433

query = do_process_query(query)

434

435

@searcher.scan(query, options)

436

end

437

end

438

400

439

# Retrieves a document/documents from the index. The method for retrieval

401

440

# depends on the type of the argument passed.

402

441

408

447

409

448

# If +arg+ is a String then search for the first document with +arg+ in

410

449

# the +id+ field. The +id+ field is either :id or whatever you set

411

# :id_field parameter to when you create the Index object.

450

# +:id_field+ parameter to when you create the Index object.

412

451

def doc(*arg)

413

452

@dir.synchronize do

414

453

id = arg[0]

424

463

end

425

464

alias :[] :doc

426

465

466

# Retrieves the term_vector for a document. The document can be referenced

467

# by either a string id to match the id field or an integer corresponding

468

# to Ferret's document number.

469

470

# See Ferret::Index::IndexReader#term_vector

471

def term_vector(id, field)

472

@dir.synchronize do

473

ensure_reader_open()

474

if id.kind_of?(String) or id.kind_of?(Symbol)

475

term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)

476

if term_doc_enum.next?

477

id = term_doc_enum.doc

478

else

479

return nil

480

end

481

end

482

return @reader.term_vector(id, field)

483

end

484

end

485

486

# iterate through all documents in the index. This method preloads the

487

# documents so you don't need to call #load on the document to load all the

488

# fields.

489

def each

490

@dir.synchronize do

491

ensure_reader_open

492

(0...@reader.max_doc).each do |i|

493

yield @reader[i].load unless @reader.deleted?(i)

494

end

495

end

496

end

497

427

498

# Deletes a document/documents from the index. The method for determining

428

499

# the document to delete depends on the type of the argument passed.

429

500

431

502

# document number. Will raise an error if the document does not exist.

432

503

433

504

# If +arg+ is a String then search for the documents with +arg+ in the

434

# +id+ field. The +id+ field is either :id or whatever you set :id_field

505

# +id+ field. The +id+ field is either :id or whatever you set +:id_field+

435

506

# parameter to when you create the Index object. Will fail quietly if the

436

507

# no document exists.

508

509

# If +arg+ is a Hash or an Array then a batch delete will be performed.

510

# If +arg+ is an Array then it will be considered an array of +id+'s. If

511

# it is a Hash, then its keys will be used instead as the Array of

512

# document +id+'s. If the +id+ is an Integer then it is considered a

513

# Ferret document number and the corresponding document will be deleted.

514

# If the +id+ is a String or a Symbol then the +id+ will be considered a

515

# term and the documents that contain that term in the +:id_field+ will be

516

# deleted.

437

517

def delete(arg)

438

@dir.synchrolock do

439

ensure_writer_open()

518

@dir.synchronize do

440

519

if arg.is_a?(String) or arg.is_a?(Symbol)

441

520

ensure_writer_open()

442

521

@writer.delete(@id_field, arg.to_s)

443

522

elsif arg.is_a?(Integer)

444

523

ensure_reader_open()

445

524

cnt = @reader.delete(arg)

525

elsif arg.is_a?(Hash) or arg.is_a?(Array)

526

batch_delete(arg)

446

527

else

447

528

raise ArgumentError, "Cannot delete for arg of type #{arg.class}"

448

529

end

457

538

# string (in which case it is parsed by the standard query parser)

458

539

# or an actual query object.

459

540

def query_delete(query)

460

@dir.synchrolock do

541

@dir.synchronize do

461

542

ensure_writer_open()

462

543

ensure_searcher_open()

463

544

query = do_process_query(query)

479

560

# Update the document referenced by the document number +id+ if +id+ is an

480

561

# integer or all of the documents which have the term +id+ if +id+ is a

481

562

# term..

563

# For batch update of set of documents, for performance reasons, see batch_update

482

564

483

565

# id:: The number of the document to update. Can also be a string

484

566

# representing the value in the +id+ field. Also consider using

485

567

# the :key attribute.

486

568

# new_doc:: The document to replace the old document with

487

569

def update(id, new_doc)

488

@dir.synchrolock do

570

@dir.synchronize do

489

571

ensure_writer_open()

490

572

delete(id)

491

573

if id.is_a?(String) or id.is_a?(Symbol)

498

580

end

499

581

end

500

582

583

# Batch updates the documents in an index. You can pass either a Hash or

584

# an Array.

585

586

# === Array (recommended)

587

588

# If you pass an Array then each value needs to be a Document or a Hash

589

# and each of those documents must have an +:id_field+ which will be used

590

# to delete the old document that this document is replacing.

591

592

# === Hash

593

594

# If you pass a Hash then the keys of the Hash will be considered the

595

# +id+'s and the values will be the new documents to replace the old ones

596

# with.If the +id+ is an Integer then it is considered a Ferret document

597

# number and the corresponding document will be deleted. If the +id+ is a

598

# String or a Symbol then the +id+ will be considered a term and the

599

# documents that contain that term in the +:id_field+ will be deleted.

600

601

# Note: No error will be raised if the document does not currently

602

# exist. A new document will simply be created.

603

604

# == Examples

605

606

# # will replace the documents with the +id+'s id:133 and id:254

607

# @index.batch_update({

608

# '133' => {:id => '133', :content => 'yada yada yada'},

609

# '253' => {:id => '253', :content => 'bla bla bal'}

610

# })

611

612

# # will replace the documents with the Ferret Document numbers 2 and 92

613

# @index.batch_update({

614

# 2 => {:id => '133', :content => 'yada yada yada'},

615

# 92 => {:id => '253', :content => 'bla bla bal'}

616

# })

617

618

# # will replace the documents with the +id+'s id:133 and id:254

619

# # this is recommended as it guarantees no duplicate keys

620

# @index.batch_update([

621

# {:id => '133', :content => 'yada yada yada'},

622

# {:id => '253', :content => 'bla bla bal'}

623

# ])

624

625

# docs:: A Hash of id/document pairs. The set of documents to be updated

626

def batch_update(docs)

627

@dir.synchronize do

628

ids = values = nil

629

case docs

630

when Array

631

ids = docs.collect{|doc| doc[@id_field].to_s}

632

if ids.include?(nil)

633

raise ArgumentError, "all documents must have an #{@id_field} "

634

"field when doing a batch update"

635

end

636

when Hash

637

ids = docs.keys

638

docs = docs.values

639

else

640

raise ArgumentError, "must pass Hash or Array, not #{docs.class}"

641

end

642

batch_delete(ids)

643

ensure_writer_open()

644

docs.each {|new_doc| @writer << new_doc }

645

flush()

646

end

647

end

648

649

501

650

# Update all the documents returned by the query.

502

651

503

652

# query:: The query to find documents you wish to update. Can either be

523

672

# #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}

524

673

525

674

def query_update(query, new_val)

526

@dir.synchrolock do

675

@dir.synchronize do

527

676

ensure_writer_open()

528

677

ensure_searcher_open()

529

678

docs_to_add = []

530

679

query = do_process_query(query)

531

@searcher.search_each(query) do |id, score|

680

@searcher.search_each(query, :limit => :all) do |id, score|

532

681

document = @searcher[id].load

533

682

if new_val.is_a?(Hash)

534

683

document.merge!(new_val)

568

717

end

569

718

@reader.commit

570

719

elsif @writer

571

@writer.commit

720

@writer.close

721

@writer = nil

572

722

end

573

723

end

574

724

end

577

727

# optimizes the index. This should only be called when the index will no

578

728

# longer be updated very often, but will be read a lot.

579

729

def optimize()

580

@dir.synchrolock do

730

@dir.synchronize do

581

731

ensure_writer_open()

582

732

@writer.optimize()

583

733

@writer.close()

605

755

606

756

# After this completes, the index is optimized.

607

757

def add_indexes(indexes)

608

@dir.synchrolock do

758

@dir.synchronize do

609

759

ensure_writer_open()

610

760

indexes = [indexes].flatten # make sure we have an array

611

761

return if indexes.size == 0 # nothing to do

648

798

elsif directory.is_a?(Ferret::Store::Directory)

649

799

@dir = directory

650

800

end

651

@dir.extend(MonitorMixin).extend(SynchroLockMixin)

801

@dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin

652

802

@options[:dir] = @dir

653

803

@options[:create_if_missing] = true

654

804

add_indexes([old_dir])

690

840

# Returns the field_infos object so that you can add new fields to the

691

841

# index.

692

842

def field_infos

693

@dir.synchrolock do

843

@dir.synchronize do

694

844

ensure_writer_open()

695

845

return @writer.field_infos

696

846

end

778

928

@writer = nil

779

929

end

780

930

end

931

932

# If +docs+ is a Hash or an Array then a batch delete will be performed.

933

# If +docs+ is an Array then it will be considered an array of +id+'s. If

934

# it is a Hash, then its keys will be used instead as the Array of

935

# document +id+'s. If the +id+ is an Integers then it is considered a

936

# Ferret document number and the corresponding document will be deleted.

937

# If the +id+ is a String or a Symbol then the +id+ will be considered a

938

# term and the documents that contain that term in the +:id_field+ will

939

# be deleted.

940

941

# docs:: An Array of docs to be deleted, or a Hash (in which case the keys

942

# are used)

943

def batch_delete(docs)

944

docs = docs.keys if docs.is_a?(Hash)

945

raise ArgumentError, "must pass Array or Hash" unless docs.is_a? Array

946

ids = []

947

terms = []

948

docs.each do |doc|

949

case doc

950

when String then terms << doc

951

when Symbol then terms << doc.to_s

952

when Integer then ids << doc

953

else

954

raise ArgumentError, "Cannot delete for arg of type #{id.class}"

955

end

956

end

957

if ids.size > 0

958

ensure_reader_open

959

ids.each {|id| @reader.delete(id)}

960

end

961

if terms.size > 0

962

ensure_writer_open()

963

@writer.delete(@id_field, terms)

964

end

965

return self

966

end

967

781

968

end

782

969

end

783

970

Older »