~slub.team/goobi-indexserver/3.x

RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));

347

348

Document doc = new Document();

349

doc.add(newField("field", "foo firstname lastname foo", Field.Store.YES, Field.Index.ANALYZED));

350

writer.addDocument(doc);

351

352

Document doc2 = new Document();

353

doc2.add(newField("field", "foo firstname zzz lastname foo", Field.Store.YES, Field.Index.ANALYZED));

354

writer.addDocument(doc2);

355

356

Document doc3 = new Document();

357

doc3.add(newField("field", "foo firstname zzz yyy lastname foo", Field.Store.YES, Field.Index.ANALYZED));

358

writer.addDocument(doc3);

359

360

IndexReader reader = writer.getReader();

361

writer.close();

362

363

IndexSearcher searcher = newSearcher(reader);

364

PhraseQuery query = new PhraseQuery();

365

query.add(new Term("field", "firstname"));

366

query.add(new Term("field", "lastname"));

367

query.setSlop(Integer.MAX_VALUE);

368

ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;

369

assertEquals(3, hits.length);

370

// Make sure that those matches where the terms appear closer to

371

// each other get a higher score:

372

assertEquals(0.71, hits[0].score, 0.01);

373

assertEquals(0, hits[0].doc);

374

assertEquals(0.44, hits[1].score, 0.01);

375

assertEquals(1, hits[1].doc);

376

assertEquals(0.31, hits[2].score, 0.01);

377

assertEquals(2, hits[2].doc);

378

QueryUtils.check(random, query,searcher);

379

searcher.close();

380

reader.close();

381

directory.close();

382

}

383

384

public void testToString() throws Exception {

385

StopAnalyzer analyzer = new StopAnalyzer(TEST_VERSION_CURRENT);

386

QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", analyzer);

387

qp.setEnablePositionIncrements(true);

388

PhraseQuery q = (PhraseQuery)qp.parse("\"this hi this is a test is\"");

389

assertEquals("field:\"? hi ? ? ? test\"", q.toString());

390

q.add(new Term("field", "hello"), 1);

391

assertEquals("field:\"? hi|hello ? ? ? test\"", q.toString());

392

}

393

394

public void testWrappedPhrase() throws IOException {

395

query.add(new Term("repeated", "first"));

396

query.add(new Term("repeated", "part"));

397

query.add(new Term("repeated", "second"));

398

query.add(new Term("repeated", "part"));

399

query.setSlop(100);

400

401

ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;

402

assertEquals("slop of 100 just right", 1, hits.length);

403

QueryUtils.check(random, query,searcher);

404

405

query.setSlop(99);

406

407

hits = searcher.search(query, null, 1000).scoreDocs;

408

assertEquals("slop of 99 not enough", 0, hits.length);

409

QueryUtils.check(random, query,searcher);

410

}

411

412

// work on two docs like this: "phrase exist notexist exist found"

413

public void testNonExistingPhrase() throws IOException {

414

// phrase without repetitions that exists in 2 docs

415

query.add(new Term("nonexist", "phrase"));

416

query.add(new Term("nonexist", "notexist"));

417

query.add(new Term("nonexist", "found"));

418

query.setSlop(2); // would be found this way

419

420

ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;

421

assertEquals("phrase without repetitions exists in 2 docs", 2, hits.length);

422

QueryUtils.check(random, query,searcher);

423

424

// phrase with repetitions that exists in 2 docs

425

query = new PhraseQuery();

426

query.add(new Term("nonexist", "phrase"));

427

query.add(new Term("nonexist", "exist"));

428

query.add(new Term("nonexist", "exist"));

429

query.setSlop(1); // would be found

430

431

hits = searcher.search(query, null, 1000).scoreDocs;

432

assertEquals("phrase with repetitions exists in two docs", 2, hits.length);

433

QueryUtils.check(random, query,searcher);

434

435

// phrase I with repetitions that does not exist in any doc

436

query = new PhraseQuery();

437

query.add(new Term("nonexist", "phrase"));

438

query.add(new Term("nonexist", "notexist"));

439

query.add(new Term("nonexist", "phrase"));

440

query.setSlop(1000); // would not be found no matter how high the slop is

441

442

hits = searcher.search(query, null, 1000).scoreDocs;

443

assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length);

444

QueryUtils.check(random, query,searcher);

445

446

// phrase II with repetitions that does not exist in any doc

447

query = new PhraseQuery();

448

query.add(new Term("nonexist", "phrase"));

449

query.add(new Term("nonexist", "exist"));

450

query.add(new Term("nonexist", "exist"));

451

query.add(new Term("nonexist", "exist"));

452

query.setSlop(1000); // would not be found no matter how high the slop is

453

454

hits = searcher.search(query, null, 1000).scoreDocs;

455

assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length);

456

QueryUtils.check(random, query,searcher);

457

458

}

459

460

/**

461

* Working on a 2 fields like this:

462

* Field("field", "one two three four five")

463

* Field("palindrome", "one two three two one")

464

* Phrase of size 2 occuriong twice, once in order and once in reverse,

465

* because doc is a palyndrome, is counted twice.

466

* Also, in this case order in query does not matter.

467

* Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.

468

469

public void testPalyndrome2() throws Exception {

470

471

// search on non palyndrome, find phrase with no slop, using exact phrase scorer

472

query.setSlop(0); // to use exact phrase scorer

473

query.add(new Term("field", "two"));

474

query.add(new Term("field", "three"));

475

ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;

476

assertEquals("phrase found with exact phrase scorer", 1, hits.length);

477

float score0 = hits[0].score;

478

//System.out.println("(exact) field: two three: "+score0);

479

QueryUtils.check(random, query,searcher);

480

481

// search on non palyndrome, find phrase with slop 2, though no slop required here.

482

query.setSlop(2); // to use sloppy scorer

483

hits = searcher.search(query, null, 1000).scoreDocs;

484

assertEquals("just sloppy enough", 1, hits.length);

485

float score1 = hits[0].score;

486

//System.out.println("(sloppy) field: two three: "+score1);

487

assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);

488

QueryUtils.check(random, query,searcher);

489

490

// search ordered in palyndrome, find it twice

491

query = new PhraseQuery();

492

query.setSlop(2); // must be at least two for both ordered and reversed to match

493

query.add(new Term("palindrome", "two"));

494

query.add(new Term("palindrome", "three"));

495

hits = searcher.search(query, null, 1000).scoreDocs;

496

assertEquals("just sloppy enough", 1, hits.length);

497

//float score2 = hits[0].score;

498

//System.out.println("palindrome: two three: "+score2);

499

QueryUtils.check(random, query,searcher);

500

501

//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

502

//assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);

503

504

// search reveresed in palyndrome, find it twice

505

query = new PhraseQuery();

506

query.setSlop(2); // must be at least two for both ordered and reversed to match

507

query.add(new Term("palindrome", "three"));

508

query.add(new Term("palindrome", "two"));

509

hits = searcher.search(query, null, 1000).scoreDocs;

510

assertEquals("just sloppy enough", 1, hits.length);

511

//float score3 = hits[0].score;

512

//System.out.println("palindrome: three two: "+score3);

513

QueryUtils.check(random, query,searcher);

514

515

//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

516

//assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);

517

//assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);

518

}

519

520

/**

521

* Working on a 2 fields like this:

522

* Field("field", "one two three four five")

523

* Field("palindrome", "one two three two one")

524

* Phrase of size 3 occuriong twice, once in order and once in reverse,

525

* because doc is a palyndrome, is counted twice.

526

* Also, in this case order in query does not matter.

527

* Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.

528

529

public void testPalyndrome3() throws Exception {

530

531

// search on non palyndrome, find phrase with no slop, using exact phrase scorer

532

query.setSlop(0); // to use exact phrase scorer

533

query.add(new Term("field", "one"));

534

query.add(new Term("field", "two"));

535

query.add(new Term("field", "three"));

536

ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;

537

assertEquals("phrase found with exact phrase scorer", 1, hits.length);

538

float score0 = hits[0].score;

539

//System.out.println("(exact) field: one two three: "+score0);

540

QueryUtils.check(random, query,searcher);

541

542

// just make sure no exc:

543

searcher.explain(query, 0);

544

545

// search on non palyndrome, find phrase with slop 3, though no slop required here.

546

query.setSlop(4); // to use sloppy scorer

547

hits = searcher.search(query, null, 1000).scoreDocs;

548

assertEquals("just sloppy enough", 1, hits.length);

549

float score1 = hits[0].score;

550

//System.out.println("(sloppy) field: one two three: "+score1);

551

assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);

552

QueryUtils.check(random, query,searcher);

553

554

// search ordered in palyndrome, find it twice

555

query = new PhraseQuery();

556

query.setSlop(4); // must be at least four for both ordered and reversed to match

557

query.add(new Term("palindrome", "one"));

558

query.add(new Term("palindrome", "two"));

559

query.add(new Term("palindrome", "three"));

560

hits = searcher.search(query, null, 1000).scoreDocs;

561

562

// just make sure no exc:

563

searcher.explain(query, 0);

564

565

assertEquals("just sloppy enough", 1, hits.length);

566

//float score2 = hits[0].score;

567

//System.out.println("palindrome: one two three: "+score2);

568

QueryUtils.check(random, query,searcher);

569

570

//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

571

//assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);

572

573

// search reveresed in palyndrome, find it twice

574

query = new PhraseQuery();

575

query.setSlop(4); // must be at least four for both ordered and reversed to match

576

query.add(new Term("palindrome", "three"));

577

query.add(new Term("palindrome", "two"));

578

query.add(new Term("palindrome", "one"));

579

hits = searcher.search(query, null, 1000).scoreDocs;

580

assertEquals("just sloppy enough", 1, hits.length);

581

//float score3 = hits[0].score;

582

//System.out.println("palindrome: three two one: "+score3);

583

QueryUtils.check(random, query,searcher);

584

585

//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

586

//assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);

587

//assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);

588

}

589

590

// LUCENE-1280

591

public void testEmptyPhraseQuery() throws Throwable {

592

final BooleanQuery q2 = new BooleanQuery();

593

q2.add(new PhraseQuery(), BooleanClause.Occur.MUST);

594

q2.toString();

595

}

596

597

/* test that a single term is rewritten to a term query */

598

public void testRewrite() throws IOException {

599

PhraseQuery pq = new PhraseQuery();

600

pq.add(new Term("foo", "bar"));

601

Query rewritten = pq.rewrite(searcher.getIndexReader());

602

assertTrue(rewritten instanceof TermQuery);

603

}

604

605

public void testRandomPhrases() throws Exception {

606

Directory dir = newDirectory();

607

Analyzer analyzer = new MockAnalyzer(random);

608

609

RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setMergePolicy(newLogMergePolicy()));

610

List<List<String>> docs = new ArrayList<List<String>>();

611

Document d = new Document();

612

Field f = newField("f", "", Field.Store.NO, Field.Index.ANALYZED);

613

d.add(f);

614

615

Random r = random;

616

617

int NUM_DOCS = atLeast(10);

618

for (int i = 0; i < NUM_DOCS; i++) {

619

// must be > 4096 so it spans multiple chunks

620

int termCount = _TestUtil.nextInt(random, 4097, 8200);

621

622

List<String> doc = new ArrayList<String>();

623

624

StringBuilder sb = new StringBuilder();

625

while(doc.size() < termCount) {

626

if (r.nextInt(5) == 1 || docs.size() == 0) {

627

// make new non-empty-string term

628

String term;

629

while(true) {

630

term = _TestUtil.randomUnicodeString(r);

631

if (term.length() > 0) {

632

break;

633

}

634

}

635

TokenStream ts = analyzer.reusableTokenStream("ignore", new StringReader(term));

636

CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);

637

ts.reset();

638

while(ts.incrementToken()) {

639

String text = termAttr.toString();

640

doc.add(text);

641

sb.append(text).append(' ');

642

}

643

ts.end();

644

ts.close();

645

} else {

646

// pick existing sub-phrase

647

List<String> lastDoc = docs.get(r.nextInt(docs.size()));

648

int len = _TestUtil.nextInt(r, 1, 10);

649

int start = r.nextInt(lastDoc.size()-len);

650

for(int k=start;k<start+len;k++) {

651

String t = lastDoc.get(k);

652

doc.add(t);

653

sb.append(t).append(' ');

654

}

655

}

656

}

657

docs.add(doc);

658

f.setValue(sb.toString());

659

w.addDocument(d);

660

}

661

662

IndexReader reader = w.getReader();

663

IndexSearcher s = newSearcher(reader);

664

w.close();

665

666

// now search

667

int num = atLeast(10);

668

for(int i=0;i<num;i++) {

669

int docID = r.nextInt(docs.size());

670

List<String> doc = docs.get(docID);

671

672

final int numTerm = _TestUtil.nextInt(r, 2, 20);

673

final int start = r.nextInt(doc.size()-numTerm);

674

PhraseQuery pq = new PhraseQuery();

675

StringBuilder sb = new StringBuilder();

676

for(int t=start;t<start+numTerm;t++) {

677

pq.add(new Term("f", doc.get(t)));

678

sb.append(doc.get(t)).append(' ');

679

}

680

681

TopDocs hits = s.search(pq, NUM_DOCS);

682

boolean found = false;

683

for(int j=0;j<hits.scoreDocs.length;j++) {

684

if (hits.scoreDocs[j].doc == docID) {

685

found = true;

686

break;

687

}

688

}

689

690

assertTrue("phrase '" + sb + "' not found; start=" + start, found);

691

}

692

693

reader.close();

694

s.close();

695

dir.close();

696

}

697

}

Older »