~slub.team/goobi-indexserver/3.x

return "$URL: http://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_5_0/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java $";

516

}

517

518

@Override

519

public URL[] getDocs() {

520

try {

521

return new URL[] { new URL("http://wiki.apache.org/solr/LukeRequestHandler") };

522

}

523

catch( MalformedURLException ex ) { return null; }

524

}

525

526

///////////////////////////////////////////////////////////////////////////////////////

527

528

static class TermHistogram

529

{

530

int maxBucket = -1;

531

public Map<Integer,Integer> hist = new HashMap<Integer, Integer>();

532

533

private static final double LOG2 = Math.log( 2 );

534

public static int getPowerOfTwoBucket( int num )

535

{

536

return Math.max(1, Integer.highestOneBit(num-1) << 1);

537

}

538

539

public void add( int df )

540

{

541

Integer bucket = getPowerOfTwoBucket( df );

542

if( bucket > maxBucket ) {

543

maxBucket = bucket;

544

}

545

Integer old = hist.get( bucket );

546

if( old == null ) {

547

hist.put( bucket, 1 );

548

}

549

else {

550

hist.put( bucket, old+1 );

551

}

552

}

553

554

// TODO? should this be a list or a map?

555

public NamedList<Integer> toNamedList()

556

{

557

NamedList<Integer> nl = new NamedList<Integer>();

558

for( int bucket = 1; bucket <= maxBucket; bucket *= 2 ) {

559

Integer val = hist.get( bucket );

560

if( val == null ) {

561

val = 0;

562

}

563

nl.add( ""+bucket, val );

564

}

565

return nl;

566

}

567

}

568

569

/**

570

* Private internal class that counts up frequent terms

571

572

private static class TopTermQueue extends PriorityQueue

573

{

574

static class TermInfo {

575

TermInfo(Term t, int df) {

576

term = t;

577

docFreq = df;

578

}

579

int docFreq;

580

Term term;

581

}

582

583

public int minFreq = 0;

584

public int distinctTerms = 0;

585

public TermHistogram histogram;

586

587

TopTermQueue(int size) {

588

initialize(size);

589

histogram = new TermHistogram();

590

}

591

592

@Override

593

protected final boolean lessThan(Object a, Object b) {

594

TermInfo termInfoA = (TermInfo)a;

595

TermInfo termInfoB = (TermInfo)b;

596

return termInfoA.docFreq < termInfoB.docFreq;

597

}

598

599

/**

600

* This is a destructive call... the queue is empty at the end

601

602

public NamedList<Integer> toNamedList( IndexSchema schema )

603

{

604

// reverse the list..

605

List<TermInfo> aslist = new LinkedList<TermInfo>();

606

while( size() > 0 ) {

607

aslist.add( 0, (TermInfo)pop() );

608

}

609

610

NamedList<Integer> list = new NamedList<Integer>();

611

for (TermInfo i : aslist) {

612

String txt = i.term.text();

613

SchemaField ft = schema.getFieldOrNull( i.term.field() );

614

if( ft != null ) {

615

txt = ft.getType().indexedToReadable( txt );

616

}

617

list.add( txt, i.docFreq );

618

}

619

return list;

620

}

621

}

622

623

private static Map<String,TopTermQueue> getTopTerms( IndexReader reader, Set<String> fields, int numTerms, Set<String> junkWords ) throws Exception

624

{

625

Map<String,TopTermQueue> info = new HashMap<String, TopTermQueue>();

626

627

TermEnum terms = null;

628

try{

629

terms = reader.terms();

630

while (terms.next()) {

631

String field = terms.term().field();

632

String t = terms.term().text();

633

634

// Compute distinct terms for every field

635

TopTermQueue tiq = info.get( field );

636

if( tiq == null ) {

637

tiq = new TopTermQueue( numTerms+1 );

638

info.put( field, tiq );

639

}

640

tiq.distinctTerms++;

641

tiq.histogram.add( terms.docFreq() ); // add the term to the histogram

642

643

// Only save the distinct terms for fields we worry about

644

if (fields != null && fields.size() > 0) {

645

if( !fields.contains( field ) ) {

646

continue;

647

}

648

}

649

if( junkWords != null && junkWords.contains( t ) ) {

650

continue;

651

}

652

653

if( terms.docFreq() > tiq.minFreq ) {

654

tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));

655

if (tiq.size() > numTerms) { // if tiq full

656

tiq.pop(); // remove lowest in tiq

657

tiq.minFreq = ((TopTermQueue.TermInfo)tiq.top()).docFreq; // reset minFreq

658

}

659

}

660

}

661

}

662

finally {

663

if( terms != null ) terms.close();

664

}

665

return info;

666

}

667

}

668

669

670

671

672

673

674

Older »