270
my($class, @args) = @_;
271
my ( $class, @args ) = @_;
272
273
my $self = $class->SUPER::new(@args);
274
275
bless $self, $class;
276
my ($index_dir,$dbname,$format,$write_flag,$primary_pattern,
277
$primary_namespace,$start_pattern,$secondary_patterns) =
278
$self->_rearrange([qw(DIRECTORY
285
SECONDARY_PATTERNS)], @args);
277
my ( $index_dir, $dbname, $format, $write_flag, $primary_pattern,
278
$primary_namespace, $start_pattern, $secondary_patterns )
287
293
$self->index_directory($index_dir);
288
294
$self->dbname($dbname);
290
if ($self->index_directory && $self->read_config_file) {
292
my $fh = $self->primary_index_filehandle;
296
if ( $self->index_directory && $self->read_config_file ) {
298
my $fh = $self->primary_index_filehandle;
293
299
my $record_width = $self->read_header($fh);
294
300
$self->record_size($record_width);
296
302
$format ||= DEFAULT_FORMAT;
297
$self->format ($format);
298
$self->write_flag ($write_flag);
303
$self->format($format);
304
$self->write_flag($write_flag);
300
if ($self->write_flag && ! $primary_namespace) {
301
($primary_namespace,$primary_pattern,
302
$start_pattern,$secondary_patterns) =
303
$self->_guess_patterns($self->format);
306
if ( $self->write_flag && !$primary_namespace ) {
308
$primary_namespace, $primary_pattern,
309
$start_pattern, $secondary_patterns
310
) = $self->_guess_patterns( $self->format );
306
$self->primary_pattern ($primary_pattern);
307
$self->primary_namespace ($primary_namespace);
308
$self->start_pattern ($start_pattern);
313
$self->primary_pattern($primary_pattern);
314
$self->primary_namespace($primary_namespace);
315
$self->start_pattern($start_pattern);
309
316
$self->secondary_patterns($secondary_patterns);
314
321
sub new_from_registry {
315
my ($self,%config) = @_;
322
my ( $self, %config ) = @_;
317
324
my $dbname = $config{'dbname'};
318
325
my $location = $config{'location'};
320
my $index = Bio::DB::Flat::BinarySearch->new(-dbname => $dbname,
321
-index_dir => $location,
327
my $index = Bio::DB::Flat::BinarySearch->new(
329
-index_dir => $location,
325
333
=head2 get_Seq_by_id
327
335
Title : get_Seq_by_id
328
336
Usage : $obj->get_Seq_by_id($newval)
331
339
Returns : value of get_Seq_by_id
332
340
Args : newvalue (optional)
336
344
sub get_Seq_by_id {
345
my ( $self, $id ) = @_;
339
347
# too many uninit variables...
342
my ($fh,$length) = $self->get_stream_by_id($id);
350
my ( $fh, $length ) = $self->get_stream_by_id($id);
344
unless (defined($self->format)) {
345
$self->throw("Can't create sequence - format is not defined");
352
unless ( defined( $self->format ) ) {
353
$self->throw("Can't create sequence - format is not defined");
348
356
return unless $fh;
350
unless ( defined($self->{_seqio}) ) {
358
unless ( defined( $self->{_seqio} ) ) {
352
$self->{_seqio} = Bio::SeqIO->new(-fh => $fh,
353
-format => $self->format);
355
$self->{_seqio}->fh($fh);
360
$self->{_seqio} = Bio::SeqIO->new(
362
-format => $self->format
366
$self->{_seqio}->fh($fh);
358
369
return $self->{_seqio}->next_seq;
396
406
sub get_stream_by_id {
399
unless( $self->record_size ) {
400
if ($self->index_directory && $self->read_config_file) {
402
my $fh = $self->primary_index_filehandle;
403
my $record_width = $self->read_header($fh);
404
$self->record_size($record_width);
407
my ( $self, $id ) = @_;
409
unless ( $self->record_size ) {
410
if ( $self->index_directory && $self->read_config_file ) {
412
my $fh = $self->primary_index_filehandle;
413
my $record_width = $self->read_header($fh);
414
$self->record_size($record_width);
407
417
my $indexfh = $self->primary_index_filehandle;
410
420
my $filesize = systell($indexfh);
412
$self->throw("file was not parsed properly, record size is empty")
413
unless $self->record_size;
415
my $end = ($filesize - $self->{'_start_pos'}) / $self->record_size;
416
my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size);
419
my ($fileid,$pos,$length) = split(/\t/,$rest);
421
#print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
422
$self->throw("file was not parsed properly, record size is empty")
423
unless $self->record_size;
425
my $end = ( $filesize - $self->{'_start_pos'} ) / $self->record_size;
426
my ( $newid, $rest, $fhpos ) =
427
$self->find_entry( $indexfh, 0, $end, $id, $self->record_size );
429
my ( $fileid, $pos, $length ) = split( /\t/, $rest );
431
#print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
427
437
my $file = $self->{_file}{$fileid};
429
open (my $IN,"<$file");
439
open( my $IN, "<$file" );
435
return ($IN,$length);
443
sysseek( $IN, $pos, 0 );
445
return ( $IN, $length );
438
448
=head2 get_Seq_by_acc
497
509
sub get_Seq_by_secondary {
498
my ($self,$name,$id) = @_;
510
my ( $self, $name, $id ) = @_;
500
512
my @names = $self->secondary_namespaces;
503
515
foreach my $tmpname (@names) {
504
if ($name eq $tmpname) {
516
if ( $name eq $tmpname ) {
510
$self->throw("Secondary index for $name doesn't exist\n");
522
$self->throw("Secondary index for $name doesn't exist\n");
513
525
my $fh = $self->open_secondary_index($name);
517
529
my $filesize = systell($fh);
519
531
my $recsize = $self->{'_secondary_record_size'}{$name};
520
# print "Name " . $recsize . "\n";
522
my $end = ($filesize - $self->{'_start_pos'})/$recsize;
524
# print "End $end $filesize\n";
525
my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize);
529
# print "Found new id $newid $primary_id\n";
533
# print "Name " . $recsize . "\n";
535
my $end = ( $filesize - $self->{'_start_pos'} ) / $recsize;
537
# print "End $end $filesize\n";
538
my ( $newid, $primary_id, $pos ) =
539
$self->find_entry( $fh, 0, $end, $id, $recsize );
541
sysseek( $fh, $pos, 0 );
543
# print "Found new id $newid $primary_id\n";
530
544
# We now need to shuffle up the index file to find the top secondary entry
532
546
my $record = $newid;
534
while ($record =~ /^$newid/ && $pos >= 0) {
536
$record = $self->read_record($fh,$pos,$recsize);
537
$pos = $pos - $recsize;
538
# print "Up record = $record:$newid\n";
548
while ( $record =~ /^$newid/ && $pos >= 0 ) {
550
$record = $self->read_record( $fh, $pos, $recsize );
551
$pos = $pos - $recsize;
553
# print "Up record = $record:$newid\n";
541
556
$pos += $recsize;
543
# print "Top position is $pos\n";
558
# print "Top position is $pos\n";
545
560
# Now we have to shuffle back down again to read all the secondary entries
550
565
$primary_id{$primary_id} = 1;
552
while ($current_id eq $newid) {
553
$record = $self->read_record($fh,$pos,$recsize);
554
# print "Record is :$record:\n";
555
my ($secid,$primary_id) = split(/\t/,$record,2);
556
$current_id = $secid;
558
if ($current_id eq $newid) {
559
$primary_id =~ s/ //g;
560
# print "Primary $primary_id\n";
561
$primary_id{$primary_id} = 1;
563
$pos = $pos + $recsize;
564
# print "Down record = $record\n";
567
while ( $current_id eq $newid ) {
568
$record = $self->read_record( $fh, $pos, $recsize );
570
# print "Record is :$record:\n";
571
my ( $secid, $primary_id ) = split( /\t/, $record, 2 );
572
$current_id = $secid;
574
if ( $current_id eq $newid ) {
575
$primary_id =~ s/ //g;
577
# print "Primary $primary_id\n";
578
$primary_id{$primary_id} = 1;
580
$pos = $pos + $recsize;
582
# print "Down record = $record\n";
568
if (!defined($newid)) {
586
if ( !defined($newid) ) {
574
foreach my $id (keys %primary_id) {
575
push @entry,$self->get_Seq_by_id($id);
592
foreach my $id ( keys %primary_id ) {
593
push @entry, $self->get_Seq_by_id($id);
577
595
return wantarray ? @entry : $entry[0];
672
my ($self,$fh,$start,$end,$id,$recsize) = @_;
674
my $mid = int( ($end+1+$start) / 2);
675
my $pos = ($mid-1)*$recsize + $self->{'_start_pos'};
677
my ($record) = $self->read_record($fh,$pos,$recsize);
678
my ($entryid,$rest) = split(/\t/,$record,2);
689
my ( $self, $fh, $start, $end, $id, $recsize ) = @_;
691
my $mid = int( ( $end + 1 + $start ) / 2 );
692
my $pos = ( $mid - 1 ) * $recsize + $self->{'_start_pos'};
694
my ($record) = $self->read_record( $fh, $pos, $recsize );
695
my ( $entryid, $rest ) = split( /\t/, $record, 2 );
679
696
$rest =~ s/\s+$//;
681
# print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
682
# print "Entry :$id:$entryid:$rest\n";
684
my ($first,$second) = $id le $entryid ? ($id,$entryid) : ($entryid,$id);
686
if ($id eq $entryid) {
688
return ($id,$rest,$pos-$recsize);
690
} elsif ($first eq $id) {
692
if ($end-$start <= 1) {
696
# print "Moving up $entryid $id\n";
697
$self->find_entry($fh,$start,$end,$id,$recsize);
699
} elsif ($second eq $id ) {
700
# print "Moving down $entryid $id\n";
701
if ($end-$start <= 1) {
707
$self->find_entry($fh,$start,$end,$id,$recsize);
698
# print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
699
# print "Entry :$id:$entryid:$rest\n";
701
my ( $first, $second ) =
702
$id le $entryid ? ( $id, $entryid ) : ( $entryid, $id );
704
if ( $id eq $entryid ) {
706
return ( $id, $rest, $pos - $recsize );
709
elsif ( $first eq $id ) {
711
if ( $end - $start <= 1 ) {
716
# print "Moving up $entryid $id\n";
717
$self->find_entry( $fh, $start, $end, $id, $recsize );
720
elsif ( $second eq $id ) {
722
# print "Moving down $entryid $id\n";
723
if ( $end - $start <= 1 ) {
729
$self->find_entry( $fh, $start, $end, $id, $recsize );
713
734
=head2 build_index
724
745
sub build_index {
725
my ($self,@files) = @_;
727
$self->throw('Cannot build index unless -write_flag is true');
746
my ( $self, @files ) = @_;
748
or $self->throw('Cannot build index unless -write_flag is true');
729
750
my $rootdir = $self->index_directory;
731
if (!defined($rootdir)) {
732
$self->throw("No index directory set - can't build indices");
736
$self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
739
my $dbpath = File::Spec->catfile($rootdir,$self->dbname);
741
warn "Creating directory $dbpath\n";
742
mkdir $dbpath,0777 or $self->throw("Couldn't create $dbpath: $!");
746
$self->throw("Must enter an array of filenames to index");
752
if ( !defined($rootdir) ) {
753
$self->throw("No index directory set - can't build indices");
756
if ( !-d $rootdir ) {
758
"Index directory [$rootdir] is not a directory. Cant' build indices"
762
my $dbpath = File::Spec->catfile( $rootdir, $self->dbname );
764
warn "Creating directory $dbpath\n";
765
mkdir $dbpath, 0777 or $self->throw("Couldn't create $dbpath: $!");
769
$self->throw("Must enter an array of filenames to index");
749
772
foreach my $file (@files) {
750
$file = File::Spec->rel2abs($file)
751
unless File::Spec->file_name_is_absolute($file);
753
$self->throw("Can't index file [$file] as it doesn't exist");
773
$file = File::Spec->rel2abs($file)
774
unless File::Spec->file_name_is_absolute($file);
775
unless ( -e $file ) {
776
$self->throw("Can't index file [$file] as it doesn't exist");
757
if (my $filehash = $self->{_dbfile}) {
758
push @files,keys %$filehash;
780
if ( my $filehash = $self->{_dbfile} ) {
781
push @files, keys %$filehash;
762
@files = grep {!$seen{$_}++} @files;
785
@files = grep { !$seen{$_}++ } @files;
765
$self->make_config_file(\@files);
788
$self->make_config_file( \@files );
767
790
foreach my $file (@files) {
768
$entries += $self->_index_file($file);
791
$entries += $self->_index_file($file);
771
794
# update alphabet if necessary
772
$self->make_config_file(\@files);
795
$self->make_config_file( \@files );
774
797
# And finally write out the indices
775
798
$self->write_primary_index;
783
806
Title : _index_file
784
807
Usage : $obj->_index_file($newval)
787
810
Returns : value of _index_file
788
811
Args : newvalue (optional)
793
815
sub _index_file {
794
my ($self,$file) = @_;
795
my $v = $self->verbose;
796
open(my $FILE,"<", $file) || $self->throw("Can't open file [$file]");
799
my $fileid = $self->get_fileid_by_filename($file);
804
my $primary = $self->primary_pattern;
805
my $start_pattern = $self->start_pattern;
809
my $new_primary_entry;
817
my @secondary_names = $self->secondary_namespaces;
823
$self->{alphabet} ||= $self->guess_alphabet($_);
824
if ($_ =~ /$start_pattern/) {
826
$id = $new_primary_entry;
827
$self->{alphabet} ||= $self->guess_alphabet($_);
829
my $tmplen = (tell $fh) - length($_);
831
$length = $tmplen - $pos;
833
unless( defined($id)) {
834
$self->throw("No id defined for sequence");
836
unless( defined($fileid)) {
837
$self->throw("No fileid defined for file $file");
839
unless( defined($pos)) {
840
$self->throw("No position defined for " . $id . "\n");
842
unless( defined($length)) {
843
$self->throw("No length defined for " . $id . "\n");
845
$self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
849
if ($count > 0 && $count%1000 == 0) {
850
$self->debug( "Indexed $count ids\n") if $v > 0;
859
if ($_ =~ /$primary/) {
860
$new_primary_entry = $1;
863
my $secondary_patterns = $self->secondary_patterns;
865
foreach my $sec (@secondary_names) {
866
my $pattern = $secondary_patterns->{$sec};
868
if ($_ =~ /$pattern/) {
869
$secondary_id{$sec} = $1;
875
# Remember to add in the last one
877
$id = $new_primary_entry;
878
# my $tmplen = (tell $fh) - length($last_one);
879
my $tmplen = (tell $fh);
881
$length = $tmplen - $pos;
884
$self->throw("No id defined for sequence");
886
if (!defined($fileid)) {
887
$self->throw("No fileid defined for file $file");
889
if (!defined($pos)) {
890
$self->throw("No position defined for " . $id . "\n");
892
if (!defined($length)) {
893
$self->throw("No length defined for " . $id . "\n");
896
$self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
816
my ( $self, $file ) = @_;
817
my $v = $self->verbose;
818
open( my $FILE, "<", $file ) || $self->throw("Can't open file [$file]");
821
my $fileid = $self->get_fileid_by_filename($file);
826
my $primary = $self->primary_pattern;
827
my $start_pattern = $self->start_pattern;
831
my $new_primary_entry;
839
my @secondary_names = $self->secondary_namespaces;
845
$self->{alphabet} ||= $self->guess_alphabet($_);
846
if ( $_ =~ /$start_pattern/ ) {
848
$id = $new_primary_entry;
849
$self->{alphabet} ||= $self->guess_alphabet($_);
851
my $tmplen = ( tell $fh ) - length($_);
853
$length = $tmplen - $pos;
855
unless ( defined($id) ) {
856
$self->throw("No id defined for sequence");
858
unless ( defined($fileid) ) {
859
$self->throw("No fileid defined for file $file");
861
unless ( defined($pos) ) {
862
$self->throw( "No position defined for " . $id . "\n" );
864
unless ( defined($length) ) {
865
$self->throw( "No length defined for " . $id . "\n" );
867
$self->_add_id_position( $id, $pos, $fileid, $length,
872
if ( $count > 0 && $count % 1000 == 0 ) {
873
$self->debug("Indexed $count ids\n") if $v > 0;
883
if ( $_ =~ /$primary/ ) {
884
$new_primary_entry = $1;
887
my $secondary_patterns = $self->secondary_patterns;
889
foreach my $sec (@secondary_names) {
890
my $pattern = $secondary_patterns->{$sec};
892
if ( $_ =~ /$pattern/ ) {
893
$secondary_id{$sec} = $1;
899
# Remember to add in the last one
901
$id = $new_primary_entry;
903
# my $tmplen = (tell $fh) - length($last_one);
904
my $tmplen = ( tell $fh );
906
$length = $tmplen - $pos;
908
if ( !defined($id) ) {
909
$self->throw("No id defined for sequence");
911
if ( !defined($fileid) ) {
912
$self->throw("No fileid defined for file $file");
914
if ( !defined($pos) ) {
915
$self->throw( "No position defined for " . $id . "\n" );
917
if ( !defined($length) ) {
918
$self->throw( "No length defined for " . $id . "\n" );
921
$self->_add_id_position( $id, $pos, $fileid, $length, \%secondary_id );
903
928
=head2 write_primary_index
905
930
Title : write_primary_index
906
931
Usage : $obj->write_primary_index($newval)
909
934
Returns : value of write_primary_index
910
935
Args : newvalue (optional)
915
940
sub write_primary_index {
918
my @ids = keys %{$self->{_id}};
920
@ids = sort {$a cmp $b} @ids;
922
open (my $INDEX,">" . $self->primary_index_file) ||
923
$self->throw("Can't open primary index file [" .
924
$self->primary_index_file . "]");
926
my $recordlength = $self->{_maxidlength} +
927
$self->{_maxfileidlength} +
928
$self->{_maxposlength} +
929
$self->{_maxlengthlength} + 3;
931
print $INDEX sprintf("%04d",$recordlength);
933
foreach my $id (@ids) {
935
if (!defined($self->{_id}{$id}{_fileid})) {
936
$self->throw("No fileid for $id\n");
938
if (!defined($self->{_id}{$id}{_pos})) {
939
$self->throw("No position for $id\n");
941
if (!defined($self->{_id}{$id}{_length})) {
942
$self->throw("No length for $id");
945
my $record = $id . "\t" .
946
$self->{_id}{$id}{_fileid} . "\t" .
947
$self->{_id}{$id}{_pos} . "\t" .
948
$self->{_id}{$id}{_length};
950
print $INDEX sprintf("%-${recordlength}s",$record);
943
my @ids = keys %{ $self->{_id} };
945
@ids = sort { $a cmp $b } @ids;
947
open( my $INDEX, ">" . $self->primary_index_file )
949
"Can't open primary index file [" . $self->primary_index_file . "]" );
952
$self->{_maxidlength} +
953
$self->{_maxfileidlength} +
954
$self->{_maxposlength} +
955
$self->{_maxlengthlength} + 3;
957
print $INDEX sprintf( "%04d", $recordlength );
959
foreach my $id (@ids) {
961
if ( !defined( $self->{_id}{$id}{_fileid} ) ) {
962
$self->throw("No fileid for $id\n");
964
if ( !defined( $self->{_id}{$id}{_pos} ) ) {
965
$self->throw("No position for $id\n");
967
if ( !defined( $self->{_id}{$id}{_length} ) ) {
968
$self->throw("No length for $id");
973
. $self->{_id}{$id}{_fileid} . "\t"
974
. $self->{_id}{$id}{_pos} . "\t"
975
. $self->{_id}{$id}{_length};
977
print $INDEX sprintf( "%-${recordlength}s", $record );
967
994
sub write_secondary_indices {
970
# These are the different
971
my @names = keys (%{$self->{_secondary_id}});
997
# These are the different
998
my @names = keys( %{ $self->{_secondary_id} } );
974
1000
foreach my $name (@names) {
976
my @seconds = keys %{$self->{_secondary_id}{$name}};
978
# First we need to loop over to get the longest record.
981
foreach my $second (@seconds) {
982
my $tmplen = length($second) + 1;
983
my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
985
foreach my $prim (@prims) {
986
my $recordlen = $tmplen + length($prim);
988
if ($recordlen > $length) {
989
$length = $recordlen;
994
# Now we can print the index
996
my $fh = $self->new_secondary_filehandle($name);
998
print $fh sprintf("%04d",$length);
999
@seconds = sort @seconds;
1001
foreach my $second (@seconds) {
1003
my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
1006
foreach my $prim (@prims) {
1007
my $record = $tmp . "\t" . $prim;
1008
if (length($record) > $length) {
1009
$self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n");
1011
print $fh sprintf("%-${length}s",$record);
1012
print $fh sprintf("%-${length}s",$record);
1002
my @seconds = keys %{ $self->{_secondary_id}{$name} };
1004
# First we need to loop over to get the longest record.
1007
foreach my $second (@seconds) {
1008
my $tmplen = length($second) + 1;
1009
my @prims = keys %{ $self->{_secondary_id}{$name}{$second} };
1011
foreach my $prim (@prims) {
1012
my $recordlen = $tmplen + length($prim);
1014
if ( $recordlen > $length ) {
1015
$length = $recordlen;
1020
# Now we can print the index
1022
my $fh = $self->new_secondary_filehandle($name);
1024
print $fh sprintf( "%04d", $length );
1025
@seconds = sort @seconds;
1027
foreach my $second (@seconds) {
1029
my @prims = keys %{ $self->{_secondary_id}{$name}{$second} };
1032
foreach my $prim (@prims) {
1033
my $record = $tmp . "\t" . $prim;
1034
if ( length($record) > $length ) {
1036
"Something has gone horribly wrong - length of record is more than we thought [$length]\n"
1040
print $fh sprintf( "%-${length}s", $record );
1091
1119
sub _add_id_position {
1092
my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_;
1094
if (!defined($id)) {
1095
$self->throw("No id defined. Can't add id position");
1097
if (!defined($pos)) {
1098
$self->throw("No position defined. Can't add id position");
1100
if ( ! defined($fileid)) {
1101
$self->throw("No fileid defined. Can't add id position");
1103
if (! defined($length) || $length <= 0) {
1104
$self->throw("No length defined or <= 0 [$length]. Can't add id position");
1107
$self->{_id}{$id}{_pos} = $pos;
1108
$self->{_id}{$id}{_length} = $length;
1109
$self->{_id}{$id}{_fileid} = $fileid;
1111
# Now the secondary ids
1113
foreach my $sec (keys (%$secondary_id)) {
1114
my $value = $secondary_id->{$sec};
1115
$self->{_secondary_id}{$sec}{$value}{$id} = 1;
1118
$self->{_maxidlength} = length($id)
1119
if !exists $self->{_maxidlength} or
1120
length($id) >= $self->{_maxidlength};
1122
$self->{_maxfileidlength} = length($fileid)
1123
if !exists $self->{_maxfileidlength} or
1124
length($fileid) >= $self->{_maxfileidlength};
1126
$self->{_maxposlength} = length($pos)
1127
if !exists $self->{_maxposlength} or
1128
length($pos) >= $self->{_maxposlength};
1130
$self->{_maxlengthlength} = length($length)
1131
if !exists $self->{_maxlengthlength} or
1132
length($length) >= $self->{_maxlengthlength};
1120
my ( $self, $id, $pos, $fileid, $length, $secondary_id ) = @_;
1122
if ( !defined($id) ) {
1123
$self->throw("No id defined. Can't add id position");
1125
if ( !defined($pos) ) {
1126
$self->throw("No position defined. Can't add id position");
1128
if ( !defined($fileid) ) {
1129
$self->throw("No fileid defined. Can't add id position");
1131
if ( !defined($length) || $length <= 0 ) {
1133
"No length defined or <= 0 [$length]. Can't add id position");
1136
$self->{_id}{$id}{_pos} = $pos;
1137
$self->{_id}{$id}{_length} = $length;
1138
$self->{_id}{$id}{_fileid} = $fileid;
1140
# Now the secondary ids
1142
foreach my $sec ( keys(%$secondary_id) ) {
1143
my $value = $secondary_id->{$sec};
1144
$self->{_secondary_id}{$sec}{$value}{$id} = 1;
1147
$self->{_maxidlength} = length($id)
1148
if !exists $self->{_maxidlength}
1149
or length($id) >= $self->{_maxidlength};
1151
$self->{_maxfileidlength} = length($fileid)
1152
if !exists $self->{_maxfileidlength}
1153
or length($fileid) >= $self->{_maxfileidlength};
1155
$self->{_maxposlength} = length($pos)
1156
if !exists $self->{_maxposlength}
1157
or length($pos) >= $self->{_maxposlength};
1159
$self->{_maxlengthlength} = length($length)
1160
if !exists $self->{_maxlengthlength}
1161
or length($length) >= $self->{_maxlengthlength};
1135
1164
=head2 make_config_file
1137
1166
Title : make_config_file
1138
1167
Usage : $obj->make_config_file($newval)
1141
1170
Returns : value of make_config_file
1142
1171
Args : newvalue (optional)
1146
1175
sub make_config_file {
1147
my ($self,$files) = @_;
1176
my ( $self, $files ) = @_;
1149
1178
my @files = @$files;
1151
1180
my $configfile = $self->_config_file;
1153
open(my $CON,">", $configfile) || $self->throw("Can't create config file [$configfile]");
1182
open( my $CON, ">", $configfile )
1183
|| $self->throw("Can't create config file [$configfile]");
1155
1185
# First line must be the type of index - in this case flat
1156
1186
print $CON "index\tflat/1\n";
1162
1192
foreach my $file (@files) {
1164
my $size = -s $file;
1166
print $CON "fileid_$count\t$file\t$size\n";
1169
open($fh,"<", $file) || $self->throw($!);
1170
$self->{_file} {$count} = $file;
1171
$self->{_dbfile}{$file} = $count;
1172
$self->{_size}{$count} = $size;
1194
my $size = -s $file;
1196
print $CON "fileid_$count\t$file\t$size\n";
1199
open( $fh, "<", $file ) || $self->throw($!);
1200
$self->{_file}{$count} = $file;
1201
$self->{_dbfile}{$file} = $count;
1202
$self->{_size}{$count} = $size;
1176
1206
# Now the namespaces
1178
print $CON "primary_namespace\t" .$self->primary_namespace. "\n";
1208
print $CON "primary_namespace\t" . $self->primary_namespace . "\n";
1180
1210
# Needs fixing for the secondary stuff
1182
1212
my $second_patterns = $self->secondary_patterns;
1184
1214
my @second = keys %$second_patterns;
1187
print $CON "secondary_namespaces";
1217
print $CON "secondary_namespaces";
1189
foreach my $second (@second) {
1190
print $CON "\t$second";
1219
foreach my $second (@second) {
1220
print $CON "\t$second";
1192
1222
print $CON "\n";
1195
1225
# Now the config format
1197
unless (defined ($self->format) ) {
1198
$self->throw("Format does not exist in module - can't write config file");
1200
my $format = $self->format;
1201
my $alphabet = $self->alphabet;
1202
my $alpha = $alphabet ? "/$alphabet" : '';
1203
print $CON "format\t" . "$format\n";
1227
unless ( defined( $self->format ) ) {
1229
"Format does not exist in module - can't write config file");
1232
my $format = $self->format;
1233
my $alphabet = $self->alphabet;
1234
my $alpha = $alphabet ? "/$alphabet" : '';
1235
print $CON "format\t" . "$format\n";
1210
1242
Title : read_config_file
1211
1243
Usage : $obj->read_config_file($newval)
1214
1246
Returns : value of read_config_file
1215
1247
Args : newvalue (optional)
1219
1251
sub read_config_file {
1221
my $configfile = $self->_config_file;
1222
return unless -e $configfile;
1224
open(my $CON,"<", $configfile) || $self->throw("Can't open configfile [$configfile]");
1226
# First line must be type
1231
# This is hard coded as we only index flatfiles here
1232
if ($line =~ m{index\tflat/(\d+)}) {
1235
$self->throw("First line not compatible with flat file index. Should be something like\n\nindex\tflat/1");
1238
$self->index_type("flat");
1239
$self->index_version($version);
1244
# Look for fileid lines
1245
if ($_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/) {
1250
if (! -e $filename) {
1251
$self->throw("File [$filename] does not exist!");
1253
if (-s $filename != $filesize) {
1254
$self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize . "]");
1258
open($fh,"<", $filename) || $self->throw($!);
1259
$self->{_file} {$fileid} = $filename;
1260
$self->{_dbfile}{$filename} = $fileid;
1261
$self->{_size} {$fileid} = $filesize;
1264
# Look for namespace lines
1265
if ( /(.*)_namespaces?\t(.+)/ ) {
1266
if ($1 eq "primary") {
1267
$self->primary_namespace($2);
1268
} elsif ($1 eq "secondary") {
1269
$self->secondary_namespaces(split "\t",$2);
1271
$self->throw("Unknown namespace name in config file [$1");
1275
# Look for format lines
1276
if ($_ =~ /format\t(\S+)/) {
1277
# Check the format here?
1280
# handle LSID format
1281
if ($format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))?/) {
1283
$self->alphabet($2);
1284
} else { # compatibility with older versions
1253
my $configfile = $self->_config_file;
1254
return unless -e $configfile;
1256
open( my $CON, "<", $configfile )
1257
|| $self->throw("Can't open configfile [$configfile]");
1259
# First line must be type
1264
# This is hard coded as we only index flatfiles here
1265
if ( $line =~ m{index\tflat/(\d+)} ) {
1270
"First line not compatible with flat file index. Should be something like\n\nindex\tflat/1"
1274
$self->index_type("flat");
1275
$self->index_version($version);
1280
# Look for fileid lines
1281
if ( $_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/ ) {
1286
if ( !-e $filename ) {
1287
$self->throw("File [$filename] does not exist!");
1289
if ( -s $filename != $filesize ) {
1291
"Flatfile size for $filename differs from what the index thinks it is. Real size ["
1293
. "] Index thinks it is ["
1299
open( $fh, "<", $filename ) || $self->throw($!);
1300
$self->{_file}{$fileid} = $filename;
1301
$self->{_dbfile}{$filename} = $fileid;
1302
$self->{_size}{$fileid} = $filesize;
1305
# Look for namespace lines
1306
if (/(.*)_namespaces?\t(.+)/) {
1307
if ( $1 eq "primary" ) {
1308
$self->primary_namespace($2);
1310
elsif ( $1 eq "secondary" ) {
1311
$self->secondary_namespaces( split "\t", $2 );
1314
$self->throw("Unknown namespace name in config file [$1");
1318
# Look for format lines
1319
if ( $_ =~ /format\t(\S+)/ ) {
1321
# Check the format here?
1324
# handle LSID format
1325
if ( $format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))?/ ) {
1327
$self->alphabet($2);
1329
else { # compatibility with older versions
1292
1337
# Now check we have all that we need
1294
my @fileid_keys = keys (%{$self->{_file}});
1296
if (!(@fileid_keys)) {
1297
$self->throw("No flatfile fileid files in config - check the index has been made correctly");
1300
if (!defined($self->primary_namespace)) {
1301
$self->throw("No primary namespace exists");
1304
if (! -e $self->primary_index_file) {
1305
$self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist");
1339
my @fileid_keys = keys( %{ $self->{_file} } );
1341
if ( !(@fileid_keys) ) {
1343
"No flatfile fileid files in config - check the index has been made correctly"
1347
if ( !defined( $self->primary_namespace ) ) {
1348
$self->throw("No primary namespace exists");
1351
if ( !-e $self->primary_index_file ) {
1352
$self->throw( "Primary index file ["
1353
. $self->primary_index_file
1354
. "] doesn't exist" );
1647
1700
sub secondary_namespaces {
1648
my ($obj,@values) = @_;
1701
my ( $obj, @values ) = @_;
1651
push(@{$obj->{'secondary_namespaces'}},@values);
1704
push( @{ $obj->{'secondary_namespaces'} }, @values );
1653
return @{$obj->{'secondary_namespaces'} || []};
1706
return @{ $obj->{'secondary_namespaces'} || [] };
1658
1709
## These are indexing routines to index commonly used format - fasta
1659
1710
## swissprot and embl
1661
1712
sub new_SWISSPROT_index {
1662
my ($self,$index_dir,@files) = @_;
1713
my ( $self, $index_dir, @files ) = @_;
1664
1715
my %secondary_patterns;
1666
my $start_pattern = "^ID (\\S+)";
1717
my $start_pattern = "^ID (\\S+)";
1667
1718
my $primary_pattern = "^AC (\\S+)\\;";
1669
1720
$secondary_patterns{"ID"} = $start_pattern;
1671
my $index = Bio::DB::Flat::BinarySearch->new
1672
(-index_dir => $index_dir,
1673
-format => 'swissprot',
1674
-primary_pattern => $primary_pattern,
1675
-primary_namespace => "ACC",
1676
-start_pattern => $start_pattern,
1677
-secondary_patterns => \%secondary_patterns);
1722
my $index = Bio::DB::Flat::BinarySearch->new(
1723
-index_dir => $index_dir,
1724
-format => 'swissprot',
1725
-primary_pattern => $primary_pattern,
1726
-primary_namespace => "ACC",
1727
-start_pattern => $start_pattern,
1728
-secondary_patterns => \%secondary_patterns
1679
1731
$index->build_index(@files);
1682
1734
sub new_EMBL_index {
1683
my ($self,$index_dir,@files) = @_;
1685
my %secondary_patterns;
1687
my $start_pattern = "^ID (\\S+)";
1688
my $primary_pattern = "^AC (\\S+)\\;";
1689
my $primary_namespace = "ACC";
1691
$secondary_patterns{"ID"} = $start_pattern;
1693
my $index = Bio::DB::Flat::BinarySearch->new
1694
(-index_dir => $index_dir,
1696
-primary_pattern => $primary_pattern,
1697
-primary_namespace => "ACC",
1698
-start_pattern => $start_pattern,
1699
-secondary_patterns => \%secondary_patterns);
1735
my ( $self, $index_dir, @files ) = @_;
1737
my %secondary_patterns;
1739
my $start_pattern = "^ID (\\S+)";
1740
my $primary_pattern = "^AC (\\S+)\\;";
1741
my $primary_namespace = "ACC";
1743
$secondary_patterns{"ID"} = $start_pattern;
1745
my $index = Bio::DB::Flat::BinarySearch->new(
1746
-index_dir => $index_dir,
1748
-primary_pattern => $primary_pattern,
1749
-primary_namespace => "ACC",
1750
-start_pattern => $start_pattern,
1751
-secondary_patterns => \%secondary_patterns
1701
1754
$index->build_index(@files);
1706
1759
sub new_FASTA_index {
1707
my ($self,$index_dir,@files) = @_;
1709
my %secondary_patterns;
1711
my $start_pattern = "^>";
1712
my $primary_pattern = "^>(\\S+)";
1713
my $primary_namespace = "ACC";
1715
$secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1717
my $index = Bio::DB::Flat::BinarySearch->new
1718
(-index_dir => $index_dir,
1720
-primary_pattern => $primary_pattern,
1721
-primary_namespace => "ACC",
1722
-start_pattern => $start_pattern,
1723
-secondary_patterns => \%secondary_patterns);
1725
$index->build_index(@files);
1760
my ( $self, $index_dir, @files ) = @_;
1762
my %secondary_patterns;
1764
my $start_pattern = "^>";
1765
my $primary_pattern = "^>(\\S+)";
1766
my $primary_namespace = "ACC";
1768
$secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1770
my $index = Bio::DB::Flat::BinarySearch->new(
1771
-index_dir => $index_dir,
1773
-primary_pattern => $primary_pattern,
1774
-primary_namespace => "ACC",
1775
-start_pattern => $start_pattern,
1776
-secondary_patterns => \%secondary_patterns
1779
$index->build_index(@files);
1730
1784
# EVERYTHING THAT FOLLOWS THIS
1731
1785
# is an awful hack - in reality Michele's code needs to be rewritten
1732
1786
# to use Bio::SeqIO, but I have too little time to do this -- LS
1733
1787
sub guess_alphabet {
1737
my $format = $self->format;
1738
return 'protein' if $format eq 'swissprot';
1740
if ($format eq 'genbank') {
1741
return unless $line =~ /^LOCUS/;
1742
return 'dna' if $line =~ /\s+\d+\s+bp/i;
1746
if ($format eq 'embl') {
1747
return unless $line =~ /^ID/;
1748
return 'dna' if $line =~ / DNA;/i;
1749
return 'rna' if $line =~ / RNA;/i;
1791
my $format = $self->format;
1792
return 'protein' if $format eq 'swissprot';
1794
if ( $format eq 'genbank' ) {
1795
return unless $line =~ /^LOCUS/;
1796
return 'dna' if $line =~ /\s+\d+\s+bp/i;
1800
if ( $format eq 'embl' ) {
1801
return unless $line =~ /^ID/;
1802
return 'dna' if $line =~ / DNA;/i;
1803
return 'rna' if $line =~ / RNA;/i;
1756
1810
# return (namespace,primary_pattern,start_pattern,secondary_pattern)
1757
1811
sub _guess_patterns {
1760
if ($format =~ /swiss(prot)?/i) {
1765
ACC => "^AC (\\S+);"
1769
if ($format =~ /embl/i) {
1774
ACC => q/^AC (\S+);/,
1775
VERSION => q/^SV\s+(\S+)/
1779
if ($format =~ /genbank/i) {
1784
ACC => q/^ACCESSION\s+(\S+)/,
1785
VERSION => q/^VERSION\s+(\S+)/
1789
if ($format =~ /fasta/i) {
1796
$self->throw("I can't handle format $format");
1814
if ( $format =~ /swiss(prot)?/i ) {
1815
return ( 'ID', "^ID (\\S+)", "^ID (\\S+)",
1816
{ ACC => "^AC (\\S+);" } );
1819
if ($format =~ /embl/i) {
1824
ACC => q/^AC (\S+);/,
1825
VERSION => q/^SV\s+(\S+)/
1829
if ( $format =~ /genbank/i ) {
1835
ACC => q/^ACCESSION\s+(\S+)/,
1836
VERSION => q/^VERSION\s+(\S+)/
1841
if ( $format =~ /fasta/i ) {
1842
return ( 'ACC', '^>(\S+)', '^>(\S+)', );
1845
$self->throw("I can't handle format $format");