1
#$Id: MapTileUtils.pm 16123 2009-09-17 12:57:27Z cjfields $
2
package Bio::Search::Tiling::MapTileUtils;
8
our @ISA = qw( Exporter );
9
our @EXPORT = qw( get_intervals_from_hsps
18
# assumed: intervals are [$a0, $a1], with $a0 <= $a1
21
Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm
33
An "interval" in this module is defined as an arrayref C<[$a0, $a1]>, where
34
C<$a0, $a1> are scalar numbers satisfying C<$a0 E<lt>= $a1>.
38
Mark A. Jensen - maj -at- fortinbras -dot- us
42
=head2 interval_tiling
44
Title : interval_tiling()
45
Usage : @tiling = interval_tiling( \@array_of_intervals )
46
Function: Find minimal set of intervals covering the input set
47
Returns : array of arrayrefs of the form
48
( [$interval => [ @indices_of_collapsed_input_intervals ]], ...)
49
Args : arrayref of intervals
54
return unless $_[0]; # no input
55
my $n = scalar @{$_[0]};
57
@input{(0..$n-1)} = @{$_[0]};
58
my @active = (0..$n-1);
63
my $tgt = $input{my $tgt_i = shift @active};
64
push @tiled_ints, $tgt_i;
65
my $tgt_not_disjoint = 1;
66
while ($tgt_not_disjoint) {
67
$tgt_not_disjoint = 0;
68
while (my $try_i = shift @active) {
69
my $try = $input{$try_i};
70
if ( !are_disjoint($tgt, $try) ) {
71
$tgt = min_covering_interval($tgt,$try);
72
push @tiled_ints, $try_i;
73
$tgt_not_disjoint = 1;
79
if (!$tgt_not_disjoint) {
80
push @ret, [ $tgt => [@tiled_ints] ];
90
=head2 decompose_interval
92
Title : decompose_interval
93
Usage : @decomposition = decompose_interval( \@overlappers )
94
Function: Calculate the disjoint decomposition of a set of
95
overlapping intervals, each annotated with a list of
97
Returns : array of arrayrefs of the form
98
( [[@interval] => [@indices_of_coverers]], ... )
99
Args : arrayref of intervals (arrayrefs like [$a0, $a1], with
100
Note : Each returned interval is associated with a list of indices of the
101
original intervals that cover that decomposition component
102
(scalar size of this list could be called the 'coverage coefficient')
103
Note : Coverage: each component of the decomp is completely contained
104
in the input intervals that overlap it, by construction.
105
Caveat : This routine expects the members of @overlappers to overlap,
106
but doesn't check this.
110
### what if the input intervals don't overlap?? They MUST overlap; that's
111
### what interval_tiling() is for.
113
sub decompose_interval {
114
return unless $_[0]; # no input
117
### this is ok, but need to handle the case where a lh and rh endpoint
121
# every lh endpoint generates (lh-1, lh)
122
# every rh endpoint generates (rh, rh+)
129
# sort, create singletons if nec.
131
@a = sort {$a<=>$b} keys %flat;
132
# throw out first and last (meeting a boundary condition)
134
# look for singletons
135
@flat = (shift @a, shift @a);
136
if ( $flat[1]-$flat[0] == 1 ) {
137
@flat = ($flat[0],$flat[0], $flat[1]);
139
while (my $a = shift @a) {
140
if ($a-$flat[-2]==2) {
141
push @flat, $flat[-1]; # create singleton interval
145
if ($flat[-1]-$flat[-2]==1 and @flat % 2) {
146
push @flat, $flat[-1];
148
# component intervals are consecutive pairs
150
while (my $a = shift @flat) {
151
push @decomp, [$a, shift @flat];
154
# for each component, return a list of the indices of the input intervals
155
# that cover the component.
157
foreach my $i (0..$#decomp) {
158
foreach my $j (0..$#ints) {
159
unless (are_disjoint($decomp[$i], $ints[$j])) {
160
if (defined $coverage[$i]) {
161
push @{$coverage[$i]}, $j;
164
$coverage[$i] = [$j];
169
return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp);
175
Usage : are_disjoint( [$a0, $a1], [$b0, $b1] )
176
Function: Determine if two intervals are disjoint
177
Returns : True if the intervals are disjoint, false if they overlap
178
Args : array of two intervals
183
my ($int1, $int2) = @_;
184
return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]);
188
=head2 min_covering_interval
190
Title : min_covering_interval
191
Usage : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] )
192
Function: Determine the minimal covering interval for two intervals
193
Returns : an interval
198
sub min_covering_interval {
199
my ($int1, $int2) = @_;
200
my @a = sort {$a <=> $b} (@$int1, @$int2);
201
return [$a[0],$a[-1]];
204
=head2 get_intervals_from_hsps
206
Title : get_intervals_from_hsps
207
Usage : @intervals = get_intervals_from_hsps($type, @hsp_objects)
208
Function: Return array of intervals of the form [ $start, $end ],
209
from an array of hsp objects
210
Returns : an array of intervals
211
Args : scalar $type, array of HSPI objects; where $type is one of 'hit',
216
sub get_intervals_from_hsps {
219
if (ref($type) =~ /HSP/) {
223
elsif ( !grep /^$type$/,qw( hit subject query ) ) {
224
die "Unknown HSP type '$type'";
226
$type = 'hit' if $type eq 'subject';
230
# push @ret, [ $_->start($type), $_->end($type)];
231
push @ret, [ $_->range($type) ];
236
# fast, clear, nasty, brutish and short.
237
# for _allowable_filters(), _set_mapping()
238
# covers BLAST, FAST families
239
# FASTA is ambiguous (nt or aa) based on alg name only
242
'N' => { 'mapping' => [1,1],
243
'def_context' => ['p_','p_'],
244
'has_strand' => [1, 1],
245
'has_frame' => [0, 0]},
246
'P' => { 'mapping' => [1,1],
247
'def_context' => ['all','all'],
248
'has_strand' => [0, 0],
249
'has_frame' => [0, 0]},
250
'X' => { 'mapping' => [3, 1],
251
'def_context' => [undef,'all'],
252
'has_strand' => [1, 0],
253
'has_frame' => [1, 0]},
254
'Y' => { 'mapping' => [3, 1],
255
'def_context' => [undef,'all'],
256
'has_strand' => [1, 0],
257
'has_frame' => [1, 0]},
258
'TA' => { 'mapping' => [1, 3],
259
'def_context' => ['all',undef],
260
'has_strand' => [0, 1],
261
'has_frame' => [0, 1]},
262
'TN' => { 'mapping' => [1, 3],
263
'def_context' => ['p_',undef],
264
'has_strand' => [1,1],
265
'has_frame' => [0, 1]},
266
'TX' => { 'mapping' => [3, 3],
267
'def_context' => [undef,undef],
268
'has_strand' => [1, 1],
269
'has_frame' => [1, 1]},
270
'TY' => { 'mapping' => [3, 3],
271
'def_context' => [undef,undef],
272
'has_strand' => [1, 1],
273
'has_frame' => [1, 1]}
276
=head2 _allowable_filters
278
Title : _allowable_filters
279
Usage : _allowable_filters($Bio_Search_Hit_HitI, $type)
280
Function: Return the HSP filters (strand, frame) allowed,
281
based on the reported algorithm
282
Returns : String encoding allowable filters:
283
s = strand, f = frame
284
Empty string if no filters allowed
285
undef if algorithm unrecognized
286
Args : A Bio::Search::Hit::HitI object,
287
scalar $type, one of 'hit', 'subject', 'query';
292
sub _allowable_filters {
296
unless (grep /^$type$/, qw( h q s ) ) {
297
warn("Unknown type '$type'; returning ''");
300
$type = 'h' if $type eq 's';
301
my $alg = $hit->algorithm;
303
# pretreat (i.e., kludge it)
304
$alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
306
for ($hit->algorithm) {
310
/(.?)BLAST(.?)/i && do {
311
return $$alg_lookup{$1.$2}{$type};
313
/(.?)FAST(.?)/ && do {
314
return $$alg_lookup{$1.$2}{$type};
324
=head2 _set_attributes
326
Title : _set_attributes
327
Usage : $tiling->_set_attributes()
328
Function: Sets attributes for invocant
329
that depend on algorithm name
330
Returns : True on success
332
Note : setting based on the configuration table
337
sub _set_attributes {
339
my $alg = $self->hit->algorithm;
341
# pretreat (i.e., kludge it)
342
$alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
346
($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
347
($self->{_def_context_query},$self->{_def_context_hit}) =
349
($self->{_has_frame_query},$self->{_has_frame_hit}) =
351
($self->{_has_strand_query},$self->{_has_strand_hit}) =
355
/(.?)BLAST(.?)/i && do {
356
($self->{_mapping_query},$self->{_mapping_hit}) =
357
@{$$alg_lookup{$1.$2}{mapping}};
358
($self->{_def_context_query},$self->{_def_context_hit}) =
359
@{$$alg_lookup{$1.$2}{def_context}};
360
($self->{_has_frame_query},$self->{_has_frame_hit}) =
361
@{$$alg_lookup{$1.$2}{has_frame}};
362
($self->{_has_strand_query},$self->{_has_strand_hit}) =
363
@{$$alg_lookup{$1.$2}{has_strand}};
366
/(.?)FAST(.?)/ && do {
367
($self->{_mapping_query},$self->{_mapping_hit}) =
368
@{$$alg_lookup{$1.$2}{mapping}};
369
($self->{_def_context_query},$self->{_def_context_hit}) =
370
@{$$alg_lookup{$1.$2}{def_context}};
371
($self->{_has_frame_query},$self->{_has_frame_hit}) =
372
@{$$alg_lookup{$1.$2}{has_frame}};
373
($self->{_has_strand_query},$self->{_has_strand_hit}) =
374
@{$$alg_lookup{$1.$2}{has_strand}};
378
$self->warn("Unrecognized algorithm '$alg'; defaults may not work");
379
($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
380
($self->{_def_context_query},$self->{_def_context_hit}) =
382
($self->{_has_frame_query},$self->{_has_frame_hit}) =
384
($self->{_has_strand_query},$self->{_has_strand_hit}) =
396
my %type_i = ( 'query' => 0, 'hit' => 1 );
397
unless ( ref($obj) && $obj->can('algorithm') ) {
398
$obj->warn("Object type unrecognized");
402
unless ( grep(/^$type$/, qw( query hit subject ) ) ) {
403
$obj->warn("Sequence type unrecognized");
406
$type = 'hit' if $type eq 'subject';
407
my $alg = $obj->algorithm;
409
# pretreat (i.e., kludge it)
410
$alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
416
/(.?)BLAST(.?)/i && do {
417
return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
419
/(.?)FAST(.?)/ && do {
420
return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
430
# need our own subsequencer for hsps.
432
package Bio::Search::HSP::HSPI;
440
Usage : $hsp->matches($type, $action, $start, $end)
441
Purpose : Get the total number of identical or conserved matches
442
in the query or sbjct sequence for the given HSP. Optionally can
443
report data within a defined interval along the seq.
446
Comments : Relies on seq_str('match') to get the string of alignment symbols
447
between the query and sbjct lines which are used for determining
448
the number of identical and conservative matches.
449
Note : Modeled on Bio::Search::HSP::HSPI::matches
454
my( $self, @args ) = @_;
455
my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args);
456
my @actions = qw( identities conserved searchutils );
459
$self->throw("Type not specified") if !defined $type;
460
$self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject));
461
$type = 'hit' if $type eq 'subject';
464
$self->throw("Action not specified") if !defined $action;
465
$self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions);
467
my ($len_id, $len_cons);
468
my $c = Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type);
469
if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) {
470
$self->throw("Both start and end are required");
472
elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) {
473
## Get data for the whole alignment.
474
# the reported values x mapping
475
$self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match');
476
($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved);
478
$_ eq 'identities' && do {
481
$_ eq 'conserved' && do {
484
$_ eq 'searchutils' && do {
485
return ($len_id, $len_cons);
488
$self->throw("What are YOU doing here?");
493
## Get the substring representing the desired sub-section of aln.
494
my($start,$stop) = $self->range($type);
495
if ( $beg < $start or $stop < $end ) {
496
$self->throw("Start/stop out of range [$start, $stop]");
500
my $match_str = $self->seq_str('match');
502
# strip the homology string of gap positions relative
504
$match_str = $self->seq_str('match');
505
my $tgt = $self->seq_str($type);
506
my $encode = $match_str ^ $tgt;
508
$encode =~ s/$zap//g;
510
$match_str = $tgt ^ $encode;
511
# match string is now the correct length for substr'ing below,
512
# given that start and end are gapless coordinates in the
517
$seq = substr( $match_str,
518
int( ($beg-$start)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ),
519
int( 1+($end-$beg)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) )
522
if(!CORE::length $seq) {
523
$self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
526
$seq =~ s/ //g; # remove space (no info).
527
$len_cons = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
528
$seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
529
$len_id = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
531
$_ eq 'identities' && do {
534
$_ eq 'conserved' && do {
537
$_ eq 'searchutils' && do {
538
return ($len_id, $len_cons);
541
$self->throw("What are YOU doing here?");
547
package Bio::Search::Tiling::MapTileUtils;
549
# a graphical depiction of a set of intervals
559
my @pos = sort {$a<=>$b} keys %pos;
560
@pos = map {sprintf("%03d",$_)} @pos;
563
$max = (length > $max) ? length : $max for (@pos);
564
for my $j (0..$max-1) {
566
my @line = map { substr($_, $j, 1) || '0' } @pos;
567
print join('', @line), "\n";
569
print '-' x @pos, "\n";
571
@pos{map {sprintf("%d",$_)} @pos} = (0..@pos);
573
print ' ' x $pos{$$_[0]}, '[', ' ' x ($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x (@pos-$pos{$$_[1]}), "\n";