39
35
# inherits off RangeI, so range operations possible
39
The LocatableSeq sequence object was developed mainly because the SimpleAlign
40
object requires this functionality, and in the rewrite of the Sequence object we
41
had to decide what to do with this.
43
It is, to be honest, not well integrated with the rest of bioperl. For example,
44
the trunc() function does not return a LocatableSeq object, as some might have
45
thought. Also, the sequence is not a Bio::SeqI, so the location is simply
46
inherited from Bio::RangeI and is not stored in a Bio::Location.
48
There are all sorts of nasty gotcha's about interactions between coordinate
49
systems when these sort of objects are used. Some mapping now occurs to deal
50
with HSP data, however it can probably be integrated in better and most methods
51
do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
52
trunc(), etc.) do not behave as expected and must be used with care.
54
Due to this, LocatableSeq functionality is to be refactored in a future BioPerl
55
release. However, for alignment functionality it works adequately for the time
44
60
=head2 Mailing Lists
46
62
User feedback is an integral part of the evolution of this and other
50
66
bioperl-l@bioperl.org - General discussion
51
67
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53
The locatable sequence object was developed mainly because the
54
SimpleAlign object requires this functionality, and in the rewrite
55
of the Sequence object we had to decide what to do with this.
57
It is, to be honest, not well integrated with the rest of bioperl, for
58
example, the trunc() function does not return a LocatableSeq object,
59
as some might have thought. There are all sorts of nasty gotcha's
60
about interactions between coordinate systems when these sort of
64
69
=head2 Reporting Bugs
66
71
Report bugs to the Bioperl bug tracking system to help us keep track
110
136
Usage : $obj->start($newval)
137
Function: Get/set the 1-based start position of this sequence in the original
138
sequence. '0' means before the original sequence starts.
112
139
Returns : value of start
113
140
Args : newvalue (optional)
121
$self->{'start'} = $value;
123
return $self->{'start'} if defined $self->{'start'};
124
return 1 if $self->seq;
148
$self->{'start'} = $value;
150
return $self->{'start'} if defined $self->{'start'};
151
return 1 if $self->seq;
131
158
Usage : $obj->end($newval)
159
Function: Get/set the 1-based end position of this sequence in the original
160
sequence. '0' means before the original sequence starts.
133
161
Returns : value of end
134
162
Args : newvalue (optional)
163
Note : although this is a get/set, it checks passed values against the
164
calculated end point ( derived from the sequence and based on
165
$GAP_SYMBOLS and possible frameshifts() ). If there is no match,
166
it will warn and set the proper value. Probably best used for
167
debugging proper sequence calculations.
142
my $string = $self->seq;
144
my $len = $self->_ungapped_len;
146
$self->warn("In sequence $id residue count gives end value $len.
147
Overriding value [$value] with value $len for Bio::LocatableSeq::end().")
148
and $value = $len if $len != $value and $self->verbose > 0;
151
$self->{'end'} = $value;
154
return $self->{'end'} || $self->_ungapped_len;
175
my $st = $self->start;
176
# start of 0 usually means the sequence is all gaps but maps to
177
# other sequences in an alignment
178
if ($self->seq && $st != 0 ) {
179
my $len = $self->_ungapped_len;
180
my $calend = $st + $len - 1;
181
my $id = $self->id || 'unknown';
182
if ($calend != $value) {
183
$self->warn("In sequence $id residue count gives end value ".
184
"$calend. \nOverriding value [$value] with value $calend for ".
185
"Bio::LocatableSeq::end().\n".$self->seq);
189
$self->{'end'} = $value;
192
if (defined $self->{'end'}) {
193
return $self->{'end'}
194
} elsif ( my $len = $self->_ungapped_len) {
195
return $len + $self->start - 1;
201
# changed 08.10.26 to return ungapped length, not the calculated end
157
203
sub _ungapped_len {
158
204
my $self = shift;
159
my $string = $self->seq || '';
160
$string =~ s/[\.\-]+//g;
161
$self->seq ? (return $self->start + CORE::length($string) - 1 ) : undef;
205
return unless my $string = $self->seq;
206
my ($map_res, $map_coord) = $self->mapping;
208
if (my %data = $self->frameshifts) {
209
map {$offset += $_} values %data;
211
$string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
212
return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
167
218
Usage : $obj->strand($newval)
169
Returns : value of strand
170
Args : newvalue (optional)
219
Function: return or set the strandedness
220
Returns : the value of the strandedness (-1, 0 or 1)
221
Args : the value of the strandedness (-1, 0 or 1)
175
226
my $self = shift;
178
$self->{'strand'} = $value;
229
$self->{'strand'} = $value;
180
231
return $self->{'strand'};
237
Usage : $obj->mapping($newval)
238
Function: return or set the mapping indices (indicates # symbols/positions in
239
the source string mapping to # of coordinate positions)
240
Returns : two-element array (# symbols => # coordinate pos)
241
Args : two elements (# symbols => # coordinate pos); this can also be
242
passed in as an array reference of the two elements (as might be
243
passed upon Bio::LocatableSeq instantiation, for instance).
250
my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
251
$self->throw("Must pass two values (# residues mapped to # positions)")
253
if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
254
$self->throw("Mapping values other than 1 or 3 are not currently supported")
256
$self->{'_mapping'} = \@mapping;
258
$self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
259
return @{ $self->{'_mapping'} };
265
Usage : $obj->frameshifts($newval)
266
Function: get/set the frameshift hash, which contains sequence positions as
267
keys and the shift (-2, -1, 1, 2) as the value
269
Args : hash or hash reference
276
if (ref $_[0] eq 'HASH') {
277
$self->{_frameshifts} = $_[0];
279
# assume this is a full list to be converted to a hash
280
$self->{_frameshifts} = \%{@_} # coerce into hash ref
283
(defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ?
284
return %{$self->{_frameshifts}} : return ();
200
$self->throw("Attribute id not set") unless defined($self->id());
201
$self->throw("Attribute start not set") unless defined($self->start());
202
$self->throw("Attribute end not set") unless defined($self->end());
204
return $self->id() . $char1 . $self->start . $char2 . $self->end ;
304
my ($id, $st, $end) = ($self->id(), $self->start(), $self->end());
306
if ($self->force_nse) {
312
$self->throw("Attribute id not set") unless defined($id);
313
$self->throw("Attribute start not set") unless defined($st);
314
$self->throw("Attribute end not set") unless defined($end);
316
#Stockholm Rfam includes version if present so it is optional
317
my $v = $self->version ? '.'.$self->version : '';
318
return $id . $v. $char1 . $st . $char2 . $end ;
324
Usage : $ls->force_nse()
325
Function: Boolean which forces get_nse() to build an NSE, regardless
326
of whether id(), start(), or end() is set
327
Returns : Boolean value
328
Args : (optional) Boolean (1 or 0)
329
Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
335
my ($self, $flag) = @_;
337
$flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
339
return $self->{'_force_nse'};
212
345
Usage :$self->no_gaps('.')
215
Gets number of gaps in the sequence. The count excludes
346
Function:Gets number of gaps in the sequence. The count excludes
216
347
leading or trailing gap characters.
218
349
Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
219
350
these, '.' and '-' are counted as gap characters unless an
220
351
optional argument specifies one of them.
222
Returns : number of internal gaps in the sequnce.
353
Returns : number of internal gaps in the sequence.
223
354
Args : a gap character (optional)
275
407
my ($self, $resnumber) = @_;
277
409
$self->throw("Residue number has to be a positive integer, not [$resnumber]")
278
unless $resnumber =~ /^\d+$/ and $resnumber > 0;
410
unless $resnumber =~ /^\d+$/ and $resnumber > 0;
280
412
if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
281
my @residues = split //, $self->seq;
282
my $count = $self->start();
284
my ($start,$end,$inc,$test);
413
my @residues = split //, $self->seq;
414
my $count = $self->start();
416
my ($start,$end,$inc,$test);
285
417
my $strand = $self->strand || 0;
286
# the following bit of "magic" allows the main loop logic to be the
287
# same regardless of the strand of the sequence
288
($start,$end,$inc,$test)= ($strand == -1)?
418
# the following bit of "magic" allows the main loop logic to be the
419
# same regardless of the strand of the sequence
420
($start,$end,$inc,$test)= ($strand == -1)?
289
421
(scalar(@residues-1),0,-1,sub{$i >= $end}) :
290
422
(0,scalar(@residues-1),1,sub{$i <= $end});
292
for ($i=$start; $test->(); $i+= $inc) {
293
if ($residues[$i] ne '.' and $residues[$i] ne '-') {
294
$count == $resnumber and last;
298
# $i now holds the index of the column.
424
for ($i=$start; $test->(); $i+= $inc) {
425
if ($residues[$i] ne '.' and $residues[$i] ne '-') {
426
$count == $resnumber and last;
430
# $i now holds the index of the column.
299
431
# The actual column number is this index + 1
304
436
$self->throw("Could not find residue number $resnumber");
309
440
=head2 location_from_column
311
442
Title : location_from_column
363
494
my $strand = $self->strand() || 1;
364
495
my $relative_pos = ($strand == -1)
365
496
? ($self->end - $pos + 1)
366
: ($pos + $start - 1);
497
: ($pos + $start - 1);
367
498
if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
368
$loc = new Bio::Location::Simple
369
(-start => $relative_pos,
370
-end => $relative_pos,
499
$loc = Bio::Location::Simple->new
500
(-start => $relative_pos,
501
-end => $relative_pos,
373
504
} elsif ($pos == 0 and $self->start == 1) {
375
506
my ($start,$end) = ($relative_pos, $relative_pos + $strand);
376
507
if ($strand == -1) {
377
($start,$end) = ($end,$start);
508
($start,$end) = ($end,$start);
379
$loc = new Bio::Location::Simple
383
-location_type => 'IN-BETWEEN'
510
$loc = Bio::Location::Simple->new
514
-location_type => 'IN-BETWEEN'
581
Usage : if(! $seq->validate_seq($seq_str) ) {
582
print "sequence $seq_str is not valid for an object of
583
alphabet ",$seq->alphabet, "\n";
585
Function: Validates a given sequence string. A validating sequence string
586
must be accepted by seq(). A string that does not validate will
587
lead to an exception if passed to seq().
589
The implementation provided here does not take alphabet() into
590
account. Allowed are all letters (A-Z), numbers [0-9]
591
and common symbols used for gaps, stop codons, unknown residues,
592
and frameshifts, including '-','.','*','?','=',and '~'.
595
Returns : 1 if the supplied sequence string is valid for the object, and
597
Args : The sequence string to be validated.
602
my ($self,$seqstr) = @_;
603
if( ! defined $seqstr ){ $seqstr = $self->seq(); }
604
return 0 unless( defined $seqstr);
606
if((CORE::length($seqstr) > 0) &&
607
($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
608
$self->warn("seq doesn't validate with [$MATCHPATTERN], mismatch is " .
609
join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));