1
# $Id: PrimarySeq.pm,v 1.79 2003/07/19 22:20:30 jason Exp $
1
# $Id: PrimarySeq.pm,v 1.95.4.4 2006/10/02 23:10:12 sendu Exp $
3
3
# bioperl module for Bio::PrimarySeq
5
# Cared for by Ewan Birney <birney@sanger.ac.uk>
5
# Cared for by Ewan Birney <birney@ebi.ac.uk>
7
7
# Copyright Ewan Birney
24
24
use Bio::DB::GenBank;
27
28
$seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
28
-id => 'GeneFragment-12',
29
-accession_number => 'X78121',
33
print "Sequence ", $seqobj->id(), " with accession ",
29
-id => 'GeneFragment-12',
30
-accession_number => 'X78121',
33
print "Sequence ", $seqobj->id(), " with accession ",
34
34
$seqobj->accession_number, "\n";
37
$inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta');
38
$inputstream = Bio::SeqIO->new(-file => "myseq.fa",
38
40
$seqobj = $inputstream->next_seq();
39
41
print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
42
43
# to get out parts of the sequence.
44
print "Sequence ", $seqobj->id(), " with accession ",
45
print "Sequence ", $seqobj->id(), " with accession ",
45
46
$seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
47
48
$string = $seqobj->seq();
48
49
$string2 = $seqobj->subseq(1,40);
53
53
PrimarySeq is a lightweight Sequence object, storing the sequence, its
83
83
Bioperl modules. Send your comments and suggestions preferably to one
84
84
of the Bioperl mailing lists. Your participation is much appreciated.
86
bioperl-l@bioperl.org - General discussion
87
http://bio.perl.org/MailList.html - About the mailing lists
86
bioperl-l@bioperl.org - General discussion
87
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
89
89
=head2 Reporting Bugs
91
91
Report bugs to the Bioperl bug tracking system to help us keep track
92
the bugs and their resolution. Bug reports can be submitted via email
92
the bugs and their resolution. Bug reports can be submitted via the
95
bioperl-bugs@bio.perl.org
96
http://bugzilla.bioperl.org/
95
http://bugzilla.open-bio.org/
98
97
=head1 AUTHOR - Ewan Birney
100
Email birney@sanger.ac.uk
102
Describe contact details here
99
Email birney@ebi.ac.uk
115
112
package Bio::PrimarySeq;
113
use vars qw($MATCHPATTERN);
120
use Bio::PrimarySeqI;
121
use Bio::IdentifiableI;
122
use Bio::DescribableI;
116
$MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
124
@ISA = qw(Bio::Root::Root Bio::PrimarySeqI
125
Bio::IdentifiableI Bio::DescribableI);
118
use base qw(Bio::Root::Root Bio::PrimarySeqI Bio::IdentifiableI Bio::DescribableI);
128
121
# setup the allowed values for alphabet()
149
142
Returns : a new Bio::PrimarySeq object
150
143
Args : -seq => sequence string
151
-display_id => display id of the sequence (locus name)
144
-display_id => display id of the sequence (locus name)
152
145
-accession_number => accession number
153
146
-primary_id => primary id (Genbank id)
154
147
-namespace => the namespace for the accession
190
183
if( defined $id && defined $given_id ) {
191
if( $id ne $given_id ) {
192
$self->throw("Provided both id and display_id constructor ".
193
"functions. [$id] [$given_id]");
184
if( $id ne $given_id ) {
185
$self->throw("Provided both id and display_id constructor ".
186
"functions. [$id] [$given_id]");
196
189
if( defined $given_id ) { $id = $given_id; }
202
195
# if alphabet is provided we set it first, so that it won't be guessed
203
196
# when the sequence is set
204
197
$alphabet && $self->alphabet($alphabet);
206
199
# if there is an alphabet, and direct is passed in, assumme the alphabet
209
202
if( $direct && $ref_to_seq) {
210
$self->{'seq'} = $$ref_to_seq;
212
$self->_guess_alphabet();
213
} # else it has been set already above
203
$self->{'seq'} = $$ref_to_seq;
205
$self->_guess_alphabet();
206
} # else it has been set already above
215
# print STDERR "DEBUG: setting sequence to [$seq]\n";
216
# note: the sequence string may be empty
217
$self->seq($seq) if defined($seq);
208
# print STDERR "DEBUG: setting sequence to [$seq]\n";
209
# note: the sequence string may be empty
210
$self->seq($seq) if defined($seq);
220
213
$id && $self->display_id($id);
221
214
$acc && $self->accession_number($acc);
249
242
Returns : A scalar
250
243
Args : Optionally on set the new value (a string). An optional second
251
244
argument presets the alphabet (otherwise it will be guessed).
252
Both parameters may also be given in named paramater style
253
with -seq and -alphabet being the names.
264
255
my ($value,$alphabet) = @args;
268
258
if(defined($value) && (! $obj->validate_seq($value))) {
269
259
$obj->throw("Attempting to set the sequence to [$value] ".
270
"which does not look healthy");
260
"which does not look healthy");
272
262
# if a sequence was already set we make sure that we re-adjust the
273
263
# alphabet, otherwise we skip guessing if alphabet is already set
274
264
# note: if the new seq is empty or undef, we don't consider that a
275
265
# change (we wouldn't have anything to guess on anyway)
277
exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
278
$obj->{'seq'} = $value;
267
exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
268
$obj->{'seq'} = $value;
279
269
# new alphabet overridden by arguments?
281
271
# yes, set it no matter what
282
$obj->alphabet($alphabet);
283
} elsif( # if we changed a previous sequence to a new one
285
# or if there is no alphabet yet at all
286
(! defined($obj->alphabet()))) {
287
# we need to guess the (possibly new) alphabet
288
$obj->_guess_alphabet();
289
} # else (seq not changed and alphabet was defined) do nothing
290
# if the seq is changed, make sure we unset a possibly set length
291
$obj->length(undef) if $is_changed_seq || $obj->{'seq'};
272
$obj->alphabet($alphabet);
273
} elsif( # if we changed a previous sequence to a new one
275
# or if there is no alphabet yet at all
276
(! defined($obj->alphabet()))) {
277
# we need to guess the (possibly new) alphabet
278
$obj->_guess_alphabet();
279
} # else (seq not changed and alphabet was defined) do nothing
280
# if the seq is changed, make sure we unset a possibly set length
281
$obj->length(undef) if $is_changed_seq || $obj->{'seq'};
293
283
return $obj->{'seq'};
298
288
Title : validate_seq
299
289
Usage : if(! $seq->validate_seq($seq_str) ) {
300
print "sequence $seq_str is not valid for an object of
290
print "sequence $seq_str is not valid for an object of
301
291
alphabet ",$seq->alphabet, "\n";
303
293
Function: Validates a given sequence string. A validating sequence string
305
295
lead to an exception if passed to seq().
307
297
The implementation provided here does not take alphabet() into
308
account. Allowed are all letters (A-Z) and '-','.', '*' and '?'.
298
account. Allowed are all letters (A-Z) and '-','.','*','?','=',
311
302
Returns : 1 if the supplied sequence string is valid for the object, and
318
309
sub validate_seq {
319
my ($self,$seqstr) = @_;
320
if( ! defined $seqstr ){ $seqstr = $self->seq(); }
321
return 0 unless( defined $seqstr);
322
if((CORE::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) {
323
$self->warn("seq doesn't validate, mismatch is " .
324
($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g));
310
my ($self,$seqstr) = @_;
311
if( ! defined $seqstr ){ $seqstr = $self->seq(); }
312
return 0 unless( defined $seqstr);
313
if((CORE::length($seqstr) > 0) &&
314
($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
315
$self->warn("seq doesn't validate, mismatch is " .
316
join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
378
370
return substr( $self->seq(), $start, ($end-$start));
381
$self->warn("Incorrect parameters to subseq - must be two integers ".
382
"or a Bio::LocationI object not ($start,$end)");
373
$self->warn("Incorrect parameters to subseq - must be two integers or a Bio::LocationI object");
414
406
my $self = shift;
415
407
my $len = CORE::length($self->seq() || '');
419
if(defined($val) && $len && ($len != $val)) {
420
$self->throw("You're trying to lie about the length: ".
421
"is $len but you say ".$val);
423
$self->{'_seq_length'} = $val;
411
if(defined($val) && $len && ($len != $val)) {
412
$self->throw("You're trying to lie about the length: ".
413
"is $len but you say ".$val);
415
$self->{'_seq_length'} = $val;
424
416
} elsif(defined($self->{'_seq_length'})) {
425
return $self->{'_seq_length'};
417
return $self->{'_seq_length'};
490
481
my( $obj, $acc ) = @_;
492
483
if (defined $acc) {
493
$obj->{'accession_number'} = $acc;
484
$obj->{'accession_number'} = $acc;
495
$acc = $obj->{'accession_number'};
496
$acc = 'unknown' unless defined $acc;
486
$acc = $obj->{'accession_number'};
487
$acc = 'unknown' unless defined $acc;
519
my ($obj,$value) = @_;
520
if( defined $value) {
521
$obj->{'primary_id'} = $value;
523
if( ! exists $obj->{'primary_id'} ) {
526
return $obj->{'primary_id'};
513
$obj->{'primary_id'} = shift;
515
if( ! defined($obj->{'primary_id'}) ) {
518
return $obj->{'primary_id'};
534
525
Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
535
Function: Returns the alphabet of sequence, one of
526
Function: Get/Set the alphabet of sequence, one of
536
527
'dna', 'rna' or 'protein'. This is case sensitive.
538
529
This is not called <type> because this would cause
550
541
my ($obj,$value) = @_;
551
542
if (defined $value) {
553
unless ( $valid_type{$value} ) {
554
$obj->throw("Alphabet '$value' is not a valid alphabet (".
555
join(',', map "'$_'", sort keys %valid_type) .
558
$obj->{'alphabet'} = $value;
544
unless ( $valid_type{$value} ) {
545
$obj->throw("Alphabet '$value' is not a valid alphabet (".
546
join(',', map "'$_'", sort keys %valid_type) .
549
$obj->{'alphabet'} = $value;
560
551
return $obj->{'alphabet'};
684
674
Title : authority
685
675
Usage : $authority = $obj->authority()
686
676
Function: A string which represents the organisation which
687
granted the namespace, written as the DNS name for
677
granted the namespace, written as the DNS name for
688
678
organisation (eg, wormbase.org).
690
680
Returns : A scalar
733
723
Function: A string which is what should be displayed to the user.
734
724
The string should have no spaces (ideally, though a cautious
735
725
user of this interface would not assumme this) and should be
736
less than thirty characters (though again, double checking
726
less than thirty characters (though again, double checking
737
727
this is a good idea).
739
729
This is aliased to display_id().
750
740
Title : description
751
741
Usage : $string = $obj->description()
752
Function: A text string suitable for displaying to the user a
742
Function: A text string suitable for displaying to the user a
753
743
description. This string is likely to have spaces, but
754
744
should not have any newlines or formatting - just plain
755
745
text. The string should not be greater than 255 characters
821
811
Title : _guess_alphabet
813
Function: Determines (and sets) the type of sequence: dna, rna, protein
815
Returns : one of strings 'dna', 'rna' or 'protein'.
831
821
sub _guess_alphabet {
833
my ($str,$str2,$total,$atgc,$u,$type);
838
$total = CORE::length($str);
825
#return if $self->alphabet;
827
my $str = $self->seq();
828
# Remove char's that clearly denote ambiguity
829
$str =~ s/[-.?x]//gi;
831
my $total = CORE::length($str);
839
832
if( $total == 0 ) {
840
$self->throw("Got a sequence with no letters in - ".
841
"cannot guess alphabet [$str]");
833
$self->warn("Got a sequence with no letters in it ".
834
"cannot guess alphabet [$str]");
844
$u = ($str =~ tr/Uu//);
845
$atgc = ($str =~ tr/ATGCNatgcn//);
838
my $u = ($str =~ tr/Uu//);
839
# The assumption here is that most of sequences comprised of mainly
840
# ATGC, with some N, will be 'dna' despite the fact that N could
842
my $atgc = ($str =~ tr/ATGCNatgcn//);
847
844
if( ($atgc / $total) > 0.85 ) {
849
846
} elsif( (($atgc + $u) / $total) > 0.85 ) {