1
# $Id: Sopma.pm,v 1.0 2003/07/ 11
3
# BioPerl module for Bio::Tools::Analysis::Protein::Sopma
5
# Copyright Richard Adams
7
# You may distribute this module under the same terms as perl itself
9
# POD documentation - main docs before the code
14
Bio::Tools::Analysis::Protein::Sopma - a wrapper around
15
Sopma protein secondary structure prediction server
19
use Bio::Tools::Analysis::Protein::Sopma;
20
#get a Bio::Seq or Bio::PrimarySeq
23
my $sopma = Bio::Tools::Analysis::Protein::Sopma->new
24
(-seq=>$seq, states=>4);
26
print $sopma->result;# #raw text to standard error
30
A module to remotely retrieve predictions of protein secondary
31
structure. Each residue in the protein receives a score representing
32
the likelihood of existing in each of four different states (helix,
33
coil, turn or sheet), e.g.,
35
my $analysis_object = Bio::Tools::SimpleAnalysis::Protein::Sopma->new
41
creates a new object. Compulsory arguments -seq. Optional arguments
42
-states, -window_width,-similarity_threshold. These values can also be
43
set by direct methods , e.g.,
45
$analysis_object->states(4);
46
$analysis_object->run ;
48
submits the query to the server and obtains raw text output Given an
49
amino acid sequence the results can be obtained in 4 formats,
50
determined by the argument to the result method
56
The raw text of the program output
58
my $rawdata = $analysis_object->result;
62
A reference to an array of hashes of scores for each state and the
65
my $data_ref = $analysis_object->result('parsed');
66
print "score for helix at residue 2 is $data_ref->[1]{'helix'}\n";
67
print "predicted struc at residue 2 is $data_ref->[1]{'struc}\n";
68
Hash keys are 'helix', 'struc', 'sheet', 'coil', 'turn'.
73
An array of Bio::SeqFeature::Generic objects where each feature is a
74
predicted unit of secondary structure. Only stretches of helix/sheet
75
predictions for longer than 4 residues are defined as helices/sheets.
77
my @fts = $analysis_object->result(Bio::SeqFeatureI);
79
print " From ", $ft->start, " to ",$ft->end, " struc: " ,
80
($ft->each_tag_value('type'))[0] ,"\n";
85
A Bio::Seq::Meta::Array implementing sequence.
87
This is a Bio::Seq object that can also hold data about each residue
88
in the sequence. In this case, the sequence can be associated with a
89
arrays of Sopma prediction scores. e.g.,
91
my $meta_sequence = $analysis_object->result('meta');
92
print "scores from residues 10 -20 are ",
93
$meta_sequence->named_submeta_text("Sopma_helix",10,20), "\n";
95
Meta sequence names are : Sopma_helix, Sopma_sheet, Sopma_turn,
96
Sopma_coil, Sopma_struc, representing the scores for each residue.
98
Many methods common to all analyses are inherited from
99
Bio::Tools::Analysis::SimpleAnalysisBase.
105
L<Bio::SimpleAnalysisI>,
106
L<Bio::Tools::Analysis::SimpleAnalysisBase>
107
L<Bio::Seq::Meta::Array>,
114
User feedback is an integral part of the evolution of this and other
115
Bioperl modules. Send your comments and suggestions preferably to one
116
of the Bioperl mailing lists. Your participation is much appreciated.
118
bioperl-l@bioperl.org - General discussion
119
http://bio.perl.org/MailList.html - About the mailing lists
121
=head2 Reporting Bugs
123
Report bugs to the Bioperl bug tracking system to help us keep track
124
the bugs and their resolution. Bug reports can be submitted via email
127
bioperl-bugs@bio.perl.org
128
http://bugzilla.bioperl.org/
132
Richard Adams, Richard.Adams@ed.ac.uk,
140
package Bio::Tools::Analysis::Protein::Sopma;
145
use HTTP::Request::Common qw (POST);
146
use Bio::SeqFeature::Generic;
147
use Bio::Tools::Analysis::SimpleAnalysisBase;
148
use Bio::Seq::Meta::Array;
151
@ISA = qw(Bio::Tools::Analysis::SimpleAnalysisBase);
153
#extends array for 2struc.
154
my $URL = 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_sopma.pl';
155
my $ANALYSIS_NAME= 'Sopma';
156
my $ANALYSIS_SPEC= {name => 'Sopma', type => 'Protein'};
159
type => 'Bio::PrimarySeqI',
162
{mandatory =>'false',
164
name => 'similarity_threshold',
167
{mandatory =>'false',
169
name => 'window_width',
172
{mandatory =>'false',
180
'' => 'bulk', # same as undef
181
raw => '[{struc=>, helix=>, turn=>, coil=>, sheet=>}]',
182
meta => 'Bio::Seq::Meta::Array object',
183
'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
185
use constant MIN_STRUC_LEN => 3;
188
=head2 similarity_threshold
190
Useage : $job->similarity_threshold(...)
191
Returns : The similarity threshold used in the analysis
192
Args : None (retrieves value) or an integer (default = 8)
193
that sets the similarity threshold .
195
This method gets/sets the similarity threshold for the prediction.
199
sub similarity_threshold {
200
my ($self, $value) = @_;
202
$self->throw ("similarity_threshold must be integer")
203
unless $value =~ /^\d+$/;
204
$self->{'_similarity_threshold'} = $value;
206
$self->{'_similarity_threshold'} ||= $self->input_spec->[1]{'default'};
207
return $self->{'_similarity_threshold'};
212
Useage : $job->window_width(...)
213
Returns : The window width used in the analysis
214
Args : None (retrieves value) or an integer (default = 17)
215
that sets the window width.
217
This method gets/sets the window width for the prediction, . If
218
attempted to set longer than the sequence, warns of error.
223
my ($self, $value) = @_;
225
$self->throw ("window_width must be integer")
226
unless $value =~ /^\d+$/;
227
$self->{'_window_width'} = $value;
229
$self->{'_window_width'} ||= $self->input_spec->[2]{'default'};
230
$self->warn ("window width longer than sequence!")
231
unless $self->{'_window_width'} < $self->seq->length;
232
return $self->{'_window_width'};
237
Useage : $job->states(...)
238
Returns : The number of secondary structure prediction states
239
Args : None (retrieves value) or either '3' or '4' to set
240
prior to running analysis.
242
This method gets/sets the number of states for the prediction, either
243
3 or 4 (includes turns).
248
my ($self, $value) = @_;
250
$self->throw ("number of states must be 3 or 4")
251
unless $value == 3 or $value ==4;
252
$self->{'_states'} = $value;
254
$self->{'_states'} ||= $self->input_spec->[3]{'default'};
255
return $self->{'_states'};
260
Usage : $job->result (...)
261
Returns : a result created by running an analysis
264
The method returns a result of an executed job. If the job was
265
terminated by an error the result may contain an error message instead
268
This implementation returns differently processed data depending on
275
Returns the raw ASCII data stream but without HTML tags
277
=item 'Bio::SeqFeatureI'
279
The argument string defines the type of bioperl objects returned in an
280
array. The objects are L<Bio::SeqFeature::Generic>. Feature primary
281
tag is "2ary". Feature tags are "type" (which can be helix, sheet
282
coil, or turn if 4 state prediction requested) "method" (Sopma)
286
Array of hash references of scores/structure assignations { helix =E<gt>,
287
sheet =E<gt> , coil =E<gt> , struc=E<gt>}.
291
A Bio::Seq::Meta::Array object. Scores can be accessed using methods
292
from this class. Meta sequence names are Sopma_helix, Sopma_sheet,
293
Sopma_coil, Sopma_turn (if defined) Sopma_struc.
302
my ($self,$value, $run_id) = @_;
308
if (!exists($self->{'_parsed'} )) {
309
my $result = IO::String->new($self->{'_result'});
310
while (my $line = <$result>) {
311
next unless $line =~ /^[HCET]\s/; # or for sopma/hnn /^[A-Z]\s/
312
$line =~/^([A-Z])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/; # or for so
313
push @score, { struc => $1,
318
#include turn if 4states are requested
319
$score[$#score]{'turn'} = $4 if $self->states == 4;
320
#can optimize by duplicating code here
322
$self->{'_parsed'} = \@score;
324
if ($value eq 'Bio::SeqFeatureI') {
325
$self->_get_2ary_coords();
326
for my $type (keys %{$self->{'_parsed_coords'}} ) {
327
next if $type =~ /\w{2,}/; #if not H,C,E or T
329
## these 2 are added to distinguish features on same
330
## sequence run with different params
333
method => $self->analysis_name,
335
$self->_add_params_to_result($tag_hash);
337
## now make feature object
338
for my $loc (@{$self->{'_parsed_coords'}{$type}} ) {
339
push @fts, Bio::SeqFeature::Generic->new
340
(-start => $loc->{'start'},
341
-end => $loc->{'end'},
343
-primary => 'Domain',
346
} #end of array of strucs of type
347
} # end of all 2nd struc elements
348
delete $self->{'_parsed_coords'}; #remove temp data
350
} #endif BioSeqFeature
352
elsif ($value eq 'meta') {
353
#1st of all make 3 or 4 arrays of scores for each type from column data
355
for my $aa (@{$self->{'_parsed'}}) {
356
for my $type (qw(struc helix sheet coil)) {
357
push @{$type_scores{$type}}, $aa->{$type};
359
push @{$type_scores{'turn'}}, $aa->{'turn'} if exists $aa->{'turn'};
362
## convert to meta sequence array ##
363
if (!$self->seq->isa("Bio::Seq::Meta::Array")) {
364
bless ($self->seq, "Bio::Seq::Meta::Array");
366
$self->seq->isa("Bio::Seq::MetaI")
367
|| $self->throw("$self is not a Bio::Seq::MetaI");
370
$Bio::Seq::Meta::Array::DEFAULT_NAME = 'Sopma_struc';
371
for my $struc_type (keys %type_scores) {
372
my $meta_name = "Sopma". "_" . "$struc_type";
374
$meta_name .= "|$run_id";
376
my @meta = map{$_->{$struc_type}} @{$self->{'_parsed'}};
377
if (grep{$_ eq $meta_name}$self->seq->meta_names >0) {
378
$self->warn ("$meta_name already exists , not overwriting!");
381
$self->seq->named_meta($meta_name,\@meta );
383
# return seq array object implementing meta sequence #
387
## else return parsed data if $value is defined
389
return $self->{'_parsed'};
393
#return raw result if no return format stated
394
return $self->{'_result'};
400
$self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
401
$self->{'_INPUT_SPEC'} = $INPUT_SPEC;
402
$self->{'_RESULT_SPEC'} = $RESULT_SPEC;
403
$self->{'_ANALYSIS_NAME'} = $ANALYSIS_NAME;
407
sub _get_2ary_coords {
408
#helper sub for result;
409
##extracts runs of structure > MIN_STRUC_LENresidues or less if Turn:
410
#i.e., helical prediction for 1 residue isn't very meaningful...
411
## and poulates array of hashes with start/end values.
412
##keys of $Result are 'H' 'T' 'C' 'E'.
414
my @prot = @{$self->{'_parsed'}};
416
for (my $index = 0; $index <= $#prot; $index++) {
418
my $type = $prot[$index]{'struc'};
419
next unless $type && $type =~ /[HTCE]/;
421
for (my $j = $index + 1; $j <= $#prot; $j++) {
422
my $test = $prot[$j];
423
if ($test->{'struc'} eq $type) {
425
} elsif ( $length > MIN_STRUC_LEN ||
426
($length <= MIN_STRUC_LEN && $type eq 'T') ) {
427
push @{$Result{$type}}, {start => $index + 1 , end => $j};
428
$index += $length -1;
431
$index += $length - 1;
436
$self->{'_parsed_coords'} = \%Result; #temp assignment
442
# delay repeated calls by default by 3 sec, set delay() to change
444
$self->status('TERMINATED_BY_ERROR');
445
my $request = POST 'http://npsa-pbil.ibcp.fr/cgi-bin/secpred_sopma.pl',
446
Content_Type => 'form-data',
447
Content => [title => "",
448
notice => $self->seq->seq,
450
states => $self->states,
451
threshold => $self->similarity_threshold ,
452
width => $self->window_width,
455
my $text = $self->request($request)->content;
456
return $self unless $text;
458
#### get text only version of results ##
459
my ($next) = $text =~ /Prediction.*?=(.*?)>/;
460
my $out = "http://npsa-pbil.ibcp.fr/". "$next";
461
my $req2 = HTTP::Request->new(GET=>$out);
462
my $resp2 = $self->request ($req2);
463
$self->{'_result'} = $resp2->content;
464
$self->status('COMPLETED') if $resp2 ne '';
468
sub _add_params_to_result{
469
## called when making Seqfeature objects
470
my ($self, $tag_hash) = @_;
472
## adds input parameter values to SeqFeatureI results where multiple
473
## parameter values are possible. Only adds value if not default.
474
map{$hash->{$_->{'name'}} = $_}@{$self->input_spec()};
476
for my $p (keys %$hash) {
477
if (!ref($self->$p) && $self->$p ne $hash->{$p}{'default'}) {
478
$tag_hash->{$p} = $self->$p;