1
# $Id: Fgenesh.pm,v 1.10.4.2 2006/10/02 23:10:32 sendu Exp $
3
# BioPerl module for Bio::Tools::Fgenesh
5
# Cared for by Christopher Dwan (chris@dwan.org)
7
# Copied, lock stock & barrel from Genscan.pm
9
# You may distribute this module under the same terms as perl itself
11
# POD documentation - main docs before the code
15
Bio::Tools::Fgenesh - parse results of one Fgenesh run
19
use Bio::Tools::Fgenesh;
21
$fgenesh = Bio::Tools::Fgenesh->new(-file => 'result.fgenesh');
23
$fgenesh = Bio::Tools::Fgenesh->new( -fh => \*INPUT );
26
# note: this class is-a Bio::Tools::AnalysisResult which implements
27
# Bio::SeqAnalysisParserI, i.e., $fgensh->next_feature() is the same
28
while($gene = $fgenesh->next_prediction()) {
29
# $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
30
# off Bio::SeqFeature::Gene::Transcript.
32
# $gene->exons() returns an array of
33
# Bio::Tools::Prediction::Exon objects
35
@exon_arr = $gene->exons();
38
@init_exons = $gene->exons('Initial');
40
@intrl_exons = $gene->exons('Internal');
42
@term_exons = $gene->exons('Terminal');
44
($single_exon) = $gene->exons();
47
# essential if you gave a filename at initialization (otherwise the file
53
The Fgenesh module provides a parser for Fgenesh (version 2) gene structure
54
prediction output. It parses one gene prediction into a
55
Bio::SeqFeature::Gene::Transcript- derived object.
57
This module also implements the L<Bio::SeqAnalysisParserI> interface, and thus
58
can be used wherever such an object fits.
64
User feedback is an integral part of the evolution of this and other
65
Bioperl modules. Send your comments and suggestions preferably to one
66
of the Bioperl mailing lists. Your participation is much appreciated.
68
bioperl-l@bioperl.org - General discussion
69
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73
Report bugs to the Bioperl bug tracking system to help us keep track
74
the bugs and their resolution. Bug reports can be submitted via the
77
http://bugzilla.open-bio.org/
79
=head1 AUTHOR - Chris Dwan
81
Email chris-at-dwan.org
85
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
90
# Let the code begin...
93
package Bio::Tools::Fgenesh;
98
use Bio::Tools::Prediction::Gene;
99
use Bio::Tools::Prediction::Exon;
101
use base qw(Bio::Tools::AnalysisResult);
103
my %ExonTags = ('CDSf' => 'Initial',
104
'CDSi' => 'Internal',
105
'CDSl' => 'Terminal',
106
'CDSo' => 'Singleton');
108
sub _initialize_state {
109
my ($self,@args) = @_;
111
# first call the inherited method!
112
$self->SUPER::_initialize_state(@args);
114
# our private state variables
115
$self->{'_preds_parsed'} = 0;
116
$self->{'_has_cds'} = 0;
117
# array of pre-parsed predictions
118
$self->{'_preds'} = [];
120
$self->{'_seqstack'} = [];
123
=head2 analysis_method
125
Usage : $genscan->analysis_method();
126
Purpose : Inherited method. Overridden to ensure that the name matches
134
sub analysis_method {
136
my ($self, $method) = @_;
137
if($method && ($method !~ /fgenesh/i)) {
138
$self->throw("method $method not supported in " . ref($self));
140
return $self->SUPER::analysis_method($method);
146
Usage : while($gene = $fgenesh->next_feature()) {
149
Function: Returns the next gene structure prediction of the Fgenesh result
150
file. Call this method repeatedly until FALSE is returned.
152
The returned object is actually a SeqFeatureI implementing object.
153
This method is required for classes implementing the
154
SeqAnalysisParserI interface, and is merely an alias for
155
next_prediction() at present.
158
Returns : A Bio::Tools::Prediction::Gene object.
164
my ($self,@args) = @_;
165
# even though next_prediction doesn't expect any args (and this method
166
# does neither), we pass on args in order to be prepared if this changes
168
return $self->next_prediction(@args);
171
=head2 next_prediction
173
Title : next_prediction
174
Usage : while($gene = $fgenesh->next_prediction()) { ... }
175
Function: Returns the next gene structure prediction of the Genscan result
176
file. Call this method repeatedly until FALSE is returned.
178
Returns : A Bio::Tools::Prediction::Gene object.
183
sub next_prediction {
187
# if the prediction section hasn't been parsed yet, we do this now
188
$self->_parse_predictions() unless $self->_predictions_parsed();
190
# get next gene structure
191
$gene = $self->_prediction();
194
# fill in predicted protein, and if available the predicted CDS
197
# use the seq stack if there's a seq on it
198
my $seqobj = pop(@{$self->{'_seqstack'}});
201
($id, $seq) = $self->_read_fasta_seq();
203
if (($id =~ /mrna/) || ($id =~ /cds/)) { $alphabet = 'dna'; }
204
else { $alphabet = 'protein'; }
205
$seqobj = Bio::PrimarySeq->new('-seq' => $seq,
206
'-display_id' => $id,
207
'-alphabet' => $alphabet);
211
# check that prediction number matches the prediction number
212
# indicated in the sequence id (there may be incomplete gene
213
# predictions that contain only signals with no associated protein
216
$gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
218
if ($id !~ /_predicted_(\w+)_$prednr/) {
219
# this is not our sequence, so push back for next prediction
220
push(@{$self->{'_seqstack'}}, $seqobj);
222
if ($1 eq "protein") {
223
$gene->predicted_protein($seqobj);
224
} elsif (($1 eq "mrna") || ($1 eq "cds")) {
226
$gene->predicted_cds($seqobj);
228
# Have to go back in and get the protein...
229
($id, $seq) = $self->_read_fasta_seq();
230
if ($id =~ /_cds_/) {
231
($id, $seq) = $self->_read_fasta_seq();
234
$seqobj = Bio::PrimarySeq->new('-seq' => $seq,
235
'-display_id' => $id,
236
'-alphabet' => "protein");
237
$gene->predicted_protein($seqobj);
246
=head2 _parse_predictions
248
Title : _parse_predictions()
249
Usage : $obj->_parse_predictions()
250
Function: Parses the prediction section. Automatically called by
251
next_prediction() if not yet done.
257
sub _parse_predictions {
262
while(defined($_ = $self->_readline())) {
264
if(/^\s*(\d+)\s+([+\-])/) {
269
my $strand = ($2 eq '+') ? 1 : -1;
271
if(! defined($gene)) {
272
$gene = Bio::Tools::Prediction::Gene->new(
273
'-primary' => "GenePrediction$prednr",
274
'-source' => 'Fgenesh');
278
my @flds = split(/\s+/, $line);
280
# create the feature object depending on the type of signal
282
my $is_exon = grep {$line =~ $_} keys(%ExonTags);
285
$predobj = Bio::Tools::Prediction::Exon->new();
286
$predobj->score($flds[8]);
291
$predobj = Bio::SeqFeature::Generic->new();
292
$predobj->score($flds[5]);
297
$predobj->source_tag('Fgenesh');
298
$predobj->strand($strand);
300
# Following tactical commenting-out made by
301
# malcolm.cook@stowers-institute.org since coordinate reversal is
302
# apparently vestigial copy/paste detritus from Genscan.pm origins of
303
# this module and this is NOT needed for fgenesh (at least in v
306
# if($predobj->strand() == 1) {
307
$predobj->start($start);
310
# $predobj->end($start);
311
# $predobj->start($end);
314
# print STDERR "start $start end $end\n";
315
# add to gene structure (should be done only when start and end
316
# are set, in order to allow for proper expansion of the range)
318
# first, set fields unique to exons
319
$predobj->primary_tag($ExonTags{$flds[3]} . 'Exon');
320
$predobj->is_coding(1);
322
if($predobj->strand() == 1) {
323
$cod_offset = ($flds[9] - $predobj->start()) % 3;
324
# Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
325
# to offsets 2 and 1, resp. Offset 3 is the same as 0.
326
$cod_offset += 3 if($cod_offset < 1);
328
# On the reverse strand the Genscan frame also refers to
329
# the first base of the first complete codon, but viewed
330
# from forward, which is the third base viewed from
332
$cod_offset = ($flds[11] - $predobj->end()) % 3;
333
# Possible values are -2, -1, 0, 1, 2. Due to the reverse
334
# situation, {2,-1} and {1,-2} correspond to offsets
335
# 1 and 2, resp. Offset 3 is the same as 0.
336
$cod_offset -= 3 if($cod_offset >= 0);
337
$cod_offset = -$cod_offset;
339
# Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
340
# is the frame of the first base relative to the exon, or the
341
# number of bases the first codon is missing).
342
$predobj->frame(3 - $cod_offset);
343
# print STDERR " frame is " . $predobj->frame() . "\n";
344
# then add to gene structure object
345
$gene->add_exon($predobj, $ExonTags{$flds[1]});
346
} elsif($flds[3] eq 'PolA') {
347
$predobj->primary_tag("PolyAsite");
348
$gene->poly_A_site($predobj);
349
} elsif($flds[3] eq 'TSS') {
350
$predobj->primary_tag("Promoter"); # (hey! a TSS is NOT a promoter... what's going on here?...
351
$gene->add_promoter($predobj);
352
#I'd like to do this (for now):
353
#$predobj->primary_tag("TSS"); #this is not the right model, but, it IS a feature at least.
354
#but the followg errs out
355
#$gene->add_SeqFeature($predobj); #err: MSG: start is undefined when bounds at Bio::SeqFeature::Generic::add_SeqFeature 671 check since gene has no start yet
358
$self->throw("unrecognized prediction line: " . $line);
363
if(/^\s*$/ && defined($gene)) {
364
# current gene is completed
365
$gene->seq_id($seqname);
366
$self->_add_prediction($gene);
371
if(/^(FGENESH)\s+([\d\.]+)/) {
372
$self->analysis_method($1);
373
$self->analysis_method_version($2);
374
if (/\s(\S+)\sgenomic DNA/) {
375
$self->analysis_subject($1);
380
if(/^\s*Seq name:\s+(\S+)/) {
385
/^Predicted protein/ && do {
386
# section of predicted sequences
387
$self->_pushback($_);
391
$self->_predictions_parsed(1);
396
Title : _prediction()
397
Usage : $gene = $obj->_prediction()
407
return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
408
return shift(@{$self->{'_preds'}});
411
=head2 _add_prediction
413
Title : _add_prediction()
414
Usage : $obj->_add_prediction($gene)
421
sub _add_prediction {
422
my ($self, $gene) = @_;
424
if(! exists($self->{'_preds'})) {
425
$self->{'_preds'} = [];
427
push(@{$self->{'_preds'}}, $gene);
430
=head2 _predictions_parsed
432
Title : _predictions_parsed
433
Usage : $obj->_predictions_parsed
436
Returns : TRUE or FALSE
440
sub _predictions_parsed {
441
my ($self, $val) = @_;
443
$self->{'_preds_parsed'} = $val if $val;
444
if(! exists($self->{'_preds_parsed'})) {
445
$self->{'_preds_parsed'} = 0;
447
return $self->{'_preds_parsed'};
453
Usage : $obj->_has_cds()
454
Function: Whether or not the result contains the predicted CDSs, too.
456
Returns : TRUE or FALSE
461
my ($self, $val) = @_;
463
$self->{'_has_cds'} = $val if $val;
464
if(! exists($self->{'_has_cds'})) {
465
$self->{'_has_cds'} = 0;
467
return $self->{'_has_cds'};
470
=head2 _read_fasta_seq
472
Title : _read_fasta_seq()
473
Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
474
Function: Simple but specialised FASTA format sequence reader. Uses
475
$self->_readline() to retrieve input, and is able to strip off
476
the traling description lines.
478
Returns : An array of two elements: fasta_id & sequence
482
sub _read_fasta_seq {
487
my $entry = $self->_readline();
488
# print " ^^ $entry\n";
489
return unless ($entry);
490
$entry = $self->_readline() if ($entry =~ /^Predicted protein/);
491
# print " ^^ $entry\n";
493
# Pick up the header / id.
494
if ($entry =~ /^>FGENESH:/) {
495
if ($entry =~ /^>FGENESH:\s+(\d+)/) {
496
# print STDERR " this is a predicted gene\n";
497
$id = "_predicted_protein_" . $1;
498
} elsif ($entry =~ /^>FGENESH:\[mRNA\]\s+(\d+)/) {
499
# print STDERR " this is an mRNA\n";
500
$id = "_predicted_mrna_" . $1;
501
} elsif ($entry =~ /^>FGENESH:\[exon\]\s+Gene:\s+(\d+)/) {
502
$id = "_predicted_cds_" . $1;
505
$entry = $self->_readline();
510
# print "*** $entry\n";
511
if (($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)) {
512
# print STDERR " -- informed about an exon header...\n";
513
$entry = $self->_readline();
516
# print STDERR " Added $entry\n";
519
last unless $entry = $self->_readline();
520
if (($entry =~ /^>/) &&
521
(!(($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)))) {
522
$self->_pushback($entry); last;
527
$seq =~ s/\s//g; # Remove whitespace