47
# This template file is in the Public Domain.
48
# You may do anything you want with this file.
51
61
package Bio::SeqIO::game::seqHandler;
52
use vars qw{ $AUTOLOAD @ISA };
54
use XML::Handler::Subs;
57
@ISA = qw(XML::Handler::Subs);
63
use Bio::SeqIO::game::featHandler;
64
use Bio::SeqIO::game::gameSubs;
65
use Bio::SeqFeature::Generic;
66
use Bio::Seq::RichSeq;
72
@ISA = qw{ Bio::SeqIO::game::gameSubs };
77
Usage : my $seqHandler = Bio::SeqIO::game::seqHandler->new($seq, $ann, $comp, $map, $src )
78
Function: constructor method to create a sequence handler
79
Returns : a sequence handler object
80
Args : $seq -- an XML sequence element
81
$ann -- a ref. to a list of <annotation> elements
82
$comp -- a ref. to a list of <computational_analysis> elements (not used yet)
83
$map -- a <map_position> element
84
$src -- a flag to indicate that the sequence already has a source feature
60
my ($caller,$seq) = @_;
89
my ($caller, $seq, $ann, $comp, $map, $src ) = @_;
61
91
my $class = ref($caller) || $caller;
71
Title : start_document
72
Usage : $obj->start_document
73
Function: PerlSAX method called when a new document is initialized
81
my ($self, $document) = @_;
82
$self->{'in_current_seq'} = 'false';
83
$self->{'Names'} = [];
84
$self->{'string'} = '';
90
Usage : $obj->end_document
91
Function: PerlSAX method called when a document is finished for cleaning up
92
Returns : list of sequences seen
98
my ($self, $document) = @_;
99
delete $self->{'Names'};
100
return new Bio::PrimarySeq( -seq => $self->{'residues'},
101
-alphabet => $self->{'alphabet'},
102
-id => $self->{'seq'},
103
-accession => $self->{'accession'},
104
-desc => $self->{'desc'},
105
-length => $self->{'length'},
113
Title : start_element
114
Usage : $obj->start_element
115
Function: PerlSAX method called when a new element is reached
117
Args : element object
122
my ($self, $element) = @_;
124
push @{$self->{'Names'}}, $element->{'Name'};
125
$self->{'string'} = '';
127
if ($element->{'Name'} eq 'bx-seq:seq') {
128
if ($element->{'Attributes'}->{'bx-seq:id'} eq $self->{'seq'}) {
129
$self->{'in_current_seq'} = 'true';
130
$self->{'alphabet'} = $element->{'Attributes'}->{'bx-seq:type'};
131
$self->{'length'} = $element->{'Attributes'}->{'bx-seq:length'};
133
#This is not the sequence we want to import, but that's ok
142
Usage : $obj->end_element
143
Function: PerlSAX method called when an element is finished
145
Args : element object
150
my ($self, $element) = @_;
152
if ($self->{'in_current_seq'} eq 'true') {
153
if ($self->in_element('bx-seq:residues')) {
154
while ($self->{'string'} =~ s/\s+//) {};
155
$self->{'residues'} = $self->{'string'};
159
if ($self->in_element('bx-seq:name')) {
160
$self->{'string'} =~ s/^\s+//g;
161
$self->{'string'} =~ s/\s+$//;
162
$self->{'string'} =~ s/\n//g;
163
$self->{'name'} = $self->{'string'};
167
if ($self->in_element('bx-link:id') && $self->within_element('bx-link:dbxref')) {
168
$self->{'string'} =~ s/^\s+//g;
169
$self->{'string'} =~ s/\s+$//;
170
$self->{'string'} =~ s/\n//g;
171
$self->{'accession'} = $self->{'string'};
174
if ($self->in_element('bx-seq:description')) {
175
$self->{'desc'} = $self->{'string'};
178
if ($self->in_element('bx-seq:seq')) {
179
$self->{'in_current_seq'} = 'false';
183
pop @{$self->{'Names'}};
190
Usage : $obj->end_element
191
Function: PerlSAX method called when text between XML tags is reached
198
my ($self, $text) = @_;
199
$self->{'string'} .= $text->{'Data'};
205
Usage : $obj->in_element
206
Function: PerlSAX method called to test if state is in a specific element
208
Args : name of element
213
my ($self, $name) = @_;
215
return ($self->{'Names'}[-1] eq $name);
218
=head2 within_element
220
Title : within_element
221
Usage : $obj->within_element
222
Function: PerlSAX method called to list depth within specific element
224
Args : name of element
229
my ($self, $name) = @_;
232
foreach my $el_name (@{$self->{'Names'}}) {
233
$count ++ if ($el_name eq $name);
242
Usage : do not use directly
243
Function: autoload handling of missing DESTROY method
253
my $method = $AUTOLOAD;
255
return if $method eq 'DESTROY';
257
print "UNRECOGNIZED $method\n";
109
Usage : @seqs = $seqHandler->convert
110
Function: converts the main XML sequence element and associated annotations to Bio::
111
Returns : a ref. to a an array containing the sequence object and a ref. to a list of features
114
Note : The features and sequence are kept apart to facilitate downstream filtering of features
120
my @ann = @{$self->{anns}};
121
my @seq = @{$self->{seqs}};
125
if ( $self->{comps} ) {
126
@comp = @{$self->{comps}}
129
# process the sequence elements
131
$self->_add_seq( $_ );
134
# process the annotation elements
136
$self->_annotation( $_ );
139
return $self->_order_feats( $self->{seq_h} );
145
Usage : $self->_order_feats( $self->{seq_h} )
146
Function: an internal method to ensure the source feature comes first
147
Returns : a ref. to a an array containing the sequence object and a ref. to a list of features
148
Args : a ref. to a hash of sequences
153
my ($self, $seqs) = @_;
154
my $seq = $self->{main_seq};
156
my $ann = $self->{ann_l};
158
# make sure source comes first
159
my @src = grep { $_->primary_tag =~ /source|origin|\bregion\b/ } @$ann;
160
my @other = grep { $_->primary_tag !~ /source|origin|\bregion\b/ } @$ann;
162
return [$seq, [@src, @other]];
168
Usage : $self->_add_seq($seq_element)
169
Function: an internal method to process the sequence elements
171
Args : a sequence element
176
my ($self, $el) = @_;
179
if ($el->{_residues}) {
180
$residues = $el->{_residues}->{Characters};
181
$residues =~ s/[ \n\r]//g;
182
$residues =~ s/\!//g;
183
$residues =~ tr/a-z/A-Z/;
189
my $id = $el->{Attributes}->{id};
190
my $ver = $el->{Attributes}->{version};
191
my $name = $el->{_name}->{Characters};
193
if ($name && $name ne $id) {
194
$self->complain("The sequence name and unique ID do not match. Using ID");
197
# get/set the sequence object
198
my $seq = $self->_seq($id);
200
# get/set the feature handler
201
my $featHandler = $self->_feat_handler;
203
# populate the sequence object
204
$seq->seq($residues);
205
$seq->seq_version($ver) if $ver;
207
# assume the id is the accession number
208
if ( $id =~ /^\w+$/ ) {
209
$seq->accession($id);
212
# If the focus attribute is set to "true", this is the main
215
if ( defined $el->{Attributes}->{focus} ) {
216
$self->{main_seq} = $seq;
220
# make sure real and annotated lengths match
221
my $length = $el->{Attributes}->{'length'};
222
$length && $seq->length(int($length));
223
if ( $seq->seq && defined($length) && $seq->length != int($length) ) {
224
$self->complain("The specified sequence has length ", $seq->length(),
225
" but the length attribute= ", $length);
227
$seq->length( int($length) );
230
# deal with top-level annotations
232
if ( $el->{Attributes}->{md5checksum} ) {
233
$tags->{md5checksum} = [$el->{Attributes}->{md5checksum}];
235
if ($el->{_dbxref}) {
236
$tags->{dbxref} ||= [];
237
push @{$tags->{dbxref}}, $self->dbxref( $el->{_dbxref} );
239
if ($el->{_description}) {
240
my $desc = $el->{_description}->{Characters};
241
$seq->description( $desc );
243
if ($el->{_organism}) {
244
my @organism = split /\s+/, $el->{_organism}->{Characters};
246
$self->complain("Species name should have at least two words");
249
my $species = Bio::Species->new( -classification => [reverse @organism] );
250
$seq->species($species);
253
if ( defined($seq->species) ) {
254
$tags->{organism} = [$seq->species->binomial];
256
elsif ($seq eq $self->{main_seq}) {
257
$self->warn("The source organism for this sequence was\n" .
258
"not specified. I will guess Drosophila melanogaster.\n" .
259
"Otherwise, add <organism>Genus species</organism>\n" .
260
"to the main sequence element");
261
my @class = qw/ Eukaryota Metazoa Arthropoda Insecta Pterygota
262
Neoptera Endopterygota Diptera Brachycera
263
Muscomorpha Ephydroidea Drosophilidae Drosophila melanogaster/;
264
my $species = Bio::Species->new( -classification => [ reverse @class ],
265
-common_name => 'fruit fly' );
266
$seq->species( $species );
269
# convert GAME to bioperl molecule types
270
my $alphabet = $el->{Attributes}->{type};
272
$alphabet =~ s/aa/protein/;
273
$alphabet =~ s/cdna/rna/;
274
$seq->alphabet($alphabet);
277
# add a source feature if req'd
278
if ( !$self->{has_source} && $focus ) {
279
$self->{source} = $featHandler->add_source($seq->length, $tags);
283
# add the map position
284
$self->_map_position( $self->{map_pos}, $seq );
285
$featHandler->{offset} = $self->{offset};
288
# prune the sequence from the parse tree
294
Title : _map_position
295
Usage : $self->_map_position($map_posn_element)
296
Function: an internal method to process the <map_position> element
298
Args : a map_position element
303
my ($self, $el) = @_;
305
# chromosome and coordinates
306
my $arm = $el->{_arm}->{Characters};
307
my $type = $el->{Attributes}->{type};
308
my $loc = $el->{_span};
309
my $start = $loc->{_start}->{Characters};
310
my $end = $loc->{_end}->{Characters};
312
# define the offset (may be a partial sequence)
313
# The coordinates will be relative but the CDS description
314
# coordinates may be absolute if the game-XML comes from apollo
316
$self->{offset} = $start - 1;
318
my $seq_id = $el->{Attributes}->{seq};
319
my $seq = $self->{seq_h}->{$seq_id};
322
$self->throw("Map position with no corresponding sequence object");
324
unless ($seq eq $self->{main_seq}){
325
$self->throw("Map position does not correspond to the main sequence");
330
# create/update the top-level sequence feature if req'd
331
if ( $self->{source} ) {
332
my $feat = $self->{source};
334
unless ($feat->has_tag('organism')) {
335
$species = eval {$seq->species->binomial} || 'unknown species';
336
$feat->add_tag_value( organism => $species );
339
my %tags = ( mol_type => "genomic dna",
341
location => "$start..$end",
346
$feat->add_tag_value( $_ => $tags{$_} );
349
$seq->add_SeqFeature($feat);
352
# come up with a description if there is none
353
my $desc = $seq->description;
354
if ( $species && $arm && $start && $end && !$desc) {
355
$seq->description("$species chromosome $arm $start..$end " .
356
"segment of complete sequence");
365
Usage : $self->_annotation($annotation_element)
366
Function: an internal method to process <annotation> elements
368
Args : an annotation element
373
my ($self, $el) = @_;
375
my $id = $el->{Attributes}->{id};
376
my $type = $el->{_type}->{Characters};
378
my $gname = $el->{_name}->{Characters} eq $id ? '' : $el->{_name}->{Characters};
380
# 'transposable element' is too long (breaks Bio::SeqIO::GenBank)
381
$type =~ s/transposable_element/repeat_region/;
383
# annotations must be on the main sequence
384
my $seqid = $self->{main_seq}->id;
385
my $featHandler = $self->_feat_handler;
389
for my $child ( @{$el->{Children}} ) {
390
my $name = $child->{Name};
392
# these elements require special handling
393
if ( $name eq 'dbxref' ) {
394
$tags->{dbxref} ||= [];
395
push @{$tags->{dbxref}}, $self->dbxref( $child );
397
elsif ( $name eq 'aspect' ) {
398
$tags->{dbxref} ||= [];
399
push @{$tags->{dbxref}}, $self->dbxref( $child->{_dbxref} );
401
elsif ( $name eq 'feature_set' ) {
402
push @feats, $featHandler->feature_set( $id, $gname, $child, $type );
404
elsif ( $name eq 'comment' ) {
405
$tags->{comment} = [$self->comment( $child )];
407
elsif ( $name eq 'property' ) {
408
$self->property( $child, $tags );
410
elsif ( $name eq 'gene' ) {
411
# we may be dealing with an annotation that is not
412
# a gene, so we have to nest the gene inside it
413
$featHandler->has_gene( $child, $gname, $id )
416
# otherwise, tag/value pairs
417
# -- mild dtd enforcement
418
# synonym is not in the dtd but shows up in gadfly
420
elsif ( $name =~ /type|synonym/ ) {
421
$tags->{$name} = [$child->{Characters}];
423
elsif ( $name ne 'name' ) {
424
$self->complain("Unrecognized element '$name'. I don't " .
425
"know what to do with $name elements in " .
426
"top-level sequence annotations." );
431
if ( $tags->{symbol} ) {
432
if ( !$tags->{gene} ) {
433
$tags->{gene} = $tags->{symbol};
435
delete $tags->{symbol};
439
$featHandler->add_annotation( $self->{main_seq}, $type, $id, $tags, \@feats );
443
# get/set the sequence object
447
Usage : my $seq = $self->_seq
448
Function: an internal sequence getter/setter
449
Returns : a Bio::RichSeq object
455
my ($self, $id) = @_;
456
$id || $self->throw("A unique id must be provided for the sequence");
460
if ( defined $self->{seq_h}->{$id}) {
461
$seq = $self->{seq_h}->{$id};
463
$seq = Bio::Seq::RichSeq->new( -id => $id );
464
$self->{seq_h}->{$id} = $seq; # store it
470
#get/set the feature handler
473
Title : _feat_handler
474
Usage : my $featHandler = $self->_featHandler
475
Function: an internal getter/setter for feature handling objects
476
Returns : a Bio::SeqIO::game::featHandler object
485
my $seq = $self->{main_seq};
487
if ( defined $self->{feat_handler} ) {
488
$handler = $self->{feat_handler};
491
my @args = ( $seq, $self->{seq_h}, $self->{ann_l} );
492
$handler = Bio::SeqIO::game::featHandler->new( @args );
493
$self->{feat_handler} = $handler;