1
# $Id: chaos.pm,v 1.10.4.1 2006/10/02 23:10:28 sendu Exp $
2
# $Date: 2006/10/02 23:10:28 $
4
# BioPerl module for Bio::SeqIO::chaos
6
# Chris Mungall <cjm@fruitfly.org>
8
# You may distribute this module under the same terms as perl itself
10
# POD documentation - main docs before the code
14
Bio::SeqIO::chaos - chaos sequence input/output stream
18
#In general you will not want to use this module directly;
19
#use the chaosxml format via SeqIO
21
$outstream = Bio::SeqIO->new(-file => $filename,
22
-format => 'chaosxml');
24
while ( my $seq = $instream->next_seq() ) {
25
$outstream->write_seq($seq);
30
This is the guts of L<Bio::SeqIO::chaosxml> - please refer to the
31
documentation for this module
33
B<CURRENTLY WRITE ONLY>
35
ChaosXML is an XML mapping of the chado relational database; for more
36
information, see http://www.fruitfly.org/chaos-xml
38
chaos can be represented in various syntaxes - XML, S-Expressions or
39
indented text. You should see the relevant SeqIO file. You will
40
probably want to use L<Bio::SeqIO::chaosxml>, which is a wrapper to
43
=head2 USING STAG OBJECTS
45
B<non-standard bioperl stuff you dont necessarily need to know follows>
47
This module (in write mode) is an B<event producer> - it generates XML
48
events via the L<Data::Stag> module. If you only care about the final
49
end-product xml, use L<Bio::SeqIO::chaosxml>
51
You can treat the resulting chaos-xml stream as stag XML objects;
53
$outstream = Bio::SeqIO->new(-file => $filename, -format => 'chaos');
55
while ( my $seq = $instream->next_seq() ) {
56
$outstream->write_seq($seq);
58
my $chaos = $outstream->handler->stag;
59
# stag provides get/set methods for xml elements
60
# (these are chaos objects, not bioperl objects)
61
my @features = $chaos->get_feature;
62
my @feature_relationships = $chaos->get_feature_relationships;
63
# stag objects can be queried with functional-programming
65
my @features_in_range =
66
$chaos->where('feature',
68
my $featureloc = shift->get_featureloc;
69
$featureloc->strand == 1 &&
70
$featureloc->nbeg > 10000 &&
71
$featureloc->nend < 20000;
73
foreach my $feature (@features_in_range) {
74
my $featureloc = $feature->get_featureloc;
75
printf "%s [%d->%d on %s]\n",
77
$featureloc->sget_nbeg,
78
$featureloc->sget_end,
79
$featureloc->sget_srcfeature_id;
82
=head1 MODULES REQUIRED
86
Downloadable from CPAN; see also http://stag.sourceforge.net
92
User feedback is an integral part of the evolution of this and other
93
Bioperl modules. Send your comments and suggestions preferably to one
94
of the Bioperl mailing lists. Your participation is much appreciated.
96
bioperl-l@bioperl.org - General discussion
97
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
101
Report bugs to the Bioperl bug tracking system to help us keep track
102
the bugs and their resolution.
103
Bug reports can be submitted via the web:
105
http://bugzilla.bioperl.org
107
=head1 AUTHOR - Chris Mungall
109
Email cjm@fruitfly.org
113
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
117
# Let the code begin...
119
package Bio::SeqIO::chaos;
122
use Bio::SeqFeature::Generic;
124
use Bio::Seq::SeqFactory;
125
use Bio::Annotation::Collection;
126
use Bio::Annotation::Comment;
127
use Bio::Annotation::Reference;
128
use Bio::Annotation::DBLink;
129
use Bio::SeqFeature::Tools::TypeMapper;
130
use Bio::SeqFeature::Tools::FeatureNamer;
131
use Bio::SeqFeature::Tools::IDHandler;
132
use Data::Stag qw(:all);
134
use base qw(Bio::SeqIO);
136
our $TM = 'Bio::SeqFeature::Tools::TypeMapper';
137
our $FNAMER = 'Bio::SeqFeature::Tools::FeatureNamer';
138
our $IDH = 'Bio::SeqFeature::Tools::IDHandler';
141
my($self,@args) = @_;
143
$self->SUPER::_initialize(@args);
144
if( ! defined $self->sequence_factory ) {
145
$self->sequence_factory(new Bio::Seq::SeqFactory
146
(-verbose => $self->verbose(),
147
-type => 'Bio::Seq::RichSeq'));
149
my $wclass = $self->default_handler_class;
150
$self->handler($wclass);
152
$self->handler->fh($self->_fh);
154
$self->{_end_of_data} = 0;
155
$self->_type_by_id_h({});
157
my $ppt = localtime $t;
158
$self->handler->S("chaos");
159
$self->handler->ev(chaos_metadata=>[
161
[chaos_flavour=>'bioperl'],
162
[feature_unique_key=>'feature_id'],
163
[equiv_chado_release=>'chado_1_01'],
164
[export_unixtime=>$t],
165
[export_localtime=>$ppt],
166
[export_host=>$ENV{HOST}],
167
[export_user=>$ENV{USER}],
168
[export_perl5lib=>$ENV{PERL5LIB}],
169
[export_program=>$0],
170
[export_module=>'Bio::SeqIO::chaos'],
171
[export_module_cvs_id=>'$Id: chaos.pm,v 1.10.4.1 2006/10/02 23:10:28 sendu Exp $'],
179
$self->end_of_data();
180
$self->SUPER::DESTROY();
185
return if $self->{_end_of_data};
186
$self->{_end_of_data} = 1;
187
$self->handler->E("chaos");
190
sub default_handler_class {
191
return Data::Stag->makehandler;
194
=head2 context_namespace
196
Title : context_namespace
197
Usage : $obj->context_namespace($newval)
200
Returns : value of context_namespace (a scalar)
201
Args : on set, new value (a scalar or undef, optional)
203
IDs will be preceeded with the context namespace
207
sub context_namespace{
210
return $self->{'context_namespace'} = shift if @_;
211
return $self->{'context_namespace'};
218
Usage : $seq = $stream->next_seq()
219
Function: returns the next sequence in the stream
220
Returns : Bio::Seq object
226
my ($self,@args) = @_;
227
my $seq = $self->sequence_factory->create
229
# '-verbose' =>$self->verbose(),
232
# -annotation => $annotation,
233
# -features => \@features
240
$self->{_handler} = shift if @_;
241
return $self->{_handler};
248
Usage : $stream->write_seq($seq)
249
Function: writes the $seq object (must be seq) to the stream
250
Returns : 1 for success and 0 for error
257
my ($self,$seq) = @_;
259
if( !defined $seq ) {
260
$self->throw("Attempting to write with no seq!");
263
if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
264
$self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
267
# get a handler - must inherit from Data::Stag::BaseHandler;
268
my $w = $self->handler;
271
### $w->S("chaos_block");
273
my $seq_chaos_feature_id;
275
# different seq objects have different version accessors -
277
my $version = $seq->can('seq_version') ? $seq->seq_version : $seq->version;
279
my $accversion = $seq->accession_number;
281
$accversion .= ".$version";
285
$seq_chaos_feature_id = $accversion;
288
$seq_chaos_feature_id = $self->get_chaos_feature_id($seq);
289
$accversion = $seq_chaos_feature_id;
292
# All ids must have a namespace prefix
293
if ($seq_chaos_feature_id !~ /:/) {
294
$seq_chaos_feature_id = "GenericSeqDB:$seq_chaos_feature_id";
297
# if ($seq->accession_number eq 'unknown') {
298
# $seq_chaos_feature_id = $self->get_chaos_feature_id('contig', $seq);
302
if ($seq->desc =~ /haplotype(.*)/i) {
303
# yikes, no consistent way to specify haplotype in gb
305
$haplotype =~ s/\s+/_/g;
306
$haplotype =~ s/\W+//g;
311
if (my $spec = $seq->species) {
312
my ($species, $genus, @class) = $spec->classification();
313
$OS = "$genus $species";
314
if (my $ssp = $spec->sub_species) {
317
$self->genus_species($OS);
318
if( $spec->common_name ) {
319
my $common = $spec->common_name;
320
# genbank parser sets species->common_name to
321
# be "Genus Species (common name)" which is wrong;
322
# we will correct for this; if common_name is set
323
# correctly then carry on
324
if ($common =~ /\((.*)\)/) {
327
$OS .= " (".$common.")";
331
$self->organismstr($OS);
334
# genus_species is part of uniquename - add haplotype
335
# to make it genuinely unique
336
$self->genus_species($self->genus_species .= " $haplotype");
339
my $uname = $self->make_uniquename($self->genus_species, $accversion);
341
# data structure representing the core sequence for this record
343
Data::Stag->new(feature=>[
344
[feature_id=>$seq_chaos_feature_id],
345
[dbxrefstr=>'SEQDB:'.$accversion],
346
[name=>$seq->display_name],
347
[uniquename=>$uname],
348
[residues=>$seq->seq],
354
$seqnode->set_type('databank_entry');
357
$prop{$_} = $seq->$_() if $seq->can($_);
358
} qw(desc keywords division molecule is_circular);
359
$prop{dates} = join("; ", $seq->get_dates) if $seq->can("get_dates");
361
local($^W) = 0; # supressing warnings about uninitialized fields.
365
foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
370
$seqnode->add_featureprop([[type=>'haplotype'],[value=>$haplotype]])
372
foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
373
$seqnode->add_featureprop([[type=>'comment'],[value=>$comment->text]]);
376
$seqnode->set_organismstr($OS);
379
my @sfs = $seq->get_SeqFeatures;
381
# genbank usually includes a 'source' feature - we just
382
# migrate the data from this to the actual source feature
383
my @sources = grep {$_->primary_tag eq 'source'} @sfs;
384
@sfs = grep {$_->primary_tag ne 'source'} @sfs;
385
$self->throw(">1 source types") if @sources > 1;
386
my $source = shift @sources;
389
my $tempw = Data::Stag->makehandler;
390
$self->write_sf($source, $seq_chaos_feature_id, $tempw);
391
my $snode = $tempw->stag;
392
$seqnode->add($_->name, $_->data)
393
foreach ($snode->get_featureprop,
394
$snode->get_feature_dbxref);
399
# throw the writer an event
402
$seqnode = undef; # free memory
404
# make events for all the features within the record
405
foreach my $sf ( @sfs ) {
406
$FNAMER->name_feature($sf);
407
$FNAMER->name_contained_features($sf);
408
$self->write_sf($sf, $seq_chaos_feature_id);
412
### $w->E("chaos_block");
420
return $self->{'organismstr'} = shift if @_;
421
return $self->{'organismstr'};
428
return $self->{'genus_species'} = shift if @_;
429
return $self->{'genus_species'};
436
$self->{_type_by_id_h} = shift if @_;
437
return $self->{_type_by_id_h};
443
# writes a seq feature
449
my $seq_chaos_feature_id = shift;
450
my $w = shift || $self->handler;
454
lc($_)=>[$sf->each_tag_value($_)]
457
my $loc = $sf->location;
458
my $name = $FNAMER->generate_feature_name($sf);
459
my $type = $sf->primary_tag;
461
# The CDS (eg in a genbank feature) implicitly represents
463
$type =~ s/CDS/polypeptide/;
465
my @subsfs = $sf->sub_SeqFeature;
467
my $sid = $loc->is_remote ? $loc->seq_id : $seq_chaos_feature_id;
469
my $CREATE_SPLIT_SFS = 0;
471
if($CREATE_SPLIT_SFS &&
472
$loc->isa("Bio::Location::SplitLocationI") ) {
473
# turn splitlocs into subfeatures
478
Bio::SeqFeature::Generic->new(
483
-primary=>$self->subpartof($type),
486
$ssf->location->is_remote(1);
487
$ssf->location->seq_id($_->seq_id);
490
} $loc->each_Location);
492
elsif( $loc->isa("Bio::Location::RemoteLocationI") ) {
493
# turn splitlocs into subfeatures
497
Bio::SeqFeature::Generic->new(
498
# -name=>$name.'.'.$n++,
502
-primary=>$self->subpartof($type),
504
} $loc->each_Location);
507
my ($beg, $end, $strand) = $self->bp2ib($loc);
510
print Dumper $sf, $loc;
511
$self->throw("($beg, $end, $strand) - no strand\n");
518
[srcfeature_id=>$sid],
525
my $feature_id = $self->get_chaos_feature_id($sf);
527
delete $props{id} if $props{id};
528
# do something with genbank stuff
529
my $pid = $props{'protein_id'};
530
my $tn = $props{'translation'};
531
my @xrefs = @{$props{'db_xref'} || []};
533
push(@xrefs, "protein:$pid->[0]");
536
my $org = $props{organism} ? $props{organism}->[0] : undef;
537
if (!$org && $self->organismstr) {
538
$org = $self->organismstr;
540
my $uname = $name ? $name.'/'.$feature_id : $feature_id;
541
if ($self->genus_species && $name) {
542
$uname = $self->make_uniquename($self->genus_species, $name);
545
$self->throw("cannot make uniquename for $feature_id $name");
547
$self->_type_by_id_h->{$feature_id} = $type;
550
[feature_id=>$feature_id],
551
$name ? ([name=>$name]) : (),
552
[uniquename=>$uname],
554
$tn ? ([residues=>$tn->[0]],
555
[seqlen=>length($tn->[0])],
556
#####[md5checksum=>md5checksum($tn->[0])],
558
$org ? ([organismstr=>$org]) : (),
569
map { [featureprop=>[[type=>$k],[value=>$_],[rank=>$rank++]]] } @{$props{$k}}
576
# strand is always determined by FIRST feature listed
577
# (see genbank entry for trans-spliced mod(mdg4) AE003734)
578
my $strand = $subsfs[0];
580
# almost all the time, all features are on same strand
581
my @sfs_on_main_strand = grep {$_->strand == $strand} @subsfs;
582
my @sfs_on_other_strand = grep {$_->strand != $strand} @subsfs;
584
sort_by_strand($strand, \@sfs_on_main_strand);
585
sort_by_strand(0-$strand, \@sfs_on_other_strand);
586
@subsfs = (@sfs_on_main_strand, @sfs_on_other_strand);
588
foreach my $ssf (@subsfs) {
589
my $ssfid = $self->write_sf($ssf, $sid);
590
#my $rtype = 'part_of';
592
$TM->get_relationship_type_by_parent_child($sf,$ssf);
593
if ($ssf->primary_tag eq 'CDS') {
594
$rtype = 'derives_from';
596
$w->ev(feature_relationship=>[
597
[subject_id=>$ssfid],
598
[object_id=>$feature_id],
606
# parents not stored as bioperl containment hierarchy
607
my @parent_ids = @{$props{parent} || []};
608
foreach my $parent_id (@parent_ids) {
610
$self->_type_by_id_h->{$parent_id} || 'unknown';
612
$TM->get_relationship_type_by_parent_child($ptype,$type);
613
$w->ev(feature_relationship=>[
614
[subject_id=>$feature_id],
615
[object_id=>$parent_id],
626
my $strand = shift || 1;
628
@$sfs = sort { ($a->start <=> $b->start) * $strand } @$sfs;
632
sub make_uniquename {
648
sub get_chaos_feature_id {
653
if ($ob->isa("Bio::SeqI")) {
654
$id = $ob->accession_number . '.' . ($ob->can('seq_version') ? $ob->seq_version : $ob->version);
657
$ob->isa("Bio::SeqFeatureI") || $self->throw("$ob must be either SeqI or SeqFeatureI");
659
if ($ob->primary_id) {
660
$id = $ob->primary_id;
664
$id = $IDH->generate_unique_persistent_id($ob);
668
$id = "$ob"; # last resort - use memory pointer ref
669
# will not be persistent, but will be unique
674
if ($ob->isa("Bio::SeqFeatureI")) {
675
$id = $IDH->generate_unique_persistent_id($ob);
678
$self->throw("Cannot generate a unique persistent ID for a Seq without either primary_id or accession");
682
$id = $self->context_namespace ? $self->context_namespace . ":" . $id : $id;
688
# interbase and directional semantics
693
ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand);
698
return ($s, $e, $str || 1);
703
my $type = 'partof_'.shift;
704
$type =~ s/partof_CDS/CDS_exon/;
705
$type =~ s/partof_protein/CDS_exon/;
706
$type =~ s/partof_polypeptide/CDS_exon/;
707
$type =~ s/partof_\w*RNA/exon/;