153
153
# and the tagname for, well, the Annotation tagname. A few are treated differently
154
154
# based on the type of data stored (Reference data in particular).
157
'AC' => 'Method/accession',
159
'DE' => 'Method/description',
160
'AU' => 'SimpleValue/-value/record_authors',
161
'SE' => 'SimpleValue/-value/seed_source',
162
'GA' => 'SimpleValue/-value/gathering_threshold',
163
'NC' => 'SimpleValue/-value/noise_cutoff',
164
'TC' => 'SimpleValue/-value/trusted_cutoff',
165
'TP' => 'SimpleValue/-value/entry_type',
166
'SQ' => 'SimpleValue/-value/num_sequences',
167
'PI' => 'SimpleValue/-value/previous_ids',
168
'DC' => 'Comment/-text/database_comment',
169
'CC' => 'Comment/-text/alignment_comment',
170
# DBLink, treated differently
171
'DR' => 'DBLink/-value/aln_dblink',
173
'AM' => 'SimpleValue/-value/build_method',
174
'NE' => 'SimpleValue/-value/pfam_family_accession',
175
'NL' => 'SimpleValue/-value/sequence_start_stop',
176
# Rfam-specific GF lines
177
'SS' => 'SimpleValue/-value/sec_structure_source',
178
# Reference objects mapped differently
179
'RN' => '-number', # reference number is dumped
185
# Build model mapped differently
159
'DE' => ['DESCRIPTION' => 'DESCRIPTION'],
160
'AU' => ['RECORD_AUTHORS' => 'RECORD_AUTHORS'],
161
'SE' => 'SEED_SOURCE',
162
'BM' => 'BUILD_COMMAND',
163
'GA' => 'GATHERING_THRESHOLD',
164
'NC' => 'NOISE_CUTOFF',
165
'TC' => 'TRUSTED_CUTOFF',
166
'TP' => 'ENTRY_TYPE',
167
'SQ' => 'NUM_SEQUENCES',
168
'PI' => 'PREVIOUS_IDS',
169
'DC' => ['DATABASE_COMMENT' => 'DATABASE_COMMENT'],
171
'RN' => ['REFERENCE' => 'REFERENCE'],
172
'RC' => ['REFERENCE' => 'COMMENT'],
173
'RM' => ['REFERENCE' => 'PUBMED'],
174
'RT' => ['REFERENCE' => 'TITLE'],
175
'RA' => ['REFERENCE' => 'AUTHORS'],
176
'RL' => ['REFERENCE' => 'JOURNAL'],
177
'CC' => ['ALIGNMENT_COMMENT' => 'ALIGNMENT_COMMENT'],
179
'AM' => 'BUILD_METHOD',
180
'NE' => 'PFAM_FAMILY_ACCESSION',
181
'NL' => 'SEQ_START_STOP',
182
# Rfam-specific GF lines
183
#'SS' => 'SEC_STRUCTURE_SOURCE',
184
'SEQUENCE' => 'SEQUENCE'
187
my %CONCATENATE = map {$_ => 1} qw(DE AU DC RC RT RA RL CC);
189
189
# this is the order that annotations are written
190
190
our @WRITEORDER = qw(accession
267
299
my $self = shift;
270
my ($start, $end, $id, $name, $seqname, $seq, $count, $tag, $data);
272
my ($refct, $bct, $lnkct) = (0,0,0);
274
my (%align, %accession, %desc, %seq_meta, %aln_meta, %annotation);
276
# in stockholm format, every non-blank line that does not start
277
# with '#=' is an alignment segment; the '#=' lines are mark up lines.
278
# Of particular interest are the '#=GF <name/st-ed> AC <accession>'
279
# lines, which give accession numbers for each segment
281
my $aln = Bio::SimpleAlign->new(-source => 'stockholm');
282
while( defined($line = $self->_readline) ) {
283
next unless $line =~ /\w+/;
284
if ($line =~ /^#\s*STOCKHOLM\s+/) {
301
my $handler = $self->alignhandler;
302
# advance to alignment header
303
while( defined(my $line = $self->_readline) ) {
304
if ($line =~ m{^#\s*STOCKHOLM\s+}xmso) {
287
$self->throw("Not Stockholm format: Expecting \"# STOCKHOLM 1.0\"; Found \"$_\"");
292
while( defined($line = $self->_readline) ) {
294
next if $line =~ /^\s+$/;
296
# Double slash (//) signals end of file.
297
last if $line =~ m{^//};
299
# GF/GS lines, by convention, should be at the top of the alignment
300
if ($line =~ m{^\#=GF\s+(\S+?)\s+([^\n]*)$}xms) {
302
# alignment annotation
303
($tag, $data) = ($1, $2);
304
if (exists $READMAP{$tag}) {
306
# reference data (multi line)
307
if (index($tag, 'R') == 0) {
308
# comments come before numbering, tricky
309
$refct++ if ( ($tag eq 'RN' && !$seen_rc) || $tag eq 'RC');
310
$seen_rc = 1 if $tag eq 'RC';
312
next READLINE if $tag eq 'RN';
314
$annotation{ 'reference' }->[$refct-1]->{ $READMAP{$tag} } .= $data.' ';
316
# Build commands (single line)
317
} elsif ($tag eq 'BM') {
318
# # build cmd parameter
319
$annotation{ 'build_command' }->[$bct]->{ $READMAP{$tag} } = $data;
322
# DBLinks (single line)
323
} elsif ($tag eq 'DR') {
324
my ($dbase, $uid, $extra) = split /\s*;\s*/ , $data, 3;
326
$ref->{'-database'} = $dbase;
327
$ref->{'-primary_id'} = ($dbase eq 'URL') ? $uid : uc $uid;
328
$ref->{'-comment'} = $extra if $extra;
329
# # dblink parameter list
330
$annotation{ 'aln_dblink' }->[$lnkct] = $ref;
333
# Everything else (single and multi line)
335
# # param/-value/tagname
336
$annotation{ $READMAP{$tag} } .= $data.' ';
309
$self->{block_line} = 0;
310
# go into main body of alignment
311
my ($data_chunk, $isa_primary, $name, $alphabet);
313
while( defined(my $line = $self->_readline) ) {
314
# only blank lines are in between blocks, so reset block line
315
my ($primary_tag, $secondary_tag, $data, $nse, $feat, $align, $concat);
316
if ($line =~ m{^\s*$}xmso) {
317
$self->{block_line} &&= 0;
322
if (index($line, '//') == 0) {
324
$handler->data_handler($data_chunk);
326
$handler->data_handler({ALIGNMENT => 1,
328
DATA => $self->alphabet})
332
elsif ($line =~ m{^\#=([A-Z]{2})\s+([^\n]+?)\s*$}xmso) {
333
($primary_tag, $data) = ($1, $2);
334
if ($primary_tag eq 'GS' || $primary_tag eq 'GR') {
335
($nse, $feat, $data) = split(/\s+/, $data, 3);
337
($feat, $data) = split(/\s+/, $data, 2);
339
$align = ($primary_tag eq 'GF' || $primary_tag eq 'GR') ? 1 : 0;
341
elsif ($line =~ m{^([^\#]\S+)\s+([A-Za-z.\-\*]+)\s*}) {
342
$self->{block_line}++;
343
($feat, $nse, $data) = ('SEQUENCE', $1, $2);
346
$self->debug("Missed line : $line\n");
348
$primary_tag ||= ''; # when no #= line is present
351
# array refs where the two values are equal indicate the start of a
352
# primary chunk of data, otherwise it is to be folded into the last
353
# data chunk under a secondary tag. These are also concatenated
354
# to previous values if the
356
if (exists($MAPPING{$feat}) && ref $MAPPING{$feat} eq 'ARRAY') {
357
($name, $secondary_tag, $isa_primary) = ( $MAPPING{$feat}->[0] eq $MAPPING{$feat}->[1] ) ?
358
($MAPPING{$feat}->[0], 'DATA', 1) :
359
(@{ $MAPPING{$feat} }, 0) ;
360
$concat = $last_feat eq $feat ? 1 : 0;
361
} elsif (exists($MAPPING{$feat})) {
362
($name, $secondary_tag, $isa_primary) = ($MAPPING{$feat}, 'DATA', 1);
363
# catch alphabet here if possible
364
if ($align && $name eq 'ACCESSION' && !$self->alphabet) {
365
if ($data =~ m{^(P|R)F}) {
366
$self->alphabet($1 eq 'R' ? 'rna' : $1 eq 'P' ? 'protein' : undef );
340
# unknown or custom data treated with simplevalue objects
341
#$self->debug("Unknown tag: $tag:\t$data\n");
342
$annotation{ 'custom' }->{ $tag } .= $data.' ';
345
} elsif( $line =~ m{^\#=GS\s+(\S+)\s+(\w{2})\s+(\S+)}xms ) {
346
# sequence annotation and data
347
($id, $tag, $data) = ($1, $2, $3);
349
$accession{$id} .= $data;
350
} elsif ($tag eq 'DE') {
353
# Bio::Seq::Meta is not AnnotationI, so can't add seq-based
354
# Annotations yet; uncomment to see what is passed by
356
# $self->debug("Missed data: $entry");
358
} elsif( $line =~ m{^\#=GR\s+(\S+)\s+(\S+)\s+([^\n]+)} ) {
359
# meta strings per sequence
360
($name, $tag, $data) = ($1, $2, $3);
361
$seq_meta{$name}->{$tag} .= $data;
362
} elsif( $line =~ m{^\#=GC\s+(\S+)\s+([^\n]+)}xms ) {
363
# meta strings per alignment
364
($tag, $data) = ($1, $2);
365
$aln_meta{$tag} .= $data;
366
} elsif( $line =~ m{^([^\#]\S+)\s+([A-Za-z.\-\*]+)\s*}xms ) {
367
($name,$seq) = ($1,$2);
368
if( ! exists $align{$name} ) {
371
$align{$name} .= $seq;
373
# debugging to catch missed data; uncomment to turn on
374
#$self->debug("Missed Data: $line");
378
# ok... now we can make the sequences
380
for my $name ( @c2name ) {
381
if( $name =~ m{(\S+)/(\d+)-(\d+)}xms ) {
382
($seqname, $start, $end) = ($1, $2, $3);
386
$end = length($align{$name});
388
$seq = Bio::Seq::Meta->new
389
('-seq' => $align{$name},
390
'-display_id' => $seqname,
393
'-description' => $desc{$name},
394
'-accession_number' => $accession{$name}
396
if (exists $seq_meta{$name}) {
397
for my $tag (sort keys %{ $seq_meta{$name} }) {
398
$seq->named_meta($tag, $seq_meta{$name}->{$tag});
404
# add meta strings w/o sequence for consensus meta data
405
my $ameta = Bio::Seq::Meta->new();
406
for my $tag (sort keys %aln_meta) {
407
$ameta->named_meta($tag, $aln_meta{$tag});
410
$aln->consensus_meta($ameta);
412
# Make the annotation collection...
414
my $coll = Bio::Annotation::Collection->new();
416
for my $tag (sort keys %annotation) {
419
if (!ref($annotation{$tag})) {
420
my ($atype, $aparam, $tagname) = split q(/), $tag;
421
# remove trailing newline, convert internal newlines to spaces
422
$annotation{$tag} =~ s{\s+$}{}g;
423
# split the READTYPE map to determine Annotation type, parameters, etc.
424
if ($atype eq 'Method') {
425
$aln->$aparam($annotation{$tag});
427
my $factory = Bio::Annotation::AnnotationFactory->new(
428
-type => "Bio::Annotation::$atype");
429
$coll->add_Annotation
430
($tagname, $factory->create_object($aparam => $annotation{$tag}));
433
} elsif ($tag eq 'custom') {
434
my $factory = Bio::Annotation::AnnotationFactory->new(
435
-type => "Bio::Annotation::SimpleValue");
436
for my $key (sort keys %{ $annotation{$tag} }) {
437
$coll->add_Annotation(
438
$tag, $factory->create_object(-tagname => $key,
439
-value => $annotation{$tag}->{$key}));
442
# more complex annotations
445
my $atype = #($tag eq 'custom') ? 'SimpleValue' :
446
($tag eq 'reference') ? 'Reference' :
447
($tag eq 'aln_dblink') ? 'DBLink' :
448
($tag eq 'build_command') ? 'SimpleValue' :
449
'BadValue'; # this will cause the factory to choke
450
$self->throw("Bad tag value : $tag.") if $atype eq 'BadValue';
451
my $factory = Bio::Annotation::AnnotationFactory->new(
452
-type => "Bio::Annotation::$atype");
453
while (my $data = shift @{ $annotation{$tag} }) {
455
# remove trailing spaces for concatenated data
456
my %clean_data = map {
457
$data->{$_} =~ s{\s+$}{}g;
460
my $ann = $factory->create_object(%clean_data);
461
$coll->add_Annotation($tag, $ann);
467
#$self->debug(Dumper($coll));
470
$aln->annotation($coll);
472
# If $end <= 0, we have either reached the end of
473
# file in <fh> or we have encountered some other error
474
return if ($end <= 0);
370
$name = ($primary_tag eq 'GR') ? 'NAMED_META' :
371
($primary_tag eq 'GC') ? 'CONSENSUS_META' :
373
($secondary_tag, $isa_primary) = ('DATA', 1);
376
# Since we can't determine whether data should be passed into the
377
# Handler until the next round (due to concatenation and combining
378
# data), we always check for the presence of the last chunk when the
379
# occasion calls for it (i.e. when the current data string needs to go
380
# into a new data chunk). If the data needs to be concatenated it is
381
# flagged above and checked below (and passed by if the conditions
384
# We run into a bit of a fencepost problem, (one chunk left over at
385
# the end); that is taken care of above when the end of the record is
388
if ($isa_primary && defined $data_chunk && !$concat) {
389
$handler->data_handler($data_chunk);
392
$data_chunk->{NAME} = $name; # used for the handler
393
$data_chunk->{ALIGNMENT} = $align; # flag that determines chunk destination
394
$data_chunk->{$secondary_tag} .= (defined($data_chunk->{$secondary_tag})) ?
396
$data_chunk->{NSE} = $nse if $nse;
397
if ($name eq 'SEQUENCE' || $name eq 'NAMED_META' || $name eq 'CONSENSUS_META') {
398
$data_chunk->{BLOCK_LINE} = $self->{block_line};
399
$data_chunk->{META_TAG} = $feat if ($name ne 'SEQUENCE');
404
my $aln = $handler->build_alignment;
405
$handler->reset_parameters;
481
411
Title : write_aln
538
483
next ANNOTATIONS;
539
} elsif ($tag eq 'XX') { # custom
485
elsif ($tag eq 'XX') { # custom
540
486
my $newtag = $ann->tagname;
541
487
$alntag = sprintf('%-10s',$aln_ann.$newtag);
543
} elsif ($tag eq 'SQ') {
488
$data = $ann->display_text;
490
elsif ($tag eq 'SQ') {
544
491
# use the actual number, not the stored Annotation data
545
492
$alntag = sprintf('%-10s',$aln_ann.$tag);
546
493
$data = $aln->no_sequences;
548
$alntag = sprintf('%-10s',$aln_ann.$tag);
495
elsif ($tag eq 'DR') {
496
$alntag = sprintf('%-10s',$aln_ann.$tag);
497
my $db = uc $ann->database;
498
my $cb = exists $LINK_CB{$db} ? $LINK_CB{$db} : $LINK_CB{_DEFAULT_};
499
$data = $ann->display_text($cb);
502
$alntag = sprintf('%-10s',$aln_ann.$tag);
503
$data = ref $ann ? $ann->display_text : $ann;
551
506
$text = wrap($alntag, $alntag, $data);
552
$self->_print("$text\n") or return 0;
507
$self->_print("$text\n") || return 0;
511
#=GS <seq-id> AC xxxxxx
513
for my $seq ($aln->each_seq) {
514
if (my $acc = $seq->accession_number) {
515
my $text = sprintf("%-4s%-22s%-3s%s\n",$seq_ann, $seq->get_nse, $tag, $acc);
516
$self->_print($text) || return 0;
520
#=GS <seq-id> DR xxxxxx
522
for my $sf ($aln->get_SeqFeatures) {
523
if (my @links = $sf->annotation->get_Annotations('dblink')) {
524
for my $link (@links) {
525
my $db = uc $link->database;
526
my $cb = exists $LINK_CB{$db} ? $LINK_CB{$db} : $LINK_CB{_DEFAULT_};
527
my $text = sprintf("%-4s%-22s%-3s%s\n",$seq_ann,
528
$sf->entire_seq->get_nse,
530
$link->display_text($cb));
531
$self->_print($text) || return 0;
536
$self->spaces && $self->_print("\n");
558
537
# now the sequences...
539
my $blocklen = $self->line_length;
540
my $maxlen = $aln->maxdisplayname_length() + 3;
541
my $metalen = $aln->max_metaname_length() || 0;
544
my $alnlen = $aln->length;
545
while ($blockstart < $alnlen) {
546
my $subaln = $aln->slice($blockstart, $blockstart+$blocklen-1 ,1);
547
$self->_print_seqs($subaln,$maxlen,$metalen);
548
$blockstart += $blocklen;
549
$self->_print("\n") unless $blockstart >= $alnlen;
552
$self->_print_seqs($aln,$maxlen,$metalen);
555
$self->_print("//\n") || return 0;
557
$self->flush() if $self->_flush_on_write && defined $self->_fh;
567
Usage : $obj->line_length($newval)
568
Function: Set the alignment output line length
569
Returns : value of line_length
570
Args : newvalue (optional)
575
my ( $self, $value ) = @_;
576
if ( defined $value ) {
577
$self->{'_line_length'} = $value;
579
return $self->{'_line_length'};
585
Usage : $obj->alphabet('dna')
586
Function: Set the sequence data alphabet
587
Returns : sequence data type
588
Args : newvalue (optional)
593
my ( $self, $value ) = @_;
594
if ( defined $value ) {
595
$self->throw("Invalid alphabet $value") unless $value eq 'rna' || $value eq 'protein' || $value eq 'dna';
596
$self->{'_alphabet'} = $value;
598
return $self->{'_alphabet'};
604
Usage : $obj->spaces(1)
605
Function: Set the 'spaces' flag, which prints extra newlines between the
606
header and the annotation and the annotation and the alignment
607
Returns : sequence data type
608
Args : newvalue (optional)
614
return $self->{'_spaces'} = shift if @_;
615
return $self->{'_spaces'};
621
Usage : $stream->alignhandler($handler)
622
Function: Get/Set the Bio::HandlerBaseI object
623
Returns : Bio::HandlerBaseI
624
Args : Bio::HandlerBaseI
629
my ($self, $handler) = @_;
631
$self->throw("Not a Bio::HandlerBaseI") unless
632
ref($handler) && $handler->isa("Bio::HandlerBaseI");
633
$self->{'_alignhandler'} = $handler;
635
return $self->{'_alignhandler'};
641
Usage : $stream->alignwriter($writer)
642
Function: Get/Set the writer object
649
shift->throw_not_implemented;
652
############# PRIVATE INIT/HANDLER METHODS #############
655
my ($self, $aln, $maxlen, $metalen) = @_;
657
my ($seq_meta, $aln_meta) = ('#=GR','#=GC');
560
658
# modified (significantly) from AlignIO::pfam
562
660
my ($namestr,$seq,$add);
564
662
# pad extra for meta lines
565
my $maxlen = $aln->maxdisplayname_length() + 5;
566
my $metalen = $aln->max_metaname_length() || 0;
568
664
for $seq ( $aln->each_seq() ) {
569
$namestr = $aln->displayname($seq->get_nse());
570
$self->_print(sprintf("%-*s %s\n",$maxlen+$metalen, $namestr, $seq->seq())) or return 0;
665
my ($s, $e, $str) = ($seq->start, $seq->end, $seq->strand);
666
$namestr = $seq->get_nse();
667
$self->_print(sprintf("%-*s%s\n",$maxlen+$metalen,
669
$seq->seq())) || return 0;
571
670
if ($seq->isa('Bio::Seq::MetaI')) {
572
671
for my $mname ($seq->meta_names) {
573
$self->_print(sprintf("%-*s%*s %s\n",$maxlen, $seq_meta.' '.$namestr, $metalen,
574
$mname, $seq->named_meta($mname))) or return 0;
672
$self->_print(sprintf("%-*s%s\n",$maxlen+$metalen,
673
$seq_meta.' '.$namestr.' '.$mname,
674
$seq->named_meta($mname))) || return 0;