169
my ($self,@args) = @_;
170
my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
171
$date, $comment, @date_arr);
173
my ($annotation, %params, @features) = ( new Bio::Annotation::Collection);
175
$line = $self->_readline; # This needs to be before the first eof() test
177
if( !defined $line ) {
178
return undef; # no throws - end of file
181
if( $line =~ /^\s+$/ ) {
182
while( defined ($line = $self->_readline) ) {
183
$line =~/^\S/ && last;
186
if( !defined $line ) {
187
return undef; # end of file
189
$line =~ /^ID\s+\S+/ || $self->throw("EMBL stream with no ID. Not embl in my book");
190
$line =~ /^ID\s+(\S+)\s+\S+\;\s+([^;]+)\;\s+(\S+)\;/;
195
$name = "unknown id";
199
# this is important to have the id for display in e.g. FTHelper, otherwise
200
# you won't know which entry caused an error
202
if ( $mol =~ /circular/ ) {
203
$params{'-is_circular'} = 1;
204
$mol =~ s|circular ||;
210
elsif ($mol =~ /RNA/) {
213
elsif ($mol =~ /AA/) {
220
# $self->warn("not parsing upper annotation in EMBL file yet!");
165
my ($self,@args) = @_;
166
my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
167
$date, $comment, @date_arr);
169
my ($annotation, %params, @features) =
170
new Bio::Annotation::Collection;
172
$line = $self->_readline;
173
# This needs to be before the first eof() test
175
if( !defined $line ) {
176
return; # no throws - end of file
179
if( $line =~ /^\s+$/ ) {
180
while( defined ($line = $self->_readline) ) {
181
$line =~/^\S/ && last;
183
# return without error if the whole next sequence was just a single
184
# blank line and then eof
188
# no ID as 1st non-blank line, need short circuit and exit routine
189
$self->throw("EMBL stream with no ID. Not embl in my book")
190
unless $line =~ /^ID\s+\S+/;
192
# At this point we are sure that $line contains an ID header line
194
if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
196
# New style header (EMBL Release >= 87, after June 2006)
200
# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
201
# This regexp comes from the new2old.pl conversion script, from EBI
202
$line =~ m/^ID (\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./;
203
($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
206
$params{'-seq_version'} = $sv;
207
$params{'-version'} = $sv;
210
if ($topology eq "circular") {
211
$params{'-is_circular'} = 1;
218
elsif ($mol =~ /RNA/) {
221
elsif ($mol =~ /AA/) {
228
# Old style header (EMBL Release < 87, before June 2006)
229
($name, $mol, $div) = ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/);
232
if ( $mol =~ /circular/ ) {
233
$params{'-is_circular'} = 1;
234
$mol =~ s|circular ||;
240
elsif ($mol =~ /RNA/) {
243
elsif ($mol =~ /AA/) {
250
unless( defined $name && length($name) ) {
251
$name = "unknown_id";
254
# $self->warn("not parsing upper annotation in EMBL file yet!");
221
255
my $buffer = $line;
225
BEFORE_FEATURE_TABLE :
226
until( !defined $buffer ) {
229
# Exit at start of Feature table
232
# Description line(s)
233
if (/^DE\s+(\S.*\S)/) {
234
$desc .= $desc ? " $1" : $1;
238
if( /^AC\s+(.*)?/ ) {
239
my @accs = split(/[; ]+/, $1); # allow space in addition
240
$params{'-accession_number'} = shift @accs
241
unless defined $params{'-accession_number'};
242
push @{$params{'-secondary_accessions'}}, @accs;
246
if( /^SV\s+\S+\.(\d+);?/ ) {
249
$params{'-seq_version'} = $sv;
250
$params{'-version'} = $sv;
253
#date (NOTE: takes last date line)
254
if( /^DT\s+(.+)$/ ) {
256
push @{$params{'-dates'}},$date;
260
if( /^KW (.*)\S*$/ ) {
261
my @kw = split(/\s*\;\s*/,$1);
263
push @{$params{'-keywords'}}, @kw;
266
# Organism name and phylogenetic information
268
my $species = $self->_read_EMBL_Species(\$buffer);
269
$params{'-species'}= $species;
274
my @refs = $self->_read_EMBL_References(\$buffer);
275
foreach my $ref ( @refs ) {
276
$annotation->add_Annotation('reference',$ref);
282
my @links = $self->_read_EMBL_DBLink(\$buffer);
283
foreach my $dblink ( @links ) {
284
$annotation->add_Annotation('dblink',$dblink);
289
elsif (/^CC\s+(.*)/) {
292
while (defined ($_ = $self->_readline) ) {
301
my $commobj = Bio::Annotation::Comment->new();
302
$commobj->text($comment);
303
$annotation->add_Annotation('comment',$commobj);
308
$buffer = $self->_readline;
257
BEFORE_FEATURE_TABLE :
258
until( !defined $buffer ) {
260
# Exit at start of Feature table
261
if( /^(F[HT]|SQ)/ ) {
262
$self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
265
# Description line(s)
266
if (/^DE\s+(\S.*\S)/) {
267
$desc .= $desc ? " $1" : $1;
271
if( /^AC\s+(.*)?/ ) {
272
my @accs = split(/[; ]+/, $1); # allow space in addition
273
$params{'-accession_number'} = shift @accs
274
unless defined $params{'-accession_number'};
275
push @{$params{'-secondary_accessions'}}, @accs;
279
if( /^SV\s+\S+\.(\d+);?/ ) {
282
$params{'-seq_version'} = $sv;
283
$params{'-version'} = $sv;
286
#date (NOTE: takes last date line)
287
if( /^DT\s+(.+)$/ ) {
289
my ($date, $version) = split(' ', $line, 2);
290
$date =~ tr/,//d; # remove comma if new version
291
if ($version =~ /\(Rel\. (\d+), Created\)/xms ) {
292
my $release = Bio::Annotation::SimpleValue->new(
293
-tagname => 'creation_release',
296
$annotation->add_Annotation($release);
297
} elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
298
my $release = Bio::Annotation::SimpleValue->new(
299
-tagname => 'update_release',
302
$annotation->add_Annotation($release);
304
my $update = Bio::Annotation::SimpleValue->new(
305
-tagname => 'update_version',
308
$annotation->add_Annotation($update);
310
push @{$params{'-dates'}}, $date;
314
if( /^KW (.*)\S*$/ ) {
315
my @kw = split(/\s*\;\s*/,$1);
316
push @{$params{'-keywords'}}, @kw;
319
# Organism name and phylogenetic information
321
# pass the accession number so we can give an informative throw message if necessary
322
my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
323
$params{'-species'}= $species;
328
my @refs = $self->_read_EMBL_References(\$buffer);
329
foreach my $ref ( @refs ) {
330
$annotation->add_Annotation('reference',$ref);
336
my @links = $self->_read_EMBL_DBLink(\$buffer);
337
foreach my $dblink ( @links ) {
338
$annotation->add_Annotation('dblink',$dblink);
343
elsif (/^CC\s+(.*)/) {
346
while (defined ($_ = $self->_readline) ) {
355
my $commobj = Bio::Annotation::Comment->new();
356
$commobj->text($comment);
357
$annotation->add_Annotation('comment',$commobj);
362
$buffer = $self->_readline;
311
365
while( defined ($_ = $self->_readline) ) {
366
/^FT\s{3}\w/ && last;
318
372
if (defined($buffer) && $buffer =~ /^FT /) {
319
until( !defined ($buffer) ) {
320
my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
324
$ftunit->_generic_seqfeature($self->location_factory(), $name));
326
if( $buffer !~ /^FT/ ) {
373
until( !defined ($buffer) ) {
374
my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
378
$ftunit->_generic_seqfeature($self->location_factory(), $name);
380
# add taxon_id from source if available
381
if($params{'-species'} && ($feat->primary_tag eq 'source')
382
&& $feat->has_tag('db_xref')
383
&& (! $params{'-species'}->ncbi_taxid())) {
384
foreach my $tagval ($feat->get_tag_values('db_xref')) {
385
if(index($tagval,"taxon:") == 0) {
386
$params{'-species'}->ncbi_taxid(substr($tagval,6));
392
# add feature to list of features
393
push(@features, $feat);
395
if( $buffer !~ /^FT/ ) {
334
while( defined ($buffer) && $buffer =~ /^XX/ ) {
401
while( defined ($buffer) && $buffer =~ /^XX/ ) {
335
402
$buffer = $self->_readline();
338
if( $buffer =~ /^CO/ ) {
339
until( !defined ($buffer) ) {
340
my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
343
$ftunit->_generic_seqfeature($self->location_factory(),
346
if( $buffer !~ /^CO/ ) {
405
if( $buffer =~ /^CO/ ) {
406
until( !defined ($buffer) ) {
407
my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
410
$ftunit->_generic_seqfeature($self->location_factory(),
413
if( $buffer !~ /^CO/ ) {
351
418
if( $buffer !~ /^SQ/ ) {
352
while( defined ($_ = $self->_readline) ) {
419
while( defined ($_ = $self->_readline) ) {
358
424
while( defined ($_ = $self->_readline) ) {
364
430
my $seq = $self->sequence_factory->create
365
(-verbose => $self->verbose(),
369
-display_id => $name,
370
-annotation => $annotation,
372
-alphabet => $alphabet,
373
-features => \@features,
431
(-verbose => $self->verbose(),
435
-display_id => $name,
436
-annotation => $annotation,
438
-alphabet => $alphabet,
439
-features => \@features,
446
=head2 _write_ID_line
448
Title : _write_ID_line
449
Usage : $self->_write_ID_line($seq);
450
Function: Writes the EMBL Release 87 format ID line to the stream, unless
451
: there is a user-supplied ID line generation function in which
452
: case that is used instead.
453
: ( See Bio::SeqIO::embl::_id_generation_function(). )
455
Args : Bio::Seq object
461
my ($self, $seq) = @_;
464
# If there is a user-supplied ID generation function, use it.
465
if( $self->_id_generation_func ) {
466
$id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
468
# Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
471
# The sequence name is supposed to be the primary accession number,
472
my $name = $seq->accession_number();
474
# but if it is not present, use the sequence ID.
478
$self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
480
# Use the sequence version, or default to 1.
481
my $version = $seq->version() || 1;
483
my $len = $seq->length();
485
# Taxonomic division.
487
if ( $seq->can('division') && defined($seq->division) && $self->_is_valid_division($seq->division) ) {
488
$div = $seq->division();
491
$div ||= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
495
# If the molecule type is a valid EMBL type, use it.
496
if ( $seq->can('molecule')
497
&& defined($seq->molecule)
498
&& $self->_is_valid_molecule_type($seq->molecule)
501
$mol = $seq->molecule();
503
# Otherwise, choose unassigned DNA or RNA based on the alphabet.
504
elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
505
my $alphabet =$seq->primary_seq->alphabet;
506
if ($alphabet eq 'dna') {
507
$mol ='unassigned DNA';
509
elsif ($alphabet eq 'rna') {
510
$mol='unassigned RNA';
512
elsif ($alphabet eq 'protein') {
513
$self->warn("Protein sequence found; EMBL is a nucleotide format.");
514
$mol='AA'; # AA is not a valid EMBL molecule type.
518
my $topology = 'linear';
519
if ($seq->is_circular) {
520
$topology = 'circular';
523
$mol ||= '';# 'unassigned'; ?
524
$id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
525
$self->_print($id_line);
529
=head2 _is_valid_division
531
Title : _is_valid_division
532
Usage : $self->_is_valid_division($div)
533
Function: tests division code for validity
534
Returns : true if $div is a valid EMBL release 87 taxonomic division.
535
Args : taxonomic division code string
539
sub _is_valid_division {
540
my ($self, $division) = @_;
542
my %EMBL_divisions = (
543
"PHG" => 1, # Bacteriophage
544
"ENV" => 1, # Environmental Sample
547
"INV" => 1, # Invertebrate
548
"MAM" => 1, # Other Mammal
549
"VRT" => 1, # Other Vertebrate
550
"MUS" => 1, # Mus musculus
552
"PRO" => 1, # Prokaryote
553
"ROD" => 1, # Other Rodent
554
"SYN" => 1, # Synthetic
555
"UNC" => 1, # Unclassified
559
return exists($EMBL_divisions{$division});
562
=head2 _is_valid_molecule_type
564
Title : _is_valid_molecule_type
565
Usage : $self->_is_valid_molecule_type($mol)
566
Function: tests molecule type for validity
567
Returns : true if $mol is a valid EMBL release 87 molecule type.
568
Args : molecule type string
572
sub _is_valid_molecule_type {
573
my ($self, $moltype) = @_;
575
my %EMBL_molecule_types = (
587
"unassigned DNA" => 1,
588
"unassigned RNA" => 1
591
return exists($EMBL_molecule_types{$moltype});
381
596
Title : write_seq
382
597
Usage : $stream->write_seq($seq)
383
598
Function: writes the $seq object (must be seq) to the stream
384
Returns : 1 for success and 0 for error
599
Returns : 1 for success and undef for error
385
600
Args : array of 1 to n Bio::SeqI objects
391
my ($self,@seqs) = @_;
393
foreach my $seq ( @seqs ) {
394
$self->throw("Attempting to write with no seq!") unless defined $seq;
395
unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
396
$self->warn("$seq is not a SeqI compliant sequence object!")
397
if $self->verbose >= 0;
398
unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
399
$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
402
my $str = $seq->seq || '';
406
my $len = $seq->length();
408
if ($seq->can('division') && defined $seq->division) {
409
$div = $seq->division();
413
if ($seq->can('molecule')) {
414
$mol = $seq->molecule();
415
$mol = 'RNA' if defined $mol && $mol =~ /RNA/; # no 'mRNA'
417
elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
418
my $alphabet =$seq->primary_seq->alphabet;
419
if ($alphabet eq 'dna') {
422
elsif ($alphabet eq 'rna') {
425
elsif ($alphabet eq 'protein') {
430
$mol = "circular $mol" if $seq->is_circular;
433
if( $self->_id_generation_func ) {
434
$temp_line = &{$self->_id_generation_func}($seq);
437
$self->warn("No whitespace allowed in EMBL display id [". $seq->display_id. "]")
438
if $seq->display_id =~ /\s/;
440
$temp_line = sprintf("%-11sstandard; $mol; $div; %d BP.", $seq->id(), $len);
443
$self->_print( "ID $temp_line\n","XX\n");
445
# Write the accession line if present
448
if( my $func = $self->_ac_generation_func ) {
449
$acc = &{$func}($seq);
450
} elsif( $seq->isa('Bio::Seq::RichSeqI') &&
451
defined($seq->accession_number) ) {
452
$acc = $seq->accession_number;
453
$acc = join("; ", $acc, $seq->get_secondary_accessions);
454
} elsif ( $seq->can('accession_number') ) {
455
$acc = $seq->accession_number;
459
$self->_print("AC $acc;\n",
464
# Write the sv line if present
467
if (my $func = $self->_sv_generation_func) {
468
$sv = &{$func}($seq);
469
} elsif($seq->isa('Bio::Seq::RichSeqI') &&
470
defined($seq->seq_version)) {
471
my ($prim_acc) = $acc =~ /([^;]+)/;
472
$sv = "$prim_acc.". $seq->seq_version();
475
$self->_print( "SV $sv\n",
482
if( $seq->can('get_dates') ) {
483
foreach my $dt ( $seq->get_dates() ) {
484
$self->_write_line_EMBL_regex("DT ","DT ",$dt,'\s+|$',80);#'
488
$self->_print("XX\n");
493
$self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80); #'
494
$self->_print( "XX\n");
496
# if there, write the kw line
499
if( my $func = $self->_kw_generation_func ) {
500
$kw = &{$func}($seq);
501
} elsif( $seq->can('keywords') ) {
502
$kw = $seq->keywords;
505
$self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80); #'
506
$self->_print( "XX\n");
512
if ($seq->can('species') && (my $spec = $seq->species)) {
513
my($species, @class) = $spec->classification();
514
my $genus = $class[0];
515
my $OS = "$genus $species";
516
if (my $ssp = $spec->sub_species) {
519
if (my $common = $spec->common_name) {
522
$self->_print("OS $OS\n");
523
my $OC = join('; ', reverse(@class)) .'.';
524
$self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80); #'
525
if ($spec->organelle) {
526
$self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80); #'
528
$self->_print("XX\n");
533
if ( $seq->can('annotation') && defined $seq->annotation ) {
534
foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
535
$self->_print( "RN [$t]\n");
537
# Having no RP line is legal, but we need both
538
# start and end for a valid location.
539
my $start = $ref->start;
541
if ($start and $end) {
542
$self->_print( "RP $start-$end\n");
543
} elsif ($start or $end) {
544
$self->throw("Both start and end are needed for a valid RP line. Got: start='$start' end='$end'");
547
if (my $med = $ref->medline) {
548
$self->_print( "RX MEDLINE; $med.\n");
550
if (my $pm = $ref->pubmed) {
551
$self->_print( "RX PUBMED; $pm.\n");
553
$self->_write_line_EMBL_regex("RA ", "RA ",
557
# If there is no title to the reference, it appears
558
# as a single semi-colon. All titles must end in
560
my $ref_title = $ref->title || '';
561
$ref_title =~ s/[\s;]*$/;/;
562
$self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80); #'
563
$self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80); #'
565
$self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80); #'
567
$self->_print("XX\n");
573
if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
574
foreach my $dr (@db_xref) {
575
my $db_name = $dr->database;
576
my $prim = $dr->primary_id;
577
my $opt = $dr->optional_id || '';
579
my $line = "$db_name; $prim; $opt.";
580
$self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80); #'
582
$self->_print("XX\n");
586
foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
587
$self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80); #'
588
$self->_print("XX\n");
595
$self->_print("FH Key Location/Qualifiers\n");
596
$self->_print("FH\n");
598
my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
600
if( defined $self->_post_sort ) {
601
# we need to read things into an array.
602
# Process. Sort them. Print 'em
604
my $post_sort_func = $self->_post_sort();
607
foreach my $sf ( @feats ) {
608
push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
611
@fth = sort { &$post_sort_func($a,$b) } @fth;
613
foreach my $fth ( @fth ) {
614
$self->_print_EMBL_FTHelper($fth);
617
# not post sorted. And so we can print as we get them.
618
# lower memory load...
620
foreach my $sf ( @feats ) {
621
my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
622
foreach my $fth ( @fth ) {
623
if( $fth->key eq 'CONTIG') {
626
$self->_print_EMBL_FTHelper($fth);
632
if( $self->_show_dna() == 0 ) {
633
$self->_print( "//\n");
636
$self->_print( "XX\n");
638
# finished printing features.
642
# Count each nucleotide
643
my $alen = $str =~ tr/a/a/;
644
my $clen = $str =~ tr/c/c/;
645
my $glen = $str =~ tr/g/g/;
646
my $tlen = $str =~ tr/t/t/;
648
my $olen = $len - ($alen + $tlen + $clen + $glen);
650
$self->warn("Weird. More atgc than bases. Problem!");
653
$self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n");
655
my $nuc = 60; # Number of nucleotides per line
656
my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
657
my $out_pat = 'A11' x 6; # Pattern for packing a line
658
my $length = length($str);
660
# Calculate the number of nucleotides which fit on whole lines
661
my $whole = int($length / $nuc) * $nuc;
663
# Print the whole lines
665
for ($i = 0; $i < $whole; $i += $nuc) {
666
my $blocks = pack $out_pat,
668
substr($str, $i, $nuc);
669
$self->_print(sprintf(" $blocks%9d\n", $i + $nuc));
672
# Print the last line
673
if (my $last = substr($str, $i)) {
674
my $last_len = length($last);
675
my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
676
my $blocks = pack $out_pat,
677
unpack($last_pat, $last);
678
$self->_print(sprintf(" $blocks%9d\n", $length)); # Add the length to the end
681
$self->_print( "//\n");
683
$self->flush if $self->_flush_on_write && defined $self->_fh;
606
my ($self,@seqs) = @_;
608
foreach my $seq ( @seqs ) {
609
$self->throw("Attempting to write with no seq!") unless defined $seq;
610
unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
611
$self->warn("$seq is not a SeqI compliant sequence object!")
612
if $self->verbose >= 0;
613
unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
614
$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
617
my $str = $seq->seq || '';
620
$self->_write_ID_line($seq);
623
# Write the accession line if present
626
if( my $func = $self->_ac_generation_func ) {
627
$acc = &{$func}($seq);
628
} elsif( $seq->isa('Bio::Seq::RichSeqI') &&
629
defined($seq->accession_number) ) {
630
$acc = $seq->accession_number;
631
$acc = join("; ", $acc, $seq->get_secondary_accessions);
632
} elsif ( $seq->can('accession_number') ) {
633
$acc = $seq->accession_number;
637
$self->_print("AC $acc;\n",
644
if( $seq->can('get_dates') ) {
645
my @dates = $seq->get_dates();
648
my ($cr) = $seq->get_Annotations("creation_release");
649
my ($ur) = $seq->get_Annotations("update_release");
650
my ($uv) = $seq->get_Annotations("update_version");
652
unless ($cr && $ur && $ur) {
656
foreach my $dt (@dates){
658
$self->_write_line_EMBL_regex("DT ","DT ",
659
$dt." (Rel. $cr, Created)",
660
'\s+|$',80) if $ct == 1;
661
$self->_write_line_EMBL_regex("DT ","DT ",
662
$dt." (Rel. $ur, Last updated, Version $uv)",
663
'\s+|$',80) if $ct == 2;
664
} else { # other formats?
665
$self->_write_line_EMBL_regex("DT ","DT ",
672
$self->_print("XX\n") || return;
677
$self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80) || return; #'
678
$self->_print( "XX\n") || return;
680
# if there, write the kw line
683
if( my $func = $self->_kw_generation_func ) {
684
$kw = &{$func}($seq);
685
} elsif( $seq->can('keywords') ) {
686
$kw = $seq->keywords;
689
$self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80) || return; #'
690
$self->_print( "XX\n") || return;
696
if ($seq->can('species') && (my $spec = $seq->species)) {
697
my @class = $spec->classification();
698
shift @class; # get rid of species name. Some embl files include
699
# the species name in the OC lines, but this seems
700
# more like an error than something we need to
702
my $OS = $spec->scientific_name;
703
if ($spec->common_name) {
704
$OS .= ' ('.$spec->common_name.')';
706
$self->_print("OS $OS\n") || return;
707
my $OC = join('; ', reverse(@class)) .'.';
708
$self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80) || return;
709
if ($spec->organelle) {
710
$self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
712
$self->_print("XX\n") || return;
717
if ( $seq->can('annotation') && defined $seq->annotation ) {
718
foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
719
$self->_print( "RN [$t]\n") || return;
721
# Having no RP line is legal, but we need both
722
# start and end for a valid location.
723
my $start = $ref->start;
725
if ($start and $end) {
726
$self->_print( "RP $start-$end\n") || return;
727
} elsif ($start or $end) {
728
$self->throw("Both start and end are needed for a valid RP line. Got: start='$start' end='$end'");
731
if (my $med = $ref->medline) {
732
$self->_print( "RX MEDLINE; $med.\n") || return;
734
if (my $pm = $ref->pubmed) {
735
$self->_print( "RX PUBMED; $pm.\n") || return;
737
$self->_write_line_EMBL_regex("RA ", "RA ",
739
'\s+|$', 80) || return; #'
741
# If there is no title to the reference, it appears
742
# as a single semi-colon. All titles must end in
744
my $ref_title = $ref->title || '';
745
$ref_title =~ s/[\s;]*$/;/;
746
$self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80) || return; #'
747
$self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80) || return; #'
749
$self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80) || return; #'
751
$self->_print("XX\n") || return;
756
if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
757
for my $dr (@db_xref) {
758
my $db_name = $dr->database;
759
my $prim = $dr->primary_id;
761
my $opt = $dr->optional_id || '';
762
my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
763
$self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80) || return; #'
765
$self->_print("XX\n") || return;
769
foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
770
$self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80) || return; #'
771
$self->_print("XX\n") || return;
778
$self->_print("FH Key Location/Qualifiers\n") || return;
779
$self->_print("FH\n") || return;
781
my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
783
if( defined $self->_post_sort ) {
784
# we need to read things into an array.
785
# Process. Sort them. Print 'em
787
my $post_sort_func = $self->_post_sort();
790
foreach my $sf ( @feats ) {
791
push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
794
@fth = sort { &$post_sort_func($a,$b) } @fth;
796
foreach my $fth ( @fth ) {
797
$self->_print_EMBL_FTHelper($fth) || return;
800
# not post sorted. And so we can print as we get them.
801
# lower memory load...
803
foreach my $sf ( @feats ) {
804
my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
805
foreach my $fth ( @fth ) {
806
if( $fth->key eq 'CONTIG') {
809
$self->_print_EMBL_FTHelper($fth) || return;
815
if( $self->_show_dna() == 0 ) {
816
$self->_print( "//\n") || return;
819
$self->_print( "XX\n") || return;
821
# finished printing features.
825
# Count each nucleotide
826
my $alen = $str =~ tr/a/a/;
827
my $clen = $str =~ tr/c/c/;
828
my $glen = $str =~ tr/g/g/;
829
my $tlen = $str =~ tr/t/t/;
831
my $len = $seq->length();
832
my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
834
$self->warn("Weird. More atgc than bases. Problem!");
837
$self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
839
my $nuc = 60; # Number of nucleotides per line
840
my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
841
my $out_pat = 'A11' x 6; # Pattern for packing a line
842
my $length = length($str);
844
# Calculate the number of nucleotides which fit on whole lines
845
my $whole = int($length / $nuc) * $nuc;
847
# Print the whole lines
849
for ($i = 0; $i < $whole; $i += $nuc) {
850
my $blocks = pack $out_pat,
852
substr($str, $i, $nuc);
853
$self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) || return;
856
# Print the last line
857
if (my $last = substr($str, $i)) {
858
my $last_len = length($last);
859
my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
860
my $blocks = pack $out_pat,
861
unpack($last_pat, $last);
862
$self->_print(sprintf(" $blocks%9d\n", $length)) ||
863
return; # Add the length to the end
866
$self->_print( "//\n") || return;
868
$self->flush if $self->_flush_on_write && defined $self->_fh;
688
873
=head2 _print_EMBL_FTHelper