1
# $Id: tigr.pm 14941 2008-10-21 01:44:12Z fangly $
3
# BioPerl module for Bio::Assembly::IO::tigr
5
# Copyright by Florent Angly
7
# You may distribute this module under the same terms as Perl itself
9
# POD documentation - main docs before the code
13
Bio::Assembly::IO::tigr - Driver to read and write assembly files in the TIGR
14
Assembler v2 default format.
18
# Building an input stream
19
use Bio::Assembly::IO;
21
# Assembly loading methods
22
my $asmio = Bio::Assembly::IO->new( -file => 'SGC0-424.tasm',
24
my $scaffold = $asmio->next_assembly;
26
# Do some things on contigs...
28
# Assembly writing methods
29
my $outasm = Bio::Assembly::IO->new( -file => ">SGC0-modified.tasm",
31
$outasm->write_assembly( -scaffold => $assembly,
36
This package loads and writes assembly information in/from files in the default
37
TIGR Assembler v2 format. The files are lassie-formatted and often have the
38
.tasm extension. This module was written to be used as a driver module for
39
Bio::Assembly::IO input/output.
43
Assemblies are loaded into Bio::Assembly::Scaffold objects composed of
44
Bio::Assembly::Contig and Bio::Assembly::Singlet objects. Since aligned reads
45
and contig gapped consensus can be obtained in the tasm files, only
46
aligned/gapped sequences are added to the different BioPerl objects.
48
Additional assembly information is stored as features. Contig objects have
49
SeqFeature information associated with the primary_tag:
51
_main_contig_feature:$contig_id -> misc contig information
52
_quality_clipping:$read_id -> quality clipping position
54
Read objects have sub_seqFeature information associated with the
57
_main_read_feature:$read_id -> misc read information
59
Singlets are considered by TIGR Assembler as contigs of one sequence and are
60
represented here with features having these primary_tag:
62
_main_contig_feature:$contig_id
63
_quality_clipping:$read_primary_id
64
_main_read_feature:$read_primary_id
65
_aligned_coord:$read_primary_id
67
=head1 THE TIGR TASM LASSIEFORMAT
71
In the TIGR tasm lassie format, contigs are separated by a line containing a single
72
pipe character "|", whereas the reads in a contig are separated by a blank line.
73
Singlets can be present in the file and are represented as a contig
74
composed of a single sequence.
76
Other than the two above-mentioned separators, each line has an attribute name,
77
followed a tab and then an attribute value.
79
The tasm format is used by more TIGR applications than just TIGR Assembler.
80
Some of the attributes are not used by TIGR Assembler or have constant values.
81
They are indicated by an asterisk *
83
Contigs have the following attributes:
86
sequence -> contig ungapped consensus sequence (ambiguities are lowercase)
87
lsequence -> gapped consensus sequence (lowercase ambiguities)
88
quality -> gapped consensus quality score (in hexadecimal)
92
method -> always 'asmg' *
94
redundancy -> fold coverage of the contig consensus
95
perc_N -> percent of ambiguities in the contig consensus
96
seq# -> number of sequences in the contig
98
cds_start -> start of coding sequence *
99
cds_end -> end of coding sequence *
100
ed_pn -> name of editor (always 'GRA') *
101
ed_date -> date and time of edition
102
comment -> some comments *
105
Each read has the following attributes:
107
seq_name -> read name
108
asm_lend -> position of first base on contig ungapped consensus sequence
109
asm_rend -> position of last base on contig ungapped consensus sequence
110
seq_lend -> start of quality-trimmed sequence (aligned read coordinates)
111
seq_rend -> end of quality-trimmed sequence (aligned read coordinates)
113
comment -> some comments *
114
db -> database name associated with the sequence (e.g. >my_db|seq1234)
115
offset -> offset of the sequence (gapped consensus coordinates)
116
lsequence -> aligned read sequence (ambiguities are uppercase)
118
When asm_rend E<lt> asm_lend, the sequence was on the complementary DNA strand but
119
its reverse complement is shown in the aligned sequence of the assembly file,
120
not the original read.
122
Ambiguities are reflected in the contig consensus sequence as
123
lowercase IUPAC characters: a c g t u m r w s y k x n . In the read
124
sequences, however, ambiguities are uppercase: M R W S Y K X N
128
Example of a contig containing three sequences:
130
sequence CGATGCTGTACGGCTGTTGCGACAGATTGCGCTGGGTCGATACCGCGTTGGTGATCGGCTTGTTCAGCGGGCTCTGGTTCGGCGACAGCGCGGCGATCTTGGCGGCTGCGAAGGTTGCCGGCGCAATCATGCGCTGCTGACCGTTGACCTGGTCCTGCCAGTACACCCAGTCGCCCACCATGACCTTCAGCGCGTAGCTGTCACAGCCGGCTGTGGTCAGCGCAGTGGCGACGGTGGTGTAGGAGGCGCCAGCAACACCTTGGGTGATCATGTAGCAGCCTTCTGACAGGCCGTAGGTCAGCATGGTCGGCCACTGGGTACCAGTCAGTCGGGTCAACCGAGATTCGCAsCTGAGCGCCACTGCCGCGCAGAGCGTACATGCCCTTGCGGGTCGCGCCGGTAACACCATCCACGCCGATCAGAACTGCGTCGGTGATGGTGGTGTTACCCGAGGTGCCAGTGGTGAAGGCGACGGTCTGGGTGCTGGCCACAGGCGCCAGAGTGGTCGCGCCAACGGTGGCGATGACCAGTTGCGATGGGCCACGGATACCTGACTGCCCGTTGTTCACGGCGCTGACGATGTTCTGCCACAGCGCCAGGCCAGAGCCGGTGATGTTGTCGAACACTTCGGGCGCAACGCCAGGGAGCGAGACGGTCAGCTTCCAGCTCGAAGCAGCGGAGCCAGTAGCCAGGGCGGCGCTGAGCGAGTTGCCGAGCGTGCCGGTGTAGAACGCGGTCAGCGTGGCGCCGGTGGCGGCGGCAGTGTCCTTCAGCGCACTGGTCGCGGCGGTGTCGGTGCCGTCAGTGACGCGCACGGCGCGGATGTTCGAGGCGCCGCCCTGGATTGATACCGCCAGCGCGGTGCACAGGTCGTACTTGCGCACGGTCyGAGTGCCGAACTTCTGCGATGCGTCACCTGGCGAGCCGATAaGCGTGGCGCTGTTCACCGGCCCCCAGTCAGCAATGCCGACGATGCCGAGAATGTCAGTCGGGACGCCATTGATGTAGCGGGTCTTGGGCGCCACTATTTGTATGTACAAATCTGGCGCAGATAAAGCCGCCGTATTCAAATAACCAGCAGGATAGATAGGCATCACGCCTCCAGAATGAAAAAGGCCACCGATTAGGTGGCCTTTGTTGTGTTCGGCTGGCTGTTAGAGCAGCAGCCCGTTTTCCCGCGCAAACGCGAATGGGTCCTTGTCATGCTTCCTGCAATTGCAGGTAGGACAAAGAATTTGCAGGTTGGATTTGTCGTTCGATCCGCCCTTTGCAAGCGGGAACACGTGGTCAACGTGATACCCATCCCTTATGGATATAGTGCACATGGCGCATTTCCAGCGCTGAGCAGCCAGCAAAAATTTTATGTCGTCGCCGGTGTGTGAGCCGACAGCATTTTTCTTGCGAGCCTTGTATGTCCGCGAGAGTGAACGAACTTGCTCCTTGTTGGCTGTCTTCCAGAGCTTTTGAGTAAGCGCACAGAGATCCTTGTTTCTTGATCTCCACTCTCTGGTTGCGGAAAT
131
lsequence CGATGCTGTACGGCTGTTGCGACAGATTGCGCTGGGTCGATACCGCGTTGGTGATCGGCTTGTTCAGCGGGCTCTGGTTCGGCGACAGCGCGGCGATCTTGGCGGCTGCGAAGGTTGCCGGCGCAATCATGCGCTGCTGACCGTTGACCTGGTCCTGCCAGTACACCCAGTCGCCCACCATGACCTTCAGCGCGTAGCTGTCACAGCCGGCTGTGGTCAGCGCAGTGGCGACGGTGGTGTAGGAGGCGCCAGCAACACCTTGGGTGATCATGTAGCAGCCTTCTGACAGGCCGTAGGTCAGCATGGTCGGCCACTGGGTACCAGTCAGTCGGGTCAACCGAGATTCG-CAsCTGAGCGCCACTGCCGCGCAGAGCGTACATGCCCTTGCGGGTCGCGCCGGTAACACCATCCACGCCGATCAGAACTGCGTCGGTGATGGTGGTGTTACCCGAGGTGCCAGTGGTGAAGGCGACGGTCTGGGTGCTGGCCACAGGCGCCAGAGTGGTCGCGCCAACGGTGGCGATGACCAGTTGCGATGGGCCACGGATACCTGACTGCCCGTTGTTCACGGCGCTGACGATGTTCTGCCACAGCGCCAGGCCAGAGCCGGTGATGTTGTCGAACACTTCGGGCGCAACGCCAGGGAGCGAGACGGTCAGCTTCCAGCTCGAAGCAGCGGAGCCAGTAGCCAGGGCGGCGCTGAGCGAGTTGCCGAGCGTGCCGGTGTAGAACGCGGTCAGCGTGGCGCCGGTGGCGGCGGCAGTGTCCTTCAGCGCACTGGTCGCGGCGGTGTCGGTGCCGTCAGTGACGCGCACGGCGCGGATGTTCGAGGCGCCGCCCTGGATTGATACCGCCAGCGCGGTGCACAGGTCGTACTTGCGCACGGTCyGAGTGCCGAACTTCTGCGATGCGTCACCTGGCGAGCCGATAaGCGTGGCGCTGTTCACCGGCCCCCAGTCAGCAATGCCGACGATGCCGAGAATGTCAGTCGGGACGCCATTGATGTAGCGGGTCTTGGGCGCCACTATTTGTATGTACAAATCTGGCGCAGATAAAGCCGCCGTATTCAAATAACCAGCAGGATAGATAGGCATCACGCCTCCAGAATGAAAAAGGCCACCGATTAGGTGGCCTTTGTTGTGTTCGGCTGGCTGTTAGAGCAGCAGCCCGTTTTCCCGCGCAAACGCGAATGGGTCCTTGTCATGCTTCCTGCAATTGCAGGTAGGACAAAGAATTTGCAGGTTGGATTTGTCGTTCGATCCGCCCTTTGCAAGCGGGAACACGTGGTCAACGTGATACCCATCCCTTATGGATATAGTGCACATGGCGCATTTCCAGCGCTGAGCAGCCAGCAAAAATTTTATGTCGTCGCCGGTGTGTGAGCCGACAGCATTTTTCTTGCGAGCCTTGTATGTCCGCGAGAGTGAACGAACTTGCTCCTTGTTGGCTGTCTTCCAGAGCTTTTGAGTAAGCGCACAGAGATCCTTGTTTCTTGATCTCCACTCTCTGGTTGCGGAAAT
132
quality 0x0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0505050505050505050E0505160505050505050505050505050505050505050505050505050505050303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303030303090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0404040404040404041604040404040404040404040404040404040404040404040404040404040404040404040404040404040E0404040404040404040B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B0B
146
ed_date 08/16/07 17:10:12
150
seq_name SDSU_RFPERU_010_C09.x01.phd.1
159
lsequence CGATGCTGTACGGCTGTTGCGACAGATTGCGCTGGGTCGATACCGCGTTGGTGATCGGCTTGTTCAGCGGGCTCTGGTTCGGCGACAGCGCGGCGATCTTGGCGGCTGCGAAGGTTGCCGGCGCAATCATGCGCTGCTGACCGTTGACCTGGTCCTGCCAGTACACCCAGTCGCCCACCATGACCTTCAGCGCGTAGCTGTCACAGCCGGCTGTGGTCAGCGCAGTGGCGACGGTGGTGTAGGAGGCGCCAGCAACACCTTGGGTGATCATGTAGCAGCCTTCTGACAGGCCGTAGGTCAGCATGGTCGGCCACTGGGTACCAGTCAGTCGGGTCAACCGAGATTCG-CAGCTGAGCGCCACTGCCGCGCAGAGCGTACATGCCCTTGCGGGTCGCGCCGGTAACACCATCCACGCCGATCAGAACTGCGTCGGTGATGGTGG
161
seq_name SDSU_RFPERU_002_H12.x01.phd.1
170
lsequence CGAGATTCGCCACCTGAGCGCCACTGCCGCGCAGAGCGTACATGCCCTTGCGGGTCGCGCCGGTAACACCATCCACGCCGATCAGAACTGCGTCGGTGATGGTGGTGTTACCCGAGGTGCCAGTGGTGAAGGCGACGGTCTGGGTGCTGGCCACAGGCGCCAGAGTGGTCGCGCCAACGGTGGCGATGACCAGTTGCGATGGGCCACGGATACCTGACTGCCCGTTGTTCACGGCGCTGACGATGTTCTGCCACAGCGCCAGGCCAGAGCCGGTGATGTTGTCGAACACTTCGGGCGCAACGCCAGGGAGCGAGACGGTCAGCTTCCAGCTCGAAGCAGCGGAGCCAGTAGCCAGGGCGGCGCTGAGCGAGTTGCCGAGCGTGCCGGTGTAGAACGCGGTCAGCGTGGCGCCGGTGGCGGCGGCAGTGTCCTTCAGCGCACTGGTCGCGGCGGTGTCGGTGCCGTCAGTGACGCGCACGGCGCGGATGTTCGAGGCGCCGCCCTGGATTGATACCGCCAGCGCGGTGCACAGGTCGTACTTGCGCACGGTCCGAGTGCCGAACTTCTGCGATGCGTCACCTGGCGAGCCGATA-GCGTGGCGC
172
seq_name SDSU_RFPERU_009_E07.x01.phd.1
181
lsequence CGCACGGTCTGAGTGCCGAACTTCTGCGATGCGTCACCTGGCGAGCCGATAAGCGTGGCGCTGTTCACCGGCCCCCAGTCAGCAATGCCGACGATGCCGAGAATGTCAGTCGGGACGCCATTGATGTAGCGGGTCTTGGGCGCCACTATTTGTATGTACAAATCTGGCGCAGATAAAGCCGCCGTATTCAAATAACCAGCAGGATAGATAGGCATCACGCCTCCAGAATGAAAAAGGCCACCGATTAGGTGGCCTTTGTTGTGTTCGGCTGGCTGTTAGAGCAGCAGCCCGTTTTCCCGCGCAAACGCGAATGGGTCCTTGTCATGCTTCCTGCAATTGCAGGTAGGACAAAGAATTTGCAGGTTGGATTTGTCGTTCGATCCGCCCTTTGCAAGCGGGAACACGTGGTCAACGTGATACCCATCCCTTATGGATATAGTGCACATGGCGCATTTCCAGCGCTGAGCAGCCAGCAAAAATTTTATGTCGTCGCCGGTGTGTGAGCCGACAGCATTTTTCTTGCGAGCCTTGTATGTCCGCGAGAGTGAACGAACTTGCTCCTTGTTGGCTGTCTTCCAGAGCTTTTGAGTAAGCGCACAGAGATCCTTGTTTCTTGATCTCCACTCTCTGGTTGCGGAAAT
190
User feedback is an integral part of the evolution of this and other
191
BioPerl modules. Send your comments and suggestions preferably to the
192
BioPerl mailing lists. Your participation is much appreciated.
194
bioperl-l@bioperl.org - General discussion
195
http://bio.perl.org/MailList.html - About the mailing lists
197
=head2 Reporting Bugs
199
Report bugs to the BioPerl bug tracking system to help us keep track
200
the bugs and their resolution. Bug reports can be submitted via email
203
bioperl-bugs@bio.perl.org
204
http://bugzilla.bioperl.org/
206
=head1 AUTHOR - Florent E Angly
208
Email florent dot angly at gmail dot com
212
The rest of the documentation details each of the object
213
methods. Internal methods are usually preceded with a "_".
217
package Bio::Assembly::IO::tigr;
220
use Bio::Seq::Quality;
221
use Bio::LocatableSeq;
222
use Bio::Assembly::IO;
223
use Bio::Assembly::Scaffold;
224
use Bio::Assembly::Contig;
225
use Bio::Assembly::Singlet;
227
use base qw(Bio::Assembly::IO);
229
my $progname = 'TIGR Assembler';
233
Title : next_assembly
234
Usage : my $scaffold = $asmio->next_assembly()
235
Function: return the next assembly in the tasm-formatted stream
236
Returns : Bio::Assembly::Scaffold object
242
my $self = shift; # object reference
244
# Create a new scaffold to hold the contigs
245
my $scaffoldobj = Bio::Assembly::Scaffold->new(-source => $progname);
247
# Contig and read related
254
# Loop over all assembly file lines
255
while ($_ = $self->_readline) {
257
if ( /^\|/ ) { # a line with a single pipe |
258
# The end of a read from a contig, the start of a new contig
262
if ($contiginfo{'seqnum'} > 1) {
263
# This is a read in a contig
264
my $readobj = $self->_store_read(\%readinfo, $contigobj);
265
} elsif ($contiginfo{'seqnum'} == 1) {
267
my $singletobj = $self->_store_singlet(\%readinfo, \%contiginfo,
270
# That should not happen
271
$self->throw("Unhandled exception");
278
} elsif ( /^$/ ) { # a blank line
280
# The end of a contig, the start of a read in that contig
284
$contigobj = $self->_store_contig( \%contiginfo, $contigobj,
285
$scaffoldobj ) if $contiginfo{'seqnum'} > 1;
287
# The end of read in a contig, the start of a new one in
292
if ($contiginfo{'seqnum'} > 1) {
293
# This is a read in a contig
294
my $readobj = $self->_store_read(\%readinfo, $contigobj);
295
} elsif ($contiginfo{'seqnum'} == 1) {
297
my $singletobj = $self->_store_singlet(\%readinfo,
298
\%contiginfo, $scaffoldobj);
300
# That should not happen
301
$self->throw("Unhandled exception");
306
# That should not happen
307
$self->throw("Unhandled exception");
312
if (/^sequence\t(.*)/) {$contiginfo{'sequence'} = $1; next}
313
elsif (/^lsequence\t(.*)/) {$contiginfo{'lsequence'} = $1; next}
314
elsif (/^quality\t(.*)/) {$contiginfo{'quality'} = $1; next}
315
elsif (/^asmbl_id\t(.*)/) {$contiginfo{'asmbl_id'} = $1; next}
316
elsif (/^seq_id\t(.*)/) {$contiginfo{'seq_id'} = $1; next}
317
elsif (/^com_name\t(.*)/) {$contiginfo{'com_name'} = $1; next}
318
elsif (/^type\t(.*)/) {$contiginfo{'type'} = $1; next}
319
elsif (/^method\t(.*)/) {$contiginfo{'method'} = $1; next}
320
elsif (/^ed_status\t(.*)/) {$contiginfo{'ed_status'} = $1; next}
321
elsif (/^redundancy\t(.*)/) {$contiginfo{'redundancy'} = $1; next}
322
elsif (/^perc_N\t(.*)/) {$contiginfo{'perc_N'} = $1; next}
323
elsif (/^seq\#\t(.*)/) {$contiginfo{'seqnum'} = $1; next}
324
elsif (/^full_cds\t(.*)/) {$contiginfo{'full_cds'} = $1; next}
325
elsif (/^cds_start\t(.*)/) {$contiginfo{'cds_start'} = $1; next}
326
elsif (/^cds_end\t(.*)/) {$contiginfo{'cds_end'} = $1; next}
327
elsif (/^ed_pn\t(.*)/) {$contiginfo{'ed_pn'} = $1; next}
328
elsif (/^ed_date\t(.*\s.*)/) {$contiginfo{'ed_date'} = $1; next}
329
elsif (/^comment\t(.*)/) {$contiginfo{'comment'} = $1; next}
330
elsif (/^frameshift\t(.*)/) {$contiginfo{'frameshift'} = $1; next}
332
$self->throw("Format unknown at line $.:\n$_\nIs your file".
333
" really a TIGR Assembler tasm-formatted file?");
337
if (/^seq_name\t(.*)/) {$readinfo{'seq_name'} = $1; next}
338
elsif (/^asm_lend\t(.*)/) {$readinfo{'asm_lend'} = $1; next}
339
elsif (/^asm_rend\t(.*)/) {$readinfo{'asm_rend'} = $1; next}
340
elsif (/^seq_lend\t(.*)/) {$readinfo{'seq_lend'} = $1; next}
341
elsif (/^seq_rend\t(.*)/) {$readinfo{'seq_rend'} = $1; next}
342
elsif (/^best\t(.*)/) {$readinfo{'best'} = $1; next}
343
elsif (/^comment\t(.*)/) {$readinfo{'comment'} = $1; next}
344
elsif (/^db\t(.*)/) {$readinfo{'db'} = $1; next}
345
elsif (/^offset\t(.*)/) {$readinfo{'offset'} = $1; next}
346
elsif (/^lsequence\t(.*)/) {$readinfo{'lsequence'} = $1; next}
348
$self->throw("Format unknown at line $.:\n$_\nIs your file".
349
" really a TIGR Assembler tasm-formatted file?");
352
# That shouldn't happen
353
$self->throw("Unhandled exception");
357
# Store read info for last read
358
if (defined $contiginfo{'seqnum'}) {
359
if ($contiginfo{'seqnum'} > 1) {
360
# This is a read in a contig
361
my $readobj = $self->_store_read(\%readinfo, $contigobj);
362
} elsif ($contiginfo{'seqnum'} == 1) {
364
my $singletobj = $self->_store_singlet(\%readinfo, \%contiginfo,
367
# That should not happen
368
$self->throw("Unhandled exception");
371
# Clear read info for last read
373
# Clear contig info for last contig
377
$scaffoldobj->update_seq_list();
384
Title : _qual_hex2dec
385
Usage : my dec_quality = $self->_qual_hex2dec($hex_quality);
386
Function: convert an hexadecimal quality score into a decimal quality score
393
my ($self, $qual) = @_;
394
$qual =~ s/^0x(.*)$/$1/;
395
$qual =~ s/(..)/hex($1).' '/eg;
401
Title : _qual_dec2hex
402
Usage : my hex_quality = $self->_qual_dec2hex($dec_quality);
403
Function: convert a decimal quality score into an hexadecimal quality score
410
my ($self, $qual) = @_;
411
$qual =~ s/(\d+)\s*/sprintf('%02X', $1)/eg;
418
Title : _store_contig
419
Usage : my $contigobj; $contigobj = $self->_store_contig(
420
\%contiginfo, $contigobj, $scaffoldobj);
421
Function: store information of a contig belonging to a scaffold in the
423
Returns : Bio::Assembly::Contig object
424
Args : hash, Bio::Assembly::Contig, Bio::Assembly::Scaffold
429
my ($self, $contiginfo, $contigobj, $scaffoldobj) = @_;
431
# Create a contig and attach it to scaffold
432
$contigobj = Bio::Assembly::Contig->new(
433
-id => $$contiginfo{'asmbl_id'},
434
-source => $progname,
437
$scaffoldobj->add_contig($contigobj);
439
# Create a gapped consensus sequence and attach it to contig
440
#$$contiginfo{'llength'} = length($$contiginfo{'lsequence'});
441
my $consensus = Bio::LocatableSeq->new(
442
-id => $$contiginfo{'asmbl_id'},
443
-seq => $$contiginfo{'lsequence'},
446
$contigobj->set_consensus_sequence($consensus);
448
# Create an gapped consensus quality score and attach it to contig
449
$$contiginfo{'quality'} = $self->_qual_hex2dec($$contiginfo{'quality'});
450
my $qual = Bio::Seq::Quality->new(
451
-id => $$contiginfo{'asmbl_id'},
452
-qual => $$contiginfo{'quality'}
454
$contigobj->set_consensus_quality($qual);
456
# Add other misc contig information as features of the contig
457
my $contigtags = Bio::SeqFeature::Generic->new(
458
-primary_tag => "_main_contig_feature:$$contiginfo{'asmbl_id'}",
460
-end => $contigobj->get_consensus_length(),
462
-tag => { 'seq_id' => $$contiginfo{'seq_id'},
463
'com_name' => $$contiginfo{'com_name'},
464
'type' => $$contiginfo{'type'},
465
'method' => $$contiginfo{'method'},
466
'ed_status' => $$contiginfo{'ed_status'},
467
'full_cds' => $$contiginfo{'full_cds'},
468
'cds_start' => $$contiginfo{'cds_start'},
469
'cds_end' => $$contiginfo{'cds_end'},
470
'ed_pn' => $$contiginfo{'ed_pn'},
471
'ed_date' => $$contiginfo{'ed_date'},
472
'comment' => $$contiginfo{'comment'},
473
'frameshift' => $$contiginfo{'frameshift'} }
475
$contigobj->add_features([ $contigtags ], 1);
483
Usage : my $readobj = $self->_store_read(\%readinfo, $contigobj);
484
Function: store information of a read belonging to a contig in the appropriate object
485
Returns : Bio::LocatableSeq
486
Args : hash, Bio::Assembly::Contig
491
my ($self, $readinfo, $contigobj) = @_;
493
# Create an aligned read object
494
#$$readinfo{'llength'} = length($$readinfo{'lsequence'});
495
$$readinfo{'strand'} = ($$readinfo{'seq_rend'} > $$readinfo{'seq_lend'} ? 1 : -1);
496
my $readobj = Bio::LocatableSeq->new(
497
# the ids of sequence objects are supposed to include the db name in it, i.e. "big_db|seq1234"
498
# that's how sequence ids coming from the fasta parser are at least
499
-display_id => $self->_merge_seq_name_and_db($$readinfo{'seq_name'}, $$readinfo{'db'}),
500
-primary_id => $self->_merge_seq_name_and_db($$readinfo{'seq_name'}, $$readinfo{'db'}),
501
-seq => $$readinfo{'lsequence'},
503
-strand => $$readinfo{'strand'},
507
# Add read location and sequence to contig (in 'gapped consensus' coordinates)
508
$$readinfo{'aln_start'} = $$readinfo{'offset'} + 1; # seq offset is in gapped coordinates
509
$$readinfo{'aln_end'} = $$readinfo{'aln_start'} + length($$readinfo{'lsequence'}) - 1; # lsequence is aligned seq
510
my $alncoord = Bio::SeqFeature::Generic->new(
511
-primary_tag => $readobj->id,
512
-start => $$readinfo{'aln_start'},
513
-end => $$readinfo{'aln_end'},
514
-strand => $$readinfo{'strand'},
515
-tag => { 'contig' => $contigobj->id() }
517
$contigobj->set_seq_coord($alncoord, $readobj);
519
# Add quality clipping read information in contig features
520
# (from 'aligned read' to 'gapped consensus' coordinates)
521
$$readinfo{'clip_start'} = $contigobj->change_coord('aligned '.$readobj->id, 'gapped consensus', $$readinfo{'seq_lend'});
522
$$readinfo{'clip_end'} = $contigobj->change_coord('aligned '.$readobj->id, 'gapped consensus', $$readinfo{'seq_rend'});
523
my $clipcoord = Bio::SeqFeature::Generic->new(
524
-primary_tag => '_quality_clipping:'.$readobj->id,
525
-start => $$readinfo{'clip_start'},
526
-end => $$readinfo{'clip_end'},
527
-strand => $$readinfo{'strand'}
529
$clipcoord->attach_seq($readobj);
530
$contigobj->add_features([ $clipcoord ], 0);
532
# Add other misc read information as subsequence feature
533
my $readtags = Bio::SeqFeature::Generic->new(
534
-primary_tag => '_main_read_feature:'.$readobj->id,
535
-start => $$readinfo{'aln_start'},
536
-end => $$readinfo{'aln_end'},
537
-strand => $$readinfo{'strand'},
538
-tag => { 'best' => $$readinfo{'best'},
539
'comment' => $$readinfo{'comment'} }
541
$alncoord->add_sub_SeqFeature($readtags);
546
=head2 _store_singlet
548
Title : _store_singlet
549
Usage : my $singletobj = $self->_store_read(\%readinfo, \%contiginfo,
551
Function: store information of a singlet belonging to a scaffold in the appropriate object
552
Returns : Bio::Assembly::Singlet
553
Args : hash, hash, Bio::Assembly::Scaffold
558
my ($self, $readinfo, $contiginfo, $scaffoldobj) = @_;
559
# Singlets in TIGR_Assembler are represented as a contig of one sequence
560
# We try to simulate this duality by playing around with the Singlet object
562
my $contigid = $$contiginfo{'asmbl_id'};
563
my $readid = $self->_merge_seq_name_and_db($$readinfo{'seq_name'}, $$readinfo{'db'});
565
# Create a sequence object
566
#$$contiginfo{'llength'} = length($$contiginfo{'lsequence'});
567
my $seqobj = Bio::Seq::Quality->new(
568
-primary_id => $contigid, # unique id in assembly (contig name)
569
-display_id => $readid,
570
-seq => $$contiginfo{'lsequence'}, # do not use $$readinfo as ambiguities are uppercase
572
-strand => $$readinfo{'strand'},
574
-qual => $self->_qual_hex2dec($$contiginfo{'quality'})
577
# Create singlet from sequence and add it to scaffold
578
my $singletobj = Bio::Assembly::Singlet->new( -seqref => $seqobj );
579
$scaffoldobj->add_singlet($singletobj);
581
# Add other misc contig information as features of the singlet
582
my $contigtags = Bio::SeqFeature::Generic->new(
583
-primary_tag => "_main_contig_feature:$contigid",
585
-end => $singletobj->get_consensus_length(),
587
-tag => { 'seq_id' => $$contiginfo{'seq_id'},
588
'com_name' => $$contiginfo{'com_name'},
589
'type' => $$contiginfo{'type'},
590
'method' => $$contiginfo{'method'},
591
'ed_status' => $$contiginfo{'ed_status'},
592
'full_cds' => $$contiginfo{'full_cds'},
593
'cds_start' => $$contiginfo{'cds_start'},
594
'cds_end' => $$contiginfo{'cds_end'},
595
'ed_pn' => $$contiginfo{'ed_pn'},
596
'ed_date' => $$contiginfo{'ed_date'},
597
'comment' => $$contiginfo{'comment'},
598
'frameshift' => $$contiginfo{'frameshift'} }
600
$singletobj->add_features([ $contigtags ], 1);
602
# Add read location and sequence to singlet features (in 'gapped consensus' coordinates)
603
$$readinfo{'aln_start'} = $$readinfo{'offset'} + 1; # seq offset is in gapped coordinates
604
$$readinfo{'aln_end'} = $$readinfo{'aln_start'} + length($$readinfo{'lsequence'}) - 1; # lsequence is aligned seq
606
my $alncoord = Bio::SeqFeature::Generic->new(
607
-primary_tag => "_aligned_coord:$readid",
608
-start => $$readinfo{'aln_start'},
609
-end => $$readinfo{'aln_end'},
610
-strand => $$readinfo{'strand'},
611
-tag => { 'contig' => $contigid }
613
$alncoord->attach_seq($singletobj->seqref);
614
$singletobj->add_features([ $alncoord ], 0);
616
# Add quality clipping read information in singlet features
617
# (from 'aligned read' to 'gapped consensus' coordinates)
618
$$readinfo{'clip_start'} = $$readinfo{'seq_lend'};
619
$$readinfo{'clip_end'} = $$readinfo{'seq_rend'};
620
my $clipcoord = Bio::SeqFeature::Generic->new(
621
-primary_tag => "_quality_clipping:$readid",
622
-start => $$readinfo{'clip_start'},
623
-end => $$readinfo{'clip_end'},
624
-strand => $$readinfo{'strand'},
625
-tag => { 'contig' => $contigid }
627
$clipcoord->attach_seq($singletobj->seqref);
628
$singletobj->add_features([ $clipcoord ], 0);
630
# Add other misc read information as subsequence feature
631
my $readtags = Bio::SeqFeature::Generic->new(
632
-primary_tag => "_main_read_feature:$readid",
633
-start => $$readinfo{'aln_start'},
634
-end => $$readinfo{'aln_end'},
635
-strand => $$readinfo{'strand'},
636
-tag => { 'best' => $$readinfo{'best'},
637
'comment' => $$readinfo{'comment'} }
639
$alncoord->add_sub_SeqFeature($readtags);
644
=head2 write_assembly
646
Title : write_assembly
647
Usage : $ass_io->write_assembly($assembly)
648
Function: Write the assembly object in TIGR Assembler compatible tasm lassie
650
Returns : 1 on success, 0 for error
651
Args : A Bio::Assembly::Scaffold object
656
my ($self,@args) = @_;
657
my ($scaffoldobj, $singlets) = $self->_rearrange([qw(SCAFFOLD SINGLETS)], @args);
660
if ( !$scaffoldobj || !$scaffoldobj->isa('Bio::Assembly::Scaffold') ) {
661
$self->warn("Must provide a Bio::Align::AlignI object when calling
666
# Get list of objects - contigs and singlets
667
my @cont_ids = $scaffoldobj->get_contig_ids;
668
my @sing_ids = $scaffoldobj->get_singlet_ids;
670
my $decimal_format = '%.2f';
671
for (my $i = 0; $i < scalar @sing_ids ; $i++) {
672
# singlet display id (string)
673
my $display_id = $sing_ids[$i];
674
# singlet primary id (unique, numerical)
675
my $primary_id = $scaffoldobj->get_singlet_by_id($display_id)->seqref->primary_id;
676
$sing_ids[$i] = $primary_id;
677
$did{$primary_id} = $display_id;
679
my @ids = (@cont_ids, @sing_ids);
680
@ids = sort { $a <=> $b } @ids; # list with contig ids and singlet primary id
681
my $numobj = scalar @ids;
683
# Output all contigs and singlets (sorted by increasing id number)
684
for (my $i = 0 ; $i < $numobj ; $i++) {
686
my $objid = $ids[$i];
688
if (defined $did{$objid}) {
690
next unless ($singlets);
692
my $contigid = $objid;
693
my $readid = $did{$objid};
694
my $singletobj = $scaffoldobj->get_singlet_by_id($readid);
696
# Get contig information
698
{ $_->primary_tag eq "_main_contig_feature:$contigid" }
699
$singletobj->get_features_collection->get_all_features
702
$contiginfo{'sequence'} = $singletobj->seqref->seq;
703
$contiginfo{'lsequence'} = $contiginfo{'sequence'};
704
$contiginfo{'quality'} = $self->_qual_dec2hex(
705
join ' ', @{$singletobj->seqref->qual});
706
$contiginfo{'asmbl_id'} = $contigid;
707
$contiginfo{'seq_id'} = ($contanno->get_tag_values('seq_id'))[0];
708
$contiginfo{'com_name'} = ($contanno->get_tag_values('com_name'))[0];
709
$contiginfo{'type'} = ($contanno->get_tag_values('type'))[0];
710
$contiginfo{'method'} = ($contanno->get_tag_values('method'))[0];
711
$contiginfo{'ed_status'} = ($contanno->get_tag_values('ed_status'))[0];
712
$contiginfo{'redundancy'} = sprintf($decimal_format, 1);
713
$contiginfo{'perc_N'} = sprintf(
714
$decimal_format, $self->_perc_N($contiginfo{'sequence'}));
715
$contiginfo{'seqnum'} = 1;
716
$contiginfo{'full_cds'} = ($contanno->get_tag_values('full_cds'))[0];
717
$contiginfo{'cds_start'} = ($contanno->get_tag_values('cds_start'))[0];
718
$contiginfo{'cds_end'} = ($contanno->get_tag_values('cds_end'))[0];
719
$contiginfo{'ed_pn'} = ($contanno->get_tag_values('ed_pn'))[0];
720
$contiginfo{'ed_date'} = $self->_date_time;
721
$contiginfo{'comment'} = ($contanno->get_tag_values('comment'))[0];
722
$contiginfo{'frameshift'} = ($contanno->get_tag_values('frameshift'))[0];
724
# Check that no tag value is undef
725
$contiginfo{'seq_id'} = '' unless defined $contiginfo{'seq_id'};
726
$contiginfo{'com_name'} = '' unless defined $contiginfo{'com_name'};
727
$contiginfo{'type'} = '' unless defined $contiginfo{'type'};
728
$contiginfo{'method'} = '' unless defined $contiginfo{'method'};
729
$contiginfo{'ed_status'} = '' unless defined $contiginfo{'ed_status'};
730
$contiginfo{'full_cds'} = '' unless defined $contiginfo{'full_cds'};
731
$contiginfo{'cds_start'} = '' unless defined $contiginfo{'cds_start'};
732
$contiginfo{'cds_end'} = '' unless defined $contiginfo{'cds_end'};
733
$contiginfo{'ed_pn'} = '' unless defined $contiginfo{'ed_pn'};
734
$contiginfo{'comment'} = '' unless defined $contiginfo{'comment'};
735
$contiginfo{'frameshift'} = '' unless defined $contiginfo{'frameshift'};
737
# Print contig information
739
"sequence\t$contiginfo{'sequence'}\n".
740
"lsequence\t$contiginfo{'lsequence'}\n".
741
"quality\t$contiginfo{'quality'}\n".
742
"asmbl_id\t$contiginfo{'asmbl_id'}\n".
743
"seq_id\t$contiginfo{'seq_id'}\n".
744
"com_name\t$contiginfo{'com_name'}\n".
745
"type\t$contiginfo{'type'}\n".
746
"method\t$contiginfo{'method'}\n".
747
"ed_status\t$contiginfo{'ed_status'}\n".
748
"redundancy\t$contiginfo{'redundancy'}\n".
749
"perc_N\t$contiginfo{'perc_N'}\n".
750
"seq#\t$contiginfo{'seqnum'}\n".
751
"full_cds\t$contiginfo{'full_cds'}\n".
752
"cds_start\t$contiginfo{'cds_start'}\n".
753
"cds_end\t$contiginfo{'cds_end'}\n".
754
"ed_pn\t$contiginfo{'ed_pn'}\n".
755
"ed_date\t$contiginfo{'ed_date'}\n".
756
"comment\t$contiginfo{'comment'}\n".
757
"frameshift\t$contiginfo{'frameshift'}\n".
761
# Get read information
762
my ($seq_name, $db) = $self->_split_seq_name_and_db($readid);
763
my $clipcoord = (grep
764
{ $_->primary_tag eq "_quality_clipping:$readid"}
765
$singletobj->get_features_collection->get_all_features
768
{ $_->primary_tag eq "_aligned_coord:$readid"}
769
$singletobj->get_features_collection->get_all_features
772
{ $_->primary_tag eq "_main_read_feature:$readid" }
773
$singletobj->get_seq_coord($singletobj->seqref)->get_SeqFeatures
776
$readinfo{'seq_name'} = $seq_name;
777
$readinfo{'asm_lend'} = $alncoord->location->start;
778
$readinfo{'asm_rend'} = $alncoord->location->end;
779
$readinfo{'seq_lend'} = $clipcoord->location->start;
780
$readinfo{'seq_rend'} = $clipcoord->location->end;
781
$readinfo{'best'} = ($readanno->get_tag_values('best'))[0];
782
$readinfo{'comment'} = ($readanno->get_tag_values('comment'))[0];
783
$readinfo{'db'} = $db;
784
$readinfo{'offset'} = 0;
785
# ambiguities in read sequence are uppercase
786
$readinfo{'lsequence'} = uc($contiginfo{'lsequence'});
788
# Check that no tag value is undef
789
$readinfo{'best'} = '' unless defined $readinfo{'best'};
790
$readinfo{'comment'} = '' unless defined $readinfo{'comment'};
792
# Print read information
794
"seq_name\t$readinfo{'seq_name'}\n".
795
"asm_lend\t$readinfo{'asm_lend'}\n".
796
"asm_rend\t$readinfo{'asm_rend'}\n".
797
"seq_lend\t$readinfo{'seq_lend'}\n".
798
"seq_rend\t$readinfo{'seq_rend'}\n".
799
"best\t$readinfo{'best'}\n".
800
"comment\t$readinfo{'comment'}\n".
801
"db\t$readinfo{'db'}\n".
802
"offset\t$readinfo{'offset'}\n".
803
"lsequence\t$readinfo{'lsequence'}\n"
805
if ($i+1 < $numobj) {
806
$self->_print("|\n");
810
my $contigid = $objid;
811
my $contigobj = $scaffoldobj->get_contig_by_id($contigid);
813
# Skip contigs of 1 sequence (singlets) if needed
814
next if ($contigobj->no_sequences == 1) && (!$singlets);
816
# Get contig information
818
{ $_->primary_tag eq "_main_contig_feature:$contigid" }
819
$contigobj->get_features_collection->get_all_features
822
$contiginfo{'sequence'} = $self->_ungap(
823
$contigobj->get_consensus_sequence->seq);
824
$contiginfo{'lsequence'} = $contigobj->get_consensus_sequence->seq;
825
$contiginfo{'quality'} = $self->_qual_dec2hex(
826
join ' ', @{$contigobj->get_consensus_quality->qual});
827
$contiginfo{'asmbl_id'} = $contigid;
828
$contiginfo{'seq_id'} = ($contanno->get_tag_values('seq_id'))[0];
829
$contiginfo{'com_name'} = ($contanno->get_tag_values('com_name'))[0];
830
$contiginfo{'type'} = ($contanno->get_tag_values('type'))[0];
831
$contiginfo{'method'} = ($contanno->get_tag_values('method'))[0];
832
$contiginfo{'ed_status'} = ($contanno->get_tag_values('ed_status'))[0];
833
$contiginfo{'redundancy'} = sprintf(
834
$decimal_format, $self->_redundancy($contigobj));
835
$contiginfo{'perc_N'} = sprintf(
836
$decimal_format, $self->_perc_N($contiginfo{'sequence'}));
837
$contiginfo{'seqnum'} = $contigobj->no_sequences;
838
$contiginfo{'full_cds'} = ($contanno->get_tag_values('full_cds'))[0];
839
$contiginfo{'cds_start'} = ($contanno->get_tag_values('cds_start'))[0];
840
$contiginfo{'cds_end'} = ($contanno->get_tag_values('cds_end'))[0];
841
$contiginfo{'ed_pn'} = ($contanno->get_tag_values('ed_pn'))[0];
842
$contiginfo{'ed_date'} = $self->_date_time;
843
$contiginfo{'comment'} = ($contanno->get_tag_values('comment'))[0];
844
$contiginfo{'frameshift'} = ($contanno->get_tag_values('frameshift'))[0];
846
# Check that no tag value is undef
847
$contiginfo{'seq_id'} = '' unless defined $contiginfo{'seq_id'};
848
$contiginfo{'com_name'} = '' unless defined $contiginfo{'com_name'};
849
$contiginfo{'type'} = '' unless defined $contiginfo{'type'};
850
$contiginfo{'method'} = '' unless defined $contiginfo{'method'};
851
$contiginfo{'ed_status'} = '' unless defined $contiginfo{'ed_status'};
852
$contiginfo{'full_cds'} = '' unless defined $contiginfo{'full_cds'};
853
$contiginfo{'cds_start'} = '' unless defined $contiginfo{'cds_start'};
854
$contiginfo{'cds_end'} = '' unless defined $contiginfo{'cds_end'};
855
$contiginfo{'ed_pn'} = '' unless defined $contiginfo{'ed_pn'};
856
$contiginfo{'comment'} = '' unless defined $contiginfo{'comment'};
857
$contiginfo{'frameshift'} = '' unless defined $contiginfo{'frameshift'};
859
# Print contig information
861
"sequence\t$contiginfo{'sequence'}\n".
862
"lsequence\t$contiginfo{'lsequence'}\n".
863
"quality\t$contiginfo{'quality'}\n".
864
"asmbl_id\t$contiginfo{'asmbl_id'}\n".
865
"seq_id\t$contiginfo{'seq_id'}\n".
866
"com_name\t$contiginfo{'com_name'}\n".
867
"type\t$contiginfo{'type'}\n".
868
"method\t$contiginfo{'method'}\n".
869
"ed_status\t$contiginfo{'ed_status'}\n".
870
"redundancy\t$contiginfo{'redundancy'}\n".
871
"perc_N\t$contiginfo{'perc_N'}\n".
872
"seq#\t$contiginfo{'seqnum'}\n".
873
"full_cds\t$contiginfo{'full_cds'}\n".
874
"cds_start\t$contiginfo{'cds_start'}\n".
875
"cds_end\t$contiginfo{'cds_end'}\n".
876
"ed_pn\t$contiginfo{'ed_pn'}\n".
877
"ed_date\t$contiginfo{'ed_date'}\n".
878
"comment\t$contiginfo{'comment'}\n".
879
"frameshift\t$contiginfo{'frameshift'}\n".
883
for my $readobj ( $contigobj->each_seq() ) {
886
# Get read information
887
my ($seq_name, $db) = $self->_split_seq_name_and_db($readobj->id);
888
my ($asm_lend, $asm_rend, $seq_lend, $seq_rend, $offset)
889
= $self->_coord($readobj, $contigobj);
890
my $readanno = ( grep
891
{ $_->primary_tag eq '_main_read_feature:'.$readobj->primary_id }
892
$contigobj->get_seq_coord($readobj)->get_SeqFeatures
895
$readinfo{'seq_name'} = $seq_name;
896
$readinfo{'asm_lend'} = $asm_lend;
897
$readinfo{'asm_rend'} = $asm_rend;
898
$readinfo{'seq_lend'} = $seq_lend;
899
$readinfo{'seq_rend'} = $seq_rend;
900
$readinfo{'best'} = ($readanno->get_tag_values('best'))[0];
901
$readinfo{'comment'} = ($readanno->get_tag_values('comment'))[0];
902
$readinfo{'db'} = $db;
903
$readinfo{'offset'} = $offset;
904
$readinfo{'lsequence'} = $readobj->seq();
906
# Check that no tag value is undef
907
$readinfo{'best'} = '' unless defined $readinfo{'best'};
908
$readinfo{'comment'} = '' unless defined $readinfo{'comment'};
910
# Print read information
912
"seq_name\t$readinfo{'seq_name'}\n".
913
"asm_lend\t$readinfo{'asm_lend'}\n".
914
"asm_rend\t$readinfo{'asm_rend'}\n".
915
"seq_lend\t$readinfo{'seq_lend'}\n".
916
"seq_rend\t$readinfo{'seq_rend'}\n".
917
"best\t$readinfo{'best'}\n".
918
"comment\t$readinfo{'comment'}\n".
919
"db\t$readinfo{'db'}\n".
920
"offset\t$readinfo{'offset'}\n".
921
"lsequence\t$readinfo{'lsequence'}\n"
923
if ($seqno < $contiginfo{'seqnum'}) {
925
} elsif (($seqno == $contiginfo{'seqnum'}) && ($i+1 < $numobj)) {
926
$self->_print("|\n");
937
Usage : my $perc_N = $ass_io->_perc_N($sequence_string)
938
Function: Calculate the percent of ambiguities in a sequence.
939
M R W S Y K X N are regarded as ambiguites in an aligned read
940
sequence by TIGR Assembler. In the case of a gapped contig
941
consensus sequence, all lowercase symbols are ambiguities, i.e.:
942
a c g t u m r w s y k x n.
943
Returns : decimal number
949
my ($self, $seq_string) = @_;
950
$self->throw("Cannot accept an empty sequence") if length($seq_string) == 0;
952
for my $base ( split //, $seq_string ) {
953
# individual base matches an ambiguity?
954
if (( $base =~ m/[x|n|m|r|w|s|y|k]/i ) || ( $base =~ m/[a|c|g|t|u]/ ) ) {
958
$perc_N = $perc_N * 100 / length $seq_string;
965
Usage : my $ref = $ass_io->_redundancy($contigobj)
966
Function: Calculate the fold coverage (redundancy) of a contig consensus
967
(average number of read base pairs covering the consensus)
968
Returns : decimal number
969
Args : Bio::Assembly::Contig
974
# redundancy = (sum of all aligned read lengths - ( number of gaps in gapped
975
# consensus + number of gaps in aligned reads that are also in the consensus ) )
976
# / length of ungapped consensus
977
my ($self, $contigobj) = @_;
980
# sum of all aligned read lengths
982
for my $readobj ( $contigobj->each_seq ) {
983
my $read_length = length($readobj->seq);
984
$read_tot += $read_length;
986
$redundancy += $read_tot;
989
my $consensus_sequence = $contigobj->get_consensus_sequence->seq;
990
my @consensus_gaps = ();
991
$contigobj->_register_gaps($consensus_sequence, \@consensus_gaps);
992
my $respected_gaps = scalar(@consensus_gaps);
993
if ($respected_gaps > 0) {
994
my @cons_arr = split //, $consensus_sequence;
995
for my $gap_pos_cons ( @consensus_gaps ) {
996
for my $readobj ( $contigobj->each_seq ) {
997
my $readid = $readobj->id;
998
my $read_start = $contigobj->change_coord(
999
"aligned $readid", 'gapped consensus', $readobj->start);
1000
my $read_end = $contigobj->change_coord(
1001
"aligned $readid", 'gapped consensus', $readobj->end );
1002
# skip this if consensus gap position not within in the read boundaries
1003
next if ( ($gap_pos_cons < $read_start)
1004
|| ($gap_pos_cons > $read_end) );
1005
# does the read position have read have a gap?
1006
my @read_arr = split //, $readobj->seq;
1007
my $gap_pos_read = $contigobj->change_coord(
1008
'gapped consensus', "aligned $readid", $gap_pos_cons);
1009
if ($read_arr[$gap_pos_read-1] eq $cons_arr[$gap_pos_cons-1]) {
1015
$redundancy -= $respected_gaps;
1017
# / length of ungapped consensus
1018
my $contig_length = length($self->_ungap($contigobj->get_consensus_sequence->seq));
1019
$redundancy /= $contig_length;
1027
Usage : my $ungapped = $ass_io->_ungap($gapped)
1028
Function: Remove the gaps from a sequence. Gaps are - in TIGR Assembler
1035
my ($self, $seq_string) = @_;
1036
$seq_string =~ s/-//g;
1043
Usage : my $timepoint = $ass_io->date_time
1044
Function: Get date and time (MM//DD/YY HH:MM:SS)
1052
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
1053
my $formatted_date_time =
1054
sprintf('%02d', $mon+1).'/'.
1055
sprintf('%02d', $mday).'/'.
1056
sprintf('%02d', $year % 100).
1058
sprintf('%02d', $hour).':'.
1059
sprintf('%02d', $min).':'.
1060
sprintf('%02d',$sec)
1062
return $formatted_date_time;
1065
=head2 _split_seq_name_and_db
1067
Title : _split_seq_name_and_db
1068
Usage : my ($seqname, $db) = $ass_io->_split_seq_name_and_db($id)
1069
Function: Extract seq_name and db from sequence id
1070
Returns : seq_name, db
1075
sub _split_seq_name_and_db {
1076
my ($self, $id) = @_;
1079
if ($id =~ m/(\S+)\|(\S+)/) {
1085
return ($seq_name, $db);
1088
=head2 _merge_seq_name_and_db
1090
Title : _merge_seq_name_and_db
1091
Usage : my $id = $ass_io->_merge_seq_name_and_db($seq_name, $db)
1092
Function: Construct id from seq_name and db
1098
sub _merge_seq_name_and_db {
1099
my ($self, $seq_name, $db) = @_;
1102
$id = $db.'|'.$seq_name;
1112
Usage : my $id = $ass_io->__coord($readobj, $contigobj)
1113
Function: Get different coordinates for the read
1114
Returns : number, number, number, number, number
1115
Args : Bio::Assembly::Seq, Bio::Assembly::Contig
1120
my ($self, $readobj, $contigobj) = @_;
1121
my ($asm_lend, $asm_rend, $seq_lend, $seq_rend, $offset) = (0, 0, 0, 0, 0);
1123
# Get read gapped consensus coordinates from contig and calculate
1124
# asm_lend and asm_rend in ungapped consensus
1125
my $aln_lend = $contigobj->get_seq_coord($readobj)->location->start;
1126
my $aln_rend = $contigobj->get_seq_coord($readobj)->location->end;
1127
$asm_lend = $contigobj->change_coord(
1128
'gapped consensus', 'ungapped consensus', $aln_lend);
1129
$asm_rend = $contigobj->change_coord(
1130
'gapped consensus', 'ungapped consensus', $aln_rend);
1132
# Get gapped consensus coordinates for quality-clipped reads from contig
1133
# annotation and determine seq_lend and seq_rend in unaligned sequence coord
1134
my $readclip = (grep
1135
{ $_->primary_tag eq '_quality_clipping:'.$readobj->primary_id }
1136
$contigobj->get_features_collection->get_all_features
1138
my $clip_lend = $readclip->location->start;
1139
my $clip_rend = $readclip->location->end;
1140
$seq_lend = $contigobj->change_coord(
1141
'gapped consensus', 'aligned '.$readobj->id, $clip_lend);
1142
$seq_rend = $contigobj->change_coord(
1143
'gapped consensus', 'aligned '.$readobj->id, $clip_rend);
1146
$offset = $aln_lend - 1;
1148
return ($asm_lend, $asm_rend, $seq_lend, $seq_rend, $offset);