1
# $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
3
## BioPerl module for Bio::Assembly::IO::ace
5
# Copyright by Robson F. de Souza
7
# You may distribute this module under the same terms as perl itself
9
# POD documentation - main docs before the code
13
Bio::Assembly::IO::ace - module to load phrap ACE files.
17
# Building an input stream
18
use Bio::Assembly::IO;
20
# Assembly loading methods
21
$io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1",
24
$assembly = $io->next_assembly;
28
This package loads the ACE files from the (phred/phrap/consed) package
29
by Phill Green. It was written to be used as a driver module for
30
Bio::Assembly::IO input/output.
34
Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
35
Bio::Assembly::Contig objects. In addition to default
36
"_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig
37
objects loaded by this module will have the following special feature
38
classes in their feature collection:
40
"_align_clipping:$seqID" : location of subsequence in sequence $seqID
41
which is aligned to the contig
43
"_quality_clipping:$seqID" : location of good quality subsequence for
46
"consensus tags", as they are called in Consed's documentation, is
47
equivalent to a bioperl sequence feature and, therefore, are added to
48
the feature collection using their type field (see Consed's README.txt
51
"read tags" are also sequence features and are stored as
52
sub_SeqFeatures of the sequence's coordinate feature (the
53
corresponding "_aligned_coord:$seqID" feature, easily accessed through
54
get_seq_coord() method).
56
"whole assembly tags" have no start and end, as they are not
57
associated to any particular sequence in the assembly, and are added
58
to the assembly's annotation collection using phrap as tag.
64
User feedback is an integral part of the evolution of this and other
65
Bioperl modules. Send your comments and suggestions preferably to the
66
Bioperl mailing lists Your participation is much appreciated.
68
bioperl-l@bioperl.org - General discussion
69
http://bio.perl.org/MailList.html - About the mailing lists
73
Report bugs to the Bioperl bug tracking system to help us keep track
74
the bugs and their resolution. Bug reports can be submitted via email
77
bioperl-bugs@bio.perl.org
78
http://bugzilla.bioperl.org/
80
=head1 AUTHOR - Robson Francisco de Souza
82
Email rfsouza@citri.iq.usp.br
86
The rest of the documentation details each of the object
87
methods. Internal methods are usually preceded with a _
91
package Bio::Assembly::IO::ace;
96
use Bio::Assembly::IO;
97
use Bio::Assembly::Scaffold;
98
use Bio::Assembly::Contig;
99
use Bio::LocatableSeq;
100
use Bio::Annotation::SimpleValue;
101
use Bio::Seq::PrimaryQual;
102
use Bio::SeqFeature::Generic;
104
@ISA = qw(Bio::Assembly::IO);
106
=head1 Parser methods
110
Title : next_assembly
111
Usage : $unigene = $stream->next_assembly()
112
Function: returns the next assembly in the stream
113
Returns : Bio::Assembly::Scaffold object
119
my $self = shift; # Object reference
122
# Resetting assembly data structure
123
my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
125
# Looping over all ACE file lines
126
my ($contigOBJ,$read_name);
127
my $read_data = {}; # Temporary holder for read data
128
while ($_ = $self->_readline) { # By now, ACE files hold a single assembly
131
# Loading assembly information (ASsembly field)
132
# (/^AS (\d+) (\d+)/) && do {
133
# $assembly->_set_nof_contigs($1);
134
# $assembly->_set_nof_sequences_in_contigs($2);
137
# Loading contig sequence (COntig sequence field)
138
(/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found!
140
$contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID);
141
# $contigOBJ->set_nof_bases($2); # Contig length in base pairs
142
# $contigOBJ->set_nof_reads($3); # Number of reads in this contig
143
# $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly
144
my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default)
145
$contigOBJ->strand($ori);
146
my $consensus_sequence = undef;
147
while ($_ = $self->_readline) { # Looping over contig lines
148
chomp; # Drop <ENTER> (\n) on current line
149
last if (/^$/); # Stop if empty line (contig end) is found
150
s/\*/-/g; # Forcing '-' as gap symbol
151
$consensus_sequence .= $_;
154
my $consensus_length = length($consensus_sequence);
155
$consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence,
157
-end=>$consensus_length,
159
$contigOBJ->set_consensus_sequence($consensus_sequence);
160
$assembly->add_contig($contigOBJ);
163
# Loading contig qualities... (Base Quality field)
165
my $consensus = $contigOBJ->get_consensus_sequence()->seq();
169
while ($_ = $self->_readline) {
172
@tmp = grep { /^\d+$/ } split(/\s+/);
177
# IF base is a gap, quality is the average for neighbouring sites
178
if (substr($consensus,$j,1) eq '-') {
179
$previous = $tmp[$i-1] unless ($i == 0);
185
push(@quality,int(($previous+$next)/2));
187
push(@quality,$tmp[$i]);
194
my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
195
-id=>$contigOBJ->id());
196
$contigOBJ->set_consensus_quality($qual);
199
# Loading read info... (Assembled From field)
200
/^AF (\S+) (C|U) (-*\d+)/ && do {
201
$read_name = $1; my $ori = $2;
202
$read_data->{$read_name}{'padded_start'} = $3; # aligned start
203
$ori = ( $ori eq 'U' ? 1 : -1);
204
$read_data->{$read_name}{'strand'} = $ori;
207
# Loading base segments definitions (Base Segment field)
208
# /^BS (\d+) (\d+) (\S+)/ && do {
209
# if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) {
210
# $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2;
211
# } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 }
214
# Loading reads... (ReaD sequence field
215
/^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
217
$read_data->{$read_name}{'length'} = $2; # number_of_padded_bases
218
$read_data->{$read_name}{'contig'} = $contigOBJ;
219
# $read_data->{$read_name}{'number_of_read_info_items'} = $3;
220
# $read_data->{$read_name}{'number_of_tags'} = $4;
222
while ($_ = $self->_readline) {
225
s/\*/-/g; # Forcing '-' as gap symbol
226
$read_sequence .= $_; # aligned read sequence
228
my $read = Bio::LocatableSeq->new(-seq=>$read_sequence,
230
-end=>$read_data->{$read_name}{'length'},
231
-strand=>$read_data->{$read_name}{'strand'},
233
-primary_id=>$read_name,
236
# Adding read location and sequence to contig ("gapped consensus" coordinates)
237
my $padded_start = $read_data->{$read_name}{'padded_start'};
238
my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1;
239
my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start,
241
-strand=>$read_data->{$read_name}{'strand'},
242
-tag => { 'contig' => $contigOBJ->id }
244
$contigOBJ->set_seq_coord($coord,$read);
247
# Loading read trimming and alignment ranges...
248
/^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
249
my $qual_start = $1; my $qual_end = $2;
250
my $align_start = $3; my $align_end = $4;
252
unless ($align_start == -1 && $align_end == -1) {
253
$align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start);
254
$align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end);
255
my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start,
257
-strand=>$read_data->{$read_name}{'strand'},
258
-primary=>"_align_clipping:$read_name");
259
$align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
260
$contigOBJ->add_features([ $align_feat ], 0);
263
unless ($qual_start == -1 && $qual_end == -1) {
264
$qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
265
$qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
266
my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start,
268
-strand=>$read_data->{$read_name}{'strand'},
269
-primary=>"_quality_clipping:$read_name");
270
$qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
271
$contigOBJ->add_features([ $qual_feat ], 0);
275
# Loading read description (DeScription fields)
277
# /CHEM: (\S+)/ && do {
278
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1;
280
# /CHROMAT_FILE: (\S+)/ && do {
281
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1;
283
# /DIRECTION: (\w+)/ && do {
285
# if ($ori eq 'rev') { $ori = 'C' }
286
# elsif ($ori eq 'fwd') { $ori = 'U' }
287
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori;
289
# /DYE: (\S+)/ && do {
290
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1;
292
# /PHD_FILE: (\S+)/ && do {
293
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1;
295
# /TEMPLATE: (\S+)/ && do {
296
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1;
298
# /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do {
299
# $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1;
303
# Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features)
305
my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
306
$contigID =~ s/^Contig//i;
307
my $extra_info = undef;
308
while ($_ = $self->_readline) {
312
my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start,
315
-tag=>{ 'source' => $source,
316
'creation_date' => $date,
317
'extra_info' => $extra_info
319
$assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1);
324
my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
325
my $extra_info = undef;
326
while ($_ = $self->_readline) {
330
$start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start);
331
$end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end);
332
my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start,
335
-tag=>{ 'source' => $source,
336
'creation_date' => $date,
337
'extra_info' => $extra_info
339
my $contig = $read_data->{$readID}{'contig'};
340
my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
341
$coord->add_sub_SeqFeature($read_tag);
346
my ($type,$source,$date) = split(' ',$self->_readline);
347
my $extra_info = undef;
348
while ($_ = $self->_readline) {
352
#? push(@line,\@extra_info);
353
my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source,
354
"DATE:",$date,"DATA:",$extra_info);
355
$assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag);
356
$assembly->annotation->add_Annotation('phrap',$assembly_tag);
359
} # while ($_ = $self->_readline)
361
$assembly->update_seq_list();
365
=head2 write_assembly
367
Title : write_assembly
368
Usage : $ass_io->write_assembly($assembly)
369
Function: Write the assembly object in Phrap compatible ACE format
370
Returns : 1 on success, 0 for error
371
Args : A Bio::Assembly::Scaffold object
375
sub write_assemebly {
378
$self->throw("Writing phrap ACE files is not implemented yet! Sorry...");