~ubuntu-branches/ubuntu/lucid/bioperl/lucid

« back to all changes in this revision

Viewing changes to Bio/Assembly/IO/ace.pm

  • Committer: Bazaar Package Importer
  • Author(s): Matt Hope
  • Date: 2004-04-18 14:24:11 UTC
  • mfrom: (1.2.1 upstream) (2.1.1 warty)
  • Revision ID: james.westby@ubuntu.com-20040418142411-gr92uexquw4w8liq
Tags: 1.4-1
* New upstream release
* Examples and working code are installed by default to usr/bin,
  this has been moved to usr/share/doc/bioperl/bin

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
# $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
 
2
#
 
3
## BioPerl module for Bio::Assembly::IO::ace
 
4
#
 
5
# Copyright by Robson F. de Souza
 
6
#
 
7
# You may distribute this module under the same terms as perl itself
 
8
 
 
9
# POD documentation - main docs before the code
 
10
 
 
11
=head1 NAME
 
12
 
 
13
Bio::Assembly::IO::ace -  module to load phrap ACE files.
 
14
 
 
15
=head1 SYNOPSYS
 
16
 
 
17
    # Building an input stream
 
18
    use Bio::Assembly::IO;
 
19
 
 
20
    # Assembly loading methods
 
21
    $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1",
 
22
                         -format=>"ace");
 
23
 
 
24
    $assembly = $io->next_assembly;
 
25
 
 
26
=head1 DESCRIPTION
 
27
 
 
28
This package loads the ACE files from the (phred/phrap/consed) package
 
29
by Phill Green.  It was written to be used as a driver module for
 
30
Bio::Assembly::IO input/output.
 
31
 
 
32
=head2 Implemention
 
33
 
 
34
Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
 
35
Bio::Assembly::Contig objects. In addition to default
 
36
"_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig
 
37
objects loaded by this module will have the following special feature
 
38
classes in their feature collection:
 
39
 
 
40
"_align_clipping:$seqID" : location of subsequence in sequence $seqID
 
41
                           which is aligned to the contig
 
42
 
 
43
"_quality_clipping:$seqID" : location of good quality subsequence for
 
44
                             sequence $seqID
 
45
 
 
46
"consensus tags", as they are called in Consed's documentation, is
 
47
equivalent to a bioperl sequence feature and, therefore, are added to
 
48
the feature collection using their type field (see Consed's README.txt
 
49
file) as primary tag.
 
50
 
 
51
"read tags" are also sequence features and are stored as
 
52
sub_SeqFeatures of the sequence's coordinate feature (the
 
53
corresponding "_aligned_coord:$seqID" feature, easily accessed through
 
54
get_seq_coord() method).
 
55
 
 
56
"whole assembly tags" have no start and end, as they are not
 
57
associated to any particular sequence in the assembly, and are added
 
58
to the assembly's annotation collection using phrap as tag.
 
59
 
 
60
=head1 FEEDBACK
 
61
 
 
62
=head2 Mailing Lists
 
63
 
 
64
User feedback is an integral part of the evolution of this and other
 
65
Bioperl modules. Send your comments and suggestions preferably to the
 
66
Bioperl mailing lists  Your participation is much appreciated.
 
67
 
 
68
  bioperl-l@bioperl.org                 - General discussion
 
69
  http://bio.perl.org/MailList.html     - About the mailing lists
 
70
 
 
71
=head2 Reporting Bugs
 
72
 
 
73
Report bugs to the Bioperl bug tracking system to help us keep track
 
74
the bugs and their resolution.  Bug reports can be submitted via email
 
75
or the web:
 
76
 
 
77
  bioperl-bugs@bio.perl.org
 
78
  http://bugzilla.bioperl.org/
 
79
 
 
80
=head1 AUTHOR - Robson Francisco de Souza
 
81
 
 
82
Email rfsouza@citri.iq.usp.br
 
83
 
 
84
=head1 APPENDIX
 
85
 
 
86
The rest of the documentation details each of the object
 
87
methods. Internal methods are usually preceded with a _
 
88
 
 
89
=cut
 
90
 
 
91
package Bio::Assembly::IO::ace;
 
92
 
 
93
use strict;
 
94
use vars qw(@ISA);
 
95
 
 
96
use Bio::Assembly::IO;
 
97
use Bio::Assembly::Scaffold;
 
98
use Bio::Assembly::Contig;
 
99
use Bio::LocatableSeq;
 
100
use Bio::Annotation::SimpleValue;
 
101
use Bio::Seq::PrimaryQual;
 
102
use Bio::SeqFeature::Generic;
 
103
 
 
104
@ISA = qw(Bio::Assembly::IO);
 
105
 
 
106
=head1 Parser methods
 
107
 
 
108
=head2 next_assembly
 
109
 
 
110
 Title   : next_assembly
 
111
 Usage   : $unigene = $stream->next_assembly()
 
112
 Function: returns the next assembly in the stream
 
113
 Returns : Bio::Assembly::Scaffold object
 
114
 Args    : NONE
 
115
 
 
116
=cut
 
117
 
 
118
sub next_assembly {
 
119
    my $self = shift; # Object reference
 
120
    local $/="\n";
 
121
 
 
122
    # Resetting assembly data structure
 
123
    my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
 
124
 
 
125
    # Looping over all ACE file lines
 
126
    my ($contigOBJ,$read_name);
 
127
    my $read_data = {}; # Temporary holder for read data
 
128
    while ($_ = $self->_readline) { # By now, ACE files hold a single assembly
 
129
        chomp;
 
130
 
 
131
        # Loading assembly information (ASsembly field)
 
132
#       (/^AS (\d+) (\d+)/) && do {
 
133
#           $assembly->_set_nof_contigs($1);
 
134
#           $assembly->_set_nof_sequences_in_contigs($2);
 
135
#       };
 
136
 
 
137
        # Loading contig sequence (COntig sequence field)
 
138
        (/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found!
 
139
            my $contigID = $1;
 
140
            $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID);
 
141
#           $contigOBJ->set_nof_bases($2); # Contig length in base pairs
 
142
#           $contigOBJ->set_nof_reads($3); # Number of reads in this contig
 
143
#           $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly
 
144
            my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default)
 
145
            $contigOBJ->strand($ori);
 
146
            my $consensus_sequence = undef;
 
147
            while ($_ = $self->_readline) { # Looping over contig lines
 
148
                chomp;                   # Drop <ENTER> (\n) on current line
 
149
                last if (/^$/);          # Stop if empty line (contig end) is found
 
150
                s/\*/-/g; # Forcing '-' as gap symbol
 
151
                $consensus_sequence .= $_;
 
152
            }
 
153
 
 
154
            my $consensus_length = length($consensus_sequence);
 
155
            $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence,
 
156
                                                              -start=>1,
 
157
                                                              -end=>$consensus_length,
 
158
                                                              -id=>$contigID);
 
159
            $contigOBJ->set_consensus_sequence($consensus_sequence);
 
160
            $assembly->add_contig($contigOBJ);
 
161
        };
 
162
 
 
163
        # Loading contig qualities... (Base Quality field)
 
164
        /^BQ/ && do {
 
165
            my $consensus = $contigOBJ->get_consensus_sequence()->seq();
 
166
            my ($i,$j,@tmp);
 
167
            my @quality = ();
 
168
            $j = 0;
 
169
            while ($_ = $self->_readline) {
 
170
                chomp;
 
171
                last if (/^$/);
 
172
                @tmp = grep { /^\d+$/ } split(/\s+/);
 
173
                $i = 0;
 
174
                my $previous = 0;
 
175
                my $next     = 0;
 
176
                while ($i<=$#tmp) {
 
177
                    # IF base is a gap, quality is the average for neighbouring sites
 
178
                    if (substr($consensus,$j,1) eq '-') {
 
179
                        $previous = $tmp[$i-1] unless ($i == 0);
 
180
                        if ($i < $#tmp) {
 
181
                            $next = $tmp[$i+1];
 
182
                        } else {
 
183
                            $next = 0;
 
184
                        }
 
185
                        push(@quality,int(($previous+$next)/2));
 
186
                    } else {
 
187
                        push(@quality,$tmp[$i]);
 
188
                        $i++;
 
189
                    }
 
190
                    $j++;
 
191
                }
 
192
            }
 
193
 
 
194
            my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
 
195
                                                  -id=>$contigOBJ->id());
 
196
            $contigOBJ->set_consensus_quality($qual);
 
197
        };
 
198
 
 
199
        # Loading read info... (Assembled From field)
 
200
        /^AF (\S+) (C|U) (-*\d+)/ && do {
 
201
            $read_name = $1; my $ori = $2;
 
202
            $read_data->{$read_name}{'padded_start'} = $3; # aligned start
 
203
            $ori = ( $ori eq 'U' ? 1 : -1);
 
204
            $read_data->{$read_name}{'strand'}  = $ori;
 
205
        };
 
206
 
 
207
        # Loading base segments definitions (Base Segment field)
 
208
#       /^BS (\d+) (\d+) (\S+)/ && do {
 
209
#           if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) {
 
210
#               $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2;
 
211
#           } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 }
 
212
#       };
 
213
 
 
214
        # Loading reads... (ReaD sequence field
 
215
        /^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
 
216
            $read_name = $1;
 
217
            $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases
 
218
            $read_data->{$read_name}{'contig'} = $contigOBJ;
 
219
#           $read_data->{$read_name}{'number_of_read_info_items'} = $3;
 
220
#           $read_data->{$read_name}{'number_of_tags'}            = $4;
 
221
            my $read_sequence;
 
222
            while ($_ = $self->_readline) {
 
223
                chomp;
 
224
                last if (/^$/);
 
225
                s/\*/-/g; # Forcing '-' as gap symbol
 
226
                $read_sequence .= $_; # aligned read sequence
 
227
            }
 
228
            my $read = Bio::LocatableSeq->new(-seq=>$read_sequence,
 
229
                                              -start=>1,
 
230
                                              -end=>$read_data->{$read_name}{'length'},
 
231
                                              -strand=>$read_data->{$read_name}{'strand'},
 
232
                                              -id=>$read_name,
 
233
                                              -primary_id=>$read_name,
 
234
                                              -alphabet=>'dna');
 
235
 
 
236
            # Adding read location and sequence to contig ("gapped consensus" coordinates)
 
237
            my $padded_start = $read_data->{$read_name}{'padded_start'};
 
238
            my $padded_end   = $padded_start + $read_data->{$read_name}{'length'} - 1;
 
239
            my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start,
 
240
                                                      -end=>$padded_end,
 
241
                                                      -strand=>$read_data->{$read_name}{'strand'},
 
242
                                                      -tag => { 'contig' => $contigOBJ->id }
 
243
                                                      );
 
244
            $contigOBJ->set_seq_coord($coord,$read);
 
245
        };
 
246
 
 
247
        # Loading read trimming and alignment ranges...
 
248
        /^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
 
249
            my $qual_start  = $1; my $qual_end  = $2;
 
250
            my $align_start = $3; my $align_end = $4;
 
251
 
 
252
            unless ($align_start == -1 && $align_end == -1) {
 
253
                $align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start);
 
254
                $align_end   = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end);
 
255
                my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start,
 
256
                                                               -end=>$align_end,
 
257
                                                               -strand=>$read_data->{$read_name}{'strand'},
 
258
                                                               -primary=>"_align_clipping:$read_name");
 
259
                $align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
 
260
                $contigOBJ->add_features([ $align_feat ], 0);
 
261
            }
 
262
 
 
263
            unless ($qual_start == -1 && $qual_end == -1) {
 
264
                $qual_start  = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
 
265
                $qual_end    = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
 
266
                my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start,
 
267
                                                              -end=>$qual_end,
 
268
                                                              -strand=>$read_data->{$read_name}{'strand'},
 
269
                                                              -primary=>"_quality_clipping:$read_name");
 
270
                $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
 
271
                $contigOBJ->add_features([ $qual_feat ], 0);
 
272
            }
 
273
        };
 
274
 
 
275
        # Loading read description (DeScription fields)
 
276
#       /^DS / && do {
 
277
#           /CHEM: (\S+)/ && do {
 
278
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1;
 
279
#           };
 
280
#           /CHROMAT_FILE: (\S+)/ && do {
 
281
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1;
 
282
#           };
 
283
#           /DIRECTION: (\w+)/ && do {
 
284
#               my $ori = $1;
 
285
#               if    ($ori eq 'rev') { $ori = 'C' }
 
286
#               elsif ($ori eq 'fwd') { $ori = 'U' }
 
287
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori;
 
288
#           };
 
289
#           /DYE: (\S+)/ && do {
 
290
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1;
 
291
#           };
 
292
#           /PHD_FILE: (\S+)/ && do {
 
293
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1;
 
294
#           };
 
295
#           /TEMPLATE: (\S+)/ && do {
 
296
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1;
 
297
#           };
 
298
#           /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do {
 
299
#               $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1;
 
300
#           };
 
301
#       };
 
302
 
 
303
        # Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features)
 
304
        /^CT\s*\{/ && do {
 
305
            my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
 
306
            $contigID =~ s/^Contig//i;
 
307
            my $extra_info = undef;
 
308
            while ($_ = $self->_readline) {
 
309
                last if (/\}/);
 
310
                $extra_info .= $_;
 
311
            }
 
312
            my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start,
 
313
                                                           -end=>$end,
 
314
                                                           -primary=>$type,
 
315
                                                           -tag=>{ 'source' => $source,
 
316
                                                                   'creation_date' => $date,
 
317
                                                                   'extra_info' => $extra_info
 
318
                                                               });
 
319
            $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1);
 
320
        };
 
321
 
 
322
        # Loading read tags
 
323
        /^RT\s*\{/ && do {
 
324
            my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
 
325
            my $extra_info = undef;
 
326
            while ($_ = $self->_readline) {
 
327
                last if (/\}/);
 
328
                $extra_info .= $_;
 
329
            }
 
330
            $start  = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start);
 
331
            $end    = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end);
 
332
            my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start,
 
333
                                                         -end=>$end,
 
334
                                                         -primary=>$type,
 
335
                                                         -tag=>{ 'source' => $source,
 
336
                                                                 'creation_date' => $date,
 
337
                                                                 'extra_info' => $extra_info
 
338
                                                                 });
 
339
            my $contig = $read_data->{$readID}{'contig'};
 
340
            my $coord  = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
 
341
            $coord->add_sub_SeqFeature($read_tag);
 
342
        };
 
343
 
 
344
        # Loading read tags
 
345
        /^WA\s*\{/ && do {
 
346
            my ($type,$source,$date) = split(' ',$self->_readline);
 
347
            my $extra_info = undef;
 
348
            while ($_ = $self->_readline) {
 
349
                last if (/\}/);
 
350
                $extra_info = $_;
 
351
            }
 
352
#?          push(@line,\@extra_info);
 
353
            my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source,
 
354
                                    "DATE:",$date,"DATA:",$extra_info);
 
355
            $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag);
 
356
            $assembly->annotation->add_Annotation('phrap',$assembly_tag);
 
357
        };
 
358
 
 
359
    } # while ($_ = $self->_readline)
 
360
 
 
361
    $assembly->update_seq_list();
 
362
    return $assembly;
 
363
}
 
364
 
 
365
=head2 write_assembly
 
366
 
 
367
    Title   : write_assembly
 
368
    Usage   : $ass_io->write_assembly($assembly)
 
369
    Function: Write the assembly object in Phrap compatible ACE format
 
370
    Returns : 1 on success, 0 for error
 
371
    Args    : A Bio::Assembly::Scaffold object
 
372
 
 
373
=cut
 
374
 
 
375
sub write_assemebly {
 
376
    my $self = shift;
 
377
 
 
378
    $self->throw("Writing phrap ACE files is not implemented yet! Sorry...");
 
379
}
 
380
 
 
381
1;
 
382
 
 
383
__END__