188
188
# check requirements
189
189
unless ( eval "require Bio::DB::Sam; 1" ) {
190
Bio::Root::Root->throw("__PACKAGE__ requires installation of samtools (libbam) and Bio::DB::Sam (available on CPAN; not part of BioPerl)");
190
Bio::Root::Root->throw(__PACKAGE__.' requires installation of samtools (libbam) and Bio::DB::Sam (available on CPAN; not part of BioPerl)');
192
192
unless ( eval "require IO::Uncompress::Gunzip; \$HAVE_IO_UNCOMPRESS = 1") {
193
Bio::Root::Root->warn("IO::Uncompress::Gunzip is not available; you'll have to do your decompression by hand.");
193
Bio::Root::Root->warn("IO::Uncompress::Gunzip is not available; you'll have to do your decompression by hand.");
231
231
my $assembly = Bio::Assembly::Scaffold->new( -progname => $progname );
233
233
foreach my $id (@refseq_ids) {
234
#### this is choice 1: all refseqs into one assy...###
235
$self->_current_refseq_id( $id );
236
# Load contigs and singlets in the scaffold
237
while ( my $obj = $self->next_contig()) {
238
# Add contig /singlet to assembly
239
if ($obj->isa('Bio::Assembly::Singlet')) { # a singlet
240
$assembly->add_singlet($obj);
242
$assembly->add_contig($obj);
234
#### this is choice 1: all refseqs into one assy...###
235
$self->_current_refseq_id( $id );
236
# Load contigs and singlets in the scaffold
237
while ( my $obj = $self->next_contig()) {
238
# Add contig /singlet to assembly
239
if ($obj->isa('Bio::Assembly::Singlet')) { # a singlet
240
$assembly->add_singlet($obj);
242
$assembly->add_contig($obj);
246
246
return $assembly;
259
259
sub next_contig {
260
260
my $self = shift;
261
261
if (!defined $self->_current_contig_seg_idx) {
262
$self->_current_contig_seg_idx(0);
262
$self->_current_contig_seg_idx(0);
265
$self->_current_contig_seg_idx( 1+$self->_current_contig_seg_idx );
265
$self->_current_contig_seg_idx( 1+$self->_current_contig_seg_idx );
267
267
unless ($self->_current_refseq_id) {
268
croak("No current refseq id set");
268
croak("No current refseq id set");
270
270
my $contig_segs = $self->_segset($self->_current_refseq_id);
271
271
unless ($contig_segs && @$contig_segs) {
272
croak("No contig segset for current id '".$self->_current_refseq_id."'")
272
croak("No contig segset for current id '".$self->_current_refseq_id."'")
274
274
# each segment in @$contig_segs represents a contig within the
287
287
while ( my $read = $seqio->next_seq ) {
288
if ($self->{_assigned}->{$read->name}) {
292
$self->{_assigned}->{$read->name}=1;
293
$self->_store_read($read, $contigobj);
288
if ($self->{_assigned}->{$read->name}) {
291
$self->{_assigned}->{$read->name}=1;
292
$self->_store_read($read, $contigobj);
296
295
if ($numseq == 1) { # ooh! a singlet!
297
$contigobj = $self->_store_singlet($contigobj);
296
$contigobj = $self->_store_singlet($contigobj);
299
298
return $contigobj;
381
380
my $self = shift;
382
381
my ($read, $contigobj) = @_;
383
382
my $readseq = Bio::LocatableSeq->new(
384
-display_id => $read->name,
385
-primary_id => $read->name,
388
-strand => $read->strand,
383
-display_id => $read->name,
384
-primary_id => $read->name,
387
-strand => $read->strand,
391
390
my $qual = Bio::Seq::PrimaryQual->new(
392
-id => $read->name."_qual",
393
-qual => [$read->qscore]
391
-id => $read->name."_qual",
392
-qual => [$read->qscore]
396
395
# add pair information
398
397
if ($read->proper_pair) { # mate also aligned
400
mate_start => $read->mate_start,
401
mate_len => $read->mate_len,
402
mate_strand => $read->mstrand,
403
insert_size => $read->isize
399
mate_start => $read->mate_start,
400
mate_len => $read->mate_len,
401
mate_strand => $read->mstrand,
402
insert_size => $read->isize
407
406
my $alncoord = Bio::SeqFeature::Generic->new(
408
-primary => $read->name,
409
-start => $read->start,
411
-strand => $read->strand,
412
-qual => join(' ',$read->qscore),
413
-tag => { 'contig' => $contigobj->id,
414
'cigar' => $read->cigar_str,
407
-primary => $read->name,
408
-start => $read->start,
410
-strand => $read->strand,
411
-qual => join(' ',$read->qscore),
412
-tag => { 'contig' => $contigobj->id,
413
'cigar' => $read->cigar_str,
418
417
$contigobj->set_seq_coord($alncoord, $readseq);
419
418
$contigobj->set_seq_qual( $readseq, $qual );
488
487
$file =~ s/^[<>+]*//; # byebye parasitic mode chars
489
488
my ($fname, $dir, $suf) = fileparse($file, ".sam", ".bam");
490
489
$self->_set_from_args({ '_basename' => $fname },
491
-methods => [qw( _basename)],
490
-methods => [qw( _basename)],
493
492
if (!defined $fasfile) {
494
for (qw( fas fa fasta fas.gz fa.gz fasta.gz )) {
495
$fasfile = File::Spec->catdir($dir, $self->_basename.$_);
493
for (qw( fas fa fasta fas.gz fa.gz fasta.gz )) {
494
$fasfile = File::Spec->catdir($dir, $self->_basename.$_);
500
499
unless (-e $fasfile) {
501
croak( "Can't find associated reference fasta db" );
500
croak( "Can't find associated reference fasta db" );
503
502
!$self->refdb && $self->refdb($fasfile);
505
504
if ($fasfile =~ /\.gz[^.]*$/) {
506
unless ($HAVE_IO_UNCOMPRESS) {
507
croak( "IO::Uncompress::Gunzip not available; can't decompress on the fly");
509
my ($tfh, $tf) = tempfile( UNLINK => 1);
510
my $z = IO::Uncompress::Gunzip->new($fasfile) or croak("Can't expand: $@");
511
while (<$z>) { print $tfh $_ }
505
unless ($HAVE_IO_UNCOMPRESS) {
506
croak( "IO::Uncompress::Gunzip not available; can't decompress on the fly");
508
my ($tfh, $tf) = tempfile( UNLINK => 1);
509
my $z = IO::Uncompress::Gunzip->new($fasfile) or croak("Can't expand: $@");
510
while (<$z>) { print $tfh $_ }
515
514
if ($file =~ /\.gz[^.]*$/) {
516
unless ($HAVE_IO_UNCOMPRESS) {
517
croak( "IO::Uncompress::Gunzip not available; can't decompress on the fly");
519
my ($tfh, $tf) = tempfile( UNLINK => 1);
520
my $z = IO::Uncompress::Gunzip->new($file) or croak("Can't expand: $@");
515
unless ($HAVE_IO_UNCOMPRESS) {
516
croak( "IO::Uncompress::Gunzip not available; can't decompress on the fly");
518
my ($tfh, $tf) = tempfile( UNLINK => 1);
519
my $z = IO::Uncompress::Gunzip->new($file) or croak("Can't expand: $@");
528
526
# sam conversion : just barf for now
531
$bam =~ s/\.sam/\.bam/;
532
croak( "'$file' looks like a text file.\n\tTo convert to the required .bam (binary SAM) format, run\n\t\$ samtools view -Sb $file > $bam\n");
529
$bam =~ s/\.sam/\.bam/;
530
croak( "'$file' looks like a text file.\n\tTo convert to the required .bam (binary SAM) format, run\n\t\$ samtools view -Sb $file > $bam\n");
535
533
$sam = Bio::DB::Sam->new( -bam => $file,
539
537
unless (defined $sam) {
540
croak( "Couldn't create the Bio::DB::Sam object" );
538
croak( "Couldn't create the Bio::DB::Sam object" );
542
540
$self->{sam} = $sam;
543
541
# now produce the contig segments for each seq_id...
544
542
for ($sam->seq_ids) {
545
my $seg = $sam->segment(-seq_id=>$_, -start=>1, -end=>$sam->length($_));
546
${$self->{_segset}}{$_} = [$self->_get_contig_segs_from_coverage($seg)];
543
my $seg = $sam->segment(-seq_id=>$_, -start=>1, -end=>$sam->length($_));
544
${$self->{_segset}}{$_} = [$self->_get_contig_segs_from_coverage($seg)];
566
564
my $self = shift;
567
565
my $segment = shift;
568
566
unless ($self->sam) {
569
croak("Sam object not yet initialized (call _init_sam)");
567
croak("Sam object not yet initialized (call _init_sam)");
571
569
unless ( ref($segment) =~ /Bio::DB::Sam::Segment/ ) {
572
croak("Requires Bio::DB::Sam::Segment object at arg 1");
570
croak("Requires Bio::DB::Sam::Segment object at arg 1");
574
572
my ($cov, @covdata, @rngs, @segs);
575
573
($cov) = $segment->features('coverage');
577
croak("No coverage data!");
575
croak("No coverage data!");
579
577
@covdata = $cov->coverage;
583
581
my ($lf_end,$rt_end);
584
582
for (0..$#covdata) {
592
# push previous range
593
if (defined $lf_end && defined $rt_end) {
594
push @rngs, [$lf_end, $rt_end];
590
# push previous range
591
if (defined $lf_end && defined $rt_end) {
592
push @rngs, [$lf_end, $rt_end];
605
602
push @rngs, [$lf_end, $rt_end] if (defined $lf_end and
609
carp ("No coverage for this segment!");
606
carp ("No coverage for this segment!");
613
push @segs, $self->sam->segment(-seq_id=>$segment->seq_id,
610
push @segs, $self->sam->segment(-seq_id=>$segment->seq_id,
638
635
my $region = $seg->seq_id.':'.$seg->start.'..'.$seg->end;
639
636
my $qual_averager = sub {
640
my ($seqid, $pos, $p) = @_;
637
my ($seqid, $pos, $p) = @_;
641
638
return unless ($seg->start <= $pos and $pos <= $seg->end);
644
for my $pileup (@$p) {
645
my $b = $pileup->alignment;
646
$acc += $b->qscore->[$pileup->qpos];
649
push @quals, int($acc/$n);
641
for my $pileup (@$p) {
642
my $b = $pileup->alignment;
643
$acc += $b->qscore->[$pileup->qpos];
646
push @quals, int($acc/$n);
651
648
$self->sam->pileup($region, $qual_averager);
673
670
my $region = $seg->seq_id.':'.$seg->start.'-'.$seg->end;
675
672
my $weighted_consensus = sub {
676
my ($seqid, $pos, $p) = @_;
673
my ($seqid, $pos, $p) = @_;
677
674
return unless ($seg->start <= $pos and $pos <= $seg->end);
680
for my $pileup (@$p) {
681
my $b = $pileup->alignment;
682
my $res = substr($b->qseq,$pileup->qpos,1);
683
$wt_tbl{$res} += $b->qscore->[$pileup->qpos] || 0;
688
my $c = (sort { $wt_tbl{$b}<=>$wt_tbl{$a} } keys %wt_tbl)[0];
690
push @quals, int($wt_tbl{$c}/$n{$c});
677
for my $pileup (@$p) {
678
my $b = $pileup->alignment;
679
my $res = substr($b->qseq,$pileup->qpos,1);
680
$wt_tbl{$res} += $b->qscore->[$pileup->qpos] || 0;
685
my $c = (sort { $wt_tbl{$b}<=>$wt_tbl{$a} } keys %wt_tbl)[0];
687
push @quals, int($wt_tbl{$c}/$n{$c});
694
690
$self->sam->pileup($region, $weighted_consensus);
695
691
return Bio::Seq::Quality->new(
696
-qual => join(' ', @quals ),
692
-qual => join(' ', @quals ),