~ubuntu-branches/ubuntu/vivid/cctools/vivid

« back to all changes in this revision

Viewing changes to sand/test/filter_verification/verify_candidates.pl

  • Committer: Bazaar Package Importer
  • Author(s): Michael Hanke
  • Date: 2011-05-07 09:05:00 UTC
  • Revision ID: james.westby@ubuntu.com-20110507090500-lqpmdtwndor6e7os
Tags: upstream-3.3.2
ImportĀ upstreamĀ versionĀ 3.3.2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/perl
 
2
# Verify the correctness of the generated "random.cand" according to the
 
3
# original sequence file "random.seq" and "random_revcom.seq".
 
4
use strict;
 
5
use warnings;
 
6
 
 
7
my $seq_file = "random.seq";
 
8
my $seq_revcom_file = "random_revcom.seq";
 
9
my $cand_file = "random.cand";
 
10
 
 
11
my $read_size = 100;
 
12
my $kmer_size = 22;
 
13
my $count = 0;
 
14
my $seq;
 
15
my $seq_revcom;
 
16
my $line;
 
17
my $kmer1;
 
18
my $kmer2;
 
19
my $tmp_read;
 
20
 
 
21
# Read in the two sequences
 
22
open SEQFILE, "$seq_file" or die $!;
 
23
$seq = <SEQFILE>;
 
24
print length($seq) . " bases are read from $seq_file.\n";
 
25
close(SEQFILE);
 
26
 
 
27
open SEQFILE, "$seq_revcom_file" or die $!;
 
28
$seq_revcom = <SEQFILE>;
 
29
print length($seq) . " bases are read from $seq_revcom_file.\n";
 
30
close(SEQFILE);
 
31
 
 
32
# Verify the correctness of .cand file
 
33
open CANDFILE, "$cand_file" or die $!;
 
34
while($line = <CANDFILE>) {
 
35
        chomp($line);
 
36
        # The format of each line in the .cand file is: 
 
37
        # read1_name   read2_name   direction   start_position_in_read1   start_position_in_read2
 
38
        if($line =~ /(\S+)\s+(\S+)\s+(1|-1)\s+(\d+)\s+(\d+)/) {
 
39
                $count++;
 
40
        
 
41
                # "read_name" denotes the position of this read in its original
 
42
                # sequence. Since we have two original sequence here, in order to
 
43
                # locate the reads from the correct original sequence, we added the
 
44
                # prefix "rc" for reads from one of the sequence.
 
45
                if(substr($1, 0, 2) ne "rc") {
 
46
                        $kmer1 = substr $seq, $1+$4, $kmer_size;
 
47
                } else {
 
48
                        $kmer1 = substr $seq_revcom, substr($1, 2)+$4, $kmer_size;
 
49
                }
 
50
 
 
51
                if($3 eq "1") {
 
52
                        # When direction equals 1
 
53
                        if(substr($2, 0, 2) ne "rc") {
 
54
                                $kmer2 = substr $seq, $2+$5, $kmer_size;
 
55
                        } else {
 
56
                                $kmer2 = substr $seq_revcom, substr($2, 2)+$5, $kmer_size;
 
57
                        }
 
58
 
 
59
                        if($kmer1 ne $kmer2) {
 
60
                                print "Incorrect match found at line $count\n";
 
61
                                goto failure;
 
62
                        }
 
63
                } elsif ($3 eq "-1") {
 
64
                        # When direction equals -1
 
65
                        if(substr($2, 0, 2) ne "rc") {
 
66
                                $tmp_read = substr $seq, $2, 100;
 
67
                                $kmer2 = substr $tmp_read, -$5-$kmer_size, $kmer_size;
 
68
                        } else {
 
69
                                $tmp_read = substr $seq_revcom, substr($2,2), 100;
 
70
                                $kmer2 = substr $tmp_read, -$5-$kmer_size, $kmer_size;
 
71
                        }
 
72
 
 
73
                        my $revcom = reverse $kmer2;
 
74
                        $revcom =~ tr/ACGT/TGCA/;
 
75
                        if($kmer1 ne $revcom) {
 
76
                                print "Incorrect match found at line $count\n";
 
77
                                goto failure;
 
78
                        }
 
79
                } else {
 
80
                        print "Invalid direction value at line $count\n";
 
81
                        goto failure;
 
82
                }
 
83
        } elsif ($line eq "EOF") {
 
84
                goto success;
 
85
        } else {
 
86
                $count++;
 
87
                print "Invalid candidate format at line $count\n";
 
88
                goto failure;
 
89
        }
 
90
}       
 
91
 
 
92
success:
 
93
        close(CANDFILE);
 
94
        print "$count candidate pairs have been verified as correct candidates.\n";
 
95
        exit(0);
 
96
 
 
97
failure:
 
98
        close(CANDFILE);
 
99
        exit(1);
 
100