2
# Verify the correctness of the generated "random.cand" according to the
3
# original sequence file "random.seq" and "random_revcom.seq".
7
my $seq_file = "random.seq";
8
my $seq_revcom_file = "random_revcom.seq";
9
my $cand_file = "random.cand";
21
# Read in the two sequences
22
open SEQFILE, "$seq_file" or die $!;
24
print length($seq) . " bases are read from $seq_file.\n";
27
open SEQFILE, "$seq_revcom_file" or die $!;
28
$seq_revcom = <SEQFILE>;
29
print length($seq) . " bases are read from $seq_revcom_file.\n";
32
# Verify the correctness of .cand file
33
open CANDFILE, "$cand_file" or die $!;
34
while($line = <CANDFILE>) {
36
# The format of each line in the .cand file is:
37
# read1_name read2_name direction start_position_in_read1 start_position_in_read2
38
if($line =~ /(\S+)\s+(\S+)\s+(1|-1)\s+(\d+)\s+(\d+)/) {
41
# "read_name" denotes the position of this read in its original
42
# sequence. Since we have two original sequence here, in order to
43
# locate the reads from the correct original sequence, we added the
44
# prefix "rc" for reads from one of the sequence.
45
if(substr($1, 0, 2) ne "rc") {
46
$kmer1 = substr $seq, $1+$4, $kmer_size;
48
$kmer1 = substr $seq_revcom, substr($1, 2)+$4, $kmer_size;
52
# When direction equals 1
53
if(substr($2, 0, 2) ne "rc") {
54
$kmer2 = substr $seq, $2+$5, $kmer_size;
56
$kmer2 = substr $seq_revcom, substr($2, 2)+$5, $kmer_size;
59
if($kmer1 ne $kmer2) {
60
print "Incorrect match found at line $count\n";
63
} elsif ($3 eq "-1") {
64
# When direction equals -1
65
if(substr($2, 0, 2) ne "rc") {
66
$tmp_read = substr $seq, $2, 100;
67
$kmer2 = substr $tmp_read, -$5-$kmer_size, $kmer_size;
69
$tmp_read = substr $seq_revcom, substr($2,2), 100;
70
$kmer2 = substr $tmp_read, -$5-$kmer_size, $kmer_size;
73
my $revcom = reverse $kmer2;
74
$revcom =~ tr/ACGT/TGCA/;
75
if($kmer1 ne $revcom) {
76
print "Incorrect match found at line $count\n";
80
print "Invalid direction value at line $count\n";
83
} elsif ($line eq "EOF") {
87
print "Invalid candidate format at line $count\n";
94
print "$count candidate pairs have been verified as correct candidates.\n";