4
# Author Jason Stajich <jason-at-bioperl-dot-org>
8
bp_pairwise_kaks - script to calculate pairwise Ka,Ks for a set of sequences
12
bp_pairwise_kaks.PLS -i t/data/worm_fam_2785.cdna [-f fasta/genbank/embl...] [-msa tcoffee/clustal] [-kaks yn00/codeml]
16
This script will take as input a dataset of cDNA sequences verify
17
that they contain no stop codons, align them in protein space,
18
project the alignment back into cDNA and estimate the Ka
19
(non-synonymous) and Ks (synonymous) substitutions based on the ML
20
method of Yang with the PAML package.
24
* PAML program codeml or yn00
25
* Multiple sequence alignment programs Clustalw OR T-Coffee
27
Often there are specific specific parameters you want to run when you
28
a computing Ka/Ks ratios so consider this script a starting point and
29
do not rely it on for every situation.
35
User feedback is an integral part of the evolution of this and other
36
Bioperl modules. Send your comments and suggestions preferably to
37
the Bioperl mailing list. Your participation is much appreciated.
39
bioperl-l@bioperl.org - General discussion
40
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
44
Report bugs to the Bioperl bug tracking system to help us keep track
45
of the bugs and their resolution. Bug reports can be submitted via the
48
https://redmine.open-bio.org/projects/bioperl/
52
Jason Stajich jason-at-bioperl-dot-org
58
require Bio::Tools::Run::Phylo::PAML::Codeml;
59
require Bio::Tools::Run::Phylo::PAML::Yn00;
61
# Multiple Sequence Alignment programs
62
require Bio::Tools::Run::Alignment::Clustalw;
63
require Bio::Tools::Run::Alignment::TCoffee;
66
die("Must have bioperl-run pkg installed to run this script");
68
# for projecting alignments from protein to R/DNA space
69
use Bio::Align::Utilities qw(aa_to_dna_aln);
71
# for input of the sequence data
75
# for the command line argument parsing
78
my ($aln_prog, $kaks_prog,$format, $output,
79
$cdna,$verbose,$help) = qw(clustalw codeml fasta);
82
'i|input:s' => \$cdna,
83
'o|output:s' => \$output,
84
'f|format:s' => \$format,
85
'msa:s' => \$aln_prog,
86
'kaks:s' => \$kaks_prog,
87
'v|verbose' => \$verbose,
95
$verbose = -1 unless $verbose;
96
my ($aln_factory,$kaks_factory);
97
if( $aln_prog =~ /clus/i ) {
98
$aln_factory = Bio::Tools::Run::Alignment::Clustalw->new(-verbose => $verbose);
99
} elsif( $aln_prog =~ /t\_?cof/i ) {
100
$aln_factory = Bio::Tools::Run::Alignment::TCoffee->new(-verbose => $verbose);
102
warn("Did not provide either 'clustalw' or 'tcoffee' as alignment program names");
105
unless( $aln_factory->executable ) {
106
warn("Could not find the executable for $aln_prog, make sure you have installed it and have either set ".uc($aln_prog)."DIR or it is in your PATH");
111
if( $kaks_prog =~ /yn00/i ) {
112
$kaks_factory = Bio::Tools::Run::Phylo::PAML::Yn00->new(-verbose => $verbose);
113
} elsif( $kaks_prog =~ /codeml/i ) {
114
# change the parameters here if you want to tweak your Codeml running!
115
$kaks_factory = Bio::Tools::Run::Phylo::PAML::Codeml->new
116
(-verbose => $verbose,
117
-params => { 'runmode' => -2,
122
unless ( $kaks_factory->executable ) {
123
warn("Could not find the executable for $kaks_prog, make sure you have installed it and you have defined PAMLDIR or it is in your PATH");
127
unless ( $cdna && -f $cdna && -r $cdna && ! -z $cdna ) {
128
warn("Did not specify a valid cDNA sequence file as input");
132
my $seqin = new Bio::SeqIO(-file => $cdna,
137
while( my $seq = $seqin->next_seq ) {
138
$seqs{$seq->display_id} = $seq;
139
my $protein = $seq->translate();
140
my $pseq = $protein->seq();
143
if( $pseq =~ /\*/ ) {
144
warn("provided a cDNA (".$seq->display_id.") sequence with a stop codon, PAML will choke!");
147
# Tcoffee can't handle '*'
149
$protein->seq($pseq);
150
push @prots, $protein;
153
warn("Need at least 2 cDNA sequences to proceed");
159
open(OUT, ">$output") || die("cannot open output $output for writing");
164
my $aa_aln = $aln_factory->align(\@prots);
165
my $dna_aln = &aa_to_dna_aln($aa_aln, \%seqs);
167
my @each = $dna_aln->each_seq();
170
$kaks_factory->alignment($dna_aln);
172
my ($rc,$parser) = $kaks_factory->run();
174
warn($kaks_factory->error_string,"\n");
177
my $result = $parser->next_result;
179
if ($result->version =~ m/3\.15/) {
180
warn("This script does not work with v3.15 of PAML! Please use 3.14 instead.");
184
my $MLmatrix = $result->get_MLmatrix();
186
my @otus = $result->get_seqs();
190
foreach my $s ( @each ) {
191
last if( $s->display_id eq $_->display_id );
197
print OUT join("\t", qw(SEQ1 SEQ2 Ka Ks Ka/Ks PROT_PERCENTID CDNA_PERCENTID)), "\n";
198
for( my $i = 0; $i < (scalar @otus -1) ; $i++) {
199
for( my $j = $i+1; $j < (scalar @otus); $j++ ) {
200
my $sub_aa_aln = $aa_aln->select_noncont($pos[$i],$pos[$j]);
201
my $sub_dna_aln = $dna_aln->select_noncont($pos[$i],$pos[$j]);
203
$otus[$i]->display_id,
204
$otus[$j]->display_id,$MLmatrix->[$i]->[$j]->{'dN'},
205
$MLmatrix->[$i]->[$j]->{'dS'},
206
$MLmatrix->[$i]->[$j]->{'omega'},
207
sprintf("%.2f",$sub_aa_aln->percentage_identity),
208
sprintf("%.2f",$sub_dna_aln->percentage_identity),