1
# Copyright 2013 by Zheng Ruan (zruan1991@gmai.com).
3
# This code is part of the Biopython distribution and governed by its
4
# license. Please see the LICENSE file that should have been included
5
# as part of this package.
6
"""Code for dealing with Codon Seq.
8
CodonSeq class is interited from Seq class. This is the core class to
9
deal with sequences in CodonAlignment in biopython.
12
from __future__ import division, print_function
13
from itertools import permutations
16
from Bio.Seq import Seq
17
from Bio.SeqRecord import SeqRecord
18
from Bio.Alphabet import IUPAC, Gapped, HasStopCodon, Alphabet
19
from Bio.Alphabet import generic_dna, _ungap
20
from Bio.Data.CodonTable import generic_by_id
22
from Bio.CodonAlign.CodonAlphabet import default_codon_alphabet, default_codon_table
24
__docformat__ = "epytext en" # Don't just use plain text in epydoc API pages!
27
"""CodonSeq is designed to be within the SeqRecords of a
30
CodonSeq is useful as it allows the user to specify
31
reading frame when translate CodonSeq
33
CodonSeq also accepts codon style slice by calling
36
Important: Ungapped CodonSeq can be any length if you
37
specify the rf_table. Gapped CodonSeq should be a
40
>>> codonseq = CodonSeq("AAATTTGGGCCAAATTT", rf_table=(0,3,6,8,11,14))
41
>>> print(codonseq.translate())
44
test get_full_rf_table method
46
>>> p = CodonSeq('AAATTTCCCGG-TGGGTTTAA', rf_table=(0, 3, 6, 9, 11, 14, 17))
47
>>> full_rf_table = p.get_full_rf_table()
48
>>> print(full_rf_table)
49
[0, 3, 6, 9, 12, 15, 18]
50
>>> print(p.translate(rf_table=full_rf_table, ungap_seq=False))
52
>>> p = CodonSeq('AAATTTCCCGGGAA-TTTTAA', rf_table=(0, 3, 6, 9, 14, 17))
53
>>> print(p.get_full_rf_table())
54
[0, 3, 6, 9, 12.0, 15, 18]
55
>>> p = CodonSeq('AAA------------TAA', rf_table=(0, 3))
56
>>> print(p.get_full_rf_table())
57
[0, 3.0, 6.0, 9.0, 12.0, 15]
60
def __init__(self, data='', alphabet=default_codon_alphabet, \
61
gap_char="-", rf_table=None):
62
# rf_table should be a tuple or list indicating the every
63
# codon position along the sequence. For example:
64
# sequence = 'AAATTTGGGCCAAATTT'
65
# rf_table = (0, 3, 6, 8, 11, 14)
66
# the translated protein sequences will be
67
# AAA TTT GGG GCC AAA TTT
69
# Notice: rf_table applies to ungapped sequence. If there
70
# are gaps in the sequence, they will be discarded. This
71
# feature ensures the rf_table is independent of where the
72
# codon sequence appears in the alignment
74
Seq.__init__(self, data.upper(), alphabet=alphabet)
75
self.gap_char = gap_char
77
# check the length of the alignment to be a triple
79
seq_ungapped = self._data.replace(gap_char, "")
80
assert len(self) % 3 == 0, "Sequence length is not a triple number"
81
self.rf_table = list(filter(lambda x: x%3 == 0,
82
range(len(seq_ungapped))))
84
# Not use Alphabet._verify_alphabet function because it
85
# only works for single alphabet
86
for i in self.rf_table:
87
if self._data[i:i+3] not in alphabet.letters:
88
raise ValueError("Sequence contain undefined letters from"
90
"({0})! ".format(self._data[i:i+3]))
92
#if gap_char in self._data:
93
# assert len(self) % 3 == 0, \
94
# "Gapped sequence length is not a triple number"
95
assert isinstance(rf_table, (tuple, list)), \
96
"rf_table should be a tuple or list object"
97
assert all(isinstance(i, int) for i in rf_table), \
98
"elements in rf_table should be int that specify " \
99
+ "the codon positions of the sequence"
100
seq_ungapped = self._data.replace(gap_char, "")
102
if seq_ungapped[i:i+3] not in alphabet.letters:
103
raise ValueError("Sequence contain undefined letters "
105
"({0})!".format(seq_ungapped[i:i+3]))
106
self.rf_table = rf_table
108
def __getitem__(self, index):
109
# TODO: handle alphabet elegantly
110
return Seq(self._data[index], alphabet=generic_dna)
112
def get_codon(self, index):
113
"""get the `index`-th codon from the self.seq
115
if len(set([i % 3 for i in self.rf_table])) != 1:
116
raise RuntimeError("frameshift detected. "
117
"CodonSeq object is not able to deal "
118
"with codon sequence with frameshift. "
119
"Plase use normal slice option.")
120
if isinstance(index, int):
122
return self._data[index*3:(index+1)*3]
124
return self._data[index*3:]
126
# This slice ensures that codon will always be the unit
127
# in slicing (it won't change to other codon if you are
128
# using reverse slicing such as [::-1]).
129
# The idea of the code below is to first map the slice
130
# to amino acid sequence and then transform it into
132
aa_index = range(len(self)//3)
134
aa_slice = aa_index[p]
137
codon_slice += self._data[i*3:i*3+3]
139
codon_slice = cslice(index)
140
return CodonSeq(codon_slice, alphabet=self.alphabet)
142
def get_codon_num(self):
143
"""Return the number of codons in the CodonSeq"""
144
return len(self.rf_table)
146
def translate(self, codon_table=default_codon_table,
147
stop_symbol="*", rf_table=None, ungap_seq=True):
148
"""Translate the CodonSeq based on the reading frame
149
in rf_table. It is possible for the user to specify
150
a rf_table at this point. If you want to include
151
gaps in the translated sequence, this is the only
152
way. ungap_seq should be set to true for this
156
if ungap_seq is True:
157
tr_seq = self._data.replace(self.gap_char, "")
161
rf_table = self.rf_table
164
if isinstance(i, float):
165
amino_acids.append('-')
167
#elif '---' == tr_seq[i:i+3]:
168
# amino_acids.append('-')
170
elif '-' in tr_seq[i:i+3]:
171
# considering two types of frameshift
172
if p == -1 or p - i == 3:
174
codon = tr_seq[i:i+6].replace('-', '')[:3]
176
codon = tr_seq[i:i+3]
179
# normal condition without gaps
180
codon = tr_seq[i:i+3]
182
if codon in codon_table.stop_codons:
183
amino_acids.append(stop_symbol)
186
amino_acids.append(codon_table.forward_table[codon])
188
raise RuntimeError("Unknown codon detected ({0}). Do you "
189
"forget to speficy ungap_seq "
190
"argument?".format(codon))
191
return "".join(amino_acids)
193
def toSeq(self, alphabet=generic_dna):
194
return Seq(self._data, generic_dna)
196
def get_full_rf_table(self):
197
"""This function returns a full rf_table of the given
198
CodonSeq records. A full rf_table is different from
199
normal rf_table in that it translate gaps in CodonSeq.
200
It is helpful to construct alignment containing
203
ungap_seq = self._data.replace("-", "")
204
codon_lst = [ungap_seq[i:i+3] for i in self.rf_table]
205
relative_pos = [self.rf_table[0]]
206
for i in range(1, len(self.rf_table[1:])+1):
207
relative_pos.append(self.rf_table[i]-self.rf_table[i-1])
210
for i in filter(lambda x: x%3==0, range(len(self._data))):
211
if self._data[i:i+3] == self.gap_char*3:
212
full_rf_table.append(i+0.0)
213
elif relative_pos[codon_num] == 0:
214
full_rf_table.append(i)
216
elif relative_pos[codon_num] in (-1, -2):
217
# check the gap status of previous codon
218
gap_stat = len(self._data[i-3:i].replace("-", ""))
220
full_rf_table.append(i+relative_pos[codon_num])
222
full_rf_table.append(i+1+relative_pos[codon_num])
224
full_rf_table.append(i+2+relative_pos[codon_num])
226
elif relative_pos[codon_num] > 0:
227
full_rf_table.append(i+0.0)
229
this_len = len(self._data[i:i+3].replace("-", ""))
230
relative_pos[codon_num] -= this_len
232
# we probably reached the last codon
236
def full_translate(self, codon_table=default_codon_table, stop_symbol="*"):
237
"""Apply full translation with gaps considered.
239
full_rf_table = self.get_full_rf_table()
240
return self.translate(codon_table=codon_table, stop_symbol=stop_symbol,
241
rf_table=full_rf_table, ungap_seq=False)
243
def ungap(self, gap=None):
244
if hasattr(self.alphabet, "gap_char"):
246
gap = self.alphabet.gap_char
247
elif gap != self.alphabet.gap_char:
248
raise ValueError("Gap %s does not match %s from alphabet"
249
% (repr(gap), repr(self.alphabet.alphabet.gap_char)))
250
alpha = _ungap(self.alphabet)
252
raise ValueError("Gap character not given and not defined in "
255
alpha = self.alphabet # modify!
256
if len(gap) != 1 or not isinstance(gap, str):
257
raise ValueError("Unexpected gap character, %s" % repr(gap))
258
return CodonSeq(str(self._data).replace(gap, ""), alpha,
259
rf_table=self.rf_table)
262
def from_seq(cls, seq, alphabet=default_codon_alphabet, rf_table=None):
264
return cls(seq._data, alphabet=alphabet)
266
return cls(seq._data, alphabet=alphabet, rf_table=rf_table)
269
def _get_codon_list(codonseq):
270
"""get a list of codons according to full_rf_table for counting
273
#if not isinstance(codonseq, CodonSeq):
274
# raise TypeError("_get_codon_list accept a CodonSeq object "
275
# "({0} detected)".format(type(codonseq)))
276
full_rf_table = codonseq.get_full_rf_table()
278
for i, k in enumerate(full_rf_table):
279
if isinstance(k, int):
282
end = int(full_rf_table[i+1])
285
this_codon = str(codonseq[start:end])
286
if len(this_codon) == 3:
287
codon_lst.append(this_codon)
289
codon_lst.append(str(this_codon.ungap()))
290
elif str(codonseq[int(k):int(k)+3]) == "---":
291
codon_lst.append("---")
293
# this may be problematic, as normally no codon shoud
294
# fall into this condition
295
codon_lst.append(codonseq[int(k):int(k)+3])
299
def cal_dn_ds(codon_seq1, codon_seq2, method="NG86",
300
codon_table=default_codon_table, k=1, cfreq=None):
301
"""Function to calculate the dN and dS of the given two CodonSeq
302
or SeqRecord that contain CodonSeq objects.
305
- NG86 - PMID: 3444411
306
- LWL85 - PMID: 3916709
308
- YN00 - PMID: 10666704
311
- w - transition/transvertion ratio
312
- cfreq - Current codon frequency vector can only be specified
313
when you are using ML method. Possible ways of
314
getting cfreq are: F1x4, F3x4 and F61.
316
if all([isinstance(codon_seq1, CodonSeq),
317
isinstance(codon_seq2, CodonSeq)]):
319
elif all([isinstance(codon_seq1, SeqRecord),
320
isinstance(codon_seq2, SeqRecord)]):
321
codon_seq1 = codon_seq1.seq
322
codon_seq2 = codon_seq2.seq
324
raise TypeError("cal_dn_ds accepts two CodonSeq objects or SeqRecord "
325
"that contains CodonSeq as its seq!")
326
if len(codon_seq1.get_full_rf_table()) != \
327
len(codon_seq2.get_full_rf_table()):
328
raise RuntimeError("full_rf_table length of seq1 ({0}) and seq2 ({1}) "
329
"are not the same".format(
330
len(codon_seq1.get_full_rf_table()),
331
len(codon_seq2.get_full_rf_table()))
335
elif cfreq is not None and method != 'ML':
336
raise RuntimeError("cfreq can only be specified when you "
337
"are using ML method")
338
if cfreq not in ('F1x4', 'F3x4', 'F61'):
340
warnings.warn("Unknown cfreq ({0}). Only F1x4, F3x4 and F61 are "
341
"acceptable. Use F3x4 in the following.".format(cfreq))
343
seq1_codon_lst = _get_codon_list(codon_seq1)
344
seq2_codon_lst = _get_codon_list(codon_seq2)
345
# remove gaps in seq_codon_lst
348
for i, j in zip(seq1_codon_lst, seq2_codon_lst):
349
if (not '-' in i) and (not '-' in j):
352
dnds_func = {'ML': _ml, 'NG86': _ng86, 'LWL85': _lwl85, 'YN00': _yn00}
354
return dnds_func[method](seq1, seq2, cfreq, codon_table)
356
return dnds_func[method](seq1, seq2, k, codon_table)
359
#################################################################
360
# private functions for NG86 method
361
#################################################################
363
def _ng86(seq1, seq2, k, codon_table):
364
"""Main function for NG86 method (PRIVATE).
366
S_sites1, N_sites1 = _count_site_NG86(seq1,
367
codon_table=codon_table, k=k)
368
S_sites2, N_sites2 = _count_site_NG86(seq2,
369
codon_table=codon_table, k=k)
370
S_sites = (S_sites1 + S_sites2) / 2.0
371
N_sites = (N_sites1 + N_sites2) / 2.0
373
for i, j in zip(seq1, seq2):
374
SN = [m+n for m,n in zip(SN, _count_diff_NG86(
376
codon_table=codon_table)
382
dS = abs(-3.0/4*log(1-4.0/3*ps))
386
dN = abs(-3.0/4*log(1-4.0/3*pn))
392
def _count_site_NG86(codon_lst, k=1, codon_table=default_codon_table):
393
"""count synonymous and non-synonymous sites of a list of codons
396
- codon_lst - A three letter codon list from a CodonSeq object.
397
This can be returned from _get_codon_list method.
398
- k - transition/transversion rate ratio
400
S_site = 0 # synonymous sites
401
N_site = 0 # non-synonymous sites
403
pyrimidine = ('T', 'C')
404
base_tuple = ('A', 'T', 'C', 'G')
405
for codon in codon_lst:
406
neighbor_codon = {'transition': [], 'transversion': []}
407
# classify neighbor codons
408
codon = codon.replace('U', 'T')
409
if codon == '---': continue
410
for n, i in enumerate(codon):
414
elif i in purine and j in purine:
415
codon_chars = [c for c in codon]
417
this_codon = ''.join(codon_chars)
418
neighbor_codon['transition'].append(this_codon)
419
elif i in pyrimidine and j in pyrimidine:
420
codon_chars = [c for c in codon]
422
this_codon = ''.join(codon_chars)
423
neighbor_codon['transition'].append(this_codon)
425
codon_chars = [c for c in codon]
427
this_codon = ''.join(codon_chars)
428
neighbor_codon['transversion'].append(this_codon)
429
# count synonymous and non-synonymous sites
430
aa = codon_table.forward_table[codon]
431
this_codon_N_site = this_codon_S_site = 0
432
for neighbor in neighbor_codon['transition']:
433
if neighbor in codon_table.stop_codons:
434
this_codon_N_site += 1
435
elif codon_table.forward_table[neighbor] == aa:
436
this_codon_S_site += 1
438
this_codon_N_site += 1
439
for neighbor in neighbor_codon['transversion']:
440
if neighbor in codon_table.stop_codons:
441
this_codon_N_site += k
442
elif codon_table.forward_table[neighbor] == aa:
443
this_codon_S_site += k
445
this_codon_N_site += k
446
norm_const = (this_codon_N_site + this_codon_S_site)/3
447
S_site += this_codon_S_site / norm_const
448
N_site += this_codon_N_site / norm_const
449
return (S_site, N_site)
452
def _count_diff_NG86(codon1, codon2, codon_table=default_codon_table):
453
"""Count differences between two codons (three-letter string).
454
The function will take multiple pathways from codon1 to codon2
455
into account (PRIVATE).
457
if not all([isinstance(codon1, str), isinstance(codon2, str)]):
458
raise TypeError("_count_diff_NG86 accept string object to represent "
459
"codon ({0}, {1} detected)".format(
463
if len(codon1) != 3 or len(codon2) != 3:
464
raise RuntimeError("codon should be three letter string ({0}, {1} "
465
"detected)".format(len(codon1), len(codon2)))
466
SN = [0, 0] # synonymous and nonsynonymous counts
467
if codon1 == '---' or codon2 == '---':
469
base_tuple = ('A', 'C', 'G', 'T')
470
if not all([i in base_tuple for i in codon1]):
471
raise RuntimeError("Unrecognized character detected in codon1 {0} "
472
"(Codon is consist of "
473
"A, T, C or G)".format(codon1))
474
if not all([i in base_tuple for i in codon2]):
475
raise RuntimeError("Unrecognized character detected in codon2 {0} "
476
"(Codon is consist of "
477
"A, T, C or G)".format(codon2))
482
for i, k in enumerate(zip(codon1, codon2)):
485
def compare_codon(codon1, codon2, codon_table=default_codon_table,
487
"""Method to compare two codon accounting for different
491
if len(set(map(codon_table.forward_table.get,
492
[codon1, codon2]))) == 1:
497
if len(diff_pos) == 1:
498
SN = [i+j for i,j in zip(SN,
499
compare_codon(codon1, codon2, codon_table=codon_table))]
500
elif len(diff_pos) == 2:
501
codon2_aa = codon_table.forward_table[codon2]
503
temp_codon = codon1[:i] + codon2[i] + codon1[i+1:]
504
SN = [i+j for i,j in zip(SN, compare_codon(
506
codon_table=codon_table,
509
SN = [i+j for i,j in zip(SN, compare_codon(
511
codon_table=codon_table,
514
elif len(diff_pos) == 3:
515
codon2_aa = codon_table.forward_table[codon2]
516
paths = list(permutations([0, 1, 2], 3))
519
tmp1 = codon1[:p[0]] + codon2[p[0]] + codon1[p[0]+1:]
520
tmp2 = tmp1[:p[1]] + codon2[p[1]] + tmp1[p[1]+1:]
521
tmp_codon.append((tmp1, tmp2))
522
SN = [i+j for i,j in zip(SN, compare_codon(codon1, tmp1,
526
SN = [i+j for i,j in zip(SN, compare_codon(tmp1, tmp2,
530
SN = [i+j for i,j in zip(SN, compare_codon(tmp2, codon2,
537
#################################################################
538
# private functions for LWL85 method
539
#################################################################
541
def _lwl85(seq1, seq2, k, codon_table):
542
"""Main function fo LWL85 method (PRIVATE).
544
# Nomenclature is according to PMID (3916709)
545
codon_fold_dict = _get_codon_fold(codon_table)
546
# count number of sites in different degenerate classes
550
for codon in seq1 + seq2:
551
fold_num = codon_fold_dict[codon]
559
L = [sum(fold0)/2.0, sum(fold2)/2.0, sum(fold4)/2.0]
560
# count number of differences in different degenerate classes
561
PQ = [0] * 6 # with P0, P2, P4, Q0, Q2, Q4 in each position
562
for codon1, codon2 in zip(seq1, seq2):
563
if (codon1 == "---" or codon2 == "---") or codon1 == codon2:
566
PQ = [i+j for i, j in zip(PQ, _diff_codon(
569
fold_dict=codon_fold_dict)
571
PQ = [i/j for i, j in zip(PQ, L*2)]
574
A = [(1./2)*log(1./(1-2*i-j)) - (1./4)*log(1./(1-2*j)) \
575
for i, j in zip(P, Q)]
576
B = [(1./2)*log(1./(1-2*i)) for i in Q]
577
dS = 3*(L[2]*A[1]+L[2]*(A[2]+B[2]))/(L[1]+3*L[2])
578
dN = 3*(L[2]*B[1]+L[0]*(A[0]+B[0]))/(2*L[1]+3*L[0])
582
def _get_codon_fold(codon_table):
583
"""function to classify different position in a codon into
584
different fold (PRIVATE).
586
def find_fold_class(codon, forward_table):
587
base = set(['A', 'T', 'C', 'G'])
589
codon_base_lst = [i for i in codon]
590
for i, b in enumerate(codon_base_lst):
591
other_base = base - set(b)
594
codon_base_lst[i] = j
596
aa.append(forward_table[''.join(codon_base_lst)])
599
if aa.count(forward_table[codon]) == 0:
601
elif aa.count(forward_table[codon]) in (1,2):
603
elif aa.count(forward_table[codon]) == 3:
606
raise RuntimeError("Unknown Error, cannot assign the "
607
"position to a fold")
608
codon_base_lst[i] = b
611
for codon in codon_table.forward_table:
613
fold_table[codon] = find_fold_class(codon,
614
codon_table.forward_table)
615
fold_table["---"] = '---'
619
def _diff_codon(codon1, codon2, fold_dict):
620
"""function to get the differences of two codon and return
621
number of different types substitutions
623
return (P0, P2, P4, Q0, Q2, Q4)
624
Nomenclature is according to PMID (3916709)
626
P0 = P2 = P4 = Q0 = Q2 = Q4 = 0
627
fold_num = fold_dict[codon1]
629
pyrimidine = ('T', 'C')
630
for n, (i, j) in enumerate(zip(codon1, codon2)):
631
if i!= j and (i in purine and j in purine):
632
if fold_num[n] == '0':
634
elif fold_num[n] == '2':
636
elif fold_num[n] == '4':
639
raise RuntimeError("Unexpected fold_num %d" % fold_num[n])
640
if i!= j and (i in pyrimidine and j in pyrimidine):
641
if fold_num[n] == '0':
643
elif fold_num[n] == '2':
645
elif fold_num[n] == '4':
648
raise RuntimeError("Unexpected fold_num %d" % fold_num[n])
649
if i != j and ((i in purine and j in pyrimidine) \
650
or (i in pyrimidine and j in purine)):
651
if fold_num[n] == '0':
653
elif fold_num[n] == '2':
655
elif fold_num[n] == '4':
658
raise RuntimeError("Unexpected fold_num %d" % fold_num[n])
659
return (P0, P2, P4, Q0, Q2, Q4)
662
#################################################################
663
# private functions for YN00 method
664
#################################################################
666
def _yn00(seq1, seq2, k, codon_table):
667
"""Main function for yn00 method (PRIVATE).
669
# nomenclature is according to PMID: 10666704
670
from collections import defaultdict
671
from scipy.linalg import expm
672
fcodon = [{'A': 0, 'G': 0, 'C': 0, 'T': 0},
673
{'A': 0, 'G': 0, 'C': 0, 'T': 0},
674
{'A': 0, 'G': 0, 'C': 0, 'T': 0}]
675
codon_fold_dict = _get_codon_fold(codon_table)
676
fold0_cnt = defaultdict(int)
677
fold4_cnt = defaultdict(int)
678
for codon in seq1 + seq2:
679
# count sites at different codon position
681
fcodon[0][codon[0]] += 1
682
fcodon[1][codon[1]] += 1
683
fcodon[2][codon[2]] += 1
684
# count sites in different degenerate fold class
685
fold_num = codon_fold_dict[codon]
686
for i, f in enumerate(fold_num):
688
fold0_cnt[codon[i]] += 1
690
fold4_cnt[codon[i]] += 1
691
f0_total = sum(fold0_cnt.values())
692
f4_total = sum(fold4_cnt.values())
693
for i, j in zip(fold0_cnt, fold4_cnt):
694
fold0_cnt[i] = fold0_cnt[i]/f0_total
695
fold4_cnt[i] = fold4_cnt[i]/f4_total
697
# the initial kappa is different from what yn00 gives,
698
# try to find the problem.
699
TV = _get_TV(seq1, seq2, codon_table=codon_table)
700
k04 = (_get_kappa_t(fold0_cnt, TV), _get_kappa_t(fold4_cnt, TV))
701
kappa = (f0_total*k04[0]+f4_total*k04[1])/(f0_total+f4_total)
703
# count synonymous sites and non-synonymous sites
705
tot = sum(fcodon[i].values())
706
fcodon[i] = dict((j, k/tot) for j, k in fcodon[i].items())
707
pi = defaultdict(int)
708
for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons:
711
for i in seq1 + seq2:
713
S_sites1, N_sites1, bfreqSN1 = _count_site_YN00(seq1, seq2, pi,
715
codon_table=codon_table)
716
S_sites2, N_sites2, bfreqSN2 = _count_site_YN00(seq2, seq1, pi,
718
codon_table=codon_table)
719
N_sites = (N_sites1+N_sites2)/2
720
S_sites = (S_sites1+S_sites2)/2
721
bfreqSN = [{'A': 0, 'T': 0, 'C': 0, 'G': 0},
722
{'A': 0, 'T': 0, 'C': 0, 'G': 0}]
724
for b in ('A', 'T', 'C', 'G'):
725
bfreqSN[i][b] = (bfreqSN1[i][b]+bfreqSN2[i][b])/2
726
# use NG86 method to get initial t and w
728
for i, j in zip(seq1, seq2):
729
SN = [m+n for m, n in zip(SN, _count_diff_NG86(
731
codon_table=codon_table)
736
p = sum(SN) / (S_sites+N_sites)
737
w = log(1-4.0/3*pn) / log(1-4.0/3*ps)
738
t = -3/4*log(1-4/3*p)
741
for temp in range(20):
742
# count synonymous and nonsynonymous differences under kappa, w, t
743
codon_lst = [i for i in \
744
list(codon_table.forward_table.keys()) + \
745
codon_table.stop_codons if 'U' not in i]
746
Q = _get_Q(pi, kappa, w, codon_lst, codon_table)
748
TV = [0, 0, 0, 0] # synonymous/nonsynonymous transition/transvertion
751
for i, j in zip(seq1, seq2):
752
if i != '---' and j != '---':
753
codon_npath.setdefault((i, j), 0)
754
codon_npath[(i, j)] += 1
755
for i in codon_npath:
756
tv = _count_diff_YN00(i[0], i[1], P, codon_lst, codon_table)
757
TV = [m+n*codon_npath[i] for m,n in zip(TV, tv)]
758
TV = (TV[0]/S_sites, TV[1]/S_sites), (TV[2]/N_sites, TV[3]/N_sites)
759
# according to the DistanceF84() function of yn00.c in paml,
760
# the t (e.q. 10) appears in PMID: 10666704 is dS and dN
762
for f, tv in zip(bfreqSN, TV):
763
dSdN.append(_get_kappa_t(f, tv, t=True))
764
t = dSdN[0]*3*S_sites/(S_sites+N_sites)+dSdN[1]*3*N_sites/(S_sites+N_sites)
766
if all(map(lambda x: x<tolerance, [abs(i-j) for i,j in zip(dSdN, dSdN_pre)])):
767
return dSdN[1], dSdN[0] # dN, dS
771
def _get_TV(codon_lst1, codon_lst2, codon_table=default_codon_table):
774
- T - proportions of transitional differences
775
- V - proportions of transversional differences
778
pyrimidine = ('C', 'T')
781
for codon1, codon2 in zip(codon_lst1, codon_lst2):
782
if "---" not in (codon1, codon2):
783
for i, j in zip(codon1, codon2):
786
elif i in purine and j in purine:
788
elif i in pyrimidine and j in pyrimidine:
793
return (TV[0]/sites, TV[1]/sites)
794
#return (TV[0], TV[1])
797
def _get_kappa_t(pi, TV, t=False):
798
"""The following formula and variable names are according to
801
pi['Y'] = pi['T'] + pi['C']
802
pi['R'] = pi['A'] + pi['G']
803
A = (2*(pi['T']*pi['C']+pi['A']*pi['G'])+\
804
2*(pi['T']*pi['C']*pi['R']/pi['Y']+pi['A']*pi['G']*pi['Y']/pi['R'])*\
805
(1-TV[1]/(2*pi['Y']*pi['R']))-TV[0])/\
806
(2*(pi['T']*pi['C']/pi['Y']+pi['A']*pi['G']/pi['R']))
807
B = 1 - TV[1]/(2*pi['Y']*pi['R'])
808
a = -0.5*log(A) # this seems to be an error in YANG's original paper
812
kappaHKY85 = 1+(pi['T']*pi['C']/pi['Y']+pi['A']*pi['G']/pi['R'])*\
813
kappaF84/(pi['T']*pi['C']+pi['A']*pi['G'])
816
t = (4*pi['T']*pi['C']*(1+kappaF84/pi['Y'])+\
817
4*pi['A']*pi['G']*(1+kappaF84/pi['R'])+4*pi['Y']*pi['R'])*b
821
def _count_site_YN00(codon_lst1, codon_lst2, pi, k,
822
codon_table=default_codon_table):
823
"""Site counting method from Ina 1995, PMID: 7699723 and modified
824
by Yang, PMID: 10666704. The method will return the total number of
825
synonymous and nonsynonymous sites and base frequencies in each
826
category. The function is equivalent to CountSites() function in
829
if len(codon_lst1) != len(codon_lst2):
830
raise RuntimeError("Length of two codon_lst should be the same "
831
"(%d and %d detected)".format(
836
length = len(codon_lst1)
838
pyrimidine = ('T', 'C')
839
base_tuple = ('A', 'T', 'C', 'G')
840
codon_dict = codon_table.forward_table
841
stop = codon_table.stop_codons
843
for i, j in zip(codon_lst1, codon_lst2):
844
if i != '---' and j != '---':
845
codon_npath.setdefault((i, j), 0)
846
codon_npath[(i, j)] += 1
847
S_sites = N_sites = 0
848
freqSN = [{'A': 0, 'T': 0, 'C': 0, 'G': 0}, # synonymous
849
{'A': 0, 'T': 0, 'C': 0, 'G': 0}] # nonsynonymous
850
for codon_pair, npath in codon_npath.items():
851
codon = codon_pair[0]
854
for base in base_tuple:
855
if codon[pos] == base: continue
856
neighbor_codon = codon[:pos] + base + codon[pos+1:]
857
if neighbor_codon in stop: continue
858
weight = pi[neighbor_codon]
859
if codon[pos] in pyrimidine and base in pyrimidine:
861
elif codon[pos] in purine and base in purine:
863
if codon_dict[codon] == codon_dict[neighbor_codon]:
865
freqSN[0][base] += weight*npath
868
freqSN[1][base] += weight*npath
871
norm_const = 3*length/(S_sites+N_sites)
872
S_sites *= norm_const
873
N_sites *= norm_const
875
norm_const = sum(i.values())
878
return S_sites, N_sites, freqSN
881
def _count_diff_YN00(codon1, codon2, P, codon_lst,
882
codon_table=default_codon_table):
883
"""Count differences between two codons (three-letter string).
884
The function will weighted multiple pathways from codon1 to codon2
885
according to P matrix of codon substitution. The proportion
886
of transition and transvertion (TV) will also be calculated in
887
the function (PRIVATE).
889
if not all([isinstance(codon1, str), isinstance(codon2, str)]):
890
raise TypeError("_count_diff_YN00 accept string object to represent "
891
"codon ({0}, {1} detected)".format(
895
if len(codon1) != 3 or len(codon2) != 3:
896
raise RuntimeError("codon should be three letter string ({0}, {1} "
897
"detected)".format(len(codon1), len(codon2)))
898
TV = [0, 0, 0, 0] # transition and transvertion counts (synonymous and nonsynonymous)
900
if codon1 == '---' or codon2 == '---':
902
base_tuple = ('A', 'C', 'G', 'T')
903
if not all([i in base_tuple for i in codon1]):
904
raise RuntimeError("Unrecognized character detected in codon1 {0} "
905
"(Codon is consist of "
906
"A, T, C or G)".format(codon1))
907
if not all([i in base_tuple for i in codon2]):
908
raise RuntimeError("Unrecognized character detected in codon2 {0} "
909
"(Codon is consist of "
910
"A, T, C or G)".format(codon2))
915
for i, k in enumerate(zip(codon1, codon2)):
918
def count_TV(codon1, codon2, diff, codon_table, weight=1):
920
pyrimidine = ('T', 'C')
921
dic = codon_table.forward_table
922
stop = codon_table.stop_codons
923
if codon1 in stop or codon2 in stop:
924
# stop codon is always considered as nonsynonymous
925
if codon1[diff] in purine and codon2[diff] in purine:
926
return [0, 0, weight, 0]
927
elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:
928
return [0, 0, weight, 0]
930
return [0, 0, 0, weight]
931
elif dic[codon1] == dic[codon2]:
932
if codon1[diff] in purine and codon2[diff] in purine:
933
return [weight, 0, 0, 0]
934
elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:
935
return [weight, 0, 0, 0]
937
return [0, weight, 0, 0]
939
if codon1[diff] in purine and codon2[diff] in purine:
940
return [0, 0, weight, 0]
941
elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:
942
return [0, 0, weight, 0]
944
return [0, 0, 0, weight]
945
if len(diff_pos) == 1:
947
TV = [p+q for p,q in zip(TV,count_TV(codon1, codon2, diff_pos[0], codon_table))]
948
elif len(diff_pos) == 2:
949
codon2_aa = codon_table.forward_table[codon2]
950
tmp_codon = [codon1[:i] + codon2[i] + codon1[i+1:] \
954
codon_idx = list(map(codon_lst.index, [codon1, i, codon2]))
955
prob = (P[codon_idx[0], codon_idx[1]],
956
P[codon_idx[1], codon_idx[2]])
957
path_prob.append(prob[0]*prob[1])
958
path_prob = [2*i/sum(path_prob) for i in path_prob]
959
for n, i in enumerate(diff_pos):
960
temp_codon = codon1[:i] + codon2[i] + codon1[i+1:]
961
TV = [p+q for p,q in zip(TV,count_TV(codon1, temp_codon, i,
963
weight=path_prob[n]/2))
965
TV = [p+q for p,q in zip(TV,count_TV(codon1, temp_codon, i,
967
weight=path_prob[n]/2))
969
elif len(diff_pos) == 3:
970
codon2_aa = codon_table.forward_table[codon2]
971
paths = list(permutations([0, 1, 2], 3))
975
tmp1 = codon1[:p[0]] + codon2[p[0]] + codon1[p[0]+1:]
976
tmp2 = tmp1[:p[1]] + codon2[p[1]] + tmp1[p[1]+1:]
977
tmp_codon.append((tmp1, tmp2))
978
codon_idx = list(map(codon_lst.index, [codon1, tmp1, tmp2, codon2]))
979
prob = (P[codon_idx[0], codon_idx[1]],
980
P[codon_idx[1], codon_idx[2]],
981
P[codon_idx[2], codon_idx[3]])
982
path_prob.append(prob[0]*prob[1]*prob[2])
983
path_prob = [3*i/sum(path_prob) for i in path_prob]
984
for i, j, k in zip(tmp_codon, path_prob, paths):
985
TV = [p+q for p,q in zip(TV, count_TV(codon1, i[0], k[0],
986
codon_table, weight=j/3))
988
TV = [p+q for p,q in zip(TV, count_TV(i[0], i[1], k[1],
989
codon_table, weight=j/3))
991
TV = [p+q for p,q in zip(TV, count_TV(i[1], codon2, k[1],
992
codon_table, weight=j/3))
994
if codon1 in codon_table.stop_codons or codon2 in codon_table.stop_codons:
996
elif codon_table.forward_table[codon1] == codon_table.forward_table[codon2]:
1003
#################################################################
1004
# private functions for Maximum Likelihood method
1005
#################################################################
1007
def _ml(seq1, seq2, cmethod, codon_table):
1008
"""Main function for ML method (PRIVATE).
1010
from collections import Counter
1011
from scipy.optimize import minimize
1012
codon_cnt = Counter()
1013
pi = _get_pi(seq1, seq2, cmethod, codon_table=codon_table)
1014
for i, j in zip(seq1, seq2):
1015
#if i != j and ('---' not in (i, j)):
1016
if '---' not in (i, j):
1017
codon_cnt[(i,j)] += 1
1018
codon_lst = [i for i in \
1019
list(codon_table.forward_table.keys()) + codon_table.stop_codons \
1021
# apply optimization
1022
def func(params, pi=pi, codon_cnt=codon_cnt, codon_lst=codon_lst,
1023
codon_table=codon_table):
1024
"""params = [t, k, w]"""
1025
return -_likelihood_func(
1026
params[0], params[1], params[2], pi,
1027
codon_cnt, codon_lst=codon_lst,
1028
codon_table=codon_table)
1030
opt_res = minimize(func, [1, 0.1, 2], method='L-BFGS-B', \
1031
bounds=((1e-10, 20), (1e-10, 20), (1e-10, 10)),
1034
Q = _get_Q(pi, k, w, codon_lst, codon_table)
1036
for i, c1 in enumerate(codon_lst):
1037
for j, c2 in enumerate(codon_lst):
1040
if codon_table.forward_table[c1] == \
1041
codon_table.forward_table[c2]:
1043
Sd += pi[c1] * Q[i, j]
1045
# nonsynonymous count
1046
Nd += pi[c1] * Q[i, j]
1048
# This is probably due to stop codons
1052
# count differences (with w fixed to 1)
1053
opt_res = minimize(func, [1, 0.1, 2], method='L-BFGS-B',
1054
bounds=((1e-10, 20), (1e-10, 20), (1, 1)),
1057
Q = _get_Q(pi, k, w, codon_lst, codon_table)
1059
for i, c1 in enumerate(codon_lst):
1060
for j, c2 in enumerate(codon_lst):
1063
if codon_table.forward_table[c1] == \
1064
codon_table.forward_table[c2]:
1066
rhoS += pi[c1] * Q[i, j]
1068
# nonsynonymous count
1069
rhoN += pi[c1] * Q[i, j]
1071
# This is probably due to stop codons
1080
def _get_pi(seq1, seq2, cmethod, codon_table=default_codon_table):
1081
"""Obtain codon frequency dict (pi) from two codon list (PRIVATE).
1082
This function is designed for ML method. Available counting methods
1083
(cfreq) are F1x4, F3x4 and F64.
1086
# Stop codon should not be allowed according to Yang.
1087
# Try to modify this!
1089
if cmethod == 'F1x4':
1090
fcodon = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
1091
for i in seq1 + seq2:
1093
for c in i: fcodon[c] += 1
1094
tot = sum(fcodon.values())
1095
fcodon = dict((j, k/tot) for j, k in fcodon.items())
1096
for i in codon_table.forward_table.keys() + codon_table.stop_codons:
1098
pi[i] = fcodon[i[0]]*fcodon[i[1]]*fcodon[i[2]]
1099
elif cmethod == 'F3x4':
1100
# three codon position
1101
fcodon = [{'A': 0, 'G': 0, 'C': 0, 'T': 0},
1102
{'A': 0, 'G': 0, 'C': 0, 'T': 0},
1103
{'A': 0, 'G': 0, 'C': 0, 'T': 0}]
1104
for i in seq1 + seq2:
1106
fcodon[0][i[0]] += 1
1107
fcodon[1][i[1]] += 1
1108
fcodon[2][i[2]] += 1
1110
tot = sum(fcodon[i].values())
1111
fcodon[i] = dict((j, k/tot) for j, k in fcodon[i].items())
1112
for i in list(codon_table.forward_table.keys()) + \
1113
codon_table.stop_codons:
1115
pi[i] = fcodon[0][i[0]]*fcodon[1][i[1]]*fcodon[2][i[2]]
1116
elif cmethod == 'F61':
1117
for i in codon_table.forward_table.keys() + codon_table.stop_codons:
1120
for i in seq1 + seq2:
1121
if i != '---': pi[i] += 1
1122
tot = sum(pi.values())
1123
pi = dict((j, k/tot) for j, k in pi.items())
1127
def _q(i, j, pi, k, w, codon_table=default_codon_table):
1128
"""Q matrix for codon substitution.
1131
- i, j : three letter codon string
1132
- pi : expected codon frequency
1133
- k : transition/transversion ratio
1134
- w : nonsynonymous/synonymous rate ratio
1135
- codon_table: Bio.Data.CodonTable object
1138
# diagonal elements is the sum of all other elements
1140
if i in codon_table.stop_codons or j in codon_table.stop_codons:
1142
if (i not in pi) or (j not in pi):
1145
pyrimidine = ('T', 'C')
1147
for n, (c1, c2) in enumerate(zip(i, j)):
1149
diff.append((n, c1, c2))
1152
if codon_table.forward_table[i] == codon_table.forward_table[j]:
1153
# synonymous substitution
1154
if diff[0][1] in purine and diff[0][2] in purine:
1157
elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine:
1164
# nonsynonymous substitution
1165
if diff[0][1] in purine and diff[0][2] in purine:
1168
elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine:
1176
def _get_Q(pi, k, w, codon_lst, codon_table):
1177
"""Q matrix for codon substitution"""
1179
codon_num = len(codon_lst)
1180
Q = np.zeros((codon_num, codon_num))
1181
for i in range(codon_num):
1182
for j in range(codon_num):
1184
Q[i, j] = _q(codon_lst[i], codon_lst[j], pi, k, w,
1185
codon_table=codon_table)
1186
nucl_substitutions = 0
1187
for i in range(codon_num):
1188
Q[i,i] = -sum(Q[i,:])
1190
nucl_substitutions += pi[codon_lst[i]] * (-Q[i, i])
1193
Q = Q / nucl_substitutions
1197
def _likelihood_func(t, k, w, pi, codon_cnt, codon_lst, codon_table):
1198
"""likelihood function for ML method
1200
from scipy.linalg import expm
1201
Q = _get_Q(pi, k, w, codon_lst, codon_table)
1203
l = 0 # likelihood value
1204
for i, c1 in enumerate(codon_lst):
1205
for j, c2 in enumerate(codon_lst):
1206
if (c1, c2) in codon_cnt:
1207
if P[i, j] * pi[c1] <= 0:
1208
l += codon_cnt[(c1, c2)]*0
1210
l += codon_cnt[(c1, c2)]*log(pi[c1]*P[i, j])
1214
if __name__ == "__main__":
1215
from Bio._utils import run_doctest