~ubuntu-branches/debian/sid/python-biopython/sid

« back to all changes in this revision

Viewing changes to Bio/CodonAlign/CodonSeq.py

Committer: Package Import Robot
Author(s): Andreas Tille
Date: 2014-06-08 10:25:35 UTC
mfrom: (1.1.22)
Revision ID: package-import@ubuntu.com-20140608102535-ebgnhcpcrum3i3w8

Tags: 1.64+dfsg-1

http://bugs.debian.org/750845

* New upstream version
* Build-Depends raxml only on those architectures where it is available
Closes: #750845

files added:
Bio/CodonAlign

Bio/CodonAlign/CodonAlignment.py

Bio/CodonAlign/CodonAlphabet.py

Bio/CodonAlign/CodonSeq.py

Bio/CodonAlign/__init__.py

Bio/CodonAlign/chisq.py

Bio/Entrez/DTDs/bookdoc_140101.dtd

Bio/Entrez/DTDs/einfo.dtd

Bio/Entrez/DTDs/esearch.dtd

Bio/Entrez/DTDs/esummary-v1.dtd

Bio/Entrez/DTDs/nlmmedlinecitationset_140101.dtd

Bio/Entrez/DTDs/pubmed_140101.dtd

Bio/Phylo/Consensus.py

Bio/Phylo/TreeConstruction.py

Bio/Sequencing/Applications/_samtools.py

Doc/doc.tex

Doc/examples/opuntia.aln

Doc/examples/opuntia.dnd

Doc/examples/simple.dnd

Tests/Affy

Tests/Affy/affy_v3_example.CEL

Tests/CodonAlign

Tests/CodonAlign/adh.aln

Tests/CodonAlign/drosophilla.fasta

Tests/CodonAlign/egfr_id

Tests/CodonAlign/egfr_nucl.fa

Tests/CodonAlign/egfr_pro.aln

Tests/CodonAlign/nucl1.fa

Tests/CodonAlign/nucl2.fa

Tests/CodonAlign/nucl3.fa

Tests/CodonAlign/nucl4.fa

Tests/CodonAlign/nucl5.fa

Tests/CodonAlign/pro1.aln

Tests/CodonAlign/pro2.aln

Tests/CodonAlign/pro3.aln

Tests/CodonAlign/pro4.aln

Tests/CodonAlign/pro5.aln

Tests/Exonerate/exn_22_m_dna2protein.exn

Tests/GenBank/1MRR_A.gp

Tests/Hmmer/domtab_31b1_hmmscan_001.out

Tests/Hmmer/domtab_31b1_hmmsearch_001.out

Tests/Hmmer/tab_31b1_hmmscan_001.out

Tests/Hmmer/tab_31b1_hmmsearch_001.out

Tests/Hmmer/text_31b1_hmmscan_001.out

Tests/Hmmer/text_31b1_hmmsearch_001.out

Tests/Nexus/ByteOrderMarkFile.nwk

Tests/Quality/zero_length.fastq

Tests/SamBam/bam1.bam

Tests/SamBam/bam2.bam

Tests/SamBam/sam1.sam

Tests/SamBam/sam2.sam

Tests/TreeConstruction

Tests/TreeConstruction/adam_refs.tre

Tests/TreeConstruction/consensus_refs.tre

Tests/TreeConstruction/majority_ref.tre

Tests/TreeConstruction/msa.phy

Tests/TreeConstruction/neighbor_trees.tre

Tests/TreeConstruction/nj.tre

Tests/TreeConstruction/pars3.tre

Tests/TreeConstruction/strict_refs.tre

Tests/TreeConstruction/test.log

Tests/TreeConstruction/trees.tre

Tests/TreeConstruction/upgma.tre

Tests/test_BioSQL_mysql_connector.py

Tests/test_CelFile.py

Tests/test_CodonAlign.py

Tests/test_Consensus.py

Tests/test_PopGen_FastSimCoal.py

Tests/test_TreeConstruction.py

Tests/test_samtools_tool.py

files removed:
.pc/add_missing_test_Fasttree_tool.patch

.pc/add_missing_test_Fasttree_tool.patch/Tests

.pc/add_missing_test_Fasttree_tool.patch/Tests/output

.pc/add_missing_test_Fasttree_tool.patch/Tests/output/test_Fasttree_tool

.pc/biopython-from-pil-import-image.patch

.pc/biopython-from-pil-import-image.patch/Tests

.pc/biopython-from-pil-import-image.patch/Tests/test_GenomeDiagram.py

.pc/biopython-from-pil-import-image.patch/Tests/test_GraphicsBitmaps.py

.pc/fix_cpairwise2_import_issue.patch

.pc/fix_cpairwise2_import_issue.patch/Bio

.pc/fix_cpairwise2_import_issue.patch/Bio/pairwise2.py

.pc/fix_test_emboss.patch

.pc/fix_test_emboss.patch/Tests

.pc/fix_test_emboss.patch/Tests/test_Emboss.py

.pc/use_upstream_patch_for_bwa_test.patch

.pc/use_upstream_patch_for_bwa_test.patch/Tests

.pc/use_upstream_patch_for_bwa_test.patch/Tests/test_BWA_tool.py

Bio/Entrez/DTDs/eLink_020511.dtd

Bio/HotRand.py

Bio/Search.py

Bio/UniGene/UniGene.py

Tests/Entrez/elink9.xml

Tests/test_HotRand.py

Tests/test_NCBIStandalone.py

debian/patches/add_missing_test_Fasttree_tool.patch

debian/patches/biopython-from-pil-import-image.patch

debian/patches/fix_cpairwise2_import_issue.patch

debian/patches/fix_test_emboss.patch

debian/patches/use_upstream_patch_for_bwa_test.patch

files modified:
.pc/applied-patches

Bio/Affy/CelFile.py

Bio/Align/AlignInfo.py

Bio/Align/Generic.py

Bio/AlignIO/Interfaces.py

Bio/Application/__init__.py

Bio/Blast/Applications.py

Bio/Blast/NCBIStandalone.py

Bio/Blast/NCBIWWW.py

Bio/Blast/ParseBlastTable.py

Bio/Blast/Record.py

Bio/Cluster/clustermodule.c

Bio/Data/CodonTable.py

Bio/Data/IUPACData.py

Bio/DocSQL.py

Bio/Emboss/Applications.py

Bio/Entrez/DTDs/NCBI_GBSeq.mod.dtd

Bio/Entrez/DTDs/eInfo_020511.dtd

Bio/Entrez/DTDs/eLink_101123.dtd

Bio/Entrez/DTDs/ePost_020511.dtd

Bio/Entrez/DTDs/eSearch_020511.dtd

Bio/Entrez/DTDs/eSummary_041029.dtd

Bio/Entrez/DTDs/egquery.dtd

Bio/Entrez/DTDs/nlmcommon_080101.dtd

Bio/Entrez/DTDs/pubmed_080101.dtd

Bio/Entrez/DTDs/taxon.dtd

Bio/Entrez/Parser.py

Bio/Entrez/__init__.py

Bio/FSSP/FSSPTools.py

Bio/File.py

Bio/GenBank/Scanner.py

Bio/GenBank/__init__.py

Bio/Geo/Record.py

Bio/Graphics/BasicChromosome.py

Bio/Graphics/GenomeDiagram/_Diagram.py

Bio/KDTree/KDTreemodule.c

Bio/Motif/_Motif.py

Bio/Motif/__init__.py

Bio/Motif/_pwm.c

Bio/NMR/NOEtools.py

Bio/NMR/xpktools.py

Bio/PDB/DSSP.py

Bio/PDB/PDBParser.py

Bio/Pathway/Rep/Graph.py

Bio/Pathway/Rep/MultiGraph.py

Bio/Pathway/__init__.py

Bio/Phylo/BaseTree.py

Bio/Phylo/NewickIO.py

Bio/Phylo/NexusIO.py

Bio/PopGen/GenePop/Controller.py

Bio/PopGen/GenePop/LargeFileParser.py

Bio/PopGen/SimCoal/Async.py

Bio/PopGen/SimCoal/Cache.py

Bio/PopGen/SimCoal/Controller.py

Bio/Restriction/PrintFormat.py

Bio/SCOP/Cla.py

Bio/SCOP/Des.py

Bio/SCOP/Dom.py

Bio/SCOP/Raf.py

Bio/SCOP/Residues.py

Bio/SCOP/__init__.py

Bio/SearchIO/BlastIO/__init__.py

Bio/SearchIO/BlastIO/blast_text.py

Bio/SearchIO/BlastIO/blast_xml.py

Bio/SearchIO/BlatIO.py

Bio/SearchIO/ExonerateIO/__init__.py

Bio/SearchIO/ExonerateIO/_base.py

Bio/SearchIO/ExonerateIO/exonerate_text.py

Bio/SearchIO/FastaIO.py

Bio/SearchIO/HmmerIO/__init__.py

Bio/SearchIO/HmmerIO/hmmer2_text.py

Bio/SearchIO/HmmerIO/hmmer3_domtab.py

Bio/SearchIO/HmmerIO/hmmer3_tab.py

Bio/SearchIO/HmmerIO/hmmer3_text.py

Bio/SearchIO/__init__.py

Bio/SearchIO/_model/__init__.py

Bio/SearchIO/_model/hsp.py

Bio/SearchIO/_model/query.py

Bio/Seq.py

Bio/SeqFeature.py

Bio/SeqIO/FastaIO.py

Bio/SeqIO/Interfaces.py

Bio/SeqIO/PhdIO.py

Bio/SeqIO/QualityIO.py

Bio/SeqIO/SeqXmlIO.py

Bio/SeqIO/SffIO.py

Bio/SeqIO/__init__.py

Bio/SeqIO/_index.py

Bio/SeqRecord.py

Bio/SeqUtils/__init__.py

Bio/Sequencing/Applications/__init__.py

Bio/TogoWS/__init__.py

Bio/__init__.py

Bio/_py3k/__init__.py

Bio/bgzf.py

Bio/motifs/__init__.py

Bio/motifs/_pwm.c

Bio/motifs/jaspar/__init__.py

Bio/motifs/meme.py

Bio/triemodule.c

BioSQL/BioSeq.py

BioSQL/BioSeqDatabase.py

CONTRIB

DEPRECATED

Doc/Tutorial.html

Doc/Tutorial.pdf

Doc/Tutorial.tex

Doc/biopdb_faq.pdf

Doc/cookbook/motif/motif.tex

Doc/install/Installation.html

Doc/install/Installation.pdf

MANIFEST.in

NEWS

PKG-INFO

README

Tests/Entrez/elink1.xml

Tests/Entrez/elink2.xml

Tests/Entrez/elink3.xml

Tests/Entrez/elink4.xml

Tests/Entrez/elink5.xml

Tests/Entrez/elink6.xml

Tests/Entrez/elink7.xml

Tests/Entrez/elink8.xml

Tests/Exonerate/README.txt

Tests/Hmmer/README.txt

Tests/PopGen/simple.par

Tests/common_BioSQL.py

Tests/output/test_Fasttree_tool

Tests/output/test_GenBank

Tests/output/test_SeqIO

Tests/run_tests.py

Tests/search_tests_common.py

Tests/test_Ace.py

Tests/test_AlignIO.py

Tests/test_AlignIO_FastaIO.py

Tests/test_AlignIO_convert.py

Tests/test_BWA_tool.py

Tests/test_BioSQL_sqlite3.py

Tests/test_ClustalOmega_tool.py

Tests/test_Clustalw_tool.py

Tests/test_Cluster.py

Tests/test_ColorSpiral.py

Tests/test_Emboss.py

Tests/test_Entrez.py

Tests/test_Entrez_online.py

Tests/test_Fasttree_tool.py

Tests/test_GenBank.py

Tests/test_GenomeDiagram.py

Tests/test_GraphicsBitmaps.py

Tests/test_GraphicsChromosome.py

Tests/test_MSAProbs_tool.py

Tests/test_Mafft_tool.py

Tests/test_MarkovModel.py

Tests/test_Motif.py

Tests/test_Muscle_tool.py

Tests/test_NCBI_BLAST_tools.py

Tests/test_Nexus.py

Tests/test_PDB.py

Tests/test_Phylo.py

Tests/test_Phylo_CDAO.py

Tests/test_PopGen_DFDist.py

Tests/test_PopGen_FDist.py

Tests/test_PopGen_GenePop.py

Tests/test_PopGen_SimCoal_nodepend.py

Tests/test_ProtParam.py

Tests/test_SearchIO_blast_tab.py

Tests/test_SearchIO_blast_text.py

Tests/test_SearchIO_blast_xml.py

Tests/test_SearchIO_blat_psl.py

Tests/test_SearchIO_exonerate.py

Tests/test_SearchIO_fasta_m10.py

Tests/test_SearchIO_hmmer2_text.py

Tests/test_SearchIO_hmmer3_domtab.py

Tests/test_SearchIO_hmmer3_tab.py

Tests/test_SearchIO_hmmer3_text.py

Tests/test_SearchIO_model.py

Tests/test_SearchIO_write.py

Tests/test_SeqIO.py

Tests/test_SeqIO_PdbIO.py

Tests/test_SeqIO_QualityIO.py

Tests/test_SeqIO_SeqXML.py

Tests/test_SeqIO_convert.py

Tests/test_SeqIO_features.py

Tests/test_SeqIO_index.py

Tests/test_SeqUtils.py

Tests/test_Seq_objs.py

Tests/test_SubsMat.py

Tests/test_Tutorial.py

Tests/test_bgzf.py

Tests/test_prodoc.py

Tests/test_py3k.py

Tests/test_trie.py

debian/changelog

debian/control

debian/patches/remove_mathml-qname.patch

debian/patches/series

debian/rules

setup.py

Show diffs side-by-side

added added

removed removed

Bio/CodonAlign/CodonSeq.py

# This code is part of the Biopython distribution and governed by its

# license. Please see the LICENSE file that should have been included

# as part of this package.

"""Code for dealing with Codon Seq.

CodonSeq class is interited from Seq class. This is the core class to

deal with sequences in CodonAlignment in biopython.

"""

from __future__ import division, print_function

from itertools import permutations

from math import log

from Bio.Seq import Seq

from Bio.SeqRecord import SeqRecord

from Bio.Alphabet import IUPAC, Gapped, HasStopCodon, Alphabet

from Bio.Alphabet import generic_dna, _ungap

from Bio.Data.CodonTable import generic_by_id

from Bio.CodonAlign.CodonAlphabet import default_codon_alphabet, default_codon_table

__docformat__ = "epytext en" # Don't just use plain text in epydoc API pages!

class CodonSeq(Seq):

"""CodonSeq is designed to be within the SeqRecords of a

CodonAlignment class.

CodonSeq is useful as it allows the user to specify

reading frame when translate CodonSeq

CodonSeq also accepts codon style slice by calling

get_codon() method.

Important: Ungapped CodonSeq can be any length if you

specify the rf_table. Gapped CodonSeq should be a

multiple of three.

>>> codonseq = CodonSeq("AAATTTGGGCCAAATTT", rf_table=(0,3,6,8,11,14))

>>> print(codonseq.translate())

KFGAKF

test get_full_rf_table method

>>> p = CodonSeq('AAATTTCCCGG-TGGGTTTAA', rf_table=(0, 3, 6, 9, 11, 14, 17))

>>> full_rf_table = p.get_full_rf_table()

>>> print(full_rf_table)

[0, 3, 6, 9, 12, 15, 18]

>>> print(p.translate(rf_table=full_rf_table, ungap_seq=False))

KFPPWV*

>>> p = CodonSeq('AAATTTCCCGGGAA-TTTTAA', rf_table=(0, 3, 6, 9, 14, 17))

>>> print(p.get_full_rf_table())

[0, 3, 6, 9, 12.0, 15, 18]

>>> p = CodonSeq('AAA------------TAA', rf_table=(0, 3))

>>> print(p.get_full_rf_table())

[0, 3.0, 6.0, 9.0, 12.0, 15]

"""

def __init__(self, data='', alphabet=default_codon_alphabet, \

gap_char="-", rf_table=None):

# rf_table should be a tuple or list indicating the every

# codon position along the sequence. For example:

# sequence = 'AAATTTGGGCCAAATTT'

# rf_table = (0, 3, 6, 8, 11, 14)

# the translated protein sequences will be

# AAA TTT GGG GCC AAA TTT

# K F G A K F

# Notice: rf_table applies to ungapped sequence. If there

# are gaps in the sequence, they will be discarded. This

# feature ensures the rf_table is independent of where the

# codon sequence appears in the alignment

Seq.__init__(self, data.upper(), alphabet=alphabet)

self.gap_char = gap_char

# check the length of the alignment to be a triple

if rf_table is None:

seq_ungapped = self._data.replace(gap_char, "")

assert len(self) % 3 == 0, "Sequence length is not a triple number"

self.rf_table = list(filter(lambda x: x%3 == 0,

range(len(seq_ungapped))))

# check alphabet

# Not use Alphabet._verify_alphabet function because it

# only works for single alphabet

for i in self.rf_table:

if self._data[i:i+3] not in alphabet.letters:

raise ValueError("Sequence contain undefined letters from"

" alphabet "

"({0})! ".format(self._data[i:i+3]))

else:

#if gap_char in self._data:

# assert len(self) % 3 == 0, \

# "Gapped sequence length is not a triple number"

assert isinstance(rf_table, (tuple, list)), \

"rf_table should be a tuple or list object"

assert all(isinstance(i, int) for i in rf_table), \

"elements in rf_table should be int that specify " \

+ "the codon positions of the sequence"

100

seq_ungapped = self._data.replace(gap_char, "")

101

for i in rf_table:

102

if seq_ungapped[i:i+3] not in alphabet.letters:

103

raise ValueError("Sequence contain undefined letters "

104

"from alphabet "

105

"({0})!".format(seq_ungapped[i:i+3]))

106

self.rf_table = rf_table

107

108

def __getitem__(self, index):

109

# TODO: handle alphabet elegantly

110

return Seq(self._data[index], alphabet=generic_dna)

111

112

def get_codon(self, index):

113

"""get the `index`-th codon from the self.seq

114

"""

115

if len(set([i % 3 for i in self.rf_table])) != 1:

116

raise RuntimeError("frameshift detected. "

117

"CodonSeq object is not able to deal "

118

"with codon sequence with frameshift. "

119

"Plase use normal slice option.")

120

if isinstance(index, int):

121

if index != -1:

122

return self._data[index*3:(index+1)*3]

123

else:

124

return self._data[index*3:]

125

else:

126

# This slice ensures that codon will always be the unit

127

# in slicing (it won't change to other codon if you are

128

# using reverse slicing such as [::-1]).

129

# The idea of the code below is to first map the slice

130

# to amino acid sequence and then transform it into

131

# codon sequence.

132

aa_index = range(len(self)//3)

133

def cslice(p):

134

aa_slice = aa_index[p]

135

codon_slice = ''

136

for i in aa_slice:

137

codon_slice += self._data[i*3:i*3+3]

138

return codon_slice

139

codon_slice = cslice(index)

140

return CodonSeq(codon_slice, alphabet=self.alphabet)

141

142

def get_codon_num(self):

143

"""Return the number of codons in the CodonSeq"""

144

return len(self.rf_table)

145

146

def translate(self, codon_table=default_codon_table,

147

stop_symbol="*", rf_table=None, ungap_seq=True):

148

"""Translate the CodonSeq based on the reading frame

149

in rf_table. It is possible for the user to specify

150

a rf_table at this point. If you want to include

151

gaps in the translated sequence, this is the only

152

way. ungap_seq should be set to true for this

153

purpose.

154

"""

155

amino_acids = []

156

if ungap_seq is True:

157

tr_seq = self._data.replace(self.gap_char, "")

158

else:

159

tr_seq = self._data

160

if rf_table is None:

161

rf_table = self.rf_table

162

p = -1 #initiation

163

for i in rf_table:

164

if isinstance(i, float):

165

amino_acids.append('-')

166

continue

167

#elif '---' == tr_seq[i:i+3]:

168

# amino_acids.append('-')

169

# continue

170

elif '-' in tr_seq[i:i+3]:

171

# considering two types of frameshift

172

if p == -1 or p - i == 3:

173

p = i

174

codon = tr_seq[i:i+6].replace('-', '')[:3]

175

elif p - i > 3:

176

codon = tr_seq[i:i+3]

177

p = i

178

else:

179

# normal condition without gaps

180

codon = tr_seq[i:i+3]

181

p = i

182

if codon in codon_table.stop_codons:

183

amino_acids.append(stop_symbol)

184

continue

185

try:

186

amino_acids.append(codon_table.forward_table[codon])

187

except KeyError:

188

raise RuntimeError("Unknown codon detected ({0}). Do you "

189

"forget to speficy ungap_seq "

190

"argument?".format(codon))

191

return "".join(amino_acids)

192

193

def toSeq(self, alphabet=generic_dna):

194

return Seq(self._data, generic_dna)

195

196

def get_full_rf_table(self):

197

"""This function returns a full rf_table of the given

198

CodonSeq records. A full rf_table is different from

199

normal rf_table in that it translate gaps in CodonSeq.

200

It is helpful to construct alignment containing

201

frameshift.

202

"""

203

ungap_seq = self._data.replace("-", "")

204

codon_lst = [ungap_seq[i:i+3] for i in self.rf_table]

205

relative_pos = [self.rf_table[0]]

206

for i in range(1, len(self.rf_table[1:])+1):

207

relative_pos.append(self.rf_table[i]-self.rf_table[i-1])

208

full_rf_table = []

209

codon_num = 0

210

for i in filter(lambda x: x%3==0, range(len(self._data))):

211

if self._data[i:i+3] == self.gap_char*3:

212

full_rf_table.append(i+0.0)

213

elif relative_pos[codon_num] == 0:

214

full_rf_table.append(i)

215

codon_num += 1

216

elif relative_pos[codon_num] in (-1, -2):

217

# check the gap status of previous codon

218

gap_stat = len(self._data[i-3:i].replace("-", ""))

219

if gap_stat == 3:

220

full_rf_table.append(i+relative_pos[codon_num])

221

elif gap_stat == 2:

222

full_rf_table.append(i+1+relative_pos[codon_num])

223

elif gap_stat == 1:

224

full_rf_table.append(i+2+relative_pos[codon_num])

225

codon_num += 1

226

elif relative_pos[codon_num] > 0:

227

full_rf_table.append(i+0.0)

228

try:

229

this_len = len(self._data[i:i+3].replace("-", ""))

230

relative_pos[codon_num] -= this_len

231

except:

232

# we probably reached the last codon

233

pass

234

return full_rf_table

235

236

def full_translate(self, codon_table=default_codon_table, stop_symbol="*"):

237

"""Apply full translation with gaps considered.

238

"""

239

full_rf_table = self.get_full_rf_table()

240

return self.translate(codon_table=codon_table, stop_symbol=stop_symbol,

241

rf_table=full_rf_table, ungap_seq=False)

242

243

def ungap(self, gap=None):

244

if hasattr(self.alphabet, "gap_char"):

245

if not gap:

246

gap = self.alphabet.gap_char

247

elif gap != self.alphabet.gap_char:

248

raise ValueError("Gap %s does not match %s from alphabet"

249

% (repr(gap), repr(self.alphabet.alphabet.gap_char)))

250

alpha = _ungap(self.alphabet)

251

elif not gap:

252

raise ValueError("Gap character not given and not defined in "

253

"alphabet")

254

else:

255

alpha = self.alphabet # modify!

256

if len(gap) != 1 or not isinstance(gap, str):

257

raise ValueError("Unexpected gap character, %s" % repr(gap))

258

return CodonSeq(str(self._data).replace(gap, ""), alpha,

259

rf_table=self.rf_table)

260

261

@classmethod

262

def from_seq(cls, seq, alphabet=default_codon_alphabet, rf_table=None):

263

if rf_table is None:

264

return cls(seq._data, alphabet=alphabet)

265

else:

266

return cls(seq._data, alphabet=alphabet, rf_table=rf_table)

267

268

269

def _get_codon_list(codonseq):

270

"""get a list of codons according to full_rf_table for counting

271

(PRIVATE).

272

"""

273

#if not isinstance(codonseq, CodonSeq):

274

# raise TypeError("_get_codon_list accept a CodonSeq object "

275

# "({0} detected)".format(type(codonseq)))

276

full_rf_table = codonseq.get_full_rf_table()

277

codon_lst = []

278

for i, k in enumerate(full_rf_table):

279

if isinstance(k, int):

280

start = k

281

try:

282

end = int(full_rf_table[i+1])

283

except IndexError:

284

end = start+3

285

this_codon = str(codonseq[start:end])

286

if len(this_codon) == 3:

287

codon_lst.append(this_codon)

288

else:

289

codon_lst.append(str(this_codon.ungap()))

290

elif str(codonseq[int(k):int(k)+3]) == "---":

291

codon_lst.append("---")

292

else:

293

# this may be problematic, as normally no codon shoud

294

# fall into this condition

295

codon_lst.append(codonseq[int(k):int(k)+3])

296

return codon_lst

297

298

299

def cal_dn_ds(codon_seq1, codon_seq2, method="NG86",

300

codon_table=default_codon_table, k=1, cfreq=None):

301

"""Function to calculate the dN and dS of the given two CodonSeq

302

or SeqRecord that contain CodonSeq objects.

303

304

Available methods:

305

- NG86 - PMID: 3444411

306

- LWL85 - PMID: 3916709

307

- ML - PMID: 7968486

308

- YN00 - PMID: 10666704

309

310

Arguments:

311

- w - transition/transvertion ratio

312

- cfreq - Current codon frequency vector can only be specified

313

when you are using ML method. Possible ways of

314

getting cfreq are: F1x4, F3x4 and F61.

315

"""

316

if all([isinstance(codon_seq1, CodonSeq),

317

isinstance(codon_seq2, CodonSeq)]):

318

pass

319

elif all([isinstance(codon_seq1, SeqRecord),

320

isinstance(codon_seq2, SeqRecord)]):

321

codon_seq1 = codon_seq1.seq

322

codon_seq2 = codon_seq2.seq

323

else:

324

raise TypeError("cal_dn_ds accepts two CodonSeq objects or SeqRecord "

325

"that contains CodonSeq as its seq!")

326

if len(codon_seq1.get_full_rf_table()) != \

327

len(codon_seq2.get_full_rf_table()):

328

raise RuntimeError("full_rf_table length of seq1 ({0}) and seq2 ({1}) "

329

"are not the same".format(

330

len(codon_seq1.get_full_rf_table()),

331

len(codon_seq2.get_full_rf_table()))

332

)

333

if cfreq is None:

334

cfreq = 'F3x4'

335

elif cfreq is not None and method != 'ML':

336

raise RuntimeError("cfreq can only be specified when you "

337

"are using ML method")

338

if cfreq not in ('F1x4', 'F3x4', 'F61'):

339

import warnings

340

warnings.warn("Unknown cfreq ({0}). Only F1x4, F3x4 and F61 are "

341

"acceptable. Use F3x4 in the following.".format(cfreq))

342

cfreq = 'F3x4'

343

seq1_codon_lst = _get_codon_list(codon_seq1)

344

seq2_codon_lst = _get_codon_list(codon_seq2)

345

# remove gaps in seq_codon_lst

346

seq1 = []

347

seq2 = []

348

for i, j in zip(seq1_codon_lst, seq2_codon_lst):

349

if (not '-' in i) and (not '-' in j):

350

seq1.append(i)

351

seq2.append(j)

352

dnds_func = {'ML': _ml, 'NG86': _ng86, 'LWL85': _lwl85, 'YN00': _yn00}

353

if method == "ML":

354

return dnds_func[method](seq1, seq2, cfreq, codon_table)

355

else:

356

return dnds_func[method](seq1, seq2, k, codon_table)

357

358

359

#################################################################

360

# private functions for NG86 method

361

#################################################################

362

363

def _ng86(seq1, seq2, k, codon_table):

364

"""Main function for NG86 method (PRIVATE).

365

"""

366

S_sites1, N_sites1 = _count_site_NG86(seq1,

367

codon_table=codon_table, k=k)

368

S_sites2, N_sites2 = _count_site_NG86(seq2,

369

codon_table=codon_table, k=k)

370

S_sites = (S_sites1 + S_sites2) / 2.0

371

N_sites = (N_sites1 + N_sites2) / 2.0

372

SN = [0, 0]

373

for i, j in zip(seq1, seq2):

374

SN = [m+n for m,n in zip(SN, _count_diff_NG86(

375

i, j,

376

codon_table=codon_table)

377

)

378

]

379

ps = SN[0] / S_sites

380

pn = SN[1] / N_sites

381

if ps < 3/4:

382

dS = abs(-3.0/4*log(1-4.0/3*ps))

383

else:

384

dS = -1

385

if pn < 3/4:

386

dN = abs(-3.0/4*log(1-4.0/3*pn))

387

else:

388

dN = -1

389

return dN, dS

390

391

392

def _count_site_NG86(codon_lst, k=1, codon_table=default_codon_table):

393

"""count synonymous and non-synonymous sites of a list of codons

394

(PRIVATE).

395

Argument:

396

- codon_lst - A three letter codon list from a CodonSeq object.

397

This can be returned from _get_codon_list method.

398

- k - transition/transversion rate ratio

399

"""

400

S_site = 0 # synonymous sites

401

N_site = 0 # non-synonymous sites

402

purine = ('A', 'G')

403

pyrimidine = ('T', 'C')

404

base_tuple = ('A', 'T', 'C', 'G')

405

for codon in codon_lst:

406

neighbor_codon = {'transition': [], 'transversion': []}

407

# classify neighbor codons

408

codon = codon.replace('U', 'T')

409

if codon == '---': continue

410

for n, i in enumerate(codon):

411

for j in base_tuple:

412

if i == j:

413

pass

414

elif i in purine and j in purine:

415

codon_chars = [c for c in codon]

416

codon_chars[n] = j

417

this_codon = ''.join(codon_chars)

418

neighbor_codon['transition'].append(this_codon)

419

elif i in pyrimidine and j in pyrimidine:

420

codon_chars = [c for c in codon]

421

codon_chars[n] = j

422

this_codon = ''.join(codon_chars)

423

neighbor_codon['transition'].append(this_codon)

424

else:

425

codon_chars = [c for c in codon]

426

codon_chars[n] = j

427

this_codon = ''.join(codon_chars)

428

neighbor_codon['transversion'].append(this_codon)

429

# count synonymous and non-synonymous sites

430

aa = codon_table.forward_table[codon]

431

this_codon_N_site = this_codon_S_site = 0

432

for neighbor in neighbor_codon['transition']:

433

if neighbor in codon_table.stop_codons:

434

this_codon_N_site += 1

435

elif codon_table.forward_table[neighbor] == aa:

436

this_codon_S_site += 1

437

else:

438

this_codon_N_site += 1

439

for neighbor in neighbor_codon['transversion']:

440

if neighbor in codon_table.stop_codons:

441

this_codon_N_site += k

442

elif codon_table.forward_table[neighbor] == aa:

443

this_codon_S_site += k

444

else:

445

this_codon_N_site += k

446

norm_const = (this_codon_N_site + this_codon_S_site)/3

447

S_site += this_codon_S_site / norm_const

448

N_site += this_codon_N_site / norm_const

449

return (S_site, N_site)

450

451

452

def _count_diff_NG86(codon1, codon2, codon_table=default_codon_table):

453

"""Count differences between two codons (three-letter string).

454

The function will take multiple pathways from codon1 to codon2

455

into account (PRIVATE).

456

"""

457

if not all([isinstance(codon1, str), isinstance(codon2, str)]):

458

raise TypeError("_count_diff_NG86 accept string object to represent "

459

"codon ({0}, {1} detected)".format(

460

type(codon1),

461

type(codon2))

462

)

463

if len(codon1) != 3 or len(codon2) != 3:

464

raise RuntimeError("codon should be three letter string ({0}, {1} "

465

"detected)".format(len(codon1), len(codon2)))

466

SN = [0, 0] # synonymous and nonsynonymous counts

467

if codon1 == '---' or codon2 == '---':

468

return SN

469

base_tuple = ('A', 'C', 'G', 'T')

470

if not all([i in base_tuple for i in codon1]):

471

raise RuntimeError("Unrecognized character detected in codon1 {0} "

472

"(Codon is consist of "

473

"A, T, C or G)".format(codon1))

474

if not all([i in base_tuple for i in codon2]):

475

raise RuntimeError("Unrecognized character detected in codon2 {0} "

476

"(Codon is consist of "

477

"A, T, C or G)".format(codon2))

478

if codon1 == codon2:

479

return SN

480

else:

481

diff_pos = []

482

for i, k in enumerate(zip(codon1, codon2)):

483

if k[0] != k[1]:

484

diff_pos.append(i)

485

def compare_codon(codon1, codon2, codon_table=default_codon_table,

486

weight=1):

487

"""Method to compare two codon accounting for different

488

pathways

489

"""

490

sd = nd = 0

491

if len(set(map(codon_table.forward_table.get,

492

[codon1, codon2]))) == 1:

493

sd += weight

494

else:

495

nd += weight

496

return (sd, nd)

497

if len(diff_pos) == 1:

498

SN = [i+j for i,j in zip(SN,

499

compare_codon(codon1, codon2, codon_table=codon_table))]

500

elif len(diff_pos) == 2:

501

codon2_aa = codon_table.forward_table[codon2]

502

for i in diff_pos:

503

temp_codon = codon1[:i] + codon2[i] + codon1[i+1:]

504

SN = [i+j for i,j in zip(SN, compare_codon(

505

codon1, temp_codon,

506

codon_table=codon_table,

507

weight=0.5))

508

]

509

SN = [i+j for i,j in zip(SN, compare_codon(

510

temp_codon, codon2,

511

codon_table=codon_table,

512

weight=0.5))

513

]

514

elif len(diff_pos) == 3:

515

codon2_aa = codon_table.forward_table[codon2]

516

paths = list(permutations([0, 1, 2], 3))

517

tmp_codon = []

518

for p in paths:

519

tmp1 = codon1[:p[0]] + codon2[p[0]] + codon1[p[0]+1:]

520

tmp2 = tmp1[:p[1]] + codon2[p[1]] + tmp1[p[1]+1:]

521

tmp_codon.append((tmp1, tmp2))

522

SN = [i+j for i,j in zip(SN, compare_codon(codon1, tmp1,

523

codon_table,

524

weight=0.5/3))

525

]

526

SN = [i+j for i,j in zip(SN, compare_codon(tmp1, tmp2,

527

codon_table,

528

weight=0.5/3))

529

]

530

SN = [i+j for i,j in zip(SN, compare_codon(tmp2, codon2,

531

codon_table,

532

weight=0.5/3))

533

]

534

return SN

535

536

537

#################################################################

538

# private functions for LWL85 method

539

#################################################################

540

541

def _lwl85(seq1, seq2, k, codon_table):

542

"""Main function fo LWL85 method (PRIVATE).

543

"""

544

# Nomenclature is according to PMID (3916709)

545

codon_fold_dict = _get_codon_fold(codon_table)

546

# count number of sites in different degenerate classes

547

fold0 = [0, 0]

548

fold2 = [0, 0]

549

fold4 = [0, 0]

550

for codon in seq1 + seq2:

551

fold_num = codon_fold_dict[codon]

552

for f in fold_num:

553

if f == '0':

554

fold0[0] += 1

555

elif f == '2':

556

fold2[0] += 1

557

elif f == '4':

558

fold4[0] += 1

559

L = [sum(fold0)/2.0, sum(fold2)/2.0, sum(fold4)/2.0]

560

# count number of differences in different degenerate classes

561

PQ = [0] * 6 # with P0, P2, P4, Q0, Q2, Q4 in each position

562

for codon1, codon2 in zip(seq1, seq2):

563

if (codon1 == "---" or codon2 == "---") or codon1 == codon2:

564

continue

565

else:

566

PQ = [i+j for i, j in zip(PQ, _diff_codon(

567

codon1,

568

codon2,

569

fold_dict=codon_fold_dict)

570

)]

571

PQ = [i/j for i, j in zip(PQ, L*2)]

572

P = PQ[:3]

573

Q = PQ[3:]

574

A = [(1./2)*log(1./(1-2*i-j)) - (1./4)*log(1./(1-2*j)) \

575

for i, j in zip(P, Q)]

576

B = [(1./2)*log(1./(1-2*i)) for i in Q]

577

dS = 3*(L[2]*A[1]+L[2]*(A[2]+B[2]))/(L[1]+3*L[2])

578

dN = 3*(L[2]*B[1]+L[0]*(A[0]+B[0]))/(2*L[1]+3*L[0])

579

return dN, dS

580

581

582

def _get_codon_fold(codon_table):

583

"""function to classify different position in a codon into

584

different fold (PRIVATE).

585

"""

586

def find_fold_class(codon, forward_table):

587

base = set(['A', 'T', 'C', 'G'])

588

fold = ''

589

codon_base_lst = [i for i in codon]

590

for i, b in enumerate(codon_base_lst):

591

other_base = base - set(b)

592

aa = []

593

for j in other_base:

594

codon_base_lst[i] = j

595

try:

596

aa.append(forward_table[''.join(codon_base_lst)])

597

except KeyError:

598

aa.append('stop')

599

if aa.count(forward_table[codon]) == 0:

600

fold += '0'

601

elif aa.count(forward_table[codon]) in (1,2):

602

fold += '2'

603

elif aa.count(forward_table[codon]) == 3:

604

fold += '4'

605

else:

606

raise RuntimeError("Unknown Error, cannot assign the "

607

"position to a fold")

608

codon_base_lst[i] = b

609

return fold

610

fold_table = {}

611

for codon in codon_table.forward_table:

612

if 'U' not in codon:

613

fold_table[codon] = find_fold_class(codon,

614

codon_table.forward_table)

615

fold_table["---"] = '---'

616

return fold_table

617

618

619

def _diff_codon(codon1, codon2, fold_dict):

620

"""function to get the differences of two codon and return

621

number of different types substitutions

622

623

return (P0, P2, P4, Q0, Q2, Q4)

624

Nomenclature is according to PMID (3916709)

625

"""

626

P0 = P2 = P4 = Q0 = Q2 = Q4 = 0

627

fold_num = fold_dict[codon1]

628

purine = ('A', 'G')

629

pyrimidine = ('T', 'C')

630

for n, (i, j) in enumerate(zip(codon1, codon2)):

631

if i!= j and (i in purine and j in purine):

632

if fold_num[n] == '0':

633

P0 += 1

634

elif fold_num[n] == '2':

635

P2 += 1

636

elif fold_num[n] == '4':

637

P4 += 1

638

else:

639

raise RuntimeError("Unexpected fold_num %d" % fold_num[n])

640

if i!= j and (i in pyrimidine and j in pyrimidine):

641

if fold_num[n] == '0':

642

P0 += 1

643

elif fold_num[n] == '2':

644

P2 += 1

645

elif fold_num[n] == '4':

646

P4 += 1

647

else:

648

raise RuntimeError("Unexpected fold_num %d" % fold_num[n])

649

if i != j and ((i in purine and j in pyrimidine) \

650

or (i in pyrimidine and j in purine)):

651

if fold_num[n] == '0':

652

Q0 += 1

653

elif fold_num[n] == '2':

654

Q2 += 1

655

elif fold_num[n] == '4':

656

Q4 += 1

657

else:

658

raise RuntimeError("Unexpected fold_num %d" % fold_num[n])

659

return (P0, P2, P4, Q0, Q2, Q4)

660

661

662

#################################################################

663

# private functions for YN00 method

664

#################################################################

665

666

def _yn00(seq1, seq2, k, codon_table):

667

"""Main function for yn00 method (PRIVATE).

668

"""

669

# nomenclature is according to PMID: 10666704

670

from collections import defaultdict

671

from scipy.linalg import expm

672

fcodon = [{'A': 0, 'G': 0, 'C': 0, 'T': 0},

673

{'A': 0, 'G': 0, 'C': 0, 'T': 0},

674

{'A': 0, 'G': 0, 'C': 0, 'T': 0}]

675

codon_fold_dict = _get_codon_fold(codon_table)

676

fold0_cnt = defaultdict(int)

677

fold4_cnt = defaultdict(int)

678

for codon in seq1 + seq2:

679

# count sites at different codon position

680

if codon != '---':

681

fcodon[0][codon[0]] += 1

682

fcodon[1][codon[1]] += 1

683

fcodon[2][codon[2]] += 1

684

# count sites in different degenerate fold class

685

fold_num = codon_fold_dict[codon]

686

for i, f in enumerate(fold_num):

687

if f == '0':

688

fold0_cnt[codon[i]] += 1

689

elif f == '4':

690

fold4_cnt[codon[i]] += 1

691

f0_total = sum(fold0_cnt.values())

692

f4_total = sum(fold4_cnt.values())

693

for i, j in zip(fold0_cnt, fold4_cnt):

694

fold0_cnt[i] = fold0_cnt[i]/f0_total

695

fold4_cnt[i] = fold4_cnt[i]/f4_total

696

# TODO:

697

# the initial kappa is different from what yn00 gives,

698

# try to find the problem.

699

TV = _get_TV(seq1, seq2, codon_table=codon_table)

700

k04 = (_get_kappa_t(fold0_cnt, TV), _get_kappa_t(fold4_cnt, TV))

701

kappa = (f0_total*k04[0]+f4_total*k04[1])/(f0_total+f4_total)

702

#kappa = 2.4285

703

# count synonymous sites and non-synonymous sites

704

for i in range(3):

705

tot = sum(fcodon[i].values())

706

fcodon[i] = dict((j, k/tot) for j, k in fcodon[i].items())

707

pi = defaultdict(int)

708

for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons:

709

if 'U' not in i:

710

pi[i] = 0

711

for i in seq1 + seq2:

712

pi[i] += 1

713

S_sites1, N_sites1, bfreqSN1 = _count_site_YN00(seq1, seq2, pi,

714

k=kappa,

715

codon_table=codon_table)

716

S_sites2, N_sites2, bfreqSN2 = _count_site_YN00(seq2, seq1, pi,

717

k=kappa,

718

codon_table=codon_table)

719

N_sites = (N_sites1+N_sites2)/2

720

S_sites = (S_sites1+S_sites2)/2

721

bfreqSN = [{'A': 0, 'T': 0, 'C': 0, 'G': 0},

722

{'A': 0, 'T': 0, 'C': 0, 'G': 0}]

723

for i in range(2):

724

for b in ('A', 'T', 'C', 'G'):

725

bfreqSN[i][b] = (bfreqSN1[i][b]+bfreqSN2[i][b])/2

726

# use NG86 method to get initial t and w

727

SN = [0, 0]

728

for i, j in zip(seq1, seq2):

729

SN = [m+n for m, n in zip(SN, _count_diff_NG86(

730

i, j,

731

codon_table=codon_table)

732

)

733

]

734

ps = SN[0] / S_sites

735

pn = SN[1] / N_sites

736

p = sum(SN) / (S_sites+N_sites)

737

w = log(1-4.0/3*pn) / log(1-4.0/3*ps)

738

t = -3/4*log(1-4/3*p)

739

tolerance = 1e-5

740

dSdN_pre = [0, 0]

741

for temp in range(20):

742

# count synonymous and nonsynonymous differences under kappa, w, t

743

codon_lst = [i for i in \

744

list(codon_table.forward_table.keys()) + \

745

codon_table.stop_codons if 'U' not in i]

746

Q = _get_Q(pi, kappa, w, codon_lst, codon_table)

747

P = expm(Q*t)

748

TV = [0, 0, 0, 0] # synonymous/nonsynonymous transition/transvertion

749

sites = [0, 0]

750

codon_npath = {}

751

for i, j in zip(seq1, seq2):

752

if i != '---' and j != '---':

753

codon_npath.setdefault((i, j), 0)

754

codon_npath[(i, j)] += 1

755

for i in codon_npath:

756

tv = _count_diff_YN00(i[0], i[1], P, codon_lst, codon_table)

757

TV = [m+n*codon_npath[i] for m,n in zip(TV, tv)]

758

TV = (TV[0]/S_sites, TV[1]/S_sites), (TV[2]/N_sites, TV[3]/N_sites)

759

# according to the DistanceF84() function of yn00.c in paml,

760

# the t (e.q. 10) appears in PMID: 10666704 is dS and dN

761

dSdN = []

762

for f, tv in zip(bfreqSN, TV):

763

dSdN.append(_get_kappa_t(f, tv, t=True))

764

t = dSdN[0]*3*S_sites/(S_sites+N_sites)+dSdN[1]*3*N_sites/(S_sites+N_sites)

765

w = dSdN[1]/dSdN[0]

766

if all(map(lambda x: x<tolerance, [abs(i-j) for i,j in zip(dSdN, dSdN_pre)])):

767

return dSdN[1], dSdN[0] # dN, dS

768

dSdN_pre = dSdN

769

770

771

def _get_TV(codon_lst1, codon_lst2, codon_table=default_codon_table):

772

"""

773

Argument:

774

- T - proportions of transitional differences

775

- V - proportions of transversional differences

776

"""

777

purine = ('A', 'G')

778

pyrimidine = ('C', 'T')

779

TV = [0, 0]

780

sites = 0

781

for codon1, codon2 in zip(codon_lst1, codon_lst2):

782

if "---" not in (codon1, codon2):

783

for i, j in zip(codon1, codon2):

784

if i == j:

785

pass

786

elif i in purine and j in purine:

787

TV[0] += 1

788

elif i in pyrimidine and j in pyrimidine:

789

TV[0] += 1

790

else:

791

TV[1] += 1

792

sites += 1

793

return (TV[0]/sites, TV[1]/sites)

794

#return (TV[0], TV[1])

795

796

797

def _get_kappa_t(pi, TV, t=False):

798

"""The following formula and variable names are according to

799

PMID: 10666704

800

"""

801

pi['Y'] = pi['T'] + pi['C']

802

pi['R'] = pi['A'] + pi['G']

803

A = (2*(pi['T']*pi['C']+pi['A']*pi['G'])+\

804

2*(pi['T']*pi['C']*pi['R']/pi['Y']+pi['A']*pi['G']*pi['Y']/pi['R'])*\

805

(1-TV[1]/(2*pi['Y']*pi['R']))-TV[0])/\

806

(2*(pi['T']*pi['C']/pi['Y']+pi['A']*pi['G']/pi['R']))

807

B = 1 - TV[1]/(2*pi['Y']*pi['R'])

808

a = -0.5*log(A) # this seems to be an error in YANG's original paper

809

b = -0.5*log(B)

810

kappaF84 = a/b-1

811

if t is False:

812

kappaHKY85 = 1+(pi['T']*pi['C']/pi['Y']+pi['A']*pi['G']/pi['R'])*\

813

kappaF84/(pi['T']*pi['C']+pi['A']*pi['G'])

814

return kappaHKY85

815

else:

816

t = (4*pi['T']*pi['C']*(1+kappaF84/pi['Y'])+\

817

4*pi['A']*pi['G']*(1+kappaF84/pi['R'])+4*pi['Y']*pi['R'])*b

818

return t

819

820

821

def _count_site_YN00(codon_lst1, codon_lst2, pi, k,

822

codon_table=default_codon_table):

823

"""Site counting method from Ina 1995, PMID: 7699723 and modified

824

by Yang, PMID: 10666704. The method will return the total number of

825

synonymous and nonsynonymous sites and base frequencies in each

826

category. The function is equivalent to CountSites() function in

827

yn00.c of PAML.

828

"""

829

if len(codon_lst1) != len(codon_lst2):

830

raise RuntimeError("Length of two codon_lst should be the same "

831

"(%d and %d detected)".format(

832

len(codon_lst1),

833

len(codon_lst2))

834

)

835

else:

836

length = len(codon_lst1)

837

purine = ('A', 'G')

838

pyrimidine = ('T', 'C')

839

base_tuple = ('A', 'T', 'C', 'G')

840

codon_dict = codon_table.forward_table

841

stop = codon_table.stop_codons

842

codon_npath = {}

843

for i, j in zip(codon_lst1, codon_lst2):

844

if i != '---' and j != '---':

845

codon_npath.setdefault((i, j), 0)

846

codon_npath[(i, j)] += 1

847

S_sites = N_sites = 0

848

freqSN = [{'A': 0, 'T': 0, 'C': 0, 'G': 0}, # synonymous

849

{'A': 0, 'T': 0, 'C': 0, 'G': 0}] # nonsynonymous

850

for codon_pair, npath in codon_npath.items():

851

codon = codon_pair[0]

852

S = N = 0

853

for pos in range(3):

854

for base in base_tuple:

855

if codon[pos] == base: continue

856

neighbor_codon = codon[:pos] + base + codon[pos+1:]

857

if neighbor_codon in stop: continue

858

weight = pi[neighbor_codon]

859

if codon[pos] in pyrimidine and base in pyrimidine:

860

weight *= k

861

elif codon[pos] in purine and base in purine:

862

weight *= k

863

if codon_dict[codon] == codon_dict[neighbor_codon]:

864

S += weight

865

freqSN[0][base] += weight*npath

866

else:

867

N += weight

868

freqSN[1][base] += weight*npath

869

S_sites += S*npath

870

N_sites += N*npath

871

norm_const = 3*length/(S_sites+N_sites)

872

S_sites *= norm_const

873

N_sites *= norm_const

874

for i in freqSN:

875

norm_const = sum(i.values())

876

for b in i:

877

i[b] /= norm_const

878

return S_sites, N_sites, freqSN

879

880

881

def _count_diff_YN00(codon1, codon2, P, codon_lst,

882

codon_table=default_codon_table):

883

"""Count differences between two codons (three-letter string).

884

The function will weighted multiple pathways from codon1 to codon2

885

according to P matrix of codon substitution. The proportion

886

of transition and transvertion (TV) will also be calculated in

887

the function (PRIVATE).

888

"""

889

if not all([isinstance(codon1, str), isinstance(codon2, str)]):

890

raise TypeError("_count_diff_YN00 accept string object to represent "

891

"codon ({0}, {1} detected)".format(

892

type(codon1),

893

type(codon2))

894

)

895

if len(codon1) != 3 or len(codon2) != 3:

896

raise RuntimeError("codon should be three letter string ({0}, {1} "

897

"detected)".format(len(codon1), len(codon2)))

898

TV = [0, 0, 0, 0] # transition and transvertion counts (synonymous and nonsynonymous)

899

site = 0

900

if codon1 == '---' or codon2 == '---':

901

return TV

902

base_tuple = ('A', 'C', 'G', 'T')

903

if not all([i in base_tuple for i in codon1]):

904

raise RuntimeError("Unrecognized character detected in codon1 {0} "

905

"(Codon is consist of "

906

"A, T, C or G)".format(codon1))

907

if not all([i in base_tuple for i in codon2]):

908

raise RuntimeError("Unrecognized character detected in codon2 {0} "

909

"(Codon is consist of "

910

"A, T, C or G)".format(codon2))

911

if codon1 == codon2:

912

return TV

913

else:

914

diff_pos = []

915

for i, k in enumerate(zip(codon1, codon2)):

916

if k[0] != k[1]:

917

diff_pos.append(i)

918

def count_TV(codon1, codon2, diff, codon_table, weight=1):

919

purine = ('A', 'G')

920

pyrimidine = ('T', 'C')

921

dic = codon_table.forward_table

922

stop = codon_table.stop_codons

923

if codon1 in stop or codon2 in stop:

924

# stop codon is always considered as nonsynonymous

925

if codon1[diff] in purine and codon2[diff] in purine:

926

return [0, 0, weight, 0]

927

elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:

928

return [0, 0, weight, 0]

929

else:

930

return [0, 0, 0, weight]

931

elif dic[codon1] == dic[codon2]:

932

if codon1[diff] in purine and codon2[diff] in purine:

933

return [weight, 0, 0, 0]

934

elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:

935

return [weight, 0, 0, 0]

936

else:

937

return [0, weight, 0, 0]

938

else:

939

if codon1[diff] in purine and codon2[diff] in purine:

940

return [0, 0, weight, 0]

941

elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine:

942

return [0, 0, weight, 0]

943

else:

944

return [0, 0, 0, weight]

945

if len(diff_pos) == 1:

946

prob = 1

947

TV = [p+q for p,q in zip(TV,count_TV(codon1, codon2, diff_pos[0], codon_table))]

948

elif len(diff_pos) == 2:

949

codon2_aa = codon_table.forward_table[codon2]

950

tmp_codon = [codon1[:i] + codon2[i] + codon1[i+1:] \

951

for i in diff_pos]

952

path_prob = []

953

for i in tmp_codon:

954

codon_idx = list(map(codon_lst.index, [codon1, i, codon2]))

955

prob = (P[codon_idx[0], codon_idx[1]],

956

P[codon_idx[1], codon_idx[2]])

957

path_prob.append(prob[0]*prob[1])

958

path_prob = [2*i/sum(path_prob) for i in path_prob]

959

for n, i in enumerate(diff_pos):

960

temp_codon = codon1[:i] + codon2[i] + codon1[i+1:]

961

TV = [p+q for p,q in zip(TV,count_TV(codon1, temp_codon, i,

962

codon_table,

963

weight=path_prob[n]/2))

964

]

965

TV = [p+q for p,q in zip(TV,count_TV(codon1, temp_codon, i,

966

codon_table,

967

weight=path_prob[n]/2))

968

]

969

elif len(diff_pos) == 3:

970

codon2_aa = codon_table.forward_table[codon2]

971

paths = list(permutations([0, 1, 2], 3))

972

path_prob = []

973

tmp_codon = []

974

for p in paths:

975

tmp1 = codon1[:p[0]] + codon2[p[0]] + codon1[p[0]+1:]

976

tmp2 = tmp1[:p[1]] + codon2[p[1]] + tmp1[p[1]+1:]

977

tmp_codon.append((tmp1, tmp2))

978

codon_idx = list(map(codon_lst.index, [codon1, tmp1, tmp2, codon2]))

979

prob = (P[codon_idx[0], codon_idx[1]],

980

P[codon_idx[1], codon_idx[2]],

981

P[codon_idx[2], codon_idx[3]])

982

path_prob.append(prob[0]*prob[1]*prob[2])

983

path_prob = [3*i/sum(path_prob) for i in path_prob]

984

for i, j, k in zip(tmp_codon, path_prob, paths):

985

TV = [p+q for p,q in zip(TV, count_TV(codon1, i[0], k[0],

986

codon_table, weight=j/3))

987

]

988

TV = [p+q for p,q in zip(TV, count_TV(i[0], i[1], k[1],

989

codon_table, weight=j/3))

990

]

991

TV = [p+q for p,q in zip(TV, count_TV(i[1], codon2, k[1],

992

codon_table, weight=j/3))

993

]

994

if codon1 in codon_table.stop_codons or codon2 in codon_table.stop_codons:

995

site = [0, 3]

996

elif codon_table.forward_table[codon1] == codon_table.forward_table[codon2]:

997

site = [3, 0]

998

else:

999

site = [0, 3]

1000

return TV

1001

1002

1003

#################################################################

1004

# private functions for Maximum Likelihood method

1005

#################################################################

1006

1007

def _ml(seq1, seq2, cmethod, codon_table):

1008

"""Main function for ML method (PRIVATE).

1009

"""

1010

from collections import Counter

1011

from scipy.optimize import minimize

1012

codon_cnt = Counter()

1013

pi = _get_pi(seq1, seq2, cmethod, codon_table=codon_table)

1014

for i, j in zip(seq1, seq2):

1015

#if i != j and ('---' not in (i, j)):

1016

if '---' not in (i, j):

1017

codon_cnt[(i,j)] += 1

1018

codon_lst = [i for i in \

1019

list(codon_table.forward_table.keys()) + codon_table.stop_codons \

1020

if 'U' not in i]

1021

# apply optimization

1022

def func(params, pi=pi, codon_cnt=codon_cnt, codon_lst=codon_lst,

1023

codon_table=codon_table):

1024

"""params = [t, k, w]"""

1025

return -_likelihood_func(

1026

params[0], params[1], params[2], pi,

1027

codon_cnt, codon_lst=codon_lst,

1028

codon_table=codon_table)

1029

# count sites

1030

opt_res = minimize(func, [1, 0.1, 2], method='L-BFGS-B', \

1031

bounds=((1e-10, 20), (1e-10, 20), (1e-10, 10)),

1032

tol=1e-5)

1033

t, k, w = opt_res.x

1034

Q = _get_Q(pi, k, w, codon_lst, codon_table)

1035

Sd = Nd = 0

1036

for i, c1 in enumerate(codon_lst):

1037

for j, c2 in enumerate(codon_lst):

1038

if i != j:

1039

try:

1040

if codon_table.forward_table[c1] == \

1041

codon_table.forward_table[c2]:

1042

# synonymous count

1043

Sd += pi[c1] * Q[i, j]

1044

else:

1045

# nonsynonymous count

1046

Nd += pi[c1] * Q[i, j]

1047

except:

1048

# This is probably due to stop codons

1049

pass

1050

Sd *= t

1051

Nd *= t

1052

# count differences (with w fixed to 1)

1053

opt_res = minimize(func, [1, 0.1, 2], method='L-BFGS-B',

1054

bounds=((1e-10, 20), (1e-10, 20), (1, 1)),

1055

tol=1e-5)

1056

t, k, w = opt_res.x

1057

Q = _get_Q(pi, k, w, codon_lst, codon_table)

1058

rhoS = rhoN = 0

1059

for i, c1 in enumerate(codon_lst):

1060

for j, c2 in enumerate(codon_lst):

1061

if i != j:

1062

try:

1063

if codon_table.forward_table[c1] == \

1064

codon_table.forward_table[c2]:

1065

# synonymous count

1066

rhoS += pi[c1] * Q[i, j]

1067

else:

1068

# nonsynonymous count

1069

rhoN += pi[c1] * Q[i, j]

1070

except:

1071

# This is probably due to stop codons

1072

pass

1073

rhoS *= 3

1074

rhoN *= 3

1075

dN = Nd/rhoN

1076

dS = Sd/rhoS

1077

return dN, dS

1078

1079

1080

def _get_pi(seq1, seq2, cmethod, codon_table=default_codon_table):

1081

"""Obtain codon frequency dict (pi) from two codon list (PRIVATE).

1082

This function is designed for ML method. Available counting methods

1083

(cfreq) are F1x4, F3x4 and F64.

1084

"""

1085

#TODO:

1086

# Stop codon should not be allowed according to Yang.

1087

# Try to modify this!

1088

pi = {}

1089

if cmethod == 'F1x4':

1090

fcodon = {'A': 0, 'G': 0, 'C': 0, 'T': 0}

1091

for i in seq1 + seq2:

1092

if i != '---':

1093

for c in i: fcodon[c] += 1

1094

tot = sum(fcodon.values())

1095

fcodon = dict((j, k/tot) for j, k in fcodon.items())

1096

for i in codon_table.forward_table.keys() + codon_table.stop_codons:

1097

if 'U' not in i:

1098

pi[i] = fcodon[i[0]]*fcodon[i[1]]*fcodon[i[2]]

1099

elif cmethod == 'F3x4':

1100

# three codon position

1101

fcodon = [{'A': 0, 'G': 0, 'C': 0, 'T': 0},

1102

{'A': 0, 'G': 0, 'C': 0, 'T': 0},

1103

{'A': 0, 'G': 0, 'C': 0, 'T': 0}]

1104

for i in seq1 + seq2:

1105

if i != '---':

1106

fcodon[0][i[0]] += 1

1107

fcodon[1][i[1]] += 1

1108

fcodon[2][i[2]] += 1

1109

for i in range(3):

1110

tot = sum(fcodon[i].values())

1111

fcodon[i] = dict((j, k/tot) for j, k in fcodon[i].items())

1112

for i in list(codon_table.forward_table.keys()) + \

1113

codon_table.stop_codons:

1114

if 'U' not in i:

1115

pi[i] = fcodon[0][i[0]]*fcodon[1][i[1]]*fcodon[2][i[2]]

1116

elif cmethod == 'F61':

1117

for i in codon_table.forward_table.keys() + codon_table.stop_codons:

1118

if 'U' not in i:

1119

pi[i] = 0.1

1120

for i in seq1 + seq2:

1121

if i != '---': pi[i] += 1

1122

tot = sum(pi.values())

1123

pi = dict((j, k/tot) for j, k in pi.items())

1124

return pi

1125

1126

1127

def _q(i, j, pi, k, w, codon_table=default_codon_table):

1128

"""Q matrix for codon substitution.

1129

1130

Arguments:

1131

- i, j : three letter codon string

1132

- pi : expected codon frequency

1133

- k : transition/transversion ratio

1134

- w : nonsynonymous/synonymous rate ratio

1135

- codon_table: Bio.Data.CodonTable object

1136

"""

1137

if i == j:

1138

# diagonal elements is the sum of all other elements

1139

return 0

1140

if i in codon_table.stop_codons or j in codon_table.stop_codons:

1141

return 0

1142

if (i not in pi) or (j not in pi):

1143

return 0

1144

purine = ('A', 'G')

1145

pyrimidine = ('T', 'C')

1146

diff = []

1147

for n, (c1, c2) in enumerate(zip(i, j)):

1148

if c1 != c2:

1149

diff.append((n, c1, c2))

1150

if len(diff) >= 2:

1151

return 0

1152

if codon_table.forward_table[i] == codon_table.forward_table[j]:

1153

# synonymous substitution

1154

if diff[0][1] in purine and diff[0][2] in purine:

1155

# transition

1156

return k*pi[j]

1157

elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine:

1158

# transition

1159

return k*pi[j]

1160

else:

1161

# transversion

1162

return pi[j]

1163

else:

1164

# nonsynonymous substitution

1165

if diff[0][1] in purine and diff[0][2] in purine:

1166

# transition

1167

return w*k*pi[j]

1168

elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine:

1169

# transition

1170

return w*k*pi[j]

1171

else:

1172

# transversion

1173

return w*pi[j]

1174

1175

1176

def _get_Q(pi, k, w, codon_lst, codon_table):

1177

"""Q matrix for codon substitution"""

1178

import numpy as np

1179

codon_num = len(codon_lst)

1180

Q = np.zeros((codon_num, codon_num))

1181

for i in range(codon_num):

1182

for j in range(codon_num):

1183

if i != j:

1184

Q[i, j] = _q(codon_lst[i], codon_lst[j], pi, k, w,

1185

codon_table=codon_table)

1186

nucl_substitutions = 0

1187

for i in range(codon_num):

1188

Q[i,i] = -sum(Q[i,:])

1189

try:

1190

nucl_substitutions += pi[codon_lst[i]] * (-Q[i, i])

1191

except KeyError:

1192

pass

1193

Q = Q / nucl_substitutions

1194

return Q

1195

1196

1197

def _likelihood_func(t, k, w, pi, codon_cnt, codon_lst, codon_table):

1198

"""likelihood function for ML method

1199

"""

1200

from scipy.linalg import expm

1201

Q = _get_Q(pi, k, w, codon_lst, codon_table)

1202

P = expm(Q*t)

1203

l = 0 # likelihood value

1204

for i, c1 in enumerate(codon_lst):

1205

for j, c2 in enumerate(codon_lst):

1206

if (c1, c2) in codon_cnt:

1207

if P[i, j] * pi[c1] <= 0:

1208

l += codon_cnt[(c1, c2)]*0

1209

else:

1210

l += codon_cnt[(c1, c2)]*log(pi[c1]*P[i, j])

1211

return l

1212

1213

1214

if __name__ == "__main__":

1215

from Bio._utils import run_doctest

1216

run_doctest()

1217

Older »