~ubuntu-branches/ubuntu/natty/python-cogent/natty

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#!/usr/bin/env python
"""Translates RNA or DNA string to amino acid sequence.

NOTE: * is used to denote termination (as per NCBI standard).
NOTE: Although the genetic code objects convert DNA to RNA and vice
versa, lists of codons that they produce will be provided in DNA format.
"""
from string import maketrans

__author__ = "Greg Caporaso and Rob Knight"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Greg Caporaso", "Rob Knight", "Peter Maxwell"]
__license__ = "GPL"
__version__ = "1.5.0"
__maintainer__ = "Greg Caporaso"
__email__ = "caporaso@colorado.edu"
__status__ = "Production"

class GeneticCodeError(Exception):
    pass

class GeneticCodeInitError(ValueError, GeneticCodeError):
    pass

class InvalidCodonError(KeyError, GeneticCodeError):
    pass

_dna_trans = maketrans('TCAG','AGTC')

def _simple_rc(seq):
    """simple reverse-complement: works only on unambiguous uppercase DNA"""
    return seq.translate(_dna_trans)[::-1]

class GeneticCode(object):
    """Holds codon to amino acid mapping, and vice versa.

    Usage:  gc = GeneticCode(CodeSequence)
            sgc = GeneticCode(
            'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG')
            sgc['UUU'] == 'F'
            sgc['TTT'] == 'F'
            sgc['F'] == ['TTT', 'TTC']          #in arbitrary order
            sgc['*'] == ['TAA', 'TAG', 'TGA']   #in arbitrary order
    
    CodeSequence : 64 character string containing NCBI genetic code translation

    GeneticCode is immutable once created.
    """
    #class data: need the bases, the list of codons in UUU -> GGG order, and
    #a mapping from positions in the list back to codons. These should be the
    #same for all GeneticCode instances, and are immutable (therefore private).
    _nt = "TCAG"
    _codons = [a+b+c for a in _nt for b in _nt for c in _nt]

    def __init__(self, CodeSequence, ID=None, Name=None, StartCodonSequence=None):
        """Returns new GeneticCode object.

        CodeSequence : 64-character string containing NCBI representation 
        of the genetic code. Raises GeneticCodeInitError if length != 64.
        """
        if (len(CodeSequence) != 64):
            raise GeneticCodeInitError,\
                  "CodeSequence: %s has length %d, but expected 64"\
                  % (CodeSequence, len(CodeSequence))
                  
        self.CodeSequence = CodeSequence
        self.ID = ID
        self.Name = Name
        self.StartCodonSequence = StartCodonSequence
        start_codons = {}
        if StartCodonSequence:
            for codon, aa in zip(self._codons, StartCodonSequence):
                if aa != '-':
                    start_codons[codon] = aa
        self.StartCodons = start_codons
        codon_lookup = dict(zip(self._codons, CodeSequence))
        self.Codons = codon_lookup
        #create synonyms for each aa
        aa_lookup = {}
        for codon in self._codons:
            aa = codon_lookup[codon]
            if aa not in aa_lookup:
                aa_lookup[aa] = [codon]
            else:
                aa_lookup[aa].append(codon)
        self.Synonyms = aa_lookup
        sense_codons = codon_lookup.copy()
        #create sense codons
        stop_codons = self['*']
        for c in stop_codons:
            del sense_codons[c]
        self.SenseCodons = sense_codons
        #create anticodons    
        ac = {}
        for aa, codons in self.Synonyms.items():
            ac[aa] = map(_simple_rc, codons)
        self.Anticodons = ac

    def _analyze_quartet(self, codons, aa):
        """Analyzes a quartet of codons and amino acids: returns list of lists.

        Each list contains one block, splitting at R/Y if necessary.

        codons should be a list of 4 codons.
        aa should be a list of 4 amino acid symbols.
        
        Possible states:
            - All amino acids are the same: returns list of one quartet.
            - Two groups of 2 aa: returns list of two doublets.
            - One group of 2 and 2 groups of 1: list of one doublet, 2 singles.
            - 4 groups of 1: four singles.

        Note: codon blocks like Ile in the standard code (AUU, AUC, AUA) will
        be split when they cross the R/Y boundary, so [[AUU, AUC], [AUA]]. This
        would also apply to a block like AUC AUA AUG -> [[AUC],[AUA,AUG]], 
        although this latter pattern is not observed in the standard code.
        """
        if aa[0] == aa[1]:
            first_doublet = True
        else:
            first_doublet = False
        if aa[2] == aa[3]:
            second_doublet = True
        else:
            second_doublet = False
        if first_doublet and second_doublet and aa[1] == aa[2]:
            return [codons]
        else:
            blocks = []
            if first_doublet:
                blocks.append(codons[:2])
            else:
                blocks.extend([[codons[0]],[codons[1]]])
            if second_doublet:
                blocks.append(codons[2:])
            else:
                blocks.extend([[codons[2]],[codons[3]]])
            return blocks
        
    def _get_blocks(self):
        """Returns list of lists of codon blocks in the genetic code.
        
        A codon block can be:
            - a quartet, if all 4 XYn codons have the same amino acid.
            - a doublet, if XYt and XYc or XYa and XYg have the same aa.
            - a singlet, otherwise.

        Returns a list of the quartets, doublets, and singlets in the order
        UUU -> GGG.

        Note that a doublet cannot span the purine/pyrimidine boundary, and 
        a quartet cannot span the boundary between two codon blocks whose first
        two bases differ.
        """
        if hasattr(self, '_blocks'):
            return self._blocks
        else:
            blocks = []
            curr_codons = []
            curr_aa = []
            for index, codon, aa in zip(range(64),self._codons,self.CodeSequence):
                #we're in a new block if it's a new quartet or a different aa
                new_quartet = not index % 4
                if new_quartet and curr_codons:
                    blocks.extend(self._analyze_quartet(curr_codons, curr_aa))
                    curr_codons = []
                    curr_aa = []
                curr_codons.append(codon)
                curr_aa.append(aa)
            #don't forget to append last block
            if curr_codons:
                blocks.extend(self._analyze_quartet(curr_codons, curr_aa))
            self._blocks = blocks
            return self._blocks
           
    Blocks = property(_get_blocks)
    
    def __str__(self):
        """Returns CodeSequence that constructs the GeneticCode."""
        return self.CodeSequence

    def __repr__(self):
        """Returns reconstructable representation of the GeneticCode."""
        return 'GeneticCode(%s)' % str(self)
    
    def __cmp__(self, other):
        """ Allows two GeneticCode objects to be compared to each other.
        Two GeneticCode objects are equal if they have equal CodeSequences.
        """
        return cmp(str(self), str(other))

    def __getitem__(self, item):
        """Returns amino acid corresponding to codon, or codons for an aa.
        
        Returns [] for empty list of codons, 'X' for unknown amino acid.
        """
        if len(item) == 1:  #amino acid
            return self.Synonyms.get(item, [])
        elif len(item) == 3:    #codon
            key = item.upper()
            key = key.replace('U', 'T')
            return self.Codons.get(key, 'X')
        else:
            raise InvalidCodonError, "Codon or aa %s has wrong length" % item
            
    def translate(self, dna, start=0):
        """ Translates DNA to protein with current GeneticCode.
        
        dna         = a string of nucleotides
        start       = position to begin translation (used to implement frames)
        
        Returns string containing amino acid sequence. Translates the entire
        sequence: it is the caller's responsibility to find open reading frames.

        NOTE: should return Protein object when we have a class for it.
        """
        if not dna:
            return ''
        if start + 1 > len(dna):
            raise ValueError, "Translation starts after end of RNA"
        return ''.join([self[dna[i:i+3]] for i in range(start, len(dna)-2, 3)])

    def sixframes(self, dna):
        """Returns six-frame translation as dict containing {frame:translation}
        """
        reverse = dna.rc()
        return [self.translate(dna, start) for start in range(3)] + \
               [self.translate(reverse, start) for start in range(3)]

    def isStart(self, codon):
        """Returns True if codon is a start codon, False otherwise."""
        fixed_codon = codon.upper().replace('U','T')
        return fixed_codon in self.StartCodons

    def isStop(self, codon):
        """Returns True if codon is a stop codon, False otherwise."""
        return self[codon] == '*'

    def changes(self, other):
        """Returns dict of {codon:'XY'} for codons that differ.
        
        X is the string representation of the amino acid in self, Y is the
        string representation of the amino acid in other. Always returns a
        2-character string.
        """
        changes = {}
        try:
            other_code = other.CodeSequence
        except AttributeError:  #try using other directly as sequence
            other_code = other
        for codon, old, new in zip(self._codons, self.CodeSequence, other_code):
            if old != new:
                changes[codon] = old+new
        return changes


NcbiGeneticCodeData = [GeneticCode(*data) for data in [
[
'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1,
'Standard Nuclear',
'---M---------------M---------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG',
2,
'Vertebrate Mitochondrial',
'--------------------------------MMMM---------------M------------',
],
[
'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
3,
'Yeast Mitochondrial',
'----------------------------------MM----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
4,
'Mold, Protozoan, and Coelenterate Mitochondrial, and Mycoplasma/Spiroplasma Nuclear',
'--MM---------------M------------MMMM---------------M------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG',
5,
'Invertebrate Mitochondrial',
'---M----------------------------MMMM---------------M------------',
],
[
'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
6,
'Ciliate, Dasycladacean and Hexamita Nuclear',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
9,
'Echinoderm and Flatworm Mitochondrial',
'-----------------------------------M---------------M------------',
],
[
'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
10,
'Euplotid Nuclear',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
11,
'Bacterial Nuclear and Plant Plastid',
'---M---------------M------------MMMM---------------M------------',
],
[
'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
12,
'Alternative Yeast Nuclear',
'-------------------M---------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG',
13,
'Ascidian Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
14,
'Alternative Flatworm Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
15,
'Blepharisma Nuclear',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
16,
'Chlorophycean Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
20,
'Trematode Mitochondrial',
'-----------------------------------M---------------M------------',
],
[
'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
22,
'Scenedesmus obliquus Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
23,
'Thraustochytrium Mitochondrial',
],
]]

#build dict of GeneticCodes keyed by ID (as int, not str)
GeneticCodes = dict([(i.ID, i) for i in NcbiGeneticCodeData])
#add str versions for convenience
for key, value in GeneticCodes.items():
    GeneticCodes[str(key)] = value

DEFAULT = GeneticCodes[1]