3
__author__ = "Jesse Zaneveld"
4
__copyright__ = "Copyright 2007-2009, The Cogent Project"
5
__credits__ = ["Jesse Zaneveld"]
8
__maintainer__ = "Jesse Zaneveld"
9
__email__ = "zaneveld@gmail.com"
10
__status__ = "Production"
13
Test code for kegg_ko.py in cogent.parse.
15
from cogent.util.unit_test import TestCase, main
17
from cogent.parse.kegg_ko import kegg_label_fields,\
18
parse_kegg_taxonomy, ko_record_iterator, ko_record_splitter,\
19
ko_default_parser, ko_first_field_parser, delete_comments,\
20
ko_colon_fields, ko_colon_delimited_parser, _is_new_kegg_rec_group,\
21
group_by_end_char, class_lines_to_fields, ko_class_parser, parse_ko,\
22
parse_ko_file, make_tab_delimited_line_parser
26
class ParseKOTests(TestCase):
27
def make_tab_delimited_line_parser(self):
28
"""make_tab_delimited_line_parser should generate line parser"""
29
line ="good\tbad:good\tgood\tgood\tbad:good\tgood"
30
parse_fn = make_tab_delimited_line_parser([0,2,3,5])
32
exp = "good\tgood\tgood\tgood\tgood\tgood"
33
self.assertEqual(obs,exp)
35
def test_kegg_label_fields(self):
36
"""kegg_label_fields should return fields from line"""
37
# Format is species:gene_id [optional gene_name]; description.
38
# Note that the '>' should already be stripped by the Fasta Parser
40
"""stm:STM0001 thrL; thr operon leader peptide ; K08278 thr operon leader peptide"""
42
"""stm:STM0002 thrA; bifunctional aspartokinase I/homeserine dehydrogenase I (EC:2.7.2.4 1.1.1.13); K00003 homoserine dehydrogenase [EC:1.1.1.3]; K00928 aspartate kinase [EC:2.7.2.4]"""
43
obs = kegg_label_fields(test1)
44
exp = ('stm:STM0001','stm','STM0001',\
45
'thrL','thr operon leader peptide ; K08278 thr operon leader peptide')
46
self.assertEqual(obs,exp)
48
obs = kegg_label_fields(test2)
49
exp = ('stm:STM0002', 'stm', 'STM0002', 'thrA', \
50
'bifunctional aspartokinase I/homeserine dehydrogenase I (EC:2.7.2.4 1.1.1.13); K00003 homoserine dehydrogenase [EC:1.1.1.3]; K00928 aspartate kinase [EC:2.7.2.4]')
52
self.assertEqual(obs,exp)
54
def test_ko_record_iterator(self):
55
"""ko_record_iterator should iterate over KO records"""
57
for rec in ko_record_iterator(TEST_KO_LINES):
60
self.assertEqual(len(recs),3)
61
self.assertEqual(len(recs[0]),31)
63
exp = 'ENTRY K01559 KO\n'
64
self.assertEqual(recs[0][0],exp)
66
exp = ' RCI: RCIX1162 RCIX2396\n'
67
self.assertEqual(recs[0][-1],exp)
69
exp = 'ENTRY K01561 KO\n'
70
self.assertEqual(recs[-1][0],exp)
72
exp = ' MSE: Msed_1088\n'
73
self.assertEqual(recs[-1][-1],exp)
75
def test_ko_record_splitter(self):
76
"""ko_record_splitter should split ko lines into a dict of groups"""
78
recs=[rec for rec in ko_record_iterator(TEST_KO_LINES)]
79
split_recs = ko_record_splitter(recs[0])
80
exp = ['GENES AFM: AFUA_4G13070\n',\
83
' BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n',\
85
' BBA: Bd0910(catD)\n',\
86
' GBE: GbCGDNIH1_0998 GbCGDNIH1_1171\n',\
89
' HMA: rrnAC1925(mhpC)\n',\
90
' RCI: RCIX1162 RCIX2396\n']
91
self.assertEqual(exp,split_recs["GENES"])
92
exp = ['CLASS Metabolism; Biosynthesis of Secondary Metabolites; Limonene and\n', ' pinene degradation [PATH:ko00903]\n', ' Metabolism; Xenobiotics Biodegradation and Metabolism; Caprolactam\n', ' degradation [PATH:ko00930]\n', ' Metabolism; Xenobiotics Biodegradation and Metabolism;\n', ' 1,1,1-Trichloro-2,2-bis(4-chlorophenyl)ethane (DDT) degradation\n', ' [PATH:ko00351]\n', ' Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', ' degradation via CoA ligation [PATH:ko00632]\n', ' Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', ' degradation via hydroxylation [PATH:ko00362]\n']
94
def test_ko_default_parser(self):
95
"""ko_default parser should strip out newlines and join lines together"""
97
# Applies to 'NAME' and 'DEFINITION' lines
99
default_line_1 = ['NAME E3.8.1.2\n']
100
obs = ko_default_parser(default_line_1)
101
self.assertEqual(obs,'E3.8.1.2')
103
default_line_2 = ['DEFINITION 2-haloacid dehalogenase [EC:3.8.1.2]\n']
104
obs = ko_default_parser(default_line_2)
105
self.assertEqual(obs,'2-haloacid dehalogenase [EC:3.8.1.2]')
107
def test_ko_first_field_parser(self):
108
"""ko_first_field_parser should strip out newlines and join lines
109
together (first field only)"""
110
obs = ko_first_field_parser(\
111
['ENTRY K01559 KO\n'])
113
self.assertEqual(obs,exp)
115
def test_delete_comments(self):
116
"""delete_comments should delete parenthetical comments from lines"""
119
"bifunctional aspartokinase I/homeserine dehydrogenase I (EC:2.7.2.4 1.1.1.13);"
120
exp = "bifunctional aspartokinase I/homeserine dehydrogenase I ;"
121
obs = delete_comments(test_line)
122
self.assertEqual(obs,exp)
125
"text(comment1(comment2));"
127
obs = delete_comments(nested_test_line)
128
self.assertEqual(obs,exp)
130
def test_ko_colon_fields(self):
131
"""ko_colon_fields should convert lines to (key, [list of values])"""
133
[' BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n']
135
obs = ko_colon_fields(test_lines)
136
exp = ('BXE', ['Bxe_B0037', 'Bxe_C0683', 'Bxe_C1002', 'Bxe_C1023'])
137
self.assertEqual(obs,exp)
139
test_lines = [' HMA: rrnAC1925(mhpC)\n']
140
obs = ko_colon_fields(test_lines, without_comments = True)
141
exp = ('HMA', ['rrnAC1925'])
142
self.assertEqual(obs,exp)
145
test_lines = [' HMA: rrnAC1925(mhpC)\n']
146
obs = ko_colon_fields(test_lines, without_comments = False)
147
exp = ('HMA', ['rrnAC1925(mhpC)'])
148
self.assertEqual(obs,exp)
150
def test_ko_colon_delimited_parser(self):
151
"""ko_colon_delimited_parser should return a dict of id: values for
152
colon delimited lines"""
154
['GENES AFM: AFUA_4G13070\n',\
155
' PHA: PSHAa2393\n',\
157
' BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n',\
158
' MPT: Mpe_A2274\n',\
159
' BBA: Bd0910(catD)\n',\
160
' GBE: GbCGDNIH1_0998 GbCGDNIH1_1171\n',\
163
' HMA: rrnAC1925(mhpC)\n',\
164
' RCI: RCIX1162 RCIX2396\n']
166
obs = ko_colon_delimited_parser(test_lines, without_comments = True)
167
self.assertEqual(obs['BXE'],['Bxe_B0037','Bxe_C0683', 'Bxe_C1002','Bxe_C1023'])
168
self.assertEqual(obs['PHA'],['PSHAa2393'])
169
# Check that comments are stripped
170
self.assertEqual(obs['BBA'],['Bd0910'])
172
obs = ko_colon_delimited_parser(test_lines, without_comments = False)
173
# Lines without comments shouldn't be affected
174
self.assertEqual(obs['BXE'],['Bxe_B0037','Bxe_C0683', 'Bxe_C1002','Bxe_C1023'])
175
self.assertEqual(obs['PHA'],['PSHAa2393'])
176
# Comments should be preserved
177
self.assertEqual(obs['BBA'],['Bd0910(catD)'])
179
def test_is_new_kegg_rec_group(self):
180
"""_is_new_kegg_rec_group should check for irregular field terminators in KEGG"""
182
# Handle unusual KEGG fields.
184
def test_group_by_end_char(self):
185
"""group_by_end_char should yield successive lines that end with a given
186
char, plus the last group of lines"""
187
class_lines=['CLASS Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
188
' gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n',\
189
' Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
190
' 1,2-Dichloroethane degradation [PATH:ko00631]\n']
192
exp =[['CLASS Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
193
' gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n'],\
194
[' Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
195
' 1,2-Dichloroethane degradation [PATH:ko00631]\n']]
196
for i,group in enumerate(group_by_end_char(class_lines)):
197
self.assertEqual(group, exp[i])
199
def test_class_lines_to_fields(self):
200
"""class_lines_to_fields should split groups of lines for one KO class
202
class_lines1=['CLASS Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
203
' gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n']
205
class_lines2=[' Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
206
' 1,2-Dichloroethane degradation [PATH:ko00631]\n']
208
obs = class_lines_to_fields(class_lines1)
209
exp = ('PATH:ko00361',('Metabolism', 'Xenobiotics Biodegradation and Metabolism', 'gamma-Hexachlorocyclohexane degradation'))
210
self.assertEqual(obs,exp)
212
obs = class_lines_to_fields(class_lines2)
213
exp = ('PATH:ko00631',('Metabolism', 'Xenobiotics Biodegradation and Metabolism','1,2-Dichloroethane degradation'))
214
self.assertEqual(obs,exp)
217
def test_ko_class_parser(self):
218
"""ko_class_parser should return fields"""
219
class_lines='CLASS Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
220
' gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n',\
221
' Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
222
' 1,2-Dichloroethane degradation [PATH:ko00631]\n'
223
exp = [('PATH:ko00361',('Metabolism','Xenobiotics Biodegradation and Metabolism',\
224
'gamma-Hexachlorocyclohexane degradation')),\
225
('PATH:ko00631',('Metabolism', 'Xenobiotics Biodegradation and Metabolism', '1,2-Dichloroethane degradation'))]
227
for i,obs in enumerate(ko_class_parser(class_lines)):
228
self.assertEqual(obs,exp[i])
231
def test_parse_ko(self):
232
"""parse_ko should parse a ko record into fields """
233
lines = TEST_KO_LINES
237
results.append(result)
238
# For each entry we expect a dict
240
self.assertEqual(results[0]["ENTRY"], "K01559")
241
self.assertEqual(results[1]["ENTRY"], "K01560")
242
self.assertEqual(results[2]["ENTRY"], "K01561")
244
self.assertEqual(results[0]["NAME"], "E3.7.1.-")
245
self.assertEqual(results[1]["NAME"], "E3.8.1.2")
246
self.assertEqual(results[2]["NAME"], "E3.8.1.3")
248
self.assertEqual(results[0].get("DEFINITION"), None) #case 1 has no def
249
self.assertEqual(results[1]["DEFINITION"],\
250
"2-haloacid dehalogenase [EC:3.8.1.2]")
251
self.assertEqual(results[2]["DEFINITION"],\
252
"haloacetate dehalogenase [EC:3.8.1.3]")
253
self.assertEqual(len(results[0]["CLASS"]), 5)
254
self.assertEqual(results[0]["CLASS"][4], \
255
('PATH:ko00362', ('Metabolism', \
256
'Xenobiotics Biodegradation and Metabolism',\
257
'Benzoate degradation via hydroxylation')))
259
self.assertEqual(results[0]["DBLINKS"], \
260
{'RN': ['R04488', 'R05100', 'R05363', \
261
'R05365', 'R06371', 'R07515', \
264
self.assertEqual(results[1]["DBLINKS"], \
265
{'GO': ['0018784'], 'RN': ['R05287'], 'COG': ['COG1011']})
267
self.assertEqual(results[2]["DBLINKS"], \
268
{'GO': ['0018785'], 'RN': ['R05287']})
270
self.assertEqual(results[0]["GENES"], \
271
{'AFM': ['AFUA_4G13070'], 'FNU': ['FN1345'],\
272
'GBE': ['GbCGDNIH1_0998', 'GbCGDNIH1_1171'],\
273
'PHA': ['PSHAa2393'], \
275
'ABO': ['ABO_0668'],\
276
'MPT': ['Mpe_A2274'],\
277
'RCI': ['RCIX1162', 'RCIX2396'], \
278
'BXE': ['Bxe_B0037', 'Bxe_C0683', 'Bxe_C1002', 'Bxe_C1023'],\
279
'HMA': ['rrnAC1925'], \
284
TEST_KO_LINES = ['ENTRY K01559 KO\n', '\
286
PATHWAY ko00351 1,1,1-Trichloro-2,2-bis(4-chlorophenyl)ethane (DDT)\n', '\
288
ko00362 Benzoate degradation via hydroxylation\n', '\
289
ko00632 Benzoate degradation via CoA ligation\n', '\
290
ko00903 Limonene and pinene degradation\n', '\
291
ko00930 Caprolactam degradation\n', '\
292
CLASS Metabolism; Biosynthesis of Secondary Metabolites; Limonene and\n', '\
293
pinene degradation [PATH:ko00903]\n', '\
294
Metabolism; Xenobiotics Biodegradation and Metabolism; Caprolactam\n', '\
295
degradation [PATH:ko00930]\n', '\
296
Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
297
1,1,1-Trichloro-2,2-bis(4-chlorophenyl)ethane (DDT) degradation\n', '\
298
[PATH:ko00351]\n', '\
299
Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', '\
300
degradation via CoA ligation [PATH:ko00632]\n', '\
301
Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', '\
302
degradation via hydroxylation [PATH:ko00362]\n', '\
303
DBLINKS RN: R04488 R05100 R05363 R05365 R06371 R07515 R07831\n', '\
304
GENES AFM: AFUA_4G13070\n', '\
305
PHA: PSHAa2393\n', '\
307
BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n', '\
308
MPT: Mpe_A2274\n', '\
309
BBA: Bd0910(catD)\n', '\
310
GBE: GbCGDNIH1_0998 GbCGDNIH1_1171\n', '\
313
HMA: rrnAC1925(mhpC)\n', '\
314
RCI: RCIX1162 RCIX2396\n', '\
316
ENTRY K01560 KO\n', '\
318
DEFINITION 2-haloacid dehalogenase [EC:3.8.1.2]\n', '\
319
PATHWAY ko00361 gamma-Hexachlorocyclohexane degradation\n', '\
320
ko00631 1,2-Dichloroethane degradation\n', '\
321
CLASS Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
322
gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n', '\
323
Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
324
1,2-Dichloroethane degradation [PATH:ko00631]\n', '\
325
DBLINKS RN: R05287\n', '\
328
GENES NCR: NCU03617\n', '\
329
ANI: AN5830.2 AN7918.2\n', '\
330
AFM: AFUA_2G07750 AFUA_5G14640 AFUA_8G05870\n', '\
331
AOR: AO090001000019 AO090003001435 AO090011000921\n', '\
332
PST: PSPTO_0247(dehII)\n', '\
333
PSP: PSPPH_1747(dehII1) PSPPH_5028(dehII2)\n', '\
334
ATU: Atu0797 Atu3405(hadL)\n', '\
335
ATC: AGR_C_1458 AGR_L_2834\n', '\
336
RET: RHE_CH00996(ypch00330) RHE_PF00342(ypf00173)\n', '\
337
MSE: Msed_0732\n', '\
339
ENTRY K01561 KO\n', '\
341
DEFINITION haloacetate dehalogenase [EC:3.8.1.3]\n', '\
342
PATHWAY ko00361 gamma-Hexachlorocyclohexane degradation\n', '\
343
ko00631 1,2-Dichloroethane degradation\n', '\
344
CLASS Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
345
gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n', '\
346
Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
347
1,2-Dichloroethane degradation [PATH:ko00631]\n', '\
348
DBLINKS RN: R05287\n', '\
350
GENES RSO: RSc0256(dehH)\n', '\
351
REH: H16_A0197\n', '\
353
BPM: BURPS1710b_0537(dehH)\n', '\
354
BPD: BURPS668_0347\n', '\
356
MSE: Msed_1088\n', '\
360
if __name__=="__main__":