6
from unittest import TestCase, main
8
from cogent.parse.binary_sff import (
9
seek_pad, parse_common_header, parse_read_header, parse_read_data,
10
validate_common_header, parse_read, parse_binary_sff, UnsupportedSffError,
11
write_pad, write_common_header, write_read_header, write_read_data,
12
write_read, write_binary_sff, format_common_header, format_read_header,
13
format_read_data, format_binary_sff, base36_encode, base36_decode,
14
decode_location, decode_timestamp, decode_accession, decode_sff_filename,
17
__author__ = "Kyle Bittinger"
18
__copyright__ = "Copyright 2007-2009, The Cogent Project"
19
__credits__ = ["Kyle Bittinger"]
22
__maintainer__ = "Kyle Bittinger"
23
__email__ = "kylebittinger@gmail.com"
24
__status__ = "Production"
27
TEST_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
28
SFF_FP = os.path.join(TEST_DIR, 'data', 'F6AVWTA01.sff')
31
class WritingFunctionTests(TestCase):
33
self.output_file = tempfile.TemporaryFile()
35
def test_write_pad(self):
36
self.output_file.write('\x01\x02\x03\x04')
37
write_pad(self.output_file)
38
self.output_file.seek(0)
39
buff = self.output_file.read()
40
self.assertEqual(buff, '\x01\x02\x03\x04\x00\x00\x00\x00')
42
def test_write_common_header(self):
43
write_common_header(self.output_file, COMMON_HEADER)
45
file_pos = self.output_file.tell()
46
self.assertTrue(file_pos % 8 == 0)
48
self.output_file.seek(0)
49
observed = parse_common_header(self.output_file)
50
self.assertEqual(observed, COMMON_HEADER)
52
file_pos = self.output_file.tell()
53
self.assertTrue(file_pos % 8 == 0)
55
def test_write_read_header(self):
56
write_read_header(self.output_file, READ_HEADER)
58
file_pos = self.output_file.tell()
59
self.assertTrue(file_pos % 8 == 0)
61
self.output_file.seek(0)
62
observed = parse_read_header(self.output_file)
63
self.assertEqual(observed, READ_HEADER)
65
file_pos = self.output_file.tell()
66
self.assertTrue(file_pos % 8 == 0)
68
def test_write_read_data(self):
69
write_read_data(self.output_file, READ_DATA)
71
file_pos = self.output_file.tell()
72
self.assertTrue(file_pos % 8 == 0)
74
self.output_file.seek(0)
75
num_flows = len(READ_DATA['flowgram_values'])
76
num_bases = len(READ_DATA['Bases'])
77
observed = parse_read_data(self.output_file, num_bases, num_flows)
78
self.assertEqual(observed, READ_DATA)
80
file_pos = self.output_file.tell()
81
self.assertTrue(file_pos % 8 == 0)
83
def test_write_read(self):
84
read = READ_HEADER.copy()
85
read.update(READ_DATA)
86
write_read(self.output_file, read)
88
file_pos = self.output_file.tell()
89
self.assertTrue(file_pos % 8 == 0)
91
self.output_file.seek(0)
92
num_flows = len(read['flowgram_values'])
93
observed = parse_read(self.output_file)
94
self.assertEqual(observed, read)
96
file_pos = self.output_file.tell()
97
self.assertTrue(file_pos % 8 == 0)
99
def test_write_binary_sff(self):
100
read = READ_HEADER.copy()
101
read.update(READ_DATA)
103
header = COMMON_HEADER.copy()
104
header['number_of_reads'] = 1
106
write_binary_sff(self.output_file, header, [read])
108
file_pos = self.output_file.tell()
109
self.assertTrue(file_pos % 8 == 0)
111
self.output_file.seek(0)
112
observed_header, observed_reads = parse_binary_sff(
113
self.output_file, native_flowgram_values=True)
114
observed_reads = list(observed_reads)
115
self.assertEqual(observed_header, header)
116
self.assertEqual(observed_reads[0], read)
117
self.assertEqual(len(observed_reads), 1)
119
file_pos = self.output_file.tell()
120
self.assertTrue(file_pos % 8 == 0)
123
class ParsingFunctionTests(TestCase):
125
self.sff_file = open(SFF_FP)
127
def test_seek_pad(self):
131
self.assertEqual(f.tell(), 8)
134
self.assertEqual(f.tell(), 16)
137
self.assertEqual(f.tell(), 16)
140
self.assertEqual(f.tell(), 16)
143
self.assertEqual(f.tell(), 16)
146
self.assertEqual(f.tell(), 24)
148
def test_parse_common_header(self):
149
observed = parse_common_header(self.sff_file)
150
self.assertEqual(observed, COMMON_HEADER)
152
def test_validate_common_header(self):
154
'magic_number': 779314790,
156
'flowgram_format_code': 1,
159
'number_of_reads': 0,
162
'number_of_flows_per_read': 0,
166
self.assertEqual(validate_common_header(header), None)
167
header['version'] = 2
168
self.assertRaises(UnsupportedSffError, validate_common_header, header)
170
def test_parse_read_header(self):
171
self.sff_file.seek(440)
172
observed = parse_read_header(self.sff_file)
173
self.assertEqual(observed, READ_HEADER)
175
def test_parse_read_data(self):
176
self.sff_file.seek(440 + 32)
177
observed = parse_read_data(self.sff_file, 271, 400)
178
self.assertEqual(observed, READ_DATA)
180
def test_parse_read(self):
181
self.sff_file.seek(440)
182
observed = parse_read(self.sff_file, 400)
183
expected = dict(READ_HEADER.items() + READ_DATA.items())
184
self.assertEqual(observed, expected)
186
def test_parse_sff(self):
187
header, reads = parse_binary_sff(self.sff_file)
188
self.assertEqual(header, COMMON_HEADER)
192
len(read['flowgram_values']), header['number_of_flows_per_read'])
194
self.assertEqual(counter, 20)
197
class FormattingFunctionTests(TestCase):
199
self.output_file = tempfile.TemporaryFile()
201
def test_format_common_header(self):
203
format_common_header(COMMON_HEADER), COMMON_HEADER_TXT)
205
def test_format_read_header(self):
207
format_read_header(READ_HEADER), READ_HEADER_TXT)
209
def test_format_read_header(self):
211
format_read_data(READ_DATA, READ_HEADER), READ_DATA_TXT)
213
def test_format_binary_sff(self):
214
output_buffer = format_binary_sff(open(SFF_FP))
215
output_buffer.seek(0)
216
expected = COMMON_HEADER_TXT + READ_HEADER_TXT + READ_DATA_TXT
217
observed = output_buffer.read(len(expected))
218
self.assertEqual(observed, expected)
221
class Base36Tests(TestCase):
222
def test_base36_encode(self):
223
self.assertEqual(base36_encode(2), 'C')
224
self.assertEqual(base36_encode(37), 'BB')
226
def test_base36_decode(self):
227
self.assertEqual(base36_decode('C'), 2)
228
self.assertEqual(base36_decode('BB'), 37)
230
def test_decode_location(self):
231
self.assertEqual(decode_location('C'), (0, 2))
233
def test_decode_timestamp(self):
234
self.assertEqual(decode_timestamp('C3U5GW'), (2004, 9, 22, 16, 59, 10))
235
self.assertEqual(decode_timestamp('GA202I'), (2010, 1, 22, 13, 28, 56))
237
def test_decode_accession(self):
239
decode_accession('GA202I001ER3QL'),
240
((2010, 1, 22, 13, 28, 56), '0', 1, (1843, 859)))
242
def test_decode_sff_filename(self):
244
decode_sff_filename('F6AVWTA01.sff'),
245
((2009, 11, 25, 14, 30, 19), 'A', 1))
249
'header_length': 440,
250
'flowgram_format_code': 1,
252
'magic_number': 779314790,
253
'number_of_flows_per_read': 400,
255
'flow_chars': 100 * 'TACG',
257
'key_sequence': 'TCAG',
258
'number_of_reads': 20,
259
'index_offset': 33464,
262
COMMON_HEADER_TXT = """\
264
Magic Number: 0x2E736666
273
Flow Chars: TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
279
'Name': 'GA202I001ER3QL',
280
'clip_adapter_left': 0,
281
'read_header_length': 32,
282
'clip_adapter_right': 0,
283
'number_of_bases': 271,
285
'clip_qual_right': 271,
288
READ_HEADER_TXT = """
290
Run Prefix: R_2010_01_22_13_28_56_
292
XY Location: 1843_0859
304
'flow_index_per_base': (
305
1, 2, 3, 2, 3, 3, 2, 1, 1, 2, 1, 2, 0, 2, 3, 3, 2, 3, 3, 0, 2, 0, 2, 0,
306
1, 1, 1, 2, 0, 2, 2, 1, 0, 0, 3, 0, 2, 1, 0, 1, 1, 3, 1, 2, 2, 2, 3, 2,
307
1, 0, 2, 0, 3, 0, 3, 3, 1, 3, 0, 0, 0, 0, 2, 1, 0, 2, 0, 2, 0, 2, 2, 2,
308
2, 3, 2, 2, 0, 1, 0, 0, 0, 2, 1, 3, 2, 0, 3, 3, 2, 1, 2, 0, 2, 2, 1, 2,
309
1, 2, 0, 1, 3, 0, 0, 3, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 3, 0, 2, 1, 1, 2,
310
1, 3, 2, 2, 1, 0, 3, 3, 0, 2, 0, 1, 1, 3, 3, 3, 2, 0, 0, 0, 3, 3, 2, 1,
311
1, 2, 2, 1, 1, 0, 1, 0, 2, 0, 3, 1, 1, 0, 2, 0, 0, 1, 0, 3, 2, 3, 3, 3,
312
1, 3, 2, 0, 1, 3, 3, 3, 1, 3, 2, 0, 1, 2, 2, 3, 3, 3, 2, 3, 3, 3, 0, 3,
313
3, 2, 2, 0, 3, 1, 1, 3, 0, 1, 0, 3, 2, 2, 0, 2, 0, 2, 0, 0, 2, 3, 2, 2,
314
0, 2, 0, 3, 2, 3, 1, 2, 0, 3, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0, 3, 3, 2,
315
0, 1, 0, 3, 0, 2, 3, 1, 1, 1, 1, 3, 1, 0, 1, 1, 2, 2, 3, 1, 0, 0, 1, 1,
316
3, 3, 1, 3, 0, 1, 0),
318
101, 0, 98, 3, 0, 104, 2, 95, 1, 0, 97, 3, 0, 110, 2, 102, 102, 110, 2,
319
99, 101, 0, 195, 5, 102, 0, 5, 96, 7, 0, 95, 7, 101, 0, 8, 98, 9, 0,
320
190, 9, 201, 0, 194, 101, 107, 104, 12, 198, 13, 104, 2, 105, 295, 7,
321
4, 197, 10, 101, 195, 98, 101, 3, 10, 100, 102, 0, 100, 7, 101, 0, 96,
322
8, 11, 102, 12, 102, 203, 9, 196, 8, 13, 206, 13, 6, 103, 10, 4, 103,
323
102, 3, 7, 479, 9, 102, 202, 10, 198, 6, 195, 9, 102, 0, 100, 5, 100,
324
2, 103, 8, 8, 100, 6, 102, 7, 200, 388, 10, 97, 100, 8, 5, 100, 12, 197,
325
7, 13, 103, 8, 7, 104, 10, 101, 104, 12, 201, 12, 99, 8, 99, 106, 13,
326
103, 102, 8, 202, 108, 9, 13, 293, 7, 4, 203, 103, 202, 107, 376, 103,
327
8, 11, 188, 8, 99, 101, 104, 8, 92, 101, 12, 4, 92, 11, 101, 7, 96, 202,
328
8, 12, 93, 11, 11, 202, 7, 195, 101, 102, 6, 0, 101, 7, 7, 106, 2, 6,
329
107, 4, 404, 12, 6, 104, 8, 10, 98, 2, 105, 110, 100, 8, 95, 3, 105,
330
102, 208, 201, 13, 195, 14, 0, 99, 86, 202, 9, 301, 206, 8, 8, 85, 6,
331
101, 6, 9, 103, 8, 9, 96, 4, 7, 102, 111, 0, 8, 93, 7, 194, 111, 5, 10,
332
95, 5, 10, 104, 2, 6, 98, 103, 0, 11, 99, 15, 192, 110, 5, 98, 8, 91, 8,
333
10, 92, 5, 10, 102, 8, 7, 105, 15, 102, 7, 9, 100, 2, 3, 102, 6, 9, 203,
334
6, 14, 107, 12, 8, 107, 1, 103, 13, 202, 2, 6, 108, 103, 99, 11, 2, 201,
335
207, 14, 8, 94, 4, 95, 9, 195, 13, 193, 9, 306, 13, 100, 11, 6, 75, 13,
336
91, 12, 205, 7, 203, 10, 3, 107, 17, 111, 12, 4, 105, 106, 7, 208, 5, 9,
337
202, 8, 108, 6, 84, 16, 103, 108, 92, 16, 93, 8, 95, 94, 207, 17, 10,
338
103, 3, 0, 104, 0, 202, 217, 16, 12, 197, 4, 90, 15, 17, 108, 98, 125,
339
104, 88, 14, 15, 99, 187, 106, 109, 12, 100, 11, 81, 8, 11, 92, 304,
340
112, 107, 2, 11, 94, 7, 6, 86, 97, 19, 3, 225, 206),
342
'TCAGCAGTAGTCCTGCTGCCTTCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCT'
343
'CTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCCCGCCTACTATCTAATGGAACGCATCCCC'
344
'ATCGTCTACCGGAATACCTTTAATCATGTGAACATGTGAACTCATGATGCCATCTTGTATTAATCTTCCT'
345
'TTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGG'),
347
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
348
37, 37, 37, 37, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
349
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 37, 37, 37, 37, 37,
350
37, 37, 37, 34, 34, 34, 34, 34, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
351
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
352
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
353
38, 32, 32, 32, 32, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
354
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
355
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
356
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
357
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
358
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
359
37, 37, 37, 37, 38, 38, 38, 38, 40, 40, 40, 38, 38, 38, 38, 38, 38, 38,
360
40, 38, 38, 38, 38, 38, 38, 37, 38, 38, 36, 37, 37, 36, 33, 28, 28, 31,
361
31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 30, 30, 25, 25, 25,
366
Flowgram: 1.01 0.00 0.98 0.03 0.00 1.04 0.02 0.95 0.01 0.00 0.97 0.03 0.00 1.10 0.02 1.02 1.02 1.10 0.02 0.99 1.01 0.00 1.95 0.05 1.02 0.00 0.05 0.96 0.07 0.00 0.95 0.07 1.01 0.00 0.08 0.98 0.09 0.00 1.90 0.09 2.01 0.00 1.94 1.01 1.07 1.04 0.12 1.98 0.13 1.04 0.02 1.05 2.95 0.07 0.04 1.97 0.10 1.01 1.95 0.98 1.01 0.03 0.10 1.00 1.02 0.00 1.00 0.07 1.01 0.00 0.96 0.08 0.11 1.02 0.12 1.02 2.03 0.09 1.96 0.08 0.13 2.06 0.13 0.06 1.03 0.10 0.04 1.03 1.02 0.03 0.07 4.79 0.09 1.02 2.02 0.10 1.98 0.06 1.95 0.09 1.02 0.00 1.00 0.05 1.00 0.02 1.03 0.08 0.08 1.00 0.06 1.02 0.07 2.00 3.88 0.10 0.97 1.00 0.08 0.05 1.00 0.12 1.97 0.07 0.13 1.03 0.08 0.07 1.04 0.10 1.01 1.04 0.12 2.01 0.12 0.99 0.08 0.99 1.06 0.13 1.03 1.02 0.08 2.02 1.08 0.09 0.13 2.93 0.07 0.04 2.03 1.03 2.02 1.07 3.76 1.03 0.08 0.11 1.88 0.08 0.99 1.01 1.04 0.08 0.92 1.01 0.12 0.04 0.92 0.11 1.01 0.07 0.96 2.02 0.08 0.12 0.93 0.11 0.11 2.02 0.07 1.95 1.01 1.02 0.06 0.00 1.01 0.07 0.07 1.06 0.02 0.06 1.07 0.04 4.04 0.12 0.06 1.04 0.08 0.10 0.98 0.02 1.05 1.10 1.00 0.08 0.95 0.03 1.05 1.02 2.08 2.01 0.13 1.95 0.14 0.00 0.99 0.86 2.02 0.09 3.01 2.06 0.08 0.08 0.85 0.06 1.01 0.06 0.09 1.03 0.08 0.09 0.96 0.04 0.07 1.02 1.11 0.00 0.08 0.93 0.07 1.94 1.11 0.05 0.10 0.95 0.05 0.10 1.04 0.02 0.06 0.98 1.03 0.00 0.11 0.99 0.15 1.92 1.10 0.05 0.98 0.08 0.91 0.08 0.10 0.92 0.05 0.10 1.02 0.08 0.07 1.05 0.15 1.02 0.07 0.09 1.00 0.02 0.03 1.02 0.06 0.09 2.03 0.06 0.14 1.07 0.12 0.08 1.07 0.01 1.03 0.13 2.02 0.02 0.06 1.08 1.03 0.99 0.11 0.02 2.01 2.07 0.14 0.08 0.94 0.04 0.95 0.09 1.95 0.13 1.93 0.09 3.06 0.13 1.00 0.11 0.06 0.75 0.13 0.91 0.12 2.05 0.07 2.03 0.10 0.03 1.07 0.17 1.11 0.12 0.04 1.05 1.06 0.07 2.08 0.05 0.09 2.02 0.08 1.08 0.06 0.84 0.16 1.03 1.08 0.92 0.16 0.93 0.08 0.95 0.94 2.07 0.17 0.10 1.03 0.03 0.00 1.04 0.00 2.02 2.17 0.16 0.12 1.97 0.04 0.90 0.15 0.17 1.08 0.98 1.25 1.04 0.88 0.14 0.15 0.99 1.87 1.06 1.09 0.12 1.00 0.11 0.81 0.08 0.11 0.92 3.04 1.12 1.07 0.02 0.11 0.94 0.07 0.06 0.86 0.97 0.19 0.03 2.25 2.06
367
Flow Indexes: 1 3 6 8 11 14 16 17 18 20 21 23 23 25 28 31 33 36 39 39 41 41 43 43 44 45 46 48 48 50 52 53 53 53 56 56 58 59 59 60 61 64 65 67 69 71 74 76 77 77 79 79 82 82 85 88 89 92 92 92 92 92 94 95 95 97 97 99 99 101 103 105 107 110 112 114 114 115 115 115 115 117 118 121 123 123 126 129 131 132 134 134 136 138 139 141 142 144 144 145 148 148 148 151 151 152 153 153 154 155 155 155 155 156 159 159 161 162 163 165 166 169 171 173 174 174 177 180 180 182 182 183 184 187 190 193 195 195 195 195 198 201 203 204 205 207 209 210 211 211 212 212 214 214 217 218 219 219 221 221 221 222 222 225 227 230 233 236 237 240 242 242 243 246 249 252 253 256 258 258 259 261 263 266 269 272 274 277 280 283 283 286 289 291 293 293 296 297 298 301 301 302 302 305 307 309 309 311 311 313 313 313 315 318 320 322 322 324 324 327 329 332 333 335 335 338 338 340 342 344 345 346 348 350 351 352 352 355 358 360 360 361 361 364 364 366 369 370 371 372 373 376 377 377 378 379 381 383 386 387 387 387 388 389 392 395 396 399 399 400 400
368
Bases: tcagCAGTAGTCCTGCTGCCTTCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCCCGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGTGAACTCATGATGCCATCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGG
369
Quality Scores: 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 37 37 37 37 37 37 37 37 34 34 34 34 34 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 32 32 32 32 38 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 38 38 40 40 40 38 38 38 38 38 38 38 40 38 38 38 38 38 38 37 38 38 36 37 37 36 33 28 28 31 31 31 31 31 31 31 31 31 31 31 32 32 31 30 30 25 25 25 25
372
if __name__ == '__main__':