2
"""Parser for 454 Flowgram files in native binary format."""
4
__author__ = 'Kyle Bittinger'
5
__copyright__ = 'Copyright 2010, The Cogent Project'
8
__credits__ = ['Kyle Bittinger']
9
__maintainer__ = 'Kyle Bittinger'
10
__email__ = 'kylebittinger@gmail.com'
11
__status__ = 'Prototype'
13
from cStringIO import StringIO
17
# Sections were inspired by, but not derived from, several other implementations:
18
# * BioPython (biopython.org)
19
# * sff_extract (www.melogen.upv.es/sff_extract)
20
# * Mothur (mothur.org)
23
class NamedStruct(struct.Struct):
24
"""Enhanced Struct class that associates names with each item in the struct.
27
def __init__(self, format, keys):
28
"""Create a new NamedStruct with a list of keys.
31
super(NamedStruct, self).__init__(format)
33
def read_from(self, file):
34
"""Read the struct from a file object and return the values as a dict.
36
buff = file.read(self.size)
37
return self.unpack(buff)
39
def pack(self, dict_of_vals):
40
vals = [dict_of_vals[k] for k in self.keys]
41
return super(NamedStruct, self).pack(*vals)
43
def unpack(self, buffer):
44
vals = super(NamedStruct, self).unpack(buffer)
45
return dict(zip(self.keys, vals))
48
def seek_pad(file, unit=8):
49
"""Set a file's position to the next multiple of a given number.
51
position = file.tell()
58
def write_pad(file, unit=8):
59
"""Write zeros until the file's position is a multiple of the given number.
61
position = file.tell()
64
num_bytes = unit - rem
65
padding_bytes = '\x00' * num_bytes
66
file.write(padding_bytes)
69
common_header_fields = [
77
'number_of_flows_per_read',
78
'flowgram_format_code',
82
common_header_struct = NamedStruct('>IIQIIHHHB', common_header_fields)
85
def parse_common_header(sff_file):
86
"""Parse a Common Header section from a binary SFF file.
88
Keys in the resulting dict are identical to those defined in the
91
As a side effect, sets the position of the file object to the end
92
of the Common Header section.
94
h = common_header_struct.read_from(sff_file)
95
h['flow_chars'] = sff_file.read(h['number_of_flows_per_read'])
96
h['key_sequence'] = sff_file.read(h['key_length'])
101
def write_common_header(sff_file, header):
102
"""Write a common header section to a binary SFF file.
104
header_bytes = common_header_struct.pack(header)
105
sff_file.write(header_bytes)
106
sff_file.write(header['flow_chars'])
107
sff_file.write(header['key_sequence'])
111
common_header_formats = [
112
' Magic Number: 0x%X\n',
114
' Index Offset: %d\n',
115
' Index Length: %d\n',
117
' Header Length: %d\n',
120
' Flowgram Code: %d\n',
124
def format_common_header(header):
125
"""Format a dictionary representation of an SFF common header as text.
128
out.write('Common Header:\n')
129
for key, fmt in zip(common_header_fields, common_header_formats):
132
out.write(' Flow Chars: %s\n' % header['flow_chars'])
133
out.write(' Key Sequence: %s\n' % header['key_sequence'])
134
return out.getvalue()
137
class UnsupportedSffError(Exception):
141
def validate_common_header(header):
142
"""Validate the Common Header section of a binary SFF file.
144
Raises an UnsupportedSffError if the header is not supported.
147
'magic_number': 0x2E736666,
149
'flowgram_format_code': 1,
151
for attr_name, expected_value in supported_values.items():
152
observed_value = header[attr_name]
153
if observed_value != expected_value:
154
raise UnsupportedSffError(
155
'%s not supported. (Expected %s, observed %s)' % (
156
attr_name, expected_value, observed_value))
159
read_header_fields = [
160
'read_header_length',
166
'clip_adapter_right',
170
read_header_struct = NamedStruct('>HHIHHHH', read_header_fields)
173
def parse_read_header(sff_file):
174
"""Parse a Read Header section from a binary SFF file.
176
Keys in the resulting dict are identical to those defined in the
179
As a side effect, sets the position of the file object to the end
180
of the Read Header section.
182
data = read_header_struct.read_from(sff_file)
183
data['Name'] = sff_file.read(data['name_length'])
188
def write_read_header(sff_file, read_header):
189
"""Write a read header section to a binary SFF file.
191
header_bytes = read_header_struct.pack(read_header)
192
sff_file.write(header_bytes)
193
sff_file.write(read_header['Name'])
197
read_header_formats = [
198
' Read Header Len: %d\n',
199
' Name Length: %d\n',
201
' Clip Qual Left: %d\n',
202
' Clip Qual Right: %d\n',
203
' Clip Adap Left: %d\n',
204
' Clip Adap Right: %d\n',
208
def format_read_header(read_header):
209
"""Format a dictionary representation of an SFF read header as text.
212
out.write('\n>%s\n' % read_header['Name'])
213
timestamp, hashchar, region, location = decode_accession(read_header['Name'])
214
out.write(' Run Prefix: R_%d_%02d_%02d_%02d_%02d_%02d_\n' % timestamp)
215
out.write(' Region #: %d\n' % region)
216
out.write(' XY Location: %04d_%04d\n' % location)
218
for key, fmt in zip(read_header_fields, read_header_formats):
219
val = read_header[key]
221
return out.getvalue()
224
def parse_read_data(sff_file, number_of_bases, number_of_flows=400):
225
"""Parse a Read Data section from a binary SFF file.
227
Keys in the resulting dict are identical to those defined in the
230
As a side effect, sets the position of the file object to the end
231
of the Read Header section.
234
flow_fmt = '>' + ('H' * number_of_flows)
235
base_fmt = '>' + ('B' * number_of_bases)
236
flow_fmt_size = struct.calcsize(flow_fmt)
237
base_fmt_size = struct.calcsize(base_fmt)
239
buff = sff_file.read(flow_fmt_size)
240
data['flowgram_values'] = struct.unpack(flow_fmt, buff)
242
buff = sff_file.read(base_fmt_size)
243
data['flow_index_per_base'] = struct.unpack(base_fmt, buff)
245
data['Bases'] = sff_file.read(number_of_bases)
247
buff = sff_file.read(base_fmt_size)
248
data['quality_scores'] = struct.unpack(base_fmt, buff)
254
def write_read_data(sff_file, read_data):
255
"""Write a read data section to a binary SFF file.
257
number_of_flows = len(read_data['flowgram_values'])
258
number_of_bases = len(read_data['quality_scores'])
259
flow_fmt = '>' + ('H' * number_of_flows)
260
base_fmt = '>' + ('B' * number_of_bases)
262
flow_bytes = struct.pack(flow_fmt, *read_data['flowgram_values'])
263
sff_file.write(flow_bytes)
265
index_bytes = struct.pack(base_fmt, *read_data['flow_index_per_base'])
266
sff_file.write(index_bytes)
268
sff_file.write(read_data['Bases'])
270
qual_bytes = struct.pack(base_fmt, *read_data['quality_scores'])
271
sff_file.write(qual_bytes)
276
def format_read_data(read_data, read_header):
277
"""Format a dictionary representation of an SFF read data as text.
279
The read data is expected to be in native flowgram format.
284
out.write('Flowgram:')
285
for x in read_data['flowgram_values']:
286
out.write('\t%01.2f' % (x * 0.01))
289
out.write('Flow Indexes:')
291
for i in read_data['flow_index_per_base']:
292
current_index = current_index + i
293
out.write('\t%d' % current_index)
296
out.write('Bases:\t')
297
# Roche uses 1-based indexing
298
left_idx = read_header['clip_qual_left'] - 1
299
right_idx = read_header['clip_qual_right'] - 1
300
for i, base in enumerate(read_data['Bases']):
301
if (i < left_idx) or (i > right_idx):
302
out.write(base.lower())
304
out.write(base.upper())
307
out.write('Quality Scores:')
308
for score in read_data['quality_scores']:
309
out.write('\t%d' % score)
312
return out.getvalue()
315
def parse_read(sff_file, number_of_flows=400):
316
"""Parse a single read from a binary SFF file.
318
Keys in the resulting dict are identical to those defined in the
319
Roche documentation for the Read Header and Read Data sections.
321
As a side effect, sets the position of the file object to the end
322
of the Read Data section.
324
header_data = parse_read_header(sff_file)
325
read_data = parse_read_data(
326
sff_file, header_data['number_of_bases'], number_of_flows)
327
read_data.update(header_data)
331
def write_read(sff_file, read):
332
"""Write a single read to a binary SFF file.
334
write_read_header(sff_file, read)
335
write_read_data(sff_file, read)
338
def format_read(read):
339
"""Format a dictionary representation of an SFF read as text.
342
out.write(format_read_header(read))
343
out.write(format_read_data(read, read))
344
return out.getvalue()
347
def parse_binary_sff(sff_file, native_flowgram_values=False):
348
"""Parse a binary SFF file, returning the header and a sequence of reads.
350
In the binary file, flowgram values are stored as integers, 100
351
times larger than the normalized floating point value. Because
352
the conversion is relatively expensive, we allow the computation
353
to be skipped if the keyword argument native_flowgram_values is
356
header = parse_common_header(sff_file)
357
number_of_flows = header['number_of_flows_per_read']
358
validate_common_header(header)
360
for i in range(header['number_of_reads']):
362
# Skip the index section
363
if sff_file.tell() == header['index_offset']:
364
sff_file.seek(header['index_length'], 1)
366
read = parse_read(sff_file, number_of_flows)
368
if not native_flowgram_values:
369
read['flowgram_values'] = [x * 0.01 for x in read['flowgram_values']]
372
return header, get_reads()
375
def write_binary_sff(sff_file, header, reads):
376
"""Write a binary SFF file, using provided header and read dicts.
380
write_common_header(sff_file, header)
382
write_read(sff_file, read)
385
def format_binary_sff(sff_file, output_file=None):
386
"""Write a text version of a binary SFF file to an output file.
388
If no output file is provided, an in-memory file-like buffer is
389
used (namely, a StringIO object).
391
if output_file is None:
392
output_file = StringIO()
393
header, reads = parse_binary_sff(sff_file, True)
394
output_file.write(format_common_header(header))
396
output_file.write(format_read(read))
400
def base36_encode(n):
401
"""Convert a positive integer to a base36 string.
403
Following the conventions outlined in the Roche 454 manual, the
404
numbers 0-25 are represented by letters, and the numbers 36-35 are
405
represented by digits.
407
Based on the code example at http://en.wikipedia.org/wiki/Base_36
410
raise ValueError('Only poitive numbers are supported.')
413
n, remainder = divmod(n, 36)
414
chars.append(base36_encode.alphabet[remainder])
415
return ''.join(chars)
418
base36_encode.alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
421
def base36_decode(base36_str):
422
"""Convert a base36 string to a positive integer.
424
Following the conventions outlined in the Roche 454 manual, the
425
numbers 0-25 are represented by letters, and the numbers 36-35 are
426
represented by digits.
428
base36_str = base36_str.translate(base36_decode.translation)
429
return int(base36_str, 36)
432
base36_decode.translation = string.maketrans(
433
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',
434
'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ',
438
def decode_location(location_str):
439
"""Decode a base36-encoded well location, in Roche 454 format.
441
Such timestamps are embedded in the final 5 characters of Roche
442
\"universal\" accession numbers.
444
return divmod(base36_decode(location_str), 4096)
447
def decode_timestamp(timestamp_str):
448
"""Decode a base36-encoded timestamp, in Roche 454 format.
450
Such timestamps are embedded in the first 6 characters of Roche
451
\"universal\" accession numbers and SFF filenames.
453
n = base36_decode(timestamp_str)
454
year, n = divmod(n, 13 * 32 * 24 * 60 * 60)
456
month, n = divmod(n, 32 * 24 * 60 * 60)
457
day, n = divmod(n, 24 * 60 * 60)
458
hour, n = divmod(n, 60 * 60)
459
minute, second = divmod(n, 60)
460
return year, month, day, hour, minute, second
463
def decode_accession(accession):
464
"""Decode a Roche 454 \"universal\" accession number.
466
assert len(accession) == 14
467
timestamp = decode_timestamp(accession[:6])
468
hashchar = accession[6]
469
region = int(accession[7:9])
470
location = decode_location(accession[9:14])
471
return timestamp, hashchar, region, location
474
def decode_sff_filename(sff_filename):
475
"""Decode a Roche 454 SFF filename, returning a timestamp and other info.
477
assert len(sff_filename) == 13
478
assert sff_filename.endswith('.sff')
479
timestamp = decode_timestamp(sff_filename[:6])
480
hashchar = sff_filename[6]
481
region = int(sff_filename[7:9])
482
return timestamp, hashchar, region