12
12
from Bio.Application import generic_run
13
13
from Bio.Emboss.Applications import WaterCommandline, NeedleCommandline
14
from Bio.Emboss.Applications import SeqretCommandline
14
15
from Bio import SeqIO
15
16
from Bio import AlignIO
16
17
from Bio import MissingExternalDependencyError
17
18
from Bio.Alphabet import generic_protein, generic_dna, generic_nucleotide
18
19
from Bio.Seq import Seq, translate
19
20
from Bio.SeqRecord import SeqRecord
21
#from Bio.Data.IUPACData import ambiguous_dna_letters
21
23
#################################################################
53
55
#Top level function as this makes it easier to use for debugging:
54
56
def emboss_convert(filename, old_format, new_format):
55
57
"""Run seqret, returns handle."""
56
#TODO - Support seqret in Bio.Emboss.Applications
57
#(ideally with the -auto and -filter arguments)
58
58
#Setup, this assumes for all the format names used
59
59
#Biopython and EMBOSS names are consistent!
60
cline = exes["seqret"]
61
cline += " -sequence " + filename
62
cline += " -sformat " + old_format
63
cline += " -osformat " + new_format
64
cline += " -auto" #no prompting
65
cline += " -filter" #use stdout
60
cline = SeqretCommandline(exes["seqret"],
63
osformat = new_format,
64
auto = True, #no prompting
67
67
child = subprocess.Popen(str(cline),
68
68
stdin=subprocess.PIPE,
73
73
return child.stdout
75
75
#Top level function as this makes it easier to use for debugging:
76
def emboss_piped_SeqIO_convert(records, old_format, new_format):
77
"""Run seqret, returns records (as a generator)."""
78
#Setup, this assumes for all the format names used
79
#Biopython and EMBOSS names are consistent!
80
cline = SeqretCommandline(exes["seqret"],
82
osformat = new_format,
83
auto = True, #no prompting
86
child = subprocess.Popen(str(cline),
87
stdin=subprocess.PIPE,
88
stdout=subprocess.PIPE,
89
stderr=subprocess.PIPE,
90
shell=(sys.platform!="win32"))
91
SeqIO.write(records, child.stdin, old_format)
93
return SeqIO.parse(child.stdout, new_format)
95
#Top level function as this makes it easier to use for debugging:
96
def emboss_piped_AlignIO_convert(alignments, old_format, new_format):
97
"""Run seqret, returns alignments (as a generator)."""
98
#Setup, this assumes for all the format names used
99
#Biopython and EMBOSS names are consistent!
100
cline = SeqretCommandline(exes["seqret"],
101
sformat = old_format,
102
osformat = new_format,
103
auto = True, #no prompting
106
child = subprocess.Popen(str(cline),
107
stdin=subprocess.PIPE,
108
stdout=subprocess.PIPE,
109
stderr=subprocess.PIPE,
110
shell=(sys.platform!="win32"))
111
AlignIO.write(alignments, child.stdin, old_format)
113
return AlignIO.parse(child.stdout, new_format)
116
#Top level function as this makes it easier to use for debugging:
76
117
def compare_records(old_list, new_list) :
77
118
"""Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
78
119
if len(old_list) != len(new_list) :
127
168
for temp_format in ["genbank","fasta"] :
128
169
if temp_format in skip_formats :
130
#TODO - Handle this with a pipe?
131
#i.e. can Bio.SeqIO write to the stdin of seqret?
132
filename = "Emboss/temp_%s.txt" % temp_format
133
temp_handle = open(filename,"w")
134
SeqIO.write(records, temp_handle, temp_format)
138
handle = emboss_convert(filename, temp_format, "fasta")
139
new_records = list(SeqIO.parse(handle, "fasta"))
171
new_records = list(emboss_piped_SeqIO_convert(records, temp_format, "fasta"))
142
173
self.assert_(compare_records(records, new_records))
143
174
except ValueError, err :
144
175
raise ValueError("Disagree on file %s %s in %s format: %s" \
145
176
% (in_format, in_filename, temp_format, err))
148
178
def check_EMBOSS_to_SeqIO(self, filename, old_format,
149
179
skip_formats=[]) :
174
204
"""SeqIO & EMBOSS reading each other's conversions of a GenBank file."""
175
205
self.check_SeqIO_with_EMBOSS("GenBank/cor6_6.gb", "genbank")
207
def test_genbank2(self) :
208
"""SeqIO & EMBOSS reading each other's conversions of another GenBank file."""
209
self.check_SeqIO_with_EMBOSS("GenBank/NC_000932.gb", "genbank")
177
211
def test_embl(self) :
178
212
"""SeqIO & EMBOSS reading each other's conversions of an EMBL file."""
179
213
self.check_SeqIO_with_EMBOSS("EMBL/U87107.embl", "embl")
242
276
old_aligns = list(AlignIO.parse(open(in_filename), in_format))
244
formats = ["clustal", "phylip", "ig"]
278
formats = ["clustal", "phylip"]
245
279
if len(old_aligns) == 1 :
246
280
formats.extend(["fasta","nexus"])
247
281
for temp_format in formats :
248
282
if temp_format in skip_formats :
250
#TODO - Handle this with a pipe?
251
#i.e. can Bio.SeqIO write to the stdin of seqret?
252
filename = "Emboss/temp_%s.txt" % temp_format
253
temp_handle = open(filename,"w")
255
AlignIO.write(old_aligns, temp_handle, temp_format)
257
#e.g. NEXUS file without knowing alphabet
258
#This should be tested by test_AlignIO
265
284
#PHYLIP is a simple format which explicitly supports
266
285
#multiple alignments (unlike FASTA).
267
handle = emboss_convert(filename, temp_format, "phylip")
268
new_aligns = list(AlignIO.parse(handle, "phylip"))
287
new_aligns = list(emboss_piped_AlignIO_convert(old_aligns,
290
except ValueError, e :
291
#e.g. ValueError: Need a DNA, RNA or Protein alphabet
292
#from writing Nexus files...
271
295
self.assert_(compare_alignments(old_aligns, new_aligns))
272
296
except ValueError, err :
273
297
raise ValueError("Disagree on file %s %s in %s format: %s" \
274
298
% (in_format, in_filename, temp_format, err))
277
300
def check_AlignIO_with_EMBOSS(self, filename, old_format, skip_formats=[],
345
368
cline.set_parameter("asequence", "asis:ACCCGGGCGCGGT")
346
369
cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
347
370
#Try using a property set here:
348
cline.outfile = "Emboss/temp_test.water"
371
cline.outfile = "Emboss/temp with space.water"
349
372
self.assertEqual(str(eval(repr(cline))), str(cline))
351
374
result, out, err = generic_run(cline)
356
379
if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline
357
380
self.assertEqual(result.return_code, 0)
358
381
filename = result.get_result("outfile")
359
self.assertEqual(filename, "Emboss/temp_test.water")
382
self.assertEqual(filename, "Emboss/temp with space.water")
360
383
assert os.path.isfile(filename)
361
384
#Check we can parse the output...
362
385
align = AlignIO.read(open(filename),"emboss")
405
428
cline.set_parameter("-gapextend", "0.5")
406
429
#EMBOSS would guess this, but let's be explicit:
407
430
cline.set_parameter("-snucleotide", "True")
408
cline.set_parameter("-outfile", "Emboss/temp_test.needle")
431
cline.set_parameter("-outfile", "Emboss/temp with space.needle")
409
432
self.assertEqual(str(eval(repr(cline))), str(cline))
411
434
result, out, err = generic_run(cline)
416
439
if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline
417
440
self.assertEqual(result.return_code, 0)
418
441
filename = result.get_result("outfile")
419
self.assertEqual(filename, "Emboss/temp_test.needle")
442
self.assertEqual(filename, "Emboss/temp with space.needle")
420
443
assert os.path.isfile(filename)
421
444
#Check we can parse the output...
422
445
align = AlignIO.read(open(filename),"emboss")
678
701
#Top level function as this makes it easier to use for debugging:
679
702
def check_translation(sequence, translation, table=None) :
680
703
if table is None :
682
if translation != str(sequence.translate()) \
683
or translation != str(translate(sequence)) \
684
or translation != translate(str(sequence)) :
685
raise ValueError("%s -> %s" % (sequence, translation))
687
if translation != str(sequence.translate(table)) \
688
or translation != str(translate(sequence,table)) \
689
or translation != translate(str(sequence),table) :
690
raise ValueError("%s -> %s (table %s)" \
691
% (sequence, translation, table))
707
if translation != str(sequence.translate(t)) \
708
or translation != str(translate(sequence,t)) \
709
or translation != translate(str(sequence),t) :
711
for i, amino in enumerate(translation) :
712
codon = sequence[i*3:i*3+3]
713
if amino != str(codon.translate(t)) :
714
raise ValueError("%s -> %s not %s (table %s)" \
715
% (codon, amino, codon.translate(t), t))
716
#Shouldn't reach this line:
717
raise ValueError("%s -> %s (table %s)" \
718
% (sequence, translation, t))
694
721
class TranslationTests(unittest.TestCase):
752
779
generic_nucleotide)
753
780
self.check(sequence)
782
#def test_all_ambig_dna_codons(self) :
783
# """transeq vs Bio.Seq on ambiguous DNA codons (inc. alt tables)."""
784
# self.translate_all_codons(ambiguous_dna_letters)
755
786
def test_all_unambig_dna_codons(self) :
756
787
"""transeq vs Bio.Seq on unambiguous DNA codons (inc. alt tables)."""
757
788
self.translate_all_codons("ATCGatcg")