188
169
get_tmp_filename(prefix = "uclust_test", suffix = ".clstr")
189
170
open(self.search_align_template2_fp,'w').write(search_align_template2)
172
self.ref_dna_seqs_fp = get_tmp_filename(prefix = "uclust_test", suffix = ".fasta")
173
open(self.ref_dna_seqs_fp,'w').write(ref_dna_seqs)
191
175
self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
176
self.tmp_raw_dna_seqs_rc_filepath,
192
177
self.tmp_sorted_fasta_filepath,
193
178
self.tmp_uc_filepath,
194
179
self.tmp_clstr_filepath,
195
180
self.search_align_query1_fp,
196
181
self.search_align_template1_fp,
197
182
self.search_align_query2_fp,
198
self.search_align_template2_fp]
183
self.search_align_template2_fp,
184
self.ref_dna_seqs_fp]
186
self.ref_test_clusters1 = ref_test_clusters1
187
self.ref_test_failures1 = ref_test_failures1
188
self.ref_test_new_seeds1 = ref_test_new_seeds1
189
self.ref_test_clusters2 = ref_test_clusters2
190
self.ref_test_failures2 = ref_test_failures2
191
self.ref_test_new_seeds2 = ref_test_new_seeds2
192
self.uc_dna_clusters = uc_dna_clusters
193
self.uc_lines1 = uc_lines1
194
self.uc_lines_overlapping_lib_input_seq_ids = \
195
uc_lines_overlapping_lib_input_seq_ids
200
197
def tearDown(self):
201
198
remove_files(self.files_to_remove,error_on_missing=False)
235
248
self.assertEqual(uc_file_actual, uc_file_expected)
236
249
app_res.cleanUp()
238
def test_uclust_convert_uc_to_cdhit_from_filepath(self):
239
""" Given a uclust (.uc) file will return converted clstr file """
241
app_res = uclust_convert_uc_to_cdhit_from_filepath(self.tmp_uc_filepath)
244
clstr_file = open(app_res['Output'].name,"U")
246
for line in clstr_file:
247
clstr_res.append(line.replace('\t',''))
249
self.assertEqual(clstr_res, clstr_clusters)
252
def test_parse_uclust_clstr_file(self):
253
""" Ensures that list of lists of OTUs will be returned """
255
clusters_res = parse_uclust_clstr_file(clstr_clusters)
257
self.assertEqual(clusters_res, expected_cluster_list)
259
251
def test_get_output_filepaths(self):
260
252
""" Properly generates output filepath names """
262
fasta_res, uc_res, cd_hit_res, output_dir_res = \
263
255
get_output_filepaths("/tmp/","test_seqs.fasta")
265
self.assertEqual(fasta_res, "/tmp/test_seqs_sorted.fasta")
266
self.assertEqual(uc_res, "/tmp/test_seqs_sorted.uc")
267
self.assertEqual(cd_hit_res, "/tmp/test_seqs_cdhit.clstr")
268
self.assertEqual(output_dir_res, "/tmp")
257
self.assertEqual(uc_res, "/tmp/test_seqs_clusters.uc")
270
fasta_res, uc_res, cd_hit_res, output_dir_res = \
271
get_output_filepaths(".","test_seqs.fasta")
272
self.assertEqual(fasta_res, ".//test_seqs_sorted.fasta")
273
self.assertEqual(uc_res, ".//test_seqs_sorted.uc")
274
self.assertEqual(cd_hit_res, ".//test_seqs_cdhit.clstr")
275
self.assertEqual(output_dir_res, "./")
277
262
def test_get_clusters_from_fasta_filepath(self):
278
263
""" Tests for return of lists of OTUs from given fasta filepath """
281
266
get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, \
284
self.assertEqual(clusters_res, expected_cluster_list)
267
original_fasta_path = None, percent_ID = 0.90, save_uc_files=False)
268
expected_cluster_list.sort()
269
expected_failure_list.sort()
270
expected_new_seed_list.sort()
271
clusters_res[0].sort()
272
clusters_res[1].sort()
273
clusters_res[2].sort()
274
self.assertEqual(clusters_res,(expected_cluster_list,
275
expected_failure_list,
276
expected_new_seed_list))
278
def test_get_clusters_from_fasta_filepath_reference_db_only(self):
279
""" Correct clusters returned when clustering against a database only
281
clusters_res = get_clusters_from_fasta_filepath(
282
self.tmp_unsorted_fasta_filepath,
283
original_fasta_path = None,
285
max_accepts=7,max_rejects=12,
287
subject_fasta_filepath=self.ref_dna_seqs_fp,
288
suppress_new_clusters=True,
291
self.ref_test_clusters1.sort()
292
self.ref_test_failures1.sort()
293
self.ref_test_new_seeds1.sort()
295
clusters_res[0].sort()
296
clusters_res[1].sort()
297
clusters_res[2].sort()
298
self.assertEqual(clusters_res,(self.ref_test_clusters1,
299
self.ref_test_failures1,
300
self.ref_test_new_seeds1))
302
def test_get_clusters_from_fasta_filepath_extending_reference_db(self):
303
""" Correct clusters when clustering against db and adding new clusters
305
clusters_res = get_clusters_from_fasta_filepath(
306
self.tmp_unsorted_fasta_filepath,
307
original_fasta_path = None,
308
max_accepts=7,max_rejects=12,
310
subject_fasta_filepath=self.ref_dna_seqs_fp,
311
suppress_new_clusters=False,enable_rev_strand_matching=True,
315
self.ref_test_clusters2.sort()
316
self.ref_test_failures2.sort()
317
self.ref_test_new_seeds2.sort()
319
clusters_res[0].sort()
320
clusters_res[1].sort()
321
clusters_res[2].sort()
322
self.assertEqual(clusters_res,(self.ref_test_clusters2,
323
self.ref_test_failures2,
324
self.ref_test_new_seeds2))
327
def test_get_clusters_from_fasta_filepath_optimal(self):
328
""" Test OTUs from filepath functions with optimal
330
# need to compile a small test where optimal has an affect --
331
# this currently is only testing that we don't get a failure with
334
get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
335
original_fasta_path = None, save_uc_files=False,
336
percent_ID = 0.90, optimal = True)
337
expected_cluster_list.sort()
338
expected_failure_list.sort()
339
expected_new_seed_list.sort()
340
clusters_res[0].sort()
341
clusters_res[1].sort()
342
clusters_res[2].sort()
344
self.assertEqual(clusters_res,(expected_cluster_list,
345
expected_failure_list,
346
expected_new_seed_list))
349
def test_get_clusters_from_fasta_filepath_suppress_sort(self):
350
""" Test OTUs from filepath functions with suppress sort
352
expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
353
['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
354
['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
355
['uclust_test_seqs_6', 'uclust_test_seqs_8'],
356
['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
358
get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
359
original_fasta_path = None,
360
percent_ID = 0.90, suppress_sort = True, save_uc_files=False)
361
expected_cluster_list.sort()
362
expected_failure_list.sort()
363
expected_new_seed_list.sort()
364
clusters_res[0].sort()
365
clusters_res[1].sort()
366
clusters_res[2].sort()
368
self.assertEqual(clusters_res,(expected_cluster_list,
369
expected_failure_list,
370
expected_new_seed_list))
372
def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
373
""" Test OTUs from filepath functions with rev strand match
375
# seq and its rc don't cluster when enable_rev_strand_matching = False
376
expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']]
377
expected_failure_list = []
378
expected_new_seed_list = ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']
380
get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
381
original_fasta_path = None, save_uc_files=False,
382
percent_ID = 0.90, enable_rev_strand_matching = False)
384
expected_cluster_list.sort()
385
expected_failure_list.sort()
386
expected_new_seed_list.sort()
387
clusters_res[0].sort()
388
clusters_res[1].sort()
389
clusters_res[2].sort()
390
self.assertEqual(clusters_res,(expected_cluster_list,
391
expected_failure_list,
392
expected_new_seed_list))
394
# seq and its rc cluster when enable_rev_strand_matching = False
395
expected_cluster_list = [['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']]
396
expected_failure_list = []
397
expected_new_seed_list = ['uclust_test_seqs_0']
399
get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
400
original_fasta_path = None, save_uc_files=False,
401
percent_ID = 0.90, enable_rev_strand_matching = True)
403
expected_cluster_list.sort()
404
expected_failure_list.sort()
405
expected_new_seed_list.sort()
406
clusters_res[0].sort()
407
clusters_res[1].sort()
408
clusters_res[2].sort()
409
self.assertEqual(clusters_res,(expected_cluster_list,
410
expected_failure_list,
411
expected_new_seed_list))
286
413
def test_process_uclust_pw_alignment_results(self):
287
414
"""parsing of pairwise alignment fasta pairs file functions as expected
351
483
GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
486
ref_dna_seqs = """>ref1 25 random bases appended to uclust_test_seqs_0 and one mismatch
487
ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATATTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCTATAGCAGCCCCAGCGTTTACTTCTA
488
>ref2 15 random bases prepended to uclust_test_seqs_1 and one mismatch
489
GCTGCGGCGTCCTGCGCCACGGTGGGTACAACACGTCCACTACATCTGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
490
>ref3 5 random bases prepended and 10 random bases appended to uclust_test_seqs_2
491
ATAGGCCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACTGCCTGATTCA
492
>ref4 exact match to uclust_test_seqs_3
493
CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
496
ref_test_clusters1 = [['uclust_test_seqs_0'],['uclust_test_seqs_1'],
497
['uclust_test_seqs_2'],['uclust_test_seqs_3']]
498
ref_test_failures1 = ['uclust_test_seqs_4','uclust_test_seqs_5',
499
'uclust_test_seqs_6','uclust_test_seqs_7',
500
'uclust_test_seqs_8','uclust_test_seqs_9']
501
ref_test_new_seeds1 = []
503
ref_test_clusters2 = [['uclust_test_seqs_0'],['uclust_test_seqs_1'],
504
['uclust_test_seqs_2'],['uclust_test_seqs_3'],
505
['uclust_test_seqs_4'],['uclust_test_seqs_5'],
506
['uclust_test_seqs_6','uclust_test_seqs_8'],
507
['uclust_test_seqs_7'],['uclust_test_seqs_9']]
508
ref_test_failures2 = []
509
ref_test_new_seeds2 = ['uclust_test_seqs_4','uclust_test_seqs_5','uclust_test_seqs_6',
510
'uclust_test_seqs_7','uclust_test_seqs_9']
513
raw_dna_seqs_rc = """>uclust_test_seqs_0
514
ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
515
>uclust_test_seqs_0_rc
516
AGCTCTGACACAAAACTGACGTGATGTGCCTTAAGTATCCAACCCGTTGGATGGGACGTCTTGTAGCCACCGT
354
519
sorted_dna_seqs=""">uclust_test_seqs_7
355
520
AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
356
521
>uclust_test_seqs_4
402
567
C 7 1 * * * * * uclust_test_seqs_0 *
403
568
C 8 1 * * * * * uclust_test_seqs_9 *""".split('\n')
406
clstr_clusters=['>Cluster 0\n',
407
'0 80nt, >uclust_test_seqs_7... *\n',
409
'0 79nt, >uclust_test_seqs_4... *\n',
411
'0 78nt, >uclust_test_seqs_2... *\n',
413
'0 77nt, >uclust_test_seqs_3... *\n',
415
'0 76nt, >uclust_test_seqs_1... *\n',
417
'0 75nt, >uclust_test_seqs_5... *\n',
419
'0 74nt, >uclust_test_seqs_6... *\n',
420
'1 72nt, >uclust_test_seqs_8... at +/92%\n',
422
'0 73nt, >uclust_test_seqs_0... *\n',
424
'0 71nt, >uclust_test_seqs_9... *\n']
426
570
expected_cluster_list=[['uclust_test_seqs_7'], ['uclust_test_seqs_4'], ['uclust_test_seqs_2'], ['uclust_test_seqs_3'], ['uclust_test_seqs_1'], ['uclust_test_seqs_5'], ['uclust_test_seqs_6', 'uclust_test_seqs_8'], ['uclust_test_seqs_0'], ['uclust_test_seqs_9']]
571
expected_failure_list = []
572
expected_new_seed_list = ['uclust_test_seqs_7', 'uclust_test_seqs_4', 'uclust_test_seqs_2',
573
'uclust_test_seqs_3', 'uclust_test_seqs_1', 'uclust_test_seqs_5', 'uclust_test_seqs_6',
574
'uclust_test_seqs_0', 'uclust_test_seqs_9']
428
576
search_align_query1 = """>1_like
429
577
TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA
501
649
('2_like_rc RC','2','-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------','AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT',100.0)]
651
uc_lines1 = """# uclust --input q.fasta --lib r.fasta --uc results.uc --id 0.90 --libonly --rev
653
# Tab-separated fields:
654
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
655
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
656
# For C and D types, PctId is average id with seed.
657
# QueryStart and SeedStart are zero-based relative to start of sequence.
658
# If minus strand, SeedStart is relative to reverse-complemented seed.
659
N * 80 * * * * * s1 some comment *
660
S 4 80 * * * * * s2 some other comment *
661
H 2 78 100.0 + 0 0 5I78M10I s3 yet another comment s2""".split('\n')
663
uc_lines_overlapping_lib_input_seq_ids = """# uclust --maxrejects 32 --input /tmp/OtuPickerbb092OWRWLWqlBR2BmTZ.fasta --id 0.97 --uc /tmp/uclust_clustersLf5Oqv0SvGTZo1mVWBqK.uc --rev --usersort --maxaccepts 8 --lib r.fasta
665
# Tab-separated fields:
666
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
667
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
668
# For C and D types, PctId is average id with seed.
669
# QueryStart and SeedStart are zero-based relative to start of sequence.
670
# If minus strand, SeedStart is relative to reverse-complemented seed.
672
H 1 24 100.0 + 0 0 24M 4 3
674
H 0 54 100.0 + 0 0 54M 2 3
675
D 0 2 * * * * 100.0 3 *
676
C 1 2 100.0 * * * * 3 *
504
679
if __name__ == '__main__':