~ubuntu-branches/ubuntu/natty/python-cogent/natty

« back to all changes in this revision

Viewing changes to tests/test_app/test_uclust.py

  • Committer: Bazaar Package Importer
  • Author(s): Steffen Moeller
  • Date: 2010-12-04 22:30:35 UTC
  • mfrom: (1.1.1 upstream)
  • Revision ID: james.westby@ubuntu.com-20101204223035-j11kinhcrrdgg2p2
Tags: 1.5-1
* Bumped standard to 3.9.1, no changes required.
* New upstream version.
  - major additions to Cookbook
  - added AlleleFreqs attribute to ensembl Variation objects.
  - added getGeneByStableId method to genome objects.
  - added Introns attribute to Transcript objects and an Intron class.
  - added Mann-Whitney test and a Monte-Carlo version
  - exploratory and confirmatory period estimation techniques (suitable for
    symbolic and continuous data)
  - Information theoretic measures (AIC and BIC) added
  - drawing of trees with collapsed nodes
  - progress display indicator support for terminal and GUI apps
  - added parser for illumina HiSeq2000 and GAiix sequence files as 
    cogent.parse.illumina_sequence.MinimalIlluminaSequenceParser.
  - added parser to FASTQ files, one of the output options for illumina's
    workflow, also added cookbook demo.
  - added functionality for parsing of SFF files without the Roche tools in
    cogent.parse.binary_sff
  - thousand fold performance improvement to nmds
  - >10-fold performance improvements to some Table operations

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
#!/usr/bin/env python
2
2
 
3
3
"""
4
 
test_uclust.py : provides unit tests for the uclust.py module
 
4
 : provides unit tests for the uclust.py module
5
5
 
6
6
Modified from Daniel McDonald's test_cd_hit.py code on Feb-4-2010 """
7
7
 
15
15
from cogent.app.uclust import (Uclust, 
16
16
 uclust_fasta_sort_from_filepath,
17
17
 uclust_cluster_from_sorted_fasta_filepath,
18
 
 uclust_convert_uc_to_cdhit_from_filepath,
19
 
 parse_uclust_clstr_file, get_output_filepaths,
 
18
 get_output_filepaths,clusters_from_uc_file,
20
19
 get_clusters_from_fasta_filepath,
21
20
 uclust_search_and_align_from_fasta_filepath,
22
 
 process_uclust_pw_alignment_results)
 
21
 process_uclust_pw_alignment_results, UclustParseError)
23
22
 
24
23
__author__ = "William Walters"
25
24
__copyright__ = "Copyright 2007-2009, The Cogent Project"
26
25
__credits__ = ["Daniel McDonald","William Walters","Greg Caporaso"]
27
26
__license__ = "GPL"
28
 
__version__ = "1.4.1"
 
27
__version__ = "1.5.0"
29
28
__maintainer__ = "William Walters"
30
29
__email__ = "William.A.Walters@colorado.edu"
31
30
__status__ = "Development"
118
117
        self.assertEqual(uc_file_actual, uc_file_expected)
119
118
    
120
119
        test_app_res.cleanUp()
121
 
 
122
 
    def test_convert_to_cdhit_from_uc_filepath(self):
123
 
        """ Should convert given uclust (.uc) file to cdhit (.clstr) format 
124
 
        
125
 
        Since a .uc file has to be passed to the app controller for uclust,
126
 
        a temporary .uc file is created, and the clusters supplied
127
 
        in this module are written to it.  This file is sent to the app 
128
 
        controller, and the resulting .clstr file is compared to the expected
129
 
        results to ensure proper function of uclust as called by this app
130
 
        controller."""
131
 
        test_app = Uclust()
132
 
        
133
 
        test_app_res = test_app(data = \
134
 
           {'--uc2clstr':self.tmp_uc_filepath,'--output':self.tmp_clstr_filepath})
135
 
 
136
 
        clstr_file = open(test_app_res['Output'].name,"U")
137
 
        clstr_res = []
138
 
        for line in clstr_file:
139
 
            clstr_res.append(line.replace('\t',''))
140
 
            
141
 
        self.assertEqual(clstr_res, clstr_clusters)
142
 
   
143
 
        test_app_res.cleanUp()
144
120
    
145
121
class UclustConvenienceWrappers(TestCase):
146
122
    """ Unit tests for uclust convenience wrappers """
152
128
        tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath,"w")
153
129
        tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
154
130
        tmp_unsorted_fasta.close()
155
 
         
156
 
         
 
131
        
 
132
        self.tmp_raw_dna_seqs_rc_filepath = \
 
133
         get_tmp_filename(prefix = "uclust_test", suffix = ".fasta")
 
134
        tmp_rc_fasta = open(self.tmp_raw_dna_seqs_rc_filepath,"w")
 
135
        tmp_rc_fasta.write('\n'.join(raw_dna_seqs_rc))
 
136
        tmp_rc_fasta.close()
 
137
        
157
138
        self.tmp_sorted_fasta_filepath = \
158
139
         get_tmp_filename(prefix = "uclust_test", suffix = ".fasta")
159
140
        tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath,"w")
188
169
         get_tmp_filename(prefix = "uclust_test", suffix = ".clstr")
189
170
        open(self.search_align_template2_fp,'w').write(search_align_template2)
190
171
        
 
172
        self.ref_dna_seqs_fp = get_tmp_filename(prefix = "uclust_test", suffix = ".fasta")
 
173
        open(self.ref_dna_seqs_fp,'w').write(ref_dna_seqs)
 
174
        
191
175
        self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
 
176
                                self.tmp_raw_dna_seqs_rc_filepath,
192
177
                                self.tmp_sorted_fasta_filepath,
193
178
                                self.tmp_uc_filepath,
194
179
                                self.tmp_clstr_filepath,
195
180
                                self.search_align_query1_fp,
196
181
                                self.search_align_template1_fp,
197
182
                                self.search_align_query2_fp,
198
 
                                self.search_align_template2_fp]
 
183
                                self.search_align_template2_fp,
 
184
                                self.ref_dna_seqs_fp]
 
185
        
 
186
        self.ref_test_clusters1 = ref_test_clusters1
 
187
        self.ref_test_failures1 = ref_test_failures1
 
188
        self.ref_test_new_seeds1 = ref_test_new_seeds1
 
189
        self.ref_test_clusters2 = ref_test_clusters2
 
190
        self.ref_test_failures2 = ref_test_failures2
 
191
        self.ref_test_new_seeds2 = ref_test_new_seeds2
 
192
        self.uc_dna_clusters = uc_dna_clusters
 
193
        self.uc_lines1 = uc_lines1
 
194
        self.uc_lines_overlapping_lib_input_seq_ids = \
 
195
         uc_lines_overlapping_lib_input_seq_ids
199
196
        
200
197
    def tearDown(self):
201
198
        remove_files(self.files_to_remove,error_on_missing=False)
216
213
        app_res.cleanUp()
217
214
        
218
215
        
 
216
    def test_clusters_from_uc_file(self):
 
217
        """ clusters_from_uc_file functions as expected """
 
218
 
 
219
        expected_clusters = {'s2':['s2','s3']}
 
220
        expected_failures = ['s1']
 
221
        expected_new_seeds = ['s2']
 
222
        self.assertEqual(clusters_from_uc_file(self.uc_lines1),
 
223
         (expected_clusters,expected_failures,expected_new_seeds))
 
224
    
 
225
    def test_clusters_from_uc_file_error(self):
 
226
        """ clusters_from_uc_file raises error when lib/input seq ids overlap"""
 
227
        self.assertRaises(UclustParseError,
 
228
                          clusters_from_uc_file,
 
229
                          self.uc_lines_overlapping_lib_input_seq_ids)
 
230
        
 
231
        
219
232
    def test_uclust_cluster_from_sorted_fasta_filepath(self):
220
233
        """ Given a sorted fasta filepath, will return uclust (.uc) file """
221
234
        
234
247
        
235
248
        self.assertEqual(uc_file_actual, uc_file_expected)
236
249
        app_res.cleanUp()
237
 
        
238
 
    def test_uclust_convert_uc_to_cdhit_from_filepath(self):
239
 
        """ Given a uclust (.uc) file will return converted clstr file """
240
 
 
241
 
        app_res = uclust_convert_uc_to_cdhit_from_filepath(self.tmp_uc_filepath)
242
 
 
243
 
        
244
 
        clstr_file = open(app_res['Output'].name,"U")
245
 
        clstr_res = []
246
 
        for line in clstr_file:
247
 
            clstr_res.append(line.replace('\t',''))
248
 
            
249
 
        self.assertEqual(clstr_res, clstr_clusters)
250
 
        app_res.cleanUp()
251
 
        
252
 
    def test_parse_uclust_clstr_file(self):
253
 
        """ Ensures that list of lists of OTUs will be returned """
254
 
        
255
 
        clusters_res = parse_uclust_clstr_file(clstr_clusters)
256
 
        
257
 
        self.assertEqual(clusters_res, expected_cluster_list)
258
250
    
259
251
    def test_get_output_filepaths(self):
260
252
        """ Properly generates output filepath names """
261
253
        
262
 
        fasta_res, uc_res, cd_hit_res, output_dir_res = \
 
254
        uc_res = \
263
255
         get_output_filepaths("/tmp/","test_seqs.fasta")
264
256
        
265
 
        self.assertEqual(fasta_res, "/tmp/test_seqs_sorted.fasta")
266
 
        self.assertEqual(uc_res, "/tmp/test_seqs_sorted.uc")
267
 
        self.assertEqual(cd_hit_res, "/tmp/test_seqs_cdhit.clstr")
268
 
        self.assertEqual(output_dir_res, "/tmp")
 
257
        self.assertEqual(uc_res, "/tmp/test_seqs_clusters.uc")
 
258
 
269
259
        
270
 
        fasta_res, uc_res, cd_hit_res, output_dir_res = \
271
 
         get_output_filepaths(".","test_seqs.fasta")
272
 
        self.assertEqual(fasta_res, ".//test_seqs_sorted.fasta")
273
 
        self.assertEqual(uc_res, ".//test_seqs_sorted.uc")
274
 
        self.assertEqual(cd_hit_res, ".//test_seqs_cdhit.clstr")
275
 
        self.assertEqual(output_dir_res, "./")
 
260
 
276
261
        
277
262
    def test_get_clusters_from_fasta_filepath(self):
278
263
        """ Tests for return of lists of OTUs from given fasta filepath """
279
264
        
280
265
        clusters_res = \
281
266
         get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, \
282
 
          percent_ID = 0.90)
283
 
 
284
 
        self.assertEqual(clusters_res, expected_cluster_list)
 
267
          original_fasta_path = None, percent_ID = 0.90, save_uc_files=False)
 
268
        expected_cluster_list.sort()
 
269
        expected_failure_list.sort()
 
270
        expected_new_seed_list.sort()
 
271
        clusters_res[0].sort()
 
272
        clusters_res[1].sort()
 
273
        clusters_res[2].sort()
 
274
        self.assertEqual(clusters_res,(expected_cluster_list,
 
275
                                       expected_failure_list,
 
276
                                       expected_new_seed_list))
 
277
                                       
 
278
    def test_get_clusters_from_fasta_filepath_reference_db_only(self):
 
279
        """ Correct clusters returned when clustering against a database only 
 
280
        """
 
281
        clusters_res = get_clusters_from_fasta_filepath(
 
282
          self.tmp_unsorted_fasta_filepath,
 
283
          original_fasta_path = None, 
 
284
          save_uc_files=False,
 
285
          max_accepts=7,max_rejects=12,
 
286
          percent_ID = 0.90,
 
287
          subject_fasta_filepath=self.ref_dna_seqs_fp,
 
288
          suppress_new_clusters=True,
 
289
          HALT_EXEC=False)
 
290
        
 
291
        self.ref_test_clusters1.sort()
 
292
        self.ref_test_failures1.sort()
 
293
        self.ref_test_new_seeds1.sort()
 
294
        
 
295
        clusters_res[0].sort()
 
296
        clusters_res[1].sort()
 
297
        clusters_res[2].sort()
 
298
        self.assertEqual(clusters_res,(self.ref_test_clusters1,
 
299
                                       self.ref_test_failures1,
 
300
                                       self.ref_test_new_seeds1))
 
301
                                       
 
302
    def test_get_clusters_from_fasta_filepath_extending_reference_db(self):
 
303
        """ Correct clusters when clustering against db and adding new clusters
 
304
        """
 
305
        clusters_res = get_clusters_from_fasta_filepath(
 
306
          self.tmp_unsorted_fasta_filepath,
 
307
          original_fasta_path = None,
 
308
          max_accepts=7,max_rejects=12,
 
309
          percent_ID = 0.90,
 
310
          subject_fasta_filepath=self.ref_dna_seqs_fp,
 
311
          suppress_new_clusters=False,enable_rev_strand_matching=True,
 
312
          HALT_EXEC=False,
 
313
          save_uc_files=False)
 
314
        
 
315
        self.ref_test_clusters2.sort()
 
316
        self.ref_test_failures2.sort()
 
317
        self.ref_test_new_seeds2.sort()
 
318
        
 
319
        clusters_res[0].sort()
 
320
        clusters_res[1].sort()
 
321
        clusters_res[2].sort()
 
322
        self.assertEqual(clusters_res,(self.ref_test_clusters2,
 
323
                                       self.ref_test_failures2,
 
324
                                       self.ref_test_new_seeds2))
 
325
        
 
326
 
 
327
    def test_get_clusters_from_fasta_filepath_optimal(self):
 
328
        """ Test OTUs from filepath functions with optimal
 
329
        """
 
330
        # need to compile a small test where optimal has an affect --
 
331
        # this currently is only testing that we don't get a failure with
 
332
        # optimal
 
333
        clusters_res = \
 
334
         get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
 
335
          original_fasta_path = None, save_uc_files=False,
 
336
          percent_ID = 0.90, optimal = True)
 
337
        expected_cluster_list.sort()
 
338
        expected_failure_list.sort()
 
339
        expected_new_seed_list.sort()
 
340
        clusters_res[0].sort()
 
341
        clusters_res[1].sort()
 
342
        clusters_res[2].sort()
 
343
        
 
344
        self.assertEqual(clusters_res,(expected_cluster_list,
 
345
                                       expected_failure_list,
 
346
                                       expected_new_seed_list))
 
347
 
 
348
        
 
349
    def test_get_clusters_from_fasta_filepath_suppress_sort(self):
 
350
        """ Test OTUs from filepath functions with suppress sort
 
351
        """
 
352
        expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
 
353
                    ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
 
354
                    ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
 
355
                    ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
 
356
                    ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
 
357
        clusters_res = \
 
358
         get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
 
359
          original_fasta_path = None,
 
360
          percent_ID = 0.90, suppress_sort = True, save_uc_files=False)
 
361
        expected_cluster_list.sort()
 
362
        expected_failure_list.sort()
 
363
        expected_new_seed_list.sort()
 
364
        clusters_res[0].sort()
 
365
        clusters_res[1].sort()
 
366
        clusters_res[2].sort()
 
367
        
 
368
        self.assertEqual(clusters_res,(expected_cluster_list,
 
369
                                       expected_failure_list,
 
370
                                       expected_new_seed_list))
 
371
        
 
372
    def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
 
373
        """ Test OTUs from filepath functions with rev strand match
 
374
        """
 
375
        # seq and its rc don't cluster when enable_rev_strand_matching = False
 
376
        expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']]
 
377
        expected_failure_list = []
 
378
        expected_new_seed_list = ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']
 
379
        clusters_res = \
 
380
         get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
 
381
          original_fasta_path = None, save_uc_files=False,
 
382
          percent_ID = 0.90, enable_rev_strand_matching = False)
 
383
        
 
384
        expected_cluster_list.sort()
 
385
        expected_failure_list.sort()
 
386
        expected_new_seed_list.sort()
 
387
        clusters_res[0].sort()
 
388
        clusters_res[1].sort()
 
389
        clusters_res[2].sort()
 
390
        self.assertEqual(clusters_res,(expected_cluster_list,
 
391
                                       expected_failure_list,
 
392
                                       expected_new_seed_list))
 
393
        
 
394
        # seq and its rc cluster when enable_rev_strand_matching = False
 
395
        expected_cluster_list = [['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']]
 
396
        expected_failure_list = []
 
397
        expected_new_seed_list = ['uclust_test_seqs_0']
 
398
        clusters_res = \
 
399
         get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
 
400
          original_fasta_path = None, save_uc_files=False,
 
401
          percent_ID = 0.90, enable_rev_strand_matching = True)
 
402
        
 
403
        expected_cluster_list.sort()
 
404
        expected_failure_list.sort()
 
405
        expected_new_seed_list.sort()
 
406
        clusters_res[0].sort()
 
407
        clusters_res[1].sort()
 
408
        clusters_res[2].sort()
 
409
        self.assertEqual(clusters_res,(expected_cluster_list,
 
410
                                       expected_failure_list,
 
411
                                       expected_new_seed_list))
285
412
        
286
413
    def test_process_uclust_pw_alignment_results(self):
287
414
        """parsing of pairwise alignment fasta pairs file functions as expected
323
450
        proc = Popen(command,shell=True,universal_newlines=True,\
324
451
                         stdout=PIPE,stderr=STDOUT)
325
452
        stdout = proc.stdout.read()
326
 
        version_string = stdout.strip().split('v')[-1]
327
 
        version = tuple(map(int,version_string.split('.')))
328
 
        self.assertTrue(version >= (1,1,577),\
329
 
         "Unsupported uclust version. 1.1.577 or later "+\
 
453
        version_string = stdout.strip().split('v')[-1].strip('q')
 
454
        try:
 
455
            version = tuple(map(int,version_string.split('.')))
 
456
            acceptable_version = version >= (1,2,21)
 
457
        except ValueError:
 
458
            acceptable_version = False
 
459
        
 
460
        self.assertTrue(acceptable_version,\
 
461
         "Unsupported uclust version. 1.2.21 or later "+\
330
462
         "is required, but running %s." % version_string)
331
463
 
332
464
raw_dna_seqs = """>uclust_test_seqs_0
351
483
GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
352
484
""".split('\n')
353
485
 
 
486
ref_dna_seqs = """>ref1 25 random bases appended to uclust_test_seqs_0 and one mismatch
 
487
ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATATTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCTATAGCAGCCCCAGCGTTTACTTCTA
 
488
>ref2 15 random bases prepended to uclust_test_seqs_1 and one mismatch
 
489
GCTGCGGCGTCCTGCGCCACGGTGGGTACAACACGTCCACTACATCTGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
 
490
>ref3 5 random bases prepended and 10 random bases appended to uclust_test_seqs_2
 
491
ATAGGCCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACTGCCTGATTCA
 
492
>ref4 exact match to uclust_test_seqs_3
 
493
CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
 
494
"""
 
495
 
 
496
ref_test_clusters1 = [['uclust_test_seqs_0'],['uclust_test_seqs_1'],
 
497
                      ['uclust_test_seqs_2'],['uclust_test_seqs_3']]
 
498
ref_test_failures1 = ['uclust_test_seqs_4','uclust_test_seqs_5',
 
499
                      'uclust_test_seqs_6','uclust_test_seqs_7',
 
500
                      'uclust_test_seqs_8','uclust_test_seqs_9']
 
501
ref_test_new_seeds1 = []
 
502
 
 
503
ref_test_clusters2 = [['uclust_test_seqs_0'],['uclust_test_seqs_1'],
 
504
                      ['uclust_test_seqs_2'],['uclust_test_seqs_3'],
 
505
                      ['uclust_test_seqs_4'],['uclust_test_seqs_5'],
 
506
                      ['uclust_test_seqs_6','uclust_test_seqs_8'],
 
507
                      ['uclust_test_seqs_7'],['uclust_test_seqs_9']]
 
508
ref_test_failures2 = []
 
509
ref_test_new_seeds2 = ['uclust_test_seqs_4','uclust_test_seqs_5','uclust_test_seqs_6',
 
510
 'uclust_test_seqs_7','uclust_test_seqs_9']
 
511
 
 
512
 
 
513
raw_dna_seqs_rc = """>uclust_test_seqs_0
 
514
ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
 
515
>uclust_test_seqs_0_rc
 
516
AGCTCTGACACAAAACTGACGTGATGTGCCTTAAGTATCCAACCCGTTGGATGGGACGTCTTGTAGCCACCGT
 
517
""".split('\n')
 
518
 
354
519
sorted_dna_seqs=""">uclust_test_seqs_7
355
520
AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
356
521
>uclust_test_seqs_4
402
567
C       7       1       *       *       *       *       *       uclust_test_seqs_0      *
403
568
C       8       1       *       *       *       *       *       uclust_test_seqs_9      *""".split('\n')
404
569
 
405
 
 
406
 
clstr_clusters=['>Cluster 0\n',
407
 
'0       80nt, >uclust_test_seqs_7... *\n',
408
 
'>Cluster 1\n',
409
 
'0       79nt, >uclust_test_seqs_4... *\n',
410
 
'>Cluster 2\n',
411
 
'0       78nt, >uclust_test_seqs_2... *\n',
412
 
'>Cluster 3\n',
413
 
'0       77nt, >uclust_test_seqs_3... *\n',
414
 
'>Cluster 4\n',
415
 
'0       76nt, >uclust_test_seqs_1... *\n',
416
 
'>Cluster 5\n',
417
 
'0       75nt, >uclust_test_seqs_5... *\n',
418
 
'>Cluster 6\n',
419
 
'0       74nt, >uclust_test_seqs_6... *\n',
420
 
'1       72nt, >uclust_test_seqs_8... at +/92%\n',
421
 
'>Cluster 7\n',
422
 
'0       73nt, >uclust_test_seqs_0... *\n',
423
 
'>Cluster 8\n',
424
 
'0       71nt, >uclust_test_seqs_9... *\n']
425
 
 
426
570
expected_cluster_list=[['uclust_test_seqs_7'], ['uclust_test_seqs_4'], ['uclust_test_seqs_2'], ['uclust_test_seqs_3'], ['uclust_test_seqs_1'], ['uclust_test_seqs_5'], ['uclust_test_seqs_6', 'uclust_test_seqs_8'], ['uclust_test_seqs_0'], ['uclust_test_seqs_9']]
 
571
expected_failure_list = []
 
572
expected_new_seed_list = ['uclust_test_seqs_7', 'uclust_test_seqs_4', 'uclust_test_seqs_2',
 
573
 'uclust_test_seqs_3', 'uclust_test_seqs_1', 'uclust_test_seqs_5', 'uclust_test_seqs_6',
 
574
 'uclust_test_seqs_0', 'uclust_test_seqs_9']
427
575
 
428
576
search_align_query1 = """>1_like
429
577
TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA
500
648
         
501
649
         ('2_like_rc RC','2','-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------','AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT',100.0)]
502
650
 
503
 
         
 
651
uc_lines1 = """# uclust --input q.fasta --lib r.fasta --uc results.uc --id 0.90 --libonly --rev
 
652
# version=1.1.579
 
653
# Tab-separated fields:
 
654
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
 
655
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
 
656
# For C and D types, PctId is average id with seed.
 
657
# QueryStart and SeedStart are zero-based relative to start of sequence.
 
658
# If minus strand, SeedStart is relative to reverse-complemented seed.
 
659
N       *       80      *       *       *       *       *       s1 some comment *
 
660
S       4       80      *       *       *       *       *       s2 some other comment   *
 
661
H       2       78      100.0   +       0       0       5I78M10I        s3 yet another comment  s2""".split('\n')
 
662
 
 
663
uc_lines_overlapping_lib_input_seq_ids = """# uclust --maxrejects 32 --input /tmp/OtuPickerbb092OWRWLWqlBR2BmTZ.fasta --id 0.97 --uc /tmp/uclust_clustersLf5Oqv0SvGTZo1mVWBqK.uc --rev --usersort --maxaccepts 8 --lib r.fasta
 
664
# version=1.1.16
 
665
# Tab-separated fields:
 
666
# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
 
667
# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
 
668
# For C and D types, PctId is average id with seed.
 
669
# QueryStart and SeedStart are zero-based relative to start of sequence.
 
670
# If minus strand, SeedStart is relative to reverse-complemented seed.
 
671
S       1       24      *       *       *       *       *       3       *
 
672
H       1       24      100.0   +       0       0       24M     4       3
 
673
L       0       54      *       *       *       *       *       3       *
 
674
H       0       54      100.0   +       0       0       54M     2       3
 
675
D       0       2       *       *       *       *       100.0   3       *
 
676
C       1       2       100.0   *       *       *       *       3       *
 
677
""".split('\n')
 
678
 
504
679
if __name__ == '__main__':
505
680
    main()