2
"""Application controller for uclust version 1.1.579
4
Includes application controllers for uclust and
5
convenience wrappers for different functions of uclust including
6
sorting fasta files, finding clusters, converting to cd-hit format and
7
searching and aligning against a database. Also contains
8
a parser for the resulting .clstr file.
10
Modified from cogent.app.cd_hit.py on 1-21-10, written by Daniel McDonald.
13
__author__ = "William Walters"
14
__copyright__ = "Copyright 2007-2011, The PyCogent Project"
15
__credits__ = ["William Walters","Greg Caporaso"]
17
__version__ = "1.2.0.dev"
18
__maintainer__ = "William Walters"
19
__email__ = "william.a.walters@colorado.edu"
20
__status__ = "Release"
22
from os import remove, makedirs
23
from os.path import split, splitext, basename, isdir, abspath, isfile
24
from cogent.parse.fasta import MinimalFastaParser
25
from cogent.app.parameters import ValuedParameter, FlagParameter
26
from cogent.app.util import CommandLineApplication, ResultPath,\
27
get_tmp_filename, ApplicationError, ApplicationNotFoundError
28
from cogent.util.misc import remove_files
29
from cogent import DNA
31
class UclustParseError(Exception):
34
class Uclust(CommandLineApplication):
35
""" Uclust ApplicationController
40
_input_handler = '_input_as_parameters'
43
# Fasta input file for merge-sort function
44
'--mergesort':ValuedParameter('--',Name='mergesort',Delimiter=' ',
47
# Output file, used by several difference functions
48
'--output':ValuedParameter('--',Name='output',Delimiter=' ',
51
# Sets temp directory for uclust to create temp fasta file
52
'--tmpdir':ValuedParameter('--',Name='tmpdir',Delimiter=' ',
55
# input filename, fasta format
56
'--input':ValuedParameter('--',Name='input',Delimiter=' ',
59
# Output filename will be in uclust (.uc) format
60
# Output cluster file, required parameter
61
'--uc':ValuedParameter('--',Name='uc',Delimiter=' ',
64
# ID percent for OTU, by default is 97%
65
'--id':ValuedParameter('--',Name='id',Delimiter=' ',IsPath=False),
67
# Disable reverse comparison option, if norev is disabled
68
# memory usage is expected to double for uclust
69
'--rev':FlagParameter('--',Name='rev'),
71
# 'library' file -- a reference of sequences representing pre-existing
73
'--lib':ValuedParameter('--',Name='lib',Delimiter=' ',IsPath=True),
75
# only compare sequences to the library file, don't add new clusters
76
# for sequences which don't hit the library
77
'--libonly':FlagParameter('--',Name='libonly'),
79
# Maximum hits before quitting search (default 1, 0=infinity).
80
'--maxaccepts':ValuedParameter('--',Name='maxaccepts',Delimiter=' '),
82
# Maximum rejects before quitting search (default 8, 0=infinity).
83
'--maxrejects':ValuedParameter('--',Name='maxrejects',Delimiter=' '),
85
# Target nr. of common words (default 8, 0=don't step)
86
'--stepwords':ValuedParameter('--',Name='stepwords',Delimiter=' '),
88
# Word length for windex (default 5 aa.s, 8 nuc.s).
89
'--w':ValuedParameter('--',Name='w',Delimiter=' '),
91
# output fp for pairwise aligned sequences
92
'--fastapairs':ValuedParameter('--',Name='fastapairs',Delimiter=' ',
95
# input filename, .uc format
96
'--uc2clstr':ValuedParameter('--', Name='uc2clstr', Delimiter=' ',
99
# Don't assume input is sorted by length (default assume sorted).
100
'--usersort':FlagParameter('--',Name='usersort'),
102
# Same as --maxrejects 0 --nowordcountreject.
103
# comes with a performance hit.
104
'--exact':FlagParameter('--',Name='exact'),
106
# Same as --maxrejects 0 --maxaccepts 0 --nowordcountreject --
107
# comes with a performance hit.
108
'--optimal':FlagParameter('--',Name='optimal'),
110
'--stable_sort':FlagParameter('--',Name='stable_sort'),
113
_suppress_stdout = False
114
_suppress_stderr = False
116
def _input_as_parameters(self,data):
117
""" Set the input path (a fasta filepath)
119
# The list of values which can be passed on a per-run basis
120
allowed_values = ['--input','--uc','--fastapairs',\
121
'--uc2clstr','--output','--mergesort']
123
unsupported_parameters = set(data.keys()) - set(allowed_values)
124
if unsupported_parameters:
125
raise ApplicationError,\
126
"Unsupported parameter(s) passed when calling uclust: %s" %\
127
' '.join(unsupported_parameters)
129
for v in allowed_values:
130
# turn the parameter off so subsequent runs are not
131
# affected by parameter settings from previous runs
132
self.Parameters[v].off()
134
# turn the parameter on if specified by the user
135
self.Parameters[v].on(data[v])
139
def _get_result_paths(self,data):
140
""" Set the result paths """
144
result['Output'] = ResultPath(\
145
Path=self.Parameters['--output'].Value,\
146
IsWritten=self.Parameters['--output'].isOn())
148
result['ClusterFile'] = ResultPath(
149
Path = self.Parameters['--uc'].Value,
150
IsWritten=self.Parameters['--uc'].isOn())
152
result['PairwiseAlignments'] = ResultPath(
153
Path = self.Parameters['--fastapairs'].Value,
154
IsWritten=self.Parameters['--fastapairs'].isOn())
158
def _accept_exit_status(self,exit_status):
159
""" Test for acceptable exit status
161
uclust can seg fault and still generate a parsable .uc file
162
so we explicitly check the exit status
165
return exit_status == 0
168
"""Method that points to documentation"""
172
http://www.drive5.com/uclust/
174
The following papers should be cited if this resource is used:
176
Paper pending. Check with Robert Edgar who is writing the paper
177
for uclust as of March 2010. Cite the above URL for the time being.
181
## Start functions for processing uclust output files
182
def get_next_record_type(lines,types):
185
if line and line[0] in types:
186
yield line.split('\t')
189
def get_next_two_fasta_records(lines):
191
for record in MinimalFastaParser(lines):
192
result.append(record)
198
def process_uclust_pw_alignment_results(fasta_pairs_lines,uc_lines):
199
""" Process results of uclust search and align """
200
alignments = get_next_two_fasta_records(fasta_pairs_lines)
201
for hit in get_next_record_type(uc_lines,'H'):
202
matching_strand = hit[4]
203
if matching_strand == '-':
205
target_rev_match = True
206
elif matching_strand == '+':
208
target_rev_match = False
209
elif matching_strand == '.':
210
# protein sequence, so no strand information
212
target_rev_match = False
214
raise UclustParseError, "Unknown strand type: %s" % matching_strand
216
uc_target_id = hit[9]
217
percent_id = float(hit[3])
219
fasta_pair = alignments.next()
221
fasta_query_id = fasta_pair[0][0]
222
aligned_query = fasta_pair[0][1]
224
if fasta_query_id != uc_query_id:
225
raise UclustParseError,\
226
"Order of fasta and uc files do not match."+\
227
" Got query %s but expected %s." %\
228
(fasta_query_id, uc_query_id)
230
fasta_target_id = fasta_pair[1][0]
231
aligned_target = fasta_pair[1][1]
233
if fasta_target_id != uc_target_id + strand_id:
234
raise UclustParseError, \
235
"Order of fasta and uc files do not match."+\
236
" Got target %s but expected %s." %\
237
(fasta_target_id, uc_target_id + strand_id)
240
query_id = uc_query_id + ' RC'
241
aligned_query = DNA.rc(aligned_query)
242
target_id = uc_target_id
243
aligned_target = DNA.rc(aligned_target)
245
query_id = uc_query_id
246
aligned_query = aligned_query
247
target_id = uc_target_id
248
aligned_target = aligned_target
250
yield (query_id, target_id, aligned_query, aligned_target, percent_id)
252
def clusters_from_uc_file(uc_lines):
253
""" Given an open .uc file, return lists (clusters, failures, new_seeds)
255
uc_lines: open .uc file, or similar object -- this is the output
256
generated by uclust's -uc parameter
258
This function processes all hit (H), seed (S), and no hit (N) lines
259
to return all clusters, failures, and new_seeds generated in
260
a uclust run. failures should only arise when users have passed
261
--lib and --libonly, and a sequence doesn't cluster to any existing
262
reference database sequences.
268
# the types of hit lines we're interested in here
269
# are hit (H), seed (S), library seed (L) and no hit (N)
270
hit_types={}.fromkeys(list('HSNL'))
271
for record in get_next_record_type(uc_lines,hit_types):
273
# sequence identifiers from the fasta header lines only
274
# (no comment data) are stored to identify a sequence in
275
# a cluster -- strip off any comments here as this value
276
# is used in several places
277
query_id = record[8].split()[0]
278
target_cluster = record[9].split()[0]
280
# add the hit to it's existing cluster (either library
282
clusters[target_cluster].append(query_id)
283
elif hit_type == 'S':
284
# a new seed was identified -- create a cluster with this
285
# sequence as the first instance
286
if query_id in clusters:
287
raise UclustParseError,\
288
("A seq id was provided as a seed, but that seq id already "
289
"represents a cluster. Are there overlapping seq ids in your "
290
"reference and input files or repeated seq ids in either? "
291
"Offending seq id is %s" % query_id)
292
clusters[query_id] = [query_id]
293
seeds.append(query_id)
294
elif hit_type == 'L':
295
# a library seed was identified -- create a cluster with this
296
# id as the index, but don't give it any instances yet bc the hit
297
# line will be specified separately. note we need to handle these
298
# lines separately from the H lines to detect overlapping seq ids
299
# between the reference and the input fasta files
300
if query_id in clusters:
301
raise UclustParseError,\
302
("A seq id was provided as a seed, but that seq id already "
303
"represents a cluster. Are there overlapping seq ids in your "
304
"reference and input files or repeated seq ids in either? "
305
"Offending seq id is %s" % query_id)
306
clusters[query_id] = []
307
elif hit_type == 'N':
308
# a failure was identified -- add it to the failures list
309
failures.append(query_id)
311
# shouldn't be possible to get here, but provided for
313
raise UclustParseError,\
314
"Unexpected result parsing line:\n%s" % '\t'.join(record)
316
# will need to return the full clusters dict, I think, to support
317
# useful identifiers in reference database clustering
318
#return clusters.values(), failures, seeds
319
return clusters, failures, seeds
321
## End functions for processing uclust output files
324
## Start uclust convenience functions
325
def uclust_fasta_sort_from_filepath(
327
output_filepath=None,
329
"""Generates sorted fasta file via uclust --mergesort."""
330
output_filepath = output_filepath or \
331
get_tmp_filename(prefix='uclust_fasta_sort', suffix='.fasta')
332
tmp_working_dir = split(output_filepath)[0]
334
app = Uclust(params={'--tmpdir':tmp_working_dir},HALT_EXEC=HALT_EXEC)
336
app_result = app(data={'--mergesort':fasta_filepath,\
337
'--output':output_filepath})
341
def uclust_search_and_align_from_fasta_filepath(
342
query_fasta_filepath,
343
subject_fasta_filepath,
345
enable_rev_strand_matching=True,
349
""" query seqs against subject fasta using uclust,
351
return global pw alignment of best match
354
# Explanation of parameter settings
355
# id - min percent id to count a match
356
# maxaccepts = 8 , searches for best match rather than first match
357
# (0 => infinite accepts, or good matches before
360
# libonly = True , does not add sequences to the library if they don't
361
# match something there already. this effectively makes
362
# uclust a search tool rather than a clustering tool
364
params = {'--id':percent_ID,
365
'--maxaccepts':max_accepts,
366
'--maxrejects':max_rejects,
368
'--lib':subject_fasta_filepath}
370
if enable_rev_strand_matching:
371
params['--rev'] = True
373
# instantiate the application controller
374
app = Uclust(params,HALT_EXEC=HALT_EXEC)
377
alignment_filepath = \
378
get_tmp_filename(prefix='uclust_alignments',suffix='.fasta')
380
get_tmp_filename(prefix='uclust_results',suffix='.uc')
381
input_data = {'--input':query_fasta_filepath,
382
'--fastapairs':alignment_filepath,
384
app_result = app(input_data)
386
# yield the pairwise alignments
387
for result in process_uclust_pw_alignment_results(
388
app_result['PairwiseAlignments'],app_result['ClusterFile']):
391
except GeneratorExit:
394
# clean up the temp files that were generated
399
def uclust_cluster_from_sorted_fasta_filepath(
401
uc_save_filepath=None,
409
suppress_sort = False,
410
enable_rev_strand_matching=False,
411
subject_fasta_filepath=None,
412
suppress_new_clusters=False,
415
""" Returns clustered uclust file from sorted fasta"""
416
output_filepath = uc_save_filepath or \
417
get_tmp_filename(prefix='uclust_clusters',suffix='.uc')
420
params = {'--id':percent_ID,
421
'--maxaccepts':max_accepts,
422
'--maxrejects':max_rejects,
423
'--stepwords':stepwords,
425
app = Uclust(params,HALT_EXEC=HALT_EXEC)
427
# Set any additional parameters specified by the user
428
if enable_rev_strand_matching: app.Parameters['--rev'].on()
429
if optimal: app.Parameters['--optimal'].on()
430
if exact: app.Parameters['--exact'].on()
431
if suppress_sort: app.Parameters['--usersort'].on()
432
if subject_fasta_filepath: app.Parameters['--lib'].on(subject_fasta_filepath)
433
if suppress_new_clusters: app.Parameters['--libonly'].on()
434
if stable_sort: app.Parameters['--stable_sort'].on()
436
app_result = app({'--input':fasta_filepath,'--uc':output_filepath})
440
def get_output_filepaths(output_dir, fasta_filepath):
441
""" Returns filepaths for intermediate file to be kept """
443
if not output_dir.endswith('/'):
446
output_file_basename = "".join(basename(fasta_filepath).split('.')[0:-1])
447
uc_save_filepath = '%s%s_clusters.uc' % \
448
(output_dir, output_file_basename)
450
return uc_save_filepath
455
def get_clusters_from_fasta_filepath(
467
enable_rev_strand_matching=False,
468
subject_fasta_filepath=None,
469
suppress_new_clusters=False,
470
return_cluster_maps=False,
474
""" Main convenience wrapper for using uclust to generate cluster files
476
A source fasta file is required for the fasta_filepath. This will be
477
sorted to be in order of longest to shortest length sequences. Following
478
this, the sorted fasta file is used to generate a cluster file in the
479
uclust (.uc) format. Next the .uc file is converted to cd-hit format
480
(.clstr). Finally this file is parsed and returned as a list of lists,
481
where each sublist a cluster of sequences. If an output_dir is
482
specified, the intermediate files will be preserved, otherwise all
483
files created are temporary and will be deleted at the end of this
486
The percent_ID parameter specifies the percent identity for a clusters,
487
i.e., if 99% were the parameter, all sequences that were 99% identical
488
would be grouped as a cluster.
493
# Create readable intermediate filenames if they are to be kept
495
fasta_output_filepath = None
496
uc_output_filepath = None
497
cd_hit_filepath = None
499
if output_dir and not output_dir.endswith('/'):
504
uc_save_filepath = get_output_filepaths(output_dir, original_fasta_path)
506
uc_save_filepath = None
508
sorted_fasta_filepath = ""
513
# Error check in case any app controller fails
516
if not suppress_sort:
517
# Sort fasta input file from largest to smallest sequence
518
sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \
519
output_filepath=fasta_output_filepath)
521
# Get sorted fasta name from application wrapper
522
sorted_fasta_filepath = sort_fasta['Output'].name
523
files_to_remove.append(sorted_fasta_filepath)
527
sorted_fasta_filepath = fasta_filepath
529
# Generate uclust cluster file (.uc format)
530
uclust_cluster = uclust_cluster_from_sorted_fasta_filepath(
531
sorted_fasta_filepath,
533
percent_ID=percent_ID,
534
max_accepts=max_accepts,
535
max_rejects=max_rejects,
537
word_length=word_length,
540
suppress_sort=suppress_sort,
541
enable_rev_strand_matching=enable_rev_strand_matching,
542
subject_fasta_filepath=subject_fasta_filepath,
543
suppress_new_clusters=suppress_new_clusters,
544
stable_sort=stable_sort,
546
# Get cluster file name from application wrapper
547
remove_files(files_to_remove)
548
except ApplicationError:
549
remove_files(files_to_remove)
550
raise ApplicationError, ('Error running uclust. Possible causes are '
551
'unsupported version (current supported version is v1.2.22) is installed or '
552
'improperly formatted input file was provided')
553
except ApplicationNotFoundError:
554
remove_files(files_to_remove)
555
raise ApplicationNotFoundError('uclust not found, is it properly '+\
558
# Get list of lists for each cluster
559
clusters, failures, seeds = \
560
clusters_from_uc_file(uclust_cluster['ClusterFile'])
562
# Remove temp files unless user specifies output filepath
563
if not save_uc_files:
564
uclust_cluster.cleanUp()
566
if return_cluster_maps:
567
return clusters, failures, seeds
569
return clusters.values(), failures, seeds
571
## End uclust convenience functions