2
# File created on 30 Dec 2009.
3
from __future__ import division
6
from subprocess import Popen, PIPE, STDOUT
7
from os import makedirs, listdir
9
from os.path import split, splitext, join, dirname, abspath
10
from datetime import datetime
11
from numpy import array
12
from cogent.util.misc import safe_md5
13
from cogent.parse.fasta import MinimalFastaParser
14
from qiime.parse import parse_mapping_file, parse_qiime_parameters
15
from qiime.util import (compute_seqs_per_library_stats,
16
get_qiime_scripts_dir,
17
create_dir, guess_even_sampling_depth,
18
get_interesting_mapping_fields,qiime_system_call,
19
get_qiime_library_version)
20
from biom.parse import parse_biom_table
21
from cogent.core.moltype import IUPAC_DNA_ambiguities
24
__author__ = "Greg Caporaso"
25
__copyright__ = "Copyright 2011, The QIIME Project"
26
__credits__ = ["Greg Caporaso", "Kyle Bittinger", "Justin Kuczynski"]
29
__maintainer__ = "Greg Caporaso"
30
__email__ = "gregcaporaso@gmail.com"
31
__status__ = "Release"
34
This file contains the QIIME workflow functions which string together
35
independent scripts. For usage examples see the related files in the
40
## Start utilities used by the workflow functions
41
def generate_log_fp(output_dir,
44
timestamp_pattern='%Y%m%d%H%M%S'):
45
timestamp = datetime.now().strftime(timestamp_pattern)
46
filename = '%s_%s.%s' % (basefile_name,timestamp,suffix)
47
return join(output_dir,filename)
49
class WorkflowError(Exception):
52
class WorkflowLogger(object):
54
def __init__(self,log_fp=None,params=None,qiime_config=None,open_mode='w'):
56
self._f = open(log_fp,open_mode)
59
start_time = datetime.now().strftime('%H:%M:%S on %d %b %Y')
60
self.write('Logging started at %s\n' % start_time)
61
self.write('QIIME version: %s\n\n' % get_qiime_library_version())
62
self.writeQiimeConfig(qiime_config)
63
self.writeParams(params)
68
# Flush here so users can see what step they're
69
# on after each write, since some steps can take
70
# a long time, and a relatively small amount of
71
# data is being written to the log files.
76
def writeQiimeConfig(self,qiime_config):
77
if qiime_config == None:
78
self.write('No qiime config provided.\n')
80
self.write('qiime_config values:\n')
81
for k,v in qiime_config.items():
83
self.write('%s\t%s\n' % (k,v))
86
def writeParams(self,params):
88
self.write('No params provided.\n')
90
self.write('parameter file values:\n')
91
for k,v in params.items():
92
for inner_k,inner_v in v.items():
93
val = inner_v or 'True'
94
self.write('%s:%s\t%s\n' % (k,inner_k,val))
98
end_time = datetime.now().strftime('%H:%M:%S on %d %b %Y')
99
self.write('\nLogging stopped at %s\n' % end_time)
105
def print_commands(commands,
106
status_update_callback,
108
close_logger_on_success=True):
109
"""Print list of commands to run """
110
logger.write("Printing commands only.\n\n")
113
status_update_callback('#%s' % e[0])
115
logger.write('# %s command\n%s\n\n' % e)
117
def call_commands_serially(commands,
118
status_update_callback,
120
close_logger_on_success=True):
121
"""Run list of commands, one after another """
122
logger.write("Executing commands.\n\n")
125
status_update_callback('%s\n%s' % e)
126
logger.write('# %s command \n%s\n\n' % e)
127
stdout, stderr, return_value = qiime_system_call(e[1])
128
if return_value != 0:
129
msg = "\n\n*** ERROR RAISED DURING STEP: %s\n" % e[0] +\
130
"Command run was:\n %s\n" % e[1] +\
131
"Command returned exit status: %d\n" % return_value +\
132
"Stdout:\n%s\nStderr\n%s\n" % (stdout,stderr)
135
raise WorkflowError, msg
136
# in the no error case, we write commands' output to the log
137
# and also echo to this proc's stdout/stderr
139
# write stdout and stderr to log file
140
logger.write("Stdout:\n%s\nStderr:\n%s\n" % (stdout,stderr))
141
# write stdout to stdout
144
# write stderr to stderr
146
sys.stderr.write(stderr)
147
if close_logger_on_success: logger.close()
149
def print_to_stdout(s):
152
def no_status_updates(s):
155
def get_params_str(params):
157
for param_id, param_value in params.items():
158
result.append('--%s' % (param_id))
159
if param_value != None:
160
result.append(param_value)
161
return ' '.join(result)
163
def validate_and_set_jobs_to_start(params,
165
default_jobs_to_start,
168
if (jobs_to_start != int(default_jobs_to_start)) and \
170
option_parser.error("Passing -O requires that -a is also passed.")
171
params['parallel']['jobs_to_start'] = str(jobs_to_start)
173
def log_input_md5s(logger,fps):
174
logger.write("Input file md5 sums:\n")
177
logger.write("%s: %s\n" % (fp, safe_md5(open(fp)).hexdigest()))
181
## End utilities used by the workflow functions
183
## Begin task-specific workflow functions
184
def run_pick_otus_through_otu_table(input_fp,
191
status_update_callback=print_to_stdout):
192
""" Run the data preparation steps of Qiime
194
The steps performed by this function are:
196
2) Pick a representative set;
197
3) Align the representative set;
199
5) Filter the alignment prior to tree building - remove positions
200
which are all gaps, and specified as 0 in the lanemask
201
6) Build a phylogenetic tree;
202
7) Build an OTU table.
206
# Prepare some variables for the later steps
207
input_dir, input_filename = split(input_fp)
208
input_basename, input_ext = splitext(input_filename)
209
create_dir(output_dir)
211
python_exe_fp = qiime_config['python_exe_fp']
212
script_dir = get_qiime_scripts_dir()
213
cluster_failures = False
215
logger = WorkflowLogger(generate_log_fp(output_dir),
217
qiime_config=qiime_config)
218
close_logger_on_success = True
220
close_logger_on_success = False
222
log_input_md5s(logger,[input_fp])
224
# Prep the OTU picking command
226
otu_picking_method = params['pick_otus']['otu_picking_method']
228
otu_picking_method = 'uclust'
229
pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
230
otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename)
231
if parallel and (otu_picking_method == 'blast' or
232
otu_picking_method == 'uclust_ref'):
233
# Grab the parallel-specific parameters
235
params_str = get_params_str(params['parallel'])
239
# Grab the OTU picker parameters
241
# Want to find a cleaner strategy for this: the parallel script
242
# is method-specific, so doesn't take a --otu_picking_method
243
# option. This works for now though.
244
d = params['pick_otus'].copy()
245
del d['otu_picking_method']
249
if otu_picking_method == 'uclust_ref':
251
suppress_new_clusters = d['suppress_new_clusters']
252
del d['suppress_new_clusters']
253
cluster_failures = False
255
cluster_failures = True
256
failure_otu_picking_method = 'uclust'
258
params_str += ' %s' % get_params_str(d)
259
otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
260
# Build the OTU picking command
261
pick_otus_cmd = '%s %s/%s -i %s -o %s -T %s' % (python_exe_fp,
269
params_str = get_params_str(params['pick_otus'])
272
# Build the OTU picking command
273
pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\
274
(python_exe_fp, script_dir, input_fp, pick_otu_dir, params_str)
276
commands.append([('Pick OTUs', pick_otus_cmd)])
279
reference_otu_fp = otu_fp
280
clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir
283
d = params['pick_otus'].copy()
284
del d['otu_picking_method']
288
if 'uclust_otu_id_prefix' not in d:
289
d['uclust_otu_id_prefix'] = 'DeNovoOTU'
290
params_str = ' %s' % get_params_str(d)
292
failures_list_fp = '%s/%s_failures.txt' % \
293
(pick_otu_dir,input_basename)
294
failures_fasta_fp = '%s/%s_failures.fasta' % \
295
(pick_otu_dir,input_basename)
297
filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
298
(input_fp,failures_list_fp,failures_fasta_fp)
300
commands.append([('Generate failures fasta file',
303
# Prep the OTU picking command for
304
failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,input_basename)
305
# Build the OTU picking command
306
pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -m %s %s' %\
307
(python_exe_fp, script_dir, failures_fasta_fp, clustered_failures_dir,
308
failure_otu_picking_method, params_str)
310
commands.append([('Pick de novo OTUs for new clusters', pick_otus_cmd)])
312
merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir
313
cat_otu_tables_cmd = 'cat %s %s >> %s' %\
314
(reference_otu_fp,failure_otu_fp,merged_otu_map_fp)
315
commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
316
otu_fp = merged_otu_map_fp
318
# Prep the representative set picking command
319
rep_set_dir = '%s/rep_set/' % output_dir
321
makedirs(rep_set_dir)
324
rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir,input_basename)
325
rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir,input_basename)
328
params_str = get_params_str(params['pick_rep_set'])
331
# Build the representative set picking command
332
pick_rep_set_cmd = '%s %s/pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\
333
(python_exe_fp, script_dir, otu_fp, input_fp, rep_set_log_fp,\
334
rep_set_fp, params_str)
335
commands.append([('Pick representative set', pick_rep_set_cmd)])
337
# Prep the taxonomy assignment command
339
assignment_method = params['assign_taxonomy']['assignment_method']
341
assignment_method = 'rdp'
342
assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
343
(output_dir,assignment_method)
344
taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \
345
(assign_taxonomy_dir,input_basename)
346
if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'):
347
# Grab the parallel-specific parameters
349
params_str = get_params_str(params['parallel'])
353
# Grab the OTU picker parameters
355
# Want to find a cleaner strategy for this: the parallel script
356
# is method-specific, so doesn't take a --assignment_method
357
# option. This works for now though.
358
d = params['assign_taxonomy'].copy()
359
del d['assignment_method']
360
params_str += ' %s' % get_params_str(d)
364
# Build the parallel taxonomy assignment command
365
assign_taxonomy_cmd = \
366
'%s %s/parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
367
(python_exe_fp, script_dir, assignment_method, rep_set_fp,\
368
assign_taxonomy_dir, params_str)
371
params_str = get_params_str(params['assign_taxonomy'])
374
# Build the taxonomy assignment command
375
assign_taxonomy_cmd = '%s %s/assign_taxonomy.py -o %s -i %s %s' %\
376
(python_exe_fp, script_dir, assign_taxonomy_dir,\
377
rep_set_fp, params_str)
379
commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
381
# Prep the OTU table building command
382
otu_table_fp = '%s/otu_table.biom' % output_dir
384
params_str = get_params_str(params['make_otu_table'])
387
# Build the OTU table building command
388
make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\
389
(python_exe_fp, script_dir, otu_fp, taxonomy_fp, otu_table_fp, params_str)
391
commands.append([('Make OTU table', make_otu_table_cmd)])
394
reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir
395
# Build the OTU table building command
396
make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\
397
(python_exe_fp, script_dir, reference_otu_fp, taxonomy_fp,
398
reference_otu_table_fp, params_str)
400
commands.append([('Make reference-only OTU table', make_otu_table_cmd)])
402
# Prep the pynast alignment command
404
alignment_method = params['align_seqs']['alignment_method']
406
alignment_method = 'pynast'
407
pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method)
408
aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir,input_basename)
409
if parallel and alignment_method == 'pynast':
410
# Grab the parallel-specific parameters
412
params_str = get_params_str(params['parallel'])
416
# Grab the alignment parameters
417
# Want to find a cleaner strategy for this: the parallel script
418
# is method-specific, so doesn't take a --alignment_method
419
# option. This works for now though.
421
d = params['align_seqs'].copy()
425
del d['alignment_method']
428
params_str += ' %s' % get_params_str(d)
430
# Build the parallel pynast alignment command
431
align_seqs_cmd = '%s %s/parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
432
(python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str)
435
params_str = get_params_str(params['align_seqs'])
438
# Build the pynast alignment command
439
align_seqs_cmd = '%s %s/align_seqs.py -i %s -o %s %s' %\
440
(python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str)
441
commands.append([('Align sequences', align_seqs_cmd)])
443
# Prep the alignment filtering command
444
filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\
445
(pynast_dir,input_basename)
447
params_str = get_params_str(params['filter_alignment'])
450
# Build the alignment filtering command
451
filter_alignment_cmd = '%s %s/filter_alignment.py -o %s -i %s %s' %\
452
(python_exe_fp, script_dir, pynast_dir, aln_fp, params_str)
453
commands.append([('Filter alignment', filter_alignment_cmd)])
455
# Prep the tree building command
456
tree_fp = '%s/rep_set.tre' % output_dir
458
params_str = get_params_str(params['make_phylogeny'])
461
# Build the tree building command
462
make_phylogeny_cmd = '%s %s/make_phylogeny.py -i %s -o %s %s' %\
463
(python_exe_fp, script_dir, filtered_aln_fp, tree_fp,\
465
commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
467
# Call the command handler on the list of commands
468
command_handler(commands,
469
status_update_callback,
471
close_logger_on_success=close_logger_on_success)
473
return abspath(tree_fp), abspath(otu_table_fp)
474
run_qiime_data_preparation = run_pick_otus_through_otu_table
477
## Start reference otu picking workflow
480
## Begin task-specific workflow functions
481
def run_pick_reference_otus_through_otu_table(
491
status_update_callback=print_to_stdout):
492
""" Run the data preparation steps of Qiime
494
The steps performed by this function are:
496
2) Build an OTU table with optional pre-defined taxonmy.
500
# confirm that a valid otu picking method was supplied before doing
502
reference_otu_picking_methods = ['blast','uclust_ref']
505
otu_picking_method = params['pick_otus']['otu_picking_method']
507
otu_picking_method = 'uclust_ref'
508
assert otu_picking_method in reference_otu_picking_methods,\
509
"Invalid OTU picking method supplied: %s. Valid choices are: %s"\
510
% (otu_picking_method,' '.join(reference_otu_picking_methods))
512
# Prepare some variables for the later steps
513
input_dir, input_filename = split(input_fp)
514
input_basename, input_ext = splitext(input_filename)
515
create_dir(output_dir)
517
python_exe_fp = qiime_config['python_exe_fp']
518
script_dir = get_qiime_scripts_dir()
520
logger = WorkflowLogger(generate_log_fp(output_dir),
522
qiime_config=qiime_config)
523
close_logger_on_success = True
525
close_logger_on_success = False
527
log_input_md5s(logger,[input_fp,refseqs_fp,taxonomy_fp])
529
# Prep the OTU picking command
530
pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
531
otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename)
532
if parallel and (otu_picking_method == 'blast' or
533
otu_picking_method == 'uclust_ref'):
534
# Grab the parallel-specific parameters
536
params_str = get_params_str(params['parallel'])
540
# Grab the OTU picker parameters
542
# Want to find a cleaner strategy for this: the parallel script
543
# is method-specific, so doesn't take a --alignment_method
544
# option. This works for now though.
545
d = params['pick_otus'].copy()
546
if 'otu_picking_method' in d:
547
del d['otu_picking_method']
548
params_str += ' %s' % get_params_str(d)
551
otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
552
# Build the OTU picking command
553
pick_otus_cmd = '%s %s/%s -i %s -o %s -r %s -T %s' %\
563
params_str = get_params_str(params['pick_otus'])
566
# Since this is reference-based OTU picking we always want to
567
# suppress new clusters -- force it here.
568
params_str+= ' --suppress_new_clusters'
569
logger.write("Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n")
570
# Build the OTU picking command
571
pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -r %s -m %s %s' %\
580
commands.append([('Pick OTUs', pick_otus_cmd)])
582
# Prep the OTU table building command
583
otu_table_fp = '%s/otu_table.biom' % pick_otu_dir
585
params_str = get_params_str(params['make_otu_table'])
589
taxonomy_str = '-t %s' % taxonomy_fp
592
# Build the OTU table building command
593
make_otu_table_cmd = '%s %s/make_otu_table.py -i %s %s -o %s %s' %\
594
(python_exe_fp, script_dir, otu_fp, taxonomy_str, otu_table_fp, params_str)
596
commands.append([('Make OTU table', make_otu_table_cmd)])
599
# Call the command handler on the list of commands
600
command_handler(commands,
601
status_update_callback,
603
close_logger_on_success=close_logger_on_success)
607
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp,
608
output_dir, command_handler, params, qiime_config,
609
color_by_interesting_fields_only=True,sampling_depth=None,
610
histogram_categories=None,
611
tree_fp=None, parallel=False, logger=None, suppress_3d_plots=False,
612
suppress_2d_plots=False,status_update_callback=print_to_stdout):
613
""" Run the data preparation steps of Qiime
615
The steps performed by this function are:
616
1) Compute a beta diversity distance matrix;
617
2) Peform a principal coordinates analysis on the result of
619
3) Generate a 3D prefs file for optimized coloring of continuous
621
4) Generate a 3D plot for all mapping fields with colors
622
optimized for continuous data;
623
5) Generate a 3D plot for all mapping fields with colors
624
optimized for discrete data.
627
# Prepare some variables for the later steps
628
otu_table_dir, otu_table_filename = split(otu_table_fp)
629
otu_table_basename, otu_table_ext = splitext(otu_table_filename)
630
create_dir(output_dir)
632
python_exe_fp = qiime_config['python_exe_fp']
633
script_dir = get_qiime_scripts_dir()
635
logger = WorkflowLogger(generate_log_fp(output_dir),
637
qiime_config=qiime_config)
638
close_logger_on_success = True
640
close_logger_on_success = False
642
log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
644
mapping_data, mapping_header, mapping_comments =\
645
parse_mapping_file(open(mapping_fp,'U'))
646
if histogram_categories:
647
invalid_categories = set(histogram_categories) - set(mapping_header)
648
if invalid_categories:
650
"Invalid histogram categories - these must exactly match "+\
651
"mapping file column headers: %s" % (' '.join(invalid_categories))
652
# Get the interesting mapping fields to color by -- if none are
653
# interesting, take all of them. Interesting is defined as those
654
# which have greater than one value and fewer values than the number
656
if color_by_interesting_fields_only:
658
get_interesting_mapping_fields(mapping_data, mapping_header) or mapping_header
660
mapping_fields = mapping_header
661
mapping_fields = ','.join(mapping_fields)
664
# Sample the OTU table at even depth
665
even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
666
(output_dir, otu_table_basename,
667
sampling_depth, otu_table_ext)
668
single_rarefaction_cmd = \
669
'%s %s/single_rarefaction.py -i %s -o %s -d %d' %\
670
(python_exe_fp, script_dir, otu_table_fp,
671
even_sampled_otu_table_fp, sampling_depth)
673
('Sample OTU table at %d seqs/sample' % sampling_depth,
674
single_rarefaction_cmd)])
675
otu_table_fp = even_sampled_otu_table_fp
676
otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
677
otu_table_basename, otu_table_ext = splitext(otu_table_filename)
679
beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
681
beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac']
683
# Prep the 3d prefs file generator command
684
prefs_fp = '%s/prefs.txt' % output_dir
686
params_str = get_params_str(params['make_prefs_file'])
689
if not 'mapping_headers_to_use' in params['make_prefs_file']:
690
params_str = '%s --mapping_headers_to_use %s' \
691
% (params_str,mapping_fields)
692
# Build the 3d prefs file generator command
694
'%s %s/make_prefs_file.py -m %s -o %s %s' %\
695
(python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str)
696
commands.append([('Build prefs file', prefs_cmd)])
699
for beta_diversity_metric in beta_diversity_metrics:
701
# Prep the beta-diversity command
703
bdiv_params_copy = params['beta_diversity'].copy()
705
bdiv_params_copy = {}
707
del bdiv_params_copy['metrics']
711
params_str = get_params_str(bdiv_params_copy)
714
params_str = '%s -t %s ' % (params_str,tree_fp)
716
# Build the beta-diversity command
718
# Grab the parallel-specific parameters
720
params_str += get_params_str(params['parallel'])
723
beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
724
(python_exe_fp, script_dir, otu_table_fp,
725
output_dir, beta_diversity_metric, params_str)
727
[('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
729
beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\
730
(python_exe_fp, script_dir, otu_table_fp,
731
output_dir, beta_diversity_metric, params_str)
733
[('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
736
orig_beta_div_fp = '%s/%s_%s.txt' % \
737
(output_dir, beta_diversity_metric, otu_table_basename)
738
beta_div_fp = '%s/%s_dm.txt' % \
739
(output_dir, beta_diversity_metric)
740
commands.append([('Rename distance matrix (%s)' % beta_diversity_metric,
741
'mv %s %s' % (orig_beta_div_fp, beta_div_fp))])
742
dm_fps.append((beta_diversity_metric, beta_div_fp))
744
# Prep the principal coordinates command
745
pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
747
params_str = get_params_str(params['principal_coordinates'])
750
# Build the principal coordinates command
751
pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
752
(python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str)
754
[('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)])
757
if not suppress_3d_plots:
758
# Prep the continuous-coloring 3d plots command
759
continuous_3d_dir = '%s/%s_3d_continuous/' %\
760
(output_dir, beta_diversity_metric)
762
makedirs(continuous_3d_dir)
766
params_str = get_params_str(params['make_3d_plots'])
769
# Build the continuous-coloring 3d plots command
770
continuous_3d_command = \
771
'%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\
772
(python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir,\
773
mapping_fp, params_str)
775
# Prep the discrete-coloring 3d plots command
776
discrete_3d_dir = '%s/%s_3d_discrete/' %\
777
(output_dir, beta_diversity_metric)
779
makedirs(discrete_3d_dir)
783
params_str = get_params_str(params['make_3d_plots'])
786
# Build the discrete-coloring 3d plots command
787
discrete_3d_command = \
788
'%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
789
(python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir,\
790
mapping_fp, params_str)
793
('Make 3D plots (continuous coloring, %s)' %\
794
beta_diversity_metric,continuous_3d_command),\
795
('Make 3D plots (discrete coloring, %s)' %\
796
beta_diversity_metric,discrete_3d_command,)])
799
if not suppress_2d_plots:
800
# Prep the continuous-coloring 3d plots command
801
continuous_2d_dir = '%s/%s_2d_continuous/' %\
802
(output_dir, beta_diversity_metric)
804
makedirs(continuous_2d_dir)
808
params_str = get_params_str(params['make_2d_plots'])
811
# Build the continuous-coloring 3d plots command
812
continuous_2d_command = \
813
'%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\
814
(python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir,\
815
mapping_fp, params_str)
817
# Prep the discrete-coloring 3d plots command
818
discrete_2d_dir = '%s/%s_2d_discrete/' %\
819
(output_dir, beta_diversity_metric)
821
makedirs(discrete_2d_dir)
825
params_str = get_params_str(params['make_2d_plots'])
828
# Build the discrete-coloring 2d plots command
829
discrete_2d_command = \
830
'%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
831
(python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir,\
832
mapping_fp, params_str)
835
('Make 2D plots (continuous coloring, %s)' %\
836
beta_diversity_metric,continuous_2d_command),\
837
('Make 2D plots (discrete coloring, %s)' %\
838
beta_diversity_metric,discrete_2d_command,)])
840
if histogram_categories:
841
# Prep the discrete-coloring 3d plots command
842
histograms_dir = '%s/%s_histograms/' %\
843
(output_dir, beta_diversity_metric)
845
makedirs(histograms_dir)
849
params_str = get_params_str(params['make_distance_histograms'])
852
# Build the make_distance_histograms command
853
distance_histograms_command = \
854
'%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\
855
(python_exe_fp, script_dir, beta_div_fp,
856
histograms_dir, mapping_fp,
857
','.join(histogram_categories), params_str)
860
('Make Distance Histograms (%s)' %\
861
beta_diversity_metric,distance_histograms_command)])
863
# Call the command handler on the list of commands
864
command_handler(commands,
865
status_update_callback,
867
close_logger_on_success=close_logger_on_success)
872
def run_qiime_alpha_rarefaction(otu_table_fp, mapping_fp,
873
output_dir, command_handler, params, qiime_config, tree_fp=None,
874
num_steps=10, parallel=False, logger=None, min_rare_depth=10,
875
max_rare_depth=None,status_update_callback=print_to_stdout):
876
""" Run the data preparation steps of Qiime
878
The steps performed by this function are:
879
1) Generate rarefied OTU tables;
880
2) Compute alpha diversity metrics for each rarefied OTU table;
881
3) Collate alpha diversity results;
882
4) Generate alpha rarefaction plots.
885
# Prepare some variables for the later steps
886
otu_table_dir, otu_table_filename = split(otu_table_fp)
887
otu_table_basename, otu_table_ext = splitext(otu_table_filename)
888
create_dir(output_dir)
890
python_exe_fp = qiime_config['python_exe_fp']
891
script_dir = get_qiime_scripts_dir()
893
logger = WorkflowLogger(generate_log_fp(output_dir),
895
qiime_config=qiime_config)
896
close_logger_on_success = True
898
close_logger_on_success = False
900
log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
902
# Prep the rarefaction command
904
open(otu_table_fp,'U')
906
logger.write('OTU table filepath cannot be opened. Does it exist?\n' +
907
' %s\n' % otu_table_fp +
908
'Original Error:\n%s\n' % str(e))
911
if max_rare_depth == None:
912
min_count, max_count, median_count, mean_count, counts_per_sample =\
913
compute_seqs_per_library_stats(parse_biom_table(open(otu_table_fp,'U')))
914
max_rare_depth = median_count
915
step = int((max_rare_depth - min_rare_depth) / num_steps) or 1
916
max_rare_depth = int(max_rare_depth)
918
rarefaction_dir = '%s/rarefaction/' % output_dir
920
makedirs(rarefaction_dir)
924
params_str = get_params_str(params['multiple_rarefactions'])
928
params_str += ' %s' % get_params_str(params['parallel'])
929
# Build the rarefaction command
931
'%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\
932
(python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, \
933
step, rarefaction_dir, params_str)
935
# Build the rarefaction command
937
'%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\
938
(python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, \
939
step, rarefaction_dir, params_str)
940
commands.append([('Alpha rarefaction', rarefaction_cmd)])
942
# Prep the alpha diversity command
943
alpha_diversity_dir = '%s/alpha_div/' % output_dir
945
makedirs(alpha_diversity_dir)
949
params_str = get_params_str(params['alpha_diversity'])
953
params_str += ' -t %s' % tree_fp
955
params_str += ' %s' % get_params_str(params['parallel'])
956
# Build the alpha diversity command
957
alpha_diversity_cmd = \
958
"%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\
959
(python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, \
962
# Build the alpha diversity command
963
alpha_diversity_cmd = \
964
"%s %s/alpha_diversity.py -i %s -o %s %s" %\
965
(python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, \
969
[('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)])
971
# Prep the alpha diversity collation command
972
# python $qdir/collate_alpha.py -i Fasting_Alpha_Metrics/ -o Fasting_Alpha_Collated/
973
alpha_collated_dir = '%s/alpha_div_collated/' % output_dir
975
makedirs(alpha_collated_dir)
979
params_str = get_params_str(params['collate_alpha'])
982
# Build the alpha diversity collation command
983
alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\
984
(python_exe_fp, script_dir, alpha_diversity_dir, \
985
alpha_collated_dir, params_str)
986
commands.append([('Collate alpha',alpha_collated_cmd)])
988
# Prep the make rarefaction plot command(s)
989
rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir
991
makedirs(rarefaction_plot_dir)
995
params_str = get_params_str(params['make_rarefaction_plots'])
998
# Build the make rarefaction plot command(s)
999
#for metric in alpha_diversity_metrics:
1000
make_rarefaction_plot_cmd =\
1001
'%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\
1002
(python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
1003
rarefaction_plot_dir, params_str)
1005
[('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
1007
# Call the command handler on the list of commands
1008
command_handler(commands,
1009
status_update_callback,
1011
close_logger_on_success=close_logger_on_success)
1013
def run_jackknifed_beta_diversity(otu_table_fp,tree_fp,seqs_per_sample,
1014
output_dir, command_handler, params, qiime_config, mapping_fp,
1015
parallel=False,logger=None,
1016
status_update_callback=print_to_stdout, master_tree=None):
1017
""" Run the data preparation steps of Qiime
1019
The steps performed by this function are:
1020
1) Compute beta diversity distance matrix from otu table (and
1021
tree, if applicable)
1022
2) Build rarefied OTU tables;
1023
3) Build UPGMA tree from full distance matrix;
1024
4) Compute distance matrics for rarefied OTU tables;
1025
5) Build UPGMA trees from rarefied OTU table distance matrices;
1026
5.5) Build a consensus tree from the rarefied UPGMA trees
1027
6) Compare rarefied OTU table distance matrix UPGMA trees
1028
to tree full UPGMA tree and write support file and newick tree
1029
with support values as node labels.
1031
master_tree can be 'full' or 'consensus', default full
1033
# Prepare some variables for the later steps
1034
if master_tree == None:
1035
master_tree = 'full'
1036
otu_table_dir, otu_table_filename = split(otu_table_fp)
1037
otu_table_basename, otu_table_ext = splitext(otu_table_filename)
1038
create_dir(output_dir)
1040
python_exe_fp = qiime_config['python_exe_fp']
1041
script_dir = get_qiime_scripts_dir()
1043
logger = WorkflowLogger(generate_log_fp(output_dir),
1045
qiime_config=qiime_config)
1046
close_logger_on_success = True
1048
close_logger_on_success = False
1050
log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
1053
beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
1055
beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac']
1057
# Prep the beta-diversity command
1059
params_str = get_params_str(params['beta_diversity'])
1063
params_str = '%s -t %s' % (params_str,tree_fp)
1064
# Build the beta-diversity command
1065
beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s %s' %\
1066
(python_exe_fp, script_dir, otu_table_fp, output_dir, params_str)
1068
[('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd)])
1070
# Prep rarefaction command
1071
rarefaction_dir = '%s/rarefaction/' % output_dir
1073
makedirs(rarefaction_dir)
1077
params_str = get_params_str(params['multiple_rarefactions_even_depth'])
1081
# params_str += ' %s' % get_params_str(params['parallel'])
1082
# # Build the parallel rarefaction command
1083
# rarefaction_cmd = \
1084
# '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s 1 -o %s %s' %\
1085
# (python_exe_fp, script_dir, otu_table_fp, seqs_per_sample,\
1086
# seqs_per_sample, rarefaction_dir, params_str)
1088
# Build the serial rarefaction command
1090
'%s %s/multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\
1091
(python_exe_fp, script_dir, otu_table_fp, seqs_per_sample, \
1092
rarefaction_dir, params_str)
1093
commands.append([('Rarefaction', rarefaction_cmd)])
1095
# Begin iterating over beta diversity distance metrics, if more than one
1097
for beta_diversity_metric in beta_diversity_metrics:
1098
metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric)
1099
distance_matrix_fp = '%s/%s_%s.txt' % \
1100
(output_dir, beta_diversity_metric, otu_table_basename)
1102
# Prep the hierarchical clustering command (for full distance matrix)
1103
full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir,otu_table_basename)
1105
params_str = get_params_str(params['upgma_cluster'])
1108
# Build the hierarchical clustering command (for full distance matrix)
1109
hierarchical_cluster_cmd = '%s %s/upgma_cluster.py -i %s -o %s %s' %\
1110
(python_exe_fp, script_dir, distance_matrix_fp, full_tree_fp, params_str)
1112
[('UPGMA on full distance matrix: %s' % beta_diversity_metric,\
1113
hierarchical_cluster_cmd)])
1117
# Prep the beta diversity command (for rarefied OTU tables)
1118
dm_dir = '%s/rare_dm/' % metric_output_dir
1123
# the metrics parameter needs to be ignored as we need to run
1124
# beta_diversity one metric at a time to keep the per-metric
1125
# output files in separate directories
1127
d = params['beta_diversity'].copy()
1131
params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric
1133
params_str = '%s -t %s' % (params_str,tree_fp)
1135
params_str += ' %s' % get_params_str(params['parallel'])
1136
# Build the parallel beta diversity command (for rarefied OTU tables)
1137
beta_div_rarefied_cmd = \
1138
'%s %s/parallel_beta_diversity.py -T -i %s -o %s %s' %\
1139
(python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str)
1141
# Build the serial beta diversity command (for rarefied OTU tables)
1142
beta_div_rarefied_cmd = \
1143
'%s %s/beta_diversity.py -i %s -o %s %s' %\
1144
(python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str)
1146
[('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric,\
1147
beta_div_rarefied_cmd)])
1149
# Prep the hierarchical clustering command (for rarefied
1150
# distance matrices)
1151
upgma_dir = '%s/rare_upgma/' % metric_output_dir
1158
params_str = get_params_str(params['upgma_cluster'])
1161
# Build the hierarchical clustering command (for rarefied
1162
# distance matrices)
1163
hierarchical_cluster_cmd =\
1164
'%s %s/upgma_cluster.py -i %s -o %s %s' %\
1165
(python_exe_fp, script_dir, dm_dir, upgma_dir, params_str)
1167
[('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric,\
1168
hierarchical_cluster_cmd)])
1171
# Build the consensus tree command
1172
consensus_tree_cmd =\
1173
'%s %s/consensus_tree.py -i %s -o %s %s' %\
1174
(python_exe_fp, script_dir, upgma_dir, upgma_dir + "/consensus.tre",
1177
[('consensus on rarefied distance matrices (%s)' % beta_diversity_metric,\
1178
consensus_tree_cmd)])
1181
# Prep the tree compare command
1182
tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir
1184
makedirs(tree_compare_dir)
1188
params_str = get_params_str(params['tree_compare'])
1192
# Build the tree compare command
1193
if master_tree == "full":
1194
master_tree_fp = full_tree_fp
1195
elif master_tree == "consensus":
1196
master_tree_fp = upgma_dir + "/consensus.tre"
1198
raise RuntimeError('master tree method "%s" not found' % (master_tree,))
1199
tree_compare_cmd = '%s %s/tree_compare.py -s %s -m %s -o %s %s' %\
1200
(python_exe_fp, script_dir, upgma_dir, master_tree_fp, \
1201
tree_compare_dir, params_str)
1203
[('Tree compare (%s)' % beta_diversity_metric,\
1206
# Prep the PCoA command
1207
pcoa_dir = '%s/pcoa/' % metric_output_dir
1213
params_str = get_params_str(params['principal_coordinates'])
1216
# Build the PCoA command
1217
pcoa_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
1218
(python_exe_fp, script_dir, dm_dir, pcoa_dir, params_str)
1220
[('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)])
1222
# Prep the 2D plots command
1223
plots_2d_dir = '%s/2d_plots/' % metric_output_dir
1225
makedirs(plots_2d_dir)
1229
params_str = get_params_str(params['make_2d_plots'])
1232
# Build the 2d plots command
1233
plots_2d_cmd = '%s %s/make_2d_plots.py -i %s -o %s -m %s %s' %\
1234
(python_exe_fp, script_dir, pcoa_dir, plots_2d_dir,
1235
mapping_fp, params_str)
1237
[('2d plots (%s)' % beta_diversity_metric, plots_2d_cmd)])
1239
# Prep the 3D plots command
1240
plots_3d_dir = '%s/3d_plots/' % metric_output_dir
1242
makedirs(plots_3d_dir)
1246
params_str = get_params_str(params['make_3d_plots'])
1249
# Build the 2d plots command
1250
plots_3d_cmd = '%s %s/make_3d_plots.py -i %s -o %s -m %s %s' %\
1251
(python_exe_fp, script_dir, pcoa_dir, plots_3d_dir,
1252
mapping_fp, params_str)
1254
[('3d plots (%s)' % beta_diversity_metric, plots_3d_cmd)])
1258
# Call the command handler on the list of commands
1259
command_handler(commands,
1260
status_update_callback,
1262
close_logger_on_success=close_logger_on_success)
1266
def format_index_link(link_description,relative_path):
1268
return '<td>%s</td><td> <a href="%s">%s</a></td>' % (link_description,
1269
re.sub('/+','/',relative_path),
1270
split(relative_path)[1])
1272
def generate_index_page(index_links,index_fp):
1273
# get containing directory for index_fp
1274
top_level_dir = split(split(index_fp)[0])[1]
1275
index_page_header = "<html><head><title>QIIME results</title></head><body>\n"
1276
index_lines = [index_page_header]
1278
for e in index_links:
1280
d[e[2]].append((e[0],e[1]))
1282
d[e[2]] = [(e[0],e[1])]
1283
index_lines.append('<table border=1>\n')
1284
for k,v in d.items():
1286
'<tr colspan=2 align=center bgcolor=wheat><td colspan=2 align=center>%s</td></tr>\n' % k)
1287
for description,path in v:
1288
path = re.sub('.*%s' % top_level_dir,'./',path)
1289
index_lines.append('<tr>%s</tr>\n' % format_index_link(description,path))
1290
index_lines.append('</table>')
1292
index_page_footer = "</body></html>"
1293
index_lines.append(index_page_footer)
1295
open(index_fp,'w').write(''.join(index_lines))
1297
# Run QIIME method comparison workflow
1298
def run_core_qiime_analyses(
1307
sampling_depth=None,
1308
even_sampling_keeps_all_samples=False,
1309
suppress_split_libraries=False,
1310
arare_min_rare_depth=10,
1312
reference_tree_fp=None,
1314
status_update_callback=print_to_stdout):
1315
""" Run full QIIME workflow generating output files for method comparison
1319
# Prepare some variables for the later steps
1320
fna_fp0 = fna_fps.split(',')[0]
1321
input_dir, input_filename = split(fna_fp0)
1322
input_basename, input_ext = splitext(input_filename)
1323
mapping_categories = parse_mapping_file(open(mapping_fp,'U'))[1]
1325
# split categories skipping any empty strings (to handle
1326
# e.g. trailing commas)
1327
categories = [c for c in categories.split(',') if c]
1328
for c in categories:
1329
if c not in mapping_categories:
1330
raise ValueError, ("Category '%s' is not a column header "
1331
"in your mapping file. "
1332
"Categories are case and white space sensitive. Valid "
1333
"choices are: (%s)" % (c,', '.join(mapping_categories)))
1337
params = parse_qiime_parameters([])
1338
create_dir(output_dir)
1339
index_fp = '%s/index.html' % output_dir
1342
python_exe_fp = qiime_config['python_exe_fp']
1343
script_dir = get_qiime_scripts_dir()
1344
log_fp = generate_log_fp(output_dir)
1345
index_links.append(('Master run log',log_fp,'Log files'))
1346
logger = WorkflowLogger(log_fp,
1348
qiime_config=qiime_config)
1349
input_fps = fna_fps.split(',') + [mapping_fp]
1350
if qual_fps != None:
1351
input_fps += qual_fps.split(',')
1352
log_input_md5s(logger,input_fps)
1355
# Prep the split_libraries command
1356
if suppress_split_libraries:
1357
if len(fna_fps.split(',')) > 1:
1359
"Only one fasta file can be passed when suppress_split_libraries=True"
1360
split_libraries_seqs_fp = fna_fp0
1362
split_libraries_output_dir = \
1363
'%s/sl_out/' % output_dir
1364
split_libraries_seqs_fp = \
1365
'%s/seqs.fna' % split_libraries_output_dir
1366
split_libraries_hist_fp = \
1367
'%s/histograms.txt' % split_libraries_output_dir
1368
split_libraries_log_fp = \
1369
'%s/split_library_log.txt' % split_libraries_output_dir
1370
index_links.append(('Demultiplexed sequences',
1371
split_libraries_seqs_fp,
1372
'Split libraries results'))
1373
index_links.append(('Split libraries log',
1374
split_libraries_log_fp,
1375
'Split libraries results'))
1376
index_links.append(('Sequence length histograms',
1377
split_libraries_hist_fp,
1378
'Split libraries results'))
1380
params_str = get_params_str(params['split_libraries'])
1383
# Build the split libraries command
1385
qual_str = '-q %s' % qual_fps
1388
split_libraries_cmd = 'split_libraries.py -f %s %s -m %s -o %s %s' %\
1389
(fna_fps, qual_str, mapping_fp, split_libraries_output_dir, params_str)
1391
commands.append([('Split libraries', split_libraries_cmd)])
1393
# Call the command handler on the list of commands
1394
command_handler(commands,
1395
status_update_callback,
1397
close_logger_on_success=False)
1398
# Reset the commands list
1401
## OTU picking through OTU table workflow
1402
data_analysis_output_dir = '%s/otus/' % output_dir
1403
de_novo_tree_fp, otu_table_fp = \
1404
run_qiime_data_preparation(input_fp=split_libraries_seqs_fp,
1405
output_dir=data_analysis_output_dir,
1406
command_handler=command_handler,
1408
qiime_config=qiime_config,
1411
status_update_callback=status_update_callback)
1412
index_links.append(('Phylogenetic tree',de_novo_tree_fp,'OTU workflow results'))
1413
index_links.append(('OTU table',otu_table_fp,'OTU workflow results'))
1415
# If a reference tree was passed, use it for downstream analysis. Otherwise
1416
# use the de novo tree.
1417
tree_fp = reference_tree_fp or de_novo_tree_fp
1419
if sampling_depth == None:
1420
min_count, max_count, median_count, mean_count, counts_per_sample =\
1421
compute_seqs_per_library_stats(parse_biom_table(open(otu_table_fp)))
1422
if even_sampling_keeps_all_samples:
1423
sampling_depth = min_count
1426
guess_even_sampling_depth(counts_per_sample.values())
1428
## Beta diversity through 3D plots workflow
1429
bdiv_full_output_dir = '%s/bdiv/' % output_dir
1430
full_dm_fps = run_beta_diversity_through_plots(
1431
otu_table_fp=otu_table_fp,
1432
mapping_fp=mapping_fp,
1433
output_dir=bdiv_full_output_dir,
1434
command_handler=command_handler,
1436
qiime_config=qiime_config,
1437
sampling_depth=None,
1438
histogram_categories=categories,
1442
status_update_callback=status_update_callback)
1444
# cluster quality stub
1445
for bdiv_metric, dm_fp in full_dm_fps:
1446
for category in categories:
1447
cluster_quality_fp = '%s/%s_%s_cluster_quality.txt' %\
1448
(bdiv_full_output_dir,bdiv_metric,category)
1450
params_str = get_params_str(params['cluster_quality'])
1453
# Build the cluster quality command
1454
cluster_quality_cmd = \
1455
'cluster_quality.py -i %s -c %s -o %s -m %s %s' %\
1456
(dm_fp, category, cluster_quality_fp, mapping_fp, params_str)
1459
('Cluster quality (%s; %s)' % (bdiv_metric, category),
1460
cluster_quality_cmd)])
1462
index_links.append(('Cluster quality results (%s, %s)' % (bdiv_metric,category),
1464
'Beta diversity results'))
1465
# Create links for the bdiv results
1466
index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
1467
'%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
1468
(bdiv_full_output_dir,bdiv_metric,bdiv_metric),
1469
'Beta diversity results'))
1470
index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
1471
'%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
1472
(bdiv_full_output_dir,bdiv_metric,bdiv_metric),
1473
'Beta diversity results'))
1474
index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
1475
'%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
1476
(bdiv_full_output_dir,bdiv_metric,bdiv_metric),
1477
'Beta diversity results'))
1478
index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
1479
'%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
1480
(bdiv_full_output_dir,bdiv_metric,bdiv_metric),
1481
'Beta diversity results'))
1482
index_links.append(('Distance histograms (%s)' % bdiv_metric,
1483
'%s/%s_histograms/%s_dm_distance_histograms.html' % \
1484
(bdiv_full_output_dir,bdiv_metric,bdiv_metric),
1485
'Beta diversity results'))
1486
index_links.append(('Distance matrix (%s)' % bdiv_metric,
1488
(bdiv_full_output_dir,bdiv_metric),
1489
'Beta diversity results'))
1490
index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
1492
(bdiv_full_output_dir,bdiv_metric),
1493
'Beta diversity results'))
1497
bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
1498
even_dm_fps = run_beta_diversity_through_plots(
1499
otu_table_fp=otu_table_fp,
1500
mapping_fp=mapping_fp,
1501
output_dir=bdiv_even_output_dir,
1502
command_handler=command_handler,
1504
qiime_config=qiime_config,
1505
sampling_depth=sampling_depth,
1506
histogram_categories=categories,
1510
status_update_callback=status_update_callback)
1511
for bdiv_metric, dm_fp in even_dm_fps:
1512
for category in categories:
1513
cluster_quality_fp = '%s/%s_%s_cluster_quality.txt' %\
1514
(bdiv_even_output_dir,bdiv_metric,category)
1516
params_str = get_params_str(params['cluster_quality'])
1519
# Build the cluster quality command
1520
cluster_quality_cmd = \
1521
'cluster_quality.py -i %s -c %s -o %s -m %s %s' %\
1522
(dm_fp, category, cluster_quality_fp, mapping_fp, params_str)
1525
('Cluster quality (%s; %s)' % (bdiv_metric, category),
1526
cluster_quality_cmd)])
1527
index_links.append(('Cluster quality results (%s, %s)' % (bdiv_metric,category),
1529
'Beta diversity results (even sampling: %d)' % sampling_depth))
1530
# Create links for the bdiv results
1531
index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
1532
'%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
1533
(bdiv_even_output_dir,bdiv_metric,bdiv_metric),
1534
'Beta diversity results (even sampling: %d)' % sampling_depth))
1535
index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
1536
'%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
1537
(bdiv_even_output_dir,bdiv_metric,bdiv_metric),
1538
'Beta diversity results (even sampling: %d)' % sampling_depth))
1539
index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
1540
'%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
1541
(bdiv_even_output_dir,bdiv_metric,bdiv_metric),
1542
'Beta diversity results (even sampling: %d)' % sampling_depth))
1543
index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
1544
'%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
1545
(bdiv_even_output_dir,bdiv_metric,bdiv_metric),
1546
'Beta diversity results (even sampling: %d)' % sampling_depth))
1547
index_links.append(('Distance histograms (%s)' % bdiv_metric,
1548
'%s/%s_histograms/%s_dm_distance_histograms.html' % \
1549
(bdiv_even_output_dir,bdiv_metric,bdiv_metric),
1550
'Beta diversity results (even sampling: %d)' % sampling_depth))
1551
index_links.append(('Distance matrix (%s)' % bdiv_metric,
1553
(bdiv_even_output_dir,bdiv_metric),
1554
'Beta diversity results (even sampling: %d)' % sampling_depth))
1555
index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
1557
(bdiv_even_output_dir,bdiv_metric),
1558
'Beta diversity results (even sampling: %d)' % sampling_depth))
1560
## Alpha rarefaction workflow
1561
arare_full_output_dir = '%s/arare/' % output_dir
1562
run_qiime_alpha_rarefaction(
1563
otu_table_fp=otu_table_fp,
1564
mapping_fp=mapping_fp,
1565
output_dir=arare_full_output_dir,
1566
command_handler=command_handler,
1568
qiime_config=qiime_config,
1570
num_steps=arare_num_steps,
1573
min_rare_depth=arare_min_rare_depth,
1574
status_update_callback=status_update_callback)
1576
index_links.append(('Alpha rarefaction plots',
1577
'%s/alpha_rarefaction_plots/rarefaction_plots.html'\
1578
% arare_full_output_dir,
1579
"Alpha rarefaction results"))
1582
# OTU category significance and supervised learning
1583
for category in categories:
1584
category_signifance_fp = \
1585
'%s/category_significance_%s.txt' % (output_dir, category)
1587
params_str = get_params_str(params['otu_category_significance'])
1590
# Build the OTU cateogry significance command
1591
category_significance_cmd = \
1592
'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
1593
(otu_table_fp, mapping_fp, category,
1594
category_signifance_fp, params_str)
1595
commands.append([('OTU category significance (%s)' % category,
1596
category_significance_cmd)])
1598
supervised_learning_dir = \
1599
'%s/supervised_learning_%s' % (output_dir, category)
1601
params_str = get_params_str(params['supervised_learning'])
1604
# Build the supervised_learning command
1605
supervised_learning_cmd = \
1606
'supervised_learning.py -i %s -m %s -c %s -o %s %s' %\
1607
(otu_table_fp, mapping_fp, category,
1608
supervised_learning_dir, params_str)
1609
commands.append([('Supervised learning (%s)' % category,
1610
supervised_learning_cmd)])
1612
index_links.append(('Category significance (%s)' % category,
1613
category_signifance_fp,
1614
"Category results"))
1615
index_links.append(('Supervised learning (%s)' % category,
1616
supervised_learning_dir,
1617
"Supervised learning results"))
1619
command_handler(commands, status_update_callback, logger)
1620
generate_index_page(index_links,index_fp)
1622
def run_summarize_taxa_through_plots(otu_table_fp, mapping_fp,
1623
output_dir, mapping_cat, sort, command_handler, params, qiime_config,
1624
logger=None, status_update_callback=print_to_stdout):
1625
""" Run the data preparation for summarizing taxonomies and generating plots
1627
The steps performed by this function are:
1628
1) Summarize OTU by Categore
1629
2) Summarize Taxonomy
1630
3) Plot Taxonomy Summary
1633
# Prepare some variables for the later steps
1634
otu_table_dir, otu_table_filename = split(otu_table_fp)
1635
otu_table_basename, otu_table_ext = splitext(otu_table_filename)
1636
create_dir(output_dir)
1639
python_exe_fp = qiime_config['python_exe_fp']
1640
script_dir = get_qiime_scripts_dir()
1643
logger = WorkflowLogger(generate_log_fp(output_dir),
1645
qiime_config=qiime_config)
1646
close_logger_on_success = True
1648
close_logger_on_success = False
1649
log_input_md5s(logger,[otu_table_fp,mapping_fp])
1651
# Prep the summarize otu by category command
1653
otu_table_f = open(otu_table_fp,'U')
1655
logger.write('OTU table filepath cannot be opened. Does it exist?\n' +
1656
' %s\n' % otu_table_fp +
1657
'Original Error:\n%s\n' % str(e))
1661
# if mapping category not passed via command-line,
1662
# check if it is passed in params file
1665
mapping_cat=params['summarize_otu_by_cat']['mapping_category']
1670
params_str = get_params_str(params['summarize_otu_by_cat'])
1671
# Need to remove the mapping category option, since it is defined above.
1672
# Using this method since we don't want to change the params dict
1673
split_params=params_str.split('--')
1674
updated_params_str=[]
1675
for i in split_params:
1676
if not i.startswith('mapping_category'):
1677
updated_params_str.append(i)
1678
params_str='--'.join(updated_params_str)
1683
output_fp=join(output_dir,'%s_otu_table.biom' % (mapping_cat))
1684
# Build the summarize otu by category command
1685
summarize_otu_by_cat_cmd = \
1686
"%s %s/summarize_otu_by_cat.py -i %s -c %s -o %s -m %s %s" %\
1687
(python_exe_fp, script_dir, mapping_fp, otu_table_fp, output_fp, \
1688
mapping_cat, params_str)
1691
[('Summarize OTU table by Category',summarize_otu_by_cat_cmd)])
1693
otu_table_fp=output_fp
1695
# Build the sort OTU table command
1697
# Prep the sort_otu_table command
1699
params_str = get_params_str(params['sort_otu_table'])
1703
# define output otu table
1704
sorted_fp=join(output_dir,
1705
splitext(split(otu_table_fp)[-1])[0]+'_sorted.biom')
1707
if mapping_cat or params_str=='':
1708
# for this case we don't have a collapsed mapping file so must
1710
sort_otu_table_cmd = \
1711
"%s %s/sort_otu_table.py -i %s -o %s" %\
1712
(python_exe_fp, script_dir, otu_table_fp, sorted_fp)
1714
sort_otu_table_cmd = \
1715
"%s %s/sort_otu_table.py -i %s -o %s -m %s %s" %\
1716
(python_exe_fp, script_dir, otu_table_fp, sorted_fp, \
1717
mapping_fp, params_str)
1719
commands.append([('Sort OTU Table',sort_otu_table_cmd)])
1721
# redefine otu_table_fp to use
1722
otu_table_fp=sorted_fp
1724
# Prep the summarize taxonomy command
1726
params_str = get_params_str(params['summarize_taxa'])
1731
sum_taxa_levels=params['summarize_taxa']['level']
1733
sum_taxa_levels=None
1735
# Build the summarize taxonomy command
1736
summarize_taxa_cmd = '%s %s/summarize_taxa.py -i %s -o %s %s' %\
1737
(python_exe_fp, script_dir, otu_table_fp, \
1738
output_dir, params_str)
1740
commands.append([('Summarize Taxonomy',summarize_taxa_cmd)])
1745
basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0])
1746
for i in sum_taxa_levels.split(','):
1747
sum_taxa_fps.append(basename+'_L%s.txt' % (str(i)))
1749
basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0])
1750
# this is the default levels from summarize_taxa, but cannot import
1751
# script to get these values
1752
for i in [2,3,4,5,6]:
1753
sum_taxa_fps.append(basename+'_L%s.txt' % (str(i)))
1755
# Prep the plot taxa summary plot command(s)
1756
taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir
1758
makedirs(taxa_summary_plots_dir)
1763
params_str = get_params_str(params['plot_taxa_summary'])
1766
# Build the plot taxa summary plot command(s)
1768
plot_taxa_summary_cmd =\
1769
'%s %s/plot_taxa_summary.py -i %s -o %s %s' %\
1770
(python_exe_fp, script_dir, ','.join(sum_taxa_fps),
1771
taxa_summary_plots_dir, params_str)
1774
[('Plot Taxonomy Summary',plot_taxa_summary_cmd)])
1776
# Call the command handler on the list of commands
1777
command_handler(commands,
1778
status_update_callback,
1780
close_logger_on_success=close_logger_on_success)
1783
def run_ampliconnoise(mapping_fp,
1784
output_dir, command_handler, params, qiime_config,
1785
logger=None, status_update_callback=print_to_stdout,
1786
chimera_alpha=-3.8228,chimera_beta=0.6200, sff_txt_fp=None, numnodes=2,
1787
suppress_perseus=True, output_filepath=None, platform='flx',
1788
seqnoise_resolution=None, truncate_len=None):
1789
""" Run the ampliconnoise pipeline
1791
The steps performed by this function are:
1792
1. Split input sff.txt file into one file per sample
1794
2. Run scripts required for PyroNoise
1796
3. Run scripts required for SeqNoise
1798
4. Run scripts requred for Perseus (chimera removal)
1800
5. Merge output files into one file similar to the output of split_libraries.py
1802
output_filepath should be absolute
1803
seqnoise_resolution should be string
1804
environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
1805
careful passing command handlers that don't spawn child processes, as they
1806
may not inherit the correct environment variable setting
1808
map_data,headers,comments = parse_mapping_file(open(mapping_fp,'U'))
1809
create_dir(output_dir)
1811
if seqnoise_resolution == None:
1812
if platform=='flx': seqnoise_resolution = '30.0'
1813
elif platform=='titanium': seqnoise_resolution = '25.0'
1814
else: raise RuntimeError('seqnoise_resolution not set, and no'+\
1815
' default for platform '+platform)
1817
if truncate_len == None:
1818
if platform=='flx': truncate_len = '220'
1819
elif platform=='titanium': truncate_len = '400'
1820
else: raise RuntimeError('truncate_len not set, and no'+\
1821
' default for platform '+platform)
1823
sample_names = [] # these are filenames minus extension, and are sample IDs
1824
primer_seqs = [] # same order as sample_names
1825
bc_seqs = [] # same order as sample_names
1826
for i in range(len(map_data)):
1827
sample_names.append(map_data[i][headers.index('SampleID')])
1828
bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
1829
# don't know why don't just take off the primer now.
1830
# but that's done later
1831
# primer += (map_data[i][headers.index('LinkerPrimerSequence')])
1832
# for char, bases in IUPAC_DNA_ambiguities.items():
1833
# primer = primer.replace(char,'['+''.join(bases)+']')
1835
primer = (map_data[i][headers.index('LinkerPrimerSequence')])
1836
for char, bases in IUPAC_DNA_ambiguities.items():
1837
primer = primer.replace(char,'['+''.join(bases)+']')
1838
primer_seqs.append(primer)
1840
if len(set(primer_seqs)) != 1:
1842
'Error: only one primer per mapping file supported.')
1843
one_primer = primer_seqs[0]
1846
python_exe_fp = qiime_config['python_exe_fp']
1847
script_dir = get_qiime_scripts_dir()
1850
logger = WorkflowLogger(generate_log_fp(output_dir),
1852
qiime_config=qiime_config)
1853
close_logger_on_success = True
1855
close_logger_on_success = False
1856
log_input_md5s(logger,[mapping_fp,sff_txt_fp])
1858
# execute commands in output_dir
1859
called_dir = os.getcwd()
1860
os.chdir(output_dir)
1861
fh = open(os.path.join(output_dir,'map.csv'),'w')
1862
for i in range(len(sample_names)):
1863
fh.write(sample_names[i]+','+bc_seqs[i]+'\n')
1866
# these are the fasta results, e.g. PC.636_Good.fa
1867
# later we merge them and copy to output file
1868
post_pyro_tail = '_'+truncate_len
1869
if suppress_perseus == True:
1870
fasta_result_names = [sample_name + post_pyro_tail+'_seqnoise_cd.fa'
1871
for sample_name in sample_names]
1873
fasta_result_names = [sample_name + '_Good.fa' \
1874
for sample_name in sample_names]
1876
cmd = 'cd '+output_dir # see also os.chdir above
1877
commands.append([('change to output dir', cmd)])
1879
cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
1880
commands.append([('confirm pyro lookup filepath environment variable',
1884
cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\
1885
os.path.join(called_dir,sff_txt_fp)+\
1886
' > splitkeys_log.txt 2> unassigned.fna'
1887
commands.append([('split sff.txt via barcodes (keys)', cmd)])
1889
for i, sample_name in enumerate(sample_names):
1891
# Build the summarize taxonomy command
1892
if platform == 'flx':
1893
cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\
1895
commands.append([('clean flows '+sample_name, cmd)])
1897
# these run through the whole sff file once per sample, I think
1898
# cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
1899
# os.path.join(called_dir,sff_txt_fp)
1900
# commands.append([('extract flows '+sample_name, cmd)])
1901
elif platform == 'titanium':
1902
cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\
1904
commands.append([('clean flows '+sample_name, cmd)])
1906
# cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
1907
# os.path.join(called_dir,sff_txt_fp)
1908
# commands.append([('extract flows '+sample_name, cmd)])
1910
raise RuntimeError("platform " + platform + " not supported")
1912
cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\
1913
sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout"
1914
commands.append([('pyrodist '+sample_name, cmd)])
1916
cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\
1917
" > "+sample_name+".fcout"
1918
commands.append([('fcluster pyrodist '+sample_name, cmd)])
1921
# mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
1922
# PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
1923
cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\
1924
sample_name+".dat -out "+\
1925
sample_name+"_pyronoise "+"-lin "+\
1926
sample_name+".list -s 60.0 -c 0.01 > "+\
1927
sample_name+"_pyronoise.pnout"
1928
commands.append([('pyronoise '+sample_name, cmd)])
1930
cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\
1931
sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\
1933
commands.append([('truncate '+sample_name, cmd)])
1935
# now start with post_pyro_tail
1936
cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\
1937
sample_name+post_pyro_tail+\
1938
".fa > "+sample_name+post_pyro_tail+".seqdist"
1939
commands.append([('seqdist '+sample_name, cmd)])
1941
cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\
1942
sample_name+post_pyro_tail+"fcl > "+\
1943
sample_name+post_pyro_tail+".fcout"
1944
commands.append([('fcluster seqdist '+sample_name, cmd)])
1947
# mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
1948
# PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
1949
# PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
1950
# PC.354_pyronoise_cd.snout
1952
cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\
1953
sample_name+post_pyro_tail+\
1954
".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\
1955
sample_name+post_pyro_tail+\
1956
"_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\
1957
sample_name+'_pyronoise'+\
1958
'.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\
1959
sample_name+post_pyro_tail+'.snout'
1960
commands.append([('seqnoise '+sample_name, cmd)])
1963
if suppress_perseus == False:
1965
cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\
1966
'_seqnoise_cd.fa > ' +\
1968
commands.append([('Perseus '+sample_name, cmd)])
1970
cmd = 'Class.pl '+sample_name+'.per '+\
1971
str(chimera_alpha) + ' '+ str(chimera_beta)+\
1972
' > '+sample_name+'.class'
1973
commands.append([('Class.pl '+sample_name, cmd)])
1975
cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\
1976
'_seqnoise_cd.fa '+\
1977
sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\
1978
sample_name+'_Good.fa'
1979
commands.append([('FilterGoodClass '+sample_name, cmd)])
1981
cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\
1982
(python_exe_fp, script_dir, fasta_result_names[i],
1983
sample_name+'_unw.fna', sample_name)
1984
commands.append([('unweight fasta '+sample_name, cmd)])
1987
' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\
1988
' > ' + output_filepath # this should be an abs filepath
1989
commands.append([('cat into one fasta file', cmd)])
1991
# Call the command handler on the list of commands
1992
command_handler(commands,
1993
status_update_callback,
1995
close_logger_on_success=close_logger_on_success)