3
3
__author__ = "Rob Knight, Greg Caporaso"
4
4
__copyright__ = "Copyright 2011, The QIIME Project"
5
__credits__ = ["Rob Knight", "Greg Caporaso", "Kyle Bittinger", "Antonio Gonzalez Pena", "David Soergel"]
5
__credits__ = ["Rob Knight", "Greg Caporaso", "Kyle Bittinger",
6
"Antonio Gonzalez Pena", "David Soergel", "Jai Ram Rideout"]
8
9
__maintainer__ = "Greg Caporaso"
9
10
__email__ = "gregcaporaso@gmail.com"
10
11
__status__ = "Release"
15
from os import system, remove, path, mkdir
16
from os.path import split, splitext
18
18
from itertools import count
19
19
from string import strip
20
20
from shutil import copy as copy_file
21
21
from tempfile import NamedTemporaryFile
22
22
from cStringIO import StringIO
23
23
from cogent import LoadSeqs, DNA
24
from qiime.util import get_tmp_filename
25
24
from cogent.app.formatdb import build_blast_db_from_fasta_path
26
25
from cogent.app.blast import blast_seqs, Blastall, BlastResult
27
26
from qiime.pycogent_backports import rdp_classifier
28
from qiime.pycogent_backports import rtax
27
from cogent.app import rtax
28
from qiime.pycogent_backports import mothur
29
from cogent.app.util import ApplicationNotFoundError
29
30
from cogent.parse.fasta import MinimalFastaParser
30
from qiime.util import FunctionWithParams, get_rdp_jarpath
31
from qiime.util import FunctionWithParams, get_rdp_jarpath, get_qiime_temp_dir
33
# Load Tax2Tree if it's available. If it's not, skip it, but set up
34
# to raise errors if the user tries to use it.
36
from t2t.nlevel import load_consensus_map, load_tree, determine_rank_order
37
from qiime.pycogent_backports import tax2tree
40
def raise_tax2tree_not_found_error(*args, **kwargs):
41
raise ApplicationNotFoundError,\
42
"Tax2Tree cannot be found.\nIs Tax2Tree installed? Is it in your $PYTHONPATH?"+\
43
"\nYou can obtain Tax2Tree from http://sourceforge.net/projects/tax2tree/."
44
#set functions which cannot be imported to raise_tax2tree_not_found_error
45
load_consensus = load_tree = determine_rank_order = tax2tree_controller = raise_tax2tree_not_found_error
33
47
"""Contains code for assigning taxonomy, using several techniques.
35
49
This module has the responsibility for taking a set of sequences and
36
50
providing a taxon assignment for each sequence."""
38
def guess_rdp_version(rdp_jarpath=None):
53
def validate_rdp_version(rdp_jarpath=None):
39
54
if rdp_jarpath is None:
40
55
rdp_jarpath = get_rdp_jarpath()
41
56
if rdp_jarpath is None:
43
"RDP classifier is not installed or "
44
"not accessible to QIIME. See install instructions here: "
45
"http://qiime.org/install/install.html#rdp-install")
46
elif "2.2" in rdp_jarpath:
50
"RDP classifier filename does not look like version 2.2. Only "
51
"version 2.2 is supported by QIIME. RDP jar path is:"
58
"RDP classifier is not installed or not accessible to QIIME. "
59
"See install instructions here: "
60
"http://qiime.org/install/install.html#rdp-install"
63
rdp_jarname = os.path.basename(rdp_jarpath)
64
version_match = re.search("\d\.\d", rdp_jarname)
65
if version_match is None:
67
"Unable to detect RDP Classifier version in file %s" % rdp_jarname
70
version = float(version_match.group())
73
"RDP Classifier does not look like version 2.2 or greater."
74
"Versions of the software prior to 2.2 have different "
75
"formatting conventions and are no longer supported by QIIME. "
76
"Detected version %s from file %s" % (version, rdp_jarpath)
55
81
class TaxonAssigner(FunctionWithParams):
333
class MothurTaxonAssigner(TaxonAssigner):
334
"""Assign taxonomy using Mothur's naive Bayes implementation
336
Name = 'MothurTaxonAssigner'
337
Application = "Mothur"
339
"Schloss, P.D., et al., Introducing mothur: Open-source, platform-"
340
"independent, community-supported software for describing and "
341
"comparing microbial communities. Appl Environ Microbiol, 2009. "
344
_tracked_properties = ['Application', 'Citation']
346
def __init__(self, params):
351
'id_to_taxonomy_fp': None,
352
'reference_sequences_fp': None,
354
_params.update(params)
355
super(MothurTaxonAssigner, self).__init__(_params)
357
def __call__(self, seq_path, result_path=None, log_path=None):
358
seq_file = open(seq_path)
359
percent_confidence = int(self.Params['Confidence'] * 100)
360
result = mothur.mothur_classify_file(
362
ref_fp=self.Params['reference_sequences_fp'],
363
tax_fp=self.Params['id_to_taxonomy_fp'],
364
cutoff=percent_confidence,
365
iters=self.Params['Iterations'],
366
ksize=self.Params['KmerSize'],
367
output_fp=result_path,
370
self.writeLog(log_path)
305
374
class RdpTaxonAssigner(TaxonAssigner):
306
375
"""Assign taxon using RDP's naive Bayesian classifier
308
377
Name = "RdpTaxonAssigner"
309
Application = "RDP classfier, version 2.2"
378
Application = "RDP classfier"
310
379
Citation = "Wang, Q, G. M. Garrity, J. M. Tiedje, and J. R. Cole. 2007. Naive Bayesian Classifier for Rapid Assignment of rRNA Sequences into the New Bacterial Taxonomy. Appl Environ Microbiol. 73(16):5261-7."
312
381
_tracked_properties = ['Application','Citation','Taxonomy']
346
407
result to the desired path instead of returning it.
347
408
log_path: path to log, which should include dump of params.
410
tmp_dir = get_qiime_temp_dir()
350
411
min_conf = self.Params['Confidence']
351
412
training_data_properties_fp = self.Params['training_data_properties_fp']
352
413
reference_sequences_fp = self.Params['reference_sequences_fp']
353
414
id_to_taxonomy_fp = self.Params['id_to_taxonomy_fp']
354
415
max_memory = self.Params['max_memory']
356
seq_file = open(seq_path, 'r')
417
seq_file = open(seq_path, 'U')
357
418
if reference_sequences_fp and id_to_taxonomy_fp:
358
419
# Train and assign taxonomy
359
420
taxonomy_file, training_seqs_file = self._generate_training_files()
360
results = self._train_fcn(
421
results = rdp_classifier.train_rdp_classifier_and_assign_taxonomy(
361
422
training_seqs_file, taxonomy_file, seq_file,
362
423
min_confidence=min_conf,
363
424
classification_output_fp=result_path,
364
max_memory=max_memory)
425
max_memory=max_memory, tmp_dir=tmp_dir)
366
428
if result_path is None:
367
429
results = self._training_set.fix_results(results)
383
449
"""Returns a tuple of file objects suitable for passing to the
384
450
RdpTrainer application controller.
452
tmp_dir = get_qiime_temp_dir()
386
453
training_set = RdpTrainingSet()
387
reference_seqs_file = open(self.Params['reference_sequences_fp'], 'r')
388
id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'r')
454
reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U')
455
id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U')
390
457
for seq_id, seq in MinimalFastaParser(reference_seqs_file):
391
458
training_set.add_sequence(seq_id, seq)
397
464
training_set.dereplicate_taxa()
399
466
rdp_taxonomy_file = NamedTemporaryFile(
400
prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt')
467
prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir)
401
468
rdp_taxonomy_file.write(training_set.get_rdp_taxonomy())
402
469
rdp_taxonomy_file.seek(0)
404
471
rdp_training_seqs_file = NamedTemporaryFile(
405
prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta')
472
prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta',
406
474
for rdp_id, seq in training_set.get_training_seqs():
407
475
rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq))
408
476
rdp_training_seqs_file.seek(0)
417
485
self._tree = RdpTree()
418
486
self.sequences = {}
419
487
self.sequence_nodes = {}
488
self.lineage_depth = None
421
490
def add_sequence(self, seq_id, seq):
422
491
self.sequences[seq_id] = seq
424
493
def add_lineage(self, seq_id, lineage_str):
494
for char, escape_str in _QIIME_RDP_ESCAPES:
495
lineage_str = re.sub(char, escape_str, lineage_str)
425
496
lineage = self._parse_lineage(lineage_str)
426
497
seq_node = self._tree.insert_lineage(lineage)
427
498
self.sequence_nodes[seq_id] = seq_node
434
505
lineage string of an id_to_taxonomy file.
436
507
lineage = lineage_str.strip().split(';')
437
if len(lineage) != 6:
508
if self.lineage_depth is None:
509
self.lineage_depth = len(lineage)
510
if len(lineage) != self.lineage_depth:
438
511
raise ValueError(
439
'Each reference assignment must contain 6 items, specifying '
440
'domain, phylum, class, order, family, and genus. '
441
'Detected %s items in "%s": %s.' % \
442
(len(lineage), lineage_str, lineage))
512
'Because the RDP Classifier operates in a bottom-up manner, '
513
'each taxonomy assignment in the id-to-taxonomy file must have '
514
'the same number of ranks. Detected %s ranks in the first '
515
'item of the file, but detected %s ranks later in the file. '
516
'Offending taxonomy string: %s' %
517
(self.lineage_depth, len(lineage), lineage_str))
445
520
def get_training_seqs(self):
469
544
# Ultimate hack to replace mangled taxa names
470
545
temp_results = StringIO()
471
546
for line in open(result_path):
472
untagged_line = re.sub(
473
548
_QIIME_RDP_TAXON_TAG + "[^;\n\t]*", '', line)
474
temp_results.write(untagged_line)
549
for char, escape_str in _QIIME_RDP_ESCAPES:
550
line = re.sub(escape_str, char, line)
551
temp_results.write(line)
475
552
open(result_path, 'w').write(temp_results.getvalue())
477
554
def fix_results(self, results_dict):
478
555
for seq_id, assignment in results_dict.iteritems():
479
556
lineage, confidence = assignment
480
revised_lineage = re.sub(
481
558
_QIIME_RDP_TAXON_TAG + "[^;\n\t]*", '', lineage)
482
results_dict[seq_id] = (revised_lineage, confidence)
559
for char, escape_str in _QIIME_RDP_ESCAPES:
560
lineage = re.sub(escape_str, char, lineage)
561
results_dict[seq_id] = (lineage, confidence)
483
562
return results_dict
538
616
def dereplicate_taxa(self):
617
# We check that there are no duplicate taxon names (case insensitive)
618
# at a given depth. We must do a case insensitive check because the RDP
619
# classifier converts taxon names to lowercase when it checks for
620
# duplicates, and will throw an error otherwise.
539
621
taxa_by_depth = {}
540
622
for node in self.get_nodes():
542
624
depth = node.depth
543
625
current_names = taxa_by_depth.get(depth, set())
544
if name in current_names:
626
if name.lower() in current_names:
545
627
node.name = name + _QIIME_RDP_TAXON_TAG + str(node.id)
547
current_names.add(name)
629
current_names.add(name.lower())
548
630
taxa_by_depth[depth] = current_names
550
632
def get_rdp_taxonomy(self):
580
673
Name = "RtaxTaxonAssigner"
581
674
Application = "RTAX classifier" # ", version 0.98" # don't hardcode the version number, as it may change, and then the log output test would fail
582
Citation = "Soergel D.A.W., Dey N., Knight R., and Brenner S.E. 2012. Selection of primers for optimal taxonomic classification of environmental 16S rRNA gene sequences. ISME J."
675
Citation = "Soergel D.A.W., Dey N., Knight R., and Brenner S.E. 2012. Selection of primers for optimal taxonomic classification of environmental 16S rRNA gene sequences. ISME J (6), 1440-1444"
583
676
_tracked_properties = ['Application','Citation']
585
678
def __init__(self, params):
639
734
amplicon_id_regex=self.Params['amplicon_id_regex']
641
736
# seq_file = open(seq_path, 'r')
642
738
results = rtax.assign_taxonomy(seq_path, reference_sequences_fp, id_to_taxonomy_fp,
643
read_1_seqs_fp, read_2_seqs_fp, single_ok=single_ok,
739
read_1_seqs_fp, read_2_seqs_fp, single_ok=single_ok, no_single_ok_generic=no_single_ok_generic,
644
740
header_id_regex=header_id_regex, read_id_regex=read_id_regex,
645
741
amplicon_id_regex=amplicon_id_regex, output_fp=result_path,
742
log_path=log_path,base_tmp_dir=get_qiime_temp_dir())
747
class Tax2TreeTaxonAssigner(TaxonAssigner):
748
"""Assign taxon using Tax2Tree
750
Name = "Tax2TreeTaxonAssigner"
751
Application = "Tax2Tree"
752
Citation = "Daniel McDonald"
754
def __init__(self, params):
755
"""Returns a new Tax2TreeAssigner object with specified params
758
#Required. Used as consensus map.
759
'id_to_taxonomy_fp': None,
760
#Required. The aligned and filtered tree of combined input and reference seqs.
763
_params.update(params)
764
TaxonAssigner.__init__(self, _params)
766
def __call__(self, seq_path=None, result_path=None, log_path=None):
767
"""Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq
769
Keep in mind, "confidence" is only done for consistency and in fact
770
all assignments will have a score of 0 because a method for determining
771
confidence is not currently implemented.
774
seq_path: path to file of sequences. The sequences themselves are
775
never actually used, but they are needed for their ids.
776
result_path: path to file of results. If specified, dumps the
777
result to the desired path instead of returning it.
778
log_path: path to log, which should include dump of params.
781
# initialize the logger
782
logger = self._get_logger(log_path)
783
logger.info(str(self))
785
with open(seq_path, 'U') as f:
786
seqs = dict(MinimalFastaParser(f))
788
consensus_map = tax2tree.prep_consensus(open(self.Params['id_to_taxonomy_fp']), seqs.keys())
789
seed_con = consensus_map[0].strip().split('\t')[1]
790
determine_rank_order(seed_con)
792
tipnames_map = load_consensus_map(consensus_map, False)
794
tree = load_tree(open(self.Params['tree_fp']), tipnames_map)
796
results = tax2tree.generate_constrings(tree, tipnames_map)
797
results = tax2tree.clean_output(results, seqs.keys())
800
# if the user provided a result_path, write the
802
with open(result_path,'w') as f:
803
for seq_id, (lineage, confidence) in results.iteritems():
804
f.write('%s\t%s\t%s\n' %(seq_id, lineage, confidence))
805
logger.info('Result path: %s' % result_path)
810
def _get_logger(self, log_path=None):
811
if log_path is not None:
812
handler = logging.FileHandler(log_path, mode='w')
814
class NullHandler(logging.Handler):
815
def emit(self, record): pass
816
handler = NullHandler()
817
logger = logging.getLogger("Tax2TreeTaxonAssigner logger")
818
logger.addHandler(handler)
819
logger.setLevel(logging.INFO)