2
# File created on 19 May 2011
3
from __future__ import division
5
__author__ = "William Van Treuren"
6
__copyright__ = "Copyright 2011, The QIIME project"
7
__credits__ = ["William Van Treuren","Greg Caporaso"]
10
__maintainer__ = "William Van Treuren"
11
__email__ = "vantreur@colorado.edu"
12
__status__ = "Release"
14
from numpy import array, isnan
15
from qiime.parse import parse_mapping_file_to_dict, parse_rarefaction
16
from cogent.maths.stats.test import t_two_sample
18
def make_value_pairs_from_category(mapping_data, category):
19
"""creates all pairs of unique category values from mapping data
22
mapping_data - a nested dictionary that maps SampleIds to
23
descriptor categories (e.g. {'Id1': {'Weight':'Fat'}}
25
category - a string which specifies a category found in the
26
mapping data (e.g. 'Obese')
29
unique_pairs - a list of unique pairs of the values specified by
31
(e.g. [('Obese','Fat'),('Obese','notFat'),('Fat','notFat')]
35
#gets the keys of the mapping file dictionary. corresponds to the
36
#names of the individuals in the mapping data file
38
keys = mapping_data.keys()
43
categories.append(mapping_data[key][category])
45
raise ValueError(('the specified category ({0}) was '+\
46
'not found in the mapping file.').format(category))
47
#strip duplicate values from this list
50
for val in categories:
51
if unique_vals.count(val)==0:
52
unique_vals.append(val)
54
#create and populate list of unique pairs of category values
57
for i in range(len(unique_vals)):
58
for j in unique_vals[i+1:]:
59
unique_pairs.append((unique_vals[i], j))
63
def make_category_values_Id_dict(mapping_data, category):
64
"""makes dict lists all SampleIds that have given category value
67
mapping_data - a nested dictionary that maps SampleID to
68
descriptor categories (e.g. {'Id1': {'Weight':'Fat}}
70
category - a string which specifies a category found in the
71
mapping data (e.g. 'Weight')
74
cat_val_Ids - a dictionary with category values as keys
75
and a list of SampleIds which have the specific category value
77
(e.g. {'Fat':['Id1','Id2'],'notFat':['Id3'],'Obese':['Id4']}).
81
keys = mapping_data.keys()
83
# create and populate list of all the different values for the
84
# category that was specified
88
categories.append(mapping_data[key][category])
90
#strip duplicate values from this list
93
for val in categories:
94
if unique_vals.count(val)==0:
95
unique_vals.append(val)
97
#make a dictionary with keys that are the possible values of the
98
#category that was specified.
101
for val in unique_vals:
102
cat_val_Ids[val] = []
104
#populate the cat_val dict with Id's which have proper category
107
for val in cat_val_Ids.keys():
108
if mapping_data[key][category] == val:
109
cat_val_Ids[val].append(key)
113
def map_category_value_pairs_to_Ids(value_pairs, cat_val_Ids):
114
"""maps category value pairs to Id's which have that category value
117
value_pairs - a list of pairs of categories (e.g.
118
[('Obese','Fat'),('Fat','notFat'),('Obese','notFat')]
120
cat_val_Ids - a dictionary with category values as keys
121
and a list of SampleId's which have the specific category value
123
(e.g. {'Fat':['Id1','Id2'],'notFat':['Id3'],'Obese':['Id4']}).
126
mapped_pairs - the list of value_pairs with the values replaced
127
by the SampleIds which have the values specified in the pair e.g
128
[(['Id4'],['Id1','Id2']),(['Id1','Id2'],\
129
['Id3]),(['Id4'],['Id3])]
134
for pair in value_pairs:
135
mapped_pairs.append((cat_val_Ids[pair[0]],cat_val_Ids[pair[1]]))
140
def make_SampleIds_rarefaction_columns_dict(rarefaction_list):
141
"""maps SampleId to column in parsed rarefaction file output
144
rarefaction_list - ouput of parse_rarefaction.py. a nested list
145
of scores and SampleIds and other fields.
148
map_from_Id_to_col - a dict which has as keys the SampleIds, and
149
as values the col they are in the in the parsed rarefaction list
152
map_from_Id_to_col = {}
154
# the first 3 entries in the rarefaction_list are not SampleIDs
157
Ids = rarefaction_list[0][3:]
160
map_from_Id_to_col[Id] = Ids.index(Id)
162
return map_from_Id_to_col
165
def extract_rarefaction_scores_at_depth(depth, rarefaction_list):
166
"""makes rarefaction matrix with row=iteration and col=SampleId
169
depth - an integer which corresponds to the depth of the
170
rarefaction. also called the "sequences per sample" in the
173
rarefaction_list - ouput of parse_rarefaction.py. a nested list
174
of scores and SampleIds and other fields.
177
result - a matrix with rows=rarefaction scores at a given depth
178
and iteration, and cols=SampleIds.
181
# make and populate an array that has as rows rarefaction values
182
# at the same depth and iteration and as cols SampleIds.
186
# the 4th element of rarefaction_list is a list of scores for each
189
for line in rarefaction_list[3]:
191
# the first two elements are just rarefaction depth and
192
# iteration, throw these away
193
score_matrix.append(line[2:])
195
# raise error if rarefaction depth not found in rarefaction file
196
if score_matrix == []:
197
raise ValueError(('Specified depth ({0}) was not found in '+\
198
'the rarefaction file.').format(depth))
201
score_matrix_elements = []
203
for line in score_matrix:
204
score_matrix_elements.append(line)
206
result = array(score_matrix_elements)
208
# raise error if any rarefaction score at spec. depth is Nan
209
if isnan(result).any():
210
raise ValueError(('Specified depth ({0}) has NaNs for some '+\
211
'rarefaction scores.').format(depth))
216
def convert_SampleIds_to_rarefaction_mtx(chosen_SampleIds,score_matrix,\
217
map_from_SampleIds_to_cols):
218
"""converts list of SampleIDs to score mtx from rarefaction file
221
chosen_SampleIds - a list of SampleIds
223
score_matrix - a matrix created by
224
extract_rarefaction_scores_at_depth which represents the
225
rarefaction scores for a given depth
227
map_from_SampleIds_to_cols - a dict which maps a SampleId to
228
the column its scores are in in the score matrix
231
reduced_scores_matrix - a matrix which is the input scores mtx
232
with only the cols that correspond to the chosen_SampleIds
235
#create and populate a list that specifies the r_array columns which
236
#correspond to the name_list
240
for Id in chosen_SampleIds:
241
cols.append(map_from_SampleIds_to_cols[Id])
243
# grab only the columns we need based on a passed list of names and
244
# a dictionary to convert between those names and the proper cols
245
reduced_scores_matrix = score_matrix.take(cols, axis=1)
247
return reduced_scores_matrix
251
def compare_alpha_diversities(rarefaction_lines, mapping_lines,
253
"""compares alpha diversities
256
rarefaction_file - rarefaction file which gives scores for
257
various rarefactions and depths
259
mapping_file - file that has ID's and categories that the ID's
262
category - the category to be compared, is a string
264
depth - the depth of the rarefaction_file to use, is an integer
267
results - a nested dictionary which specifies the category as
268
the top level key, and as its value, dictionaries which give the
269
results of the t_two_sample test for all unique pairs of values
270
in the specified category
274
rarefaction_data = parse_rarefaction(rarefaction_lines)
275
mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
276
value_pairs = make_value_pairs_from_category(mapping_data, category)
278
category_values_Ids = make_category_values_Id_dict(mapping_data,
281
SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
284
map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
287
reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
290
results = {category:{}}
292
for pair in range(len(SampleId_pairs)):
293
i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
294
reduced_rarefaction_mtx, map_from_Id_to_col))
296
j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
297
reduced_rarefaction_mtx, map_from_Id_to_col))
299
results[category][(str(value_pairs[pair][0]),
300
str(value_pairs[pair][1]))] =\