2
// File GeneMapperCsvExport.cpp
3
// Authors : Sylvain Gaillard
4
// Last modification : April 2, 2008
2
// File: GeneMapperCsvExport.cpp
3
// Author: Sylvain Gaillard
4
// Created: April 2, 2008
8
Copyright or © or Copr. CNRS, (April 2, 2008)
8
Copyright or © or Copr. Bio++ Development Team, (April 2, 2008)
10
10
This software is a computer program whose purpose is to provide classes
11
11
for population genetics analysis.
13
13
This software is governed by the CeCILL license under French law and
14
abiding by the rules of distribution of free software. You can use,
14
abiding by the rules of distribution of free software. You can use,
15
15
modify and/ or redistribute the software under the terms of the CeCILL
16
16
license as circulated by CEA, CNRS and INRIA at the following URL
17
"http://www.cecill.info".
17
"http://www.cecill.info".
19
19
As a counterpart to the access to the source code and rights to copy,
20
20
modify and redistribute granted by the license, users are provided only
21
21
with a limited warranty and the software's author, the holder of the
22
22
economic rights, and the successive licensors have only limited
25
25
In this respect, the user's attention is drawn to the risks associated
26
26
with loading, using, modifying and/or developing or reproducing the
29
29
therefore means that it is reserved for developers and experienced
30
30
professionals having in-depth computer knowledge. Users are therefore
31
31
encouraged to load and test the software's suitability as regards their
32
requirements in conditions enabling the security of their systems and/or
33
data to be ensured and, more generally, to use and operate it in the
34
same conditions as regards security.
32
requirements in conditions enabling the security of their systems and/or
33
data to be ensured and, more generally, to use and operate it in the
34
same conditions as regards security.
36
36
The fact that you are presently reading this means that you have had
37
37
knowledge of the CeCILL license and that you accept its terms.
40
40
#include "GeneMapperCsvExport.h"
42
42
using namespace bpp;
43
43
using namespace std;
45
GeneMapperCsvExport::GeneMapperCsvExport(bool ia): IndependentAlleles_(ia) {}
45
const std::string GeneMapperCsvExport::SAMPLE_FILE_H = "Sample File";
46
const std::string GeneMapperCsvExport::SAMPLE_NAME_H = "Sample Name";
47
const std::string GeneMapperCsvExport::PANEL_H = "Panel";
48
const std::string GeneMapperCsvExport::MARKER_H = "Marker";
49
const std::string GeneMapperCsvExport::DYE_H = "Dye";
50
const std::string GeneMapperCsvExport::ALLELE_H = "Allele ";
51
const std::string GeneMapperCsvExport::SIZE_H = "Size ";
52
const std::string GeneMapperCsvExport::HEIGHT_H = "Height ";
53
const std::string GeneMapperCsvExport::PEAK_AREA_H = "Peak Area ";
54
const std::string GeneMapperCsvExport::DAC_H = "DAC";
55
const std::string GeneMapperCsvExport::AN_H = "AN";
57
GeneMapperCsvExport::GeneMapperCsvExport(bool ia) : IndependentAlleles_(ia) {}
47
59
GeneMapperCsvExport::~GeneMapperCsvExport() {}
55
67
* Feed a DataTable with the data
57
DataTable dt = * DataTable::read(is, "\t", true, -1);
69
DataTable* dtp = DataTable::read(is, "\t", true, -1);
60
73
* Fixe the individuals' name if there is duplicate in the file
62
75
vector<string> ind_names;
63
76
vector<string> markers;
65
ind_names = dt.getColumn("Sample Name");
66
markers = dt.getColumn("Marker");
79
ind_names = dt.getColumn(SAMPLE_NAME_H);
80
markers = dt.getColumn(MARKER_H);
68
catch (Exception &e) {
71
86
map<string, int> indname_marker;
72
for (unsigned int i = 0 ; i < dt.getNumberOfRows() ; i++) {
73
string test_lab = dt(i, "Sample Name") + dt(i, "Marker");
74
if (indname_marker.find(test_lab) != indname_marker.end()) {
75
string new_lab = dt(i, "Sample Name") + "_" + TextTools::toString(indname_marker[test_lab] + 1);
76
dt (i, "Sample Name") = new_lab;
87
for (size_t i = 0; i < dt.getNumberOfRows(); i++)
89
string test_lab = dt(i, SAMPLE_NAME_H) + dt(i, MARKER_H);
90
if (indname_marker.find(test_lab) != indname_marker.end())
92
string new_lab = dt(i, SAMPLE_NAME_H) + "_" + TextTools::toString(indname_marker[test_lab] + 1);
93
dt (i, SAMPLE_NAME_H) = new_lab;
78
95
indname_marker[test_lab]++;
80
ind_names = dt.getColumn("Sample Name");
97
ind_names = dt.getColumn(SAMPLE_NAME_H);
82
map<string, unsigned int> ind_count = VectorTools::countValues(ind_names);
99
map<string, size_t> ind_count = VectorTools::countValues(ind_names);
83
100
ind_names = VectorTools::unique(ind_names);
84
101
markers = VectorTools::unique(markers);
85
unsigned int loc_nbr = markers.size();
102
size_t loc_nbr = markers.size();
93
110
* Group of individuals
95
112
data_set.addEmptyGroup(0);
96
for (unsigned int i = 0 ; i < ind_names.size() ; i++) {
113
for (unsigned int i = 0; i < ind_names.size(); i++)
97
115
Individual ind(ind_names[i]);
98
116
data_set.addIndividualToGroup(data_set.getGroupPosition(0), ind);
104
122
AnalyzedLoci al(markers.size());
105
123
vector<string> col_names = dt.getColumnNames();
125
// Finds columns containing allele data
106
126
vector<unsigned int> alleles_cols;
107
for (unsigned int i = 0 ; i < col_names.size() ; i++)
108
if (TextTools::count(col_names[i], "Allele "))
127
for (unsigned int i = 0; i < col_names.size(); i++)
129
if (TextTools::startsWith(col_names[i], ALLELE_H))
109
130
alleles_cols.push_back(i);
110
vector<vector <unsigned int> > alleles_pos;
111
for (unsigned int i = 0 ; i < markers.size() ; i++) {
133
vector<vector<unsigned int> > alleles_pos;
134
for (unsigned int i = 0; i < markers.size(); i++)
112
136
al.setLocusInfo(i, LocusInfo(markers[i], LocusInfo::UNKNOWN));
113
vector<unsigned int> v = VectorTools::whichAll(dt.getColumn("Marker"), markers[i]);
114
alleles_pos.push_back(v);
116
for (unsigned int i = 0 ; i < alleles_cols.size() ; i++) {
117
for (unsigned int j = 0 ; j < markers.size() ; j++) {
118
vector<string> m_allele;
119
for (unsigned int k = 0 ; k < alleles_pos[j].size() ; k++) {
120
if (dt(alleles_pos[j][k],alleles_cols[i]) != string(""))
121
m_allele.push_back(dt(alleles_pos[j][k],alleles_cols[i]));
138
std::map< std::string, std::set< std::string > > markerAlleles;
139
for (unsigned int i = 0; i < dt.getNumberOfRows(); ++i)
141
for (unsigned int j = 0; j < alleles_cols.size(); ++j)
143
if (dt(i, alleles_cols[j]) != "")
145
markerAlleles[dt(i, MARKER_H)].insert(dt(i, alleles_cols[j]));
123
m_allele = VectorTools::unique(m_allele);
124
if (m_allele.size() > 0)
125
al.addAlleleInfoByLocusName(markers[j], BasicAlleleInfo(m_allele[0]));
149
for (std::map< std::string, std::set< std::string > >::iterator itm = markerAlleles.begin(); itm != markerAlleles.end(); itm++)
151
std::set< std::string >& s = itm->second;
152
for (std::set< std::string >::iterator its = s.begin(); its != s.end(); its++)
154
al.addAlleleInfoByLocusName(itm->first, BasicAlleleInfo(*its));
128
157
data_set.setAnalyzedLoci(al);
131
160
* Individuals informations
133
unsigned int ind_col_index = VectorTools::which(dt.getColumnNames(), string("Sample Name"));
134
unsigned int mark_col_index = VectorTools::which(dt.getColumnNames(), string("Marker"));
135
for (unsigned int i = 0 ; i < dt.getNumberOfRows() ; i++) {
136
vector<unsigned int> alleles;
137
for (unsigned int j = 0 ; j < alleles_cols.size() ; j++) {
138
if (!TextTools::isEmpty(dt(i,alleles_cols[j]))) {
139
unsigned int num = (data_set.getLocusInfoByName(dt(i, mark_col_index))).getAlleleInfoKey(dt(i,alleles_cols[j]));
162
size_t ind_col_index = VectorTools::which(dt.getColumnNames(), SAMPLE_NAME_H);
163
size_t mark_col_index = VectorTools::which(dt.getColumnNames(), MARKER_H);
164
for (size_t i = 0; i < dt.getNumberOfRows(); i++)
166
vector<size_t> alleles;
167
for (size_t j = 0; j < alleles_cols.size(); j++)
169
if (!TextTools::isEmpty(dt(i, alleles_cols[j])))
171
unsigned int num = (data_set.getLocusInfoByName(dt(i, mark_col_index))).getAlleleInfoKey(dt(i, alleles_cols[j]));
140
172
alleles.push_back(num);
143
175
alleles = VectorTools::unique(alleles);
144
176
MultiAlleleMonolocusGenotype ma(alleles);
145
if (!data_set.getIndividualByIdFromGroup(0,dt(i, ind_col_index))->hasGenotype())
146
data_set.initIndividualGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0,dt(i, ind_col_index)));
177
if (!data_set.getIndividualByIdFromGroup(0, dt(i, ind_col_index))->hasGenotype())
178
data_set.initIndividualGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)));
147
179
if (alleles.size())
148
data_set.setIndividualMonolocusGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)), data_set.getAnalyzedLoci()->getLocusInfoPosition(dt(i, mark_col_index)),ma);
180
data_set.setIndividualMonolocusGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)), data_set.getAnalyzedLoci()->getLocusInfoPosition(dt(i, mark_col_index)), ma);
152
185
void GeneMapperCsvExport::read(const std::string& path, DataSet& data_set) throw (Exception)
159
192
return AbstractIDataSet::read(is);
162
DataSet* GeneMapperCsvExport::read(const std::string& path) throw (Exception)
195
DataSet* GeneMapperCsvExport::read(const std::string& path) throw (Exception)
164
197
return AbstractIDataSet::read(path);
200
// --- GeneMapperCsvExport::Record ---
201
GeneMapperCsvExport::Record::Record(const std::string& row) : sampleFile_(),
210
StringTokenizer st(row, "\t", true, false);
212
if (st.numberOfRemainingTokens() != 7 + 4 * alleleNumber) {
213
throw Exception("GeneMapperCsvExport::Record::Record: bad number of allele");
216
size_t itemNum = st.numberOfRemainingTokens();
217
size_t alleleNum = (itemNum - 7) / 4;
218
sampleFile_ = st.getToken(0);
219
sampleName_ = st.getToken(1);
220
panel_ = st.getToken(2);
221
markerName_ = st.getToken(3);
222
dye_ = st.getToken(4);
223
dac_ = st.getToken(itemNum - 2);
224
an_ = TextTools::toDouble(st.getToken(itemNum - 1));
225
for (unsigned int i = 0; i < alleleNum; ++i)
227
GeneMapperCsvExport::Allele al(
229
TextTools::toDouble(st.getToken(5 + alleleNum + i)),
230
TextTools::to<unsigned int>(st.getToken(5 + (2 * alleleNum) + i)),
231
TextTools::toDouble(st.getToken(5 + (3 * alleleNum) + i))
233
alleles_.push_back(al);