1
// -*- mode: C++; tab-width: 2; -*-
4
// --------------------------------------------------------------------------
5
// OpenMS Mass Spectrometry Framework
6
// --------------------------------------------------------------------------
7
// Copyright (C) 2003-2011 -- Oliver Kohlbacher, Knut Reinert
9
// This library is free software; you can redistribute it and/or
10
// modify it under the terms of the GNU Lesser General Public
11
// License as published by the Free Software Foundation; either
12
// version 2.1 of the License, or (at your option) any later version.
14
// This library is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
// Lesser General Public License for more details.
19
// You should have received a copy of the GNU Lesser General Public
20
// License along with this library; if not, write to the Free Software
21
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
// --------------------------------------------------------------------------
24
// $Maintainer: Mathias Walzer $
25
// $Authors: Nico Pfeifer, Mathias Walzer $
26
// --------------------------------------------------------------------------
28
#include <OpenMS/ANALYSIS/SVM/SVMWrapper.h>
29
#include <OpenMS/FORMAT/IdXMLFile.h>
30
#include <OpenMS/FORMAT/TextFile.h>
31
#include <OpenMS/FORMAT/LibSVMEncoder.h>
32
#include <OpenMS/METADATA/ProteinIdentification.h>
33
#include <OpenMS/APPLICATIONS/TOPPBase.h>
34
#include <OpenMS/MATH/STATISTICS/StatisticFunctions.h>
38
using namespace OpenMS;
41
//-------------------------------------------------------------
43
//-------------------------------------------------------------
46
@page TOPP_RTPredict RTPredict
48
@brief This application is used to predict retention times
49
for peptides or peptide separation.
51
This methods and applications of this model are described
52
in several publications:
54
Nico Pfeifer, Andreas Leinenbach, Christian G. Huber and Oliver Kohlbacher
55
Statistical learning of peptide retention behavior in chromatographic separations: A new kernel-based approach for computational proteomics.
56
BMC Bioinformatics 2007, 8:468
58
Nico Pfeifer, Andreas Leinenbach, Christian G. Huber and Oliver Kohlbacher
59
Improving Peptide Identification in Proteome Analysis by a Two-Dimensional Retention Time Filtering Approach
60
J. Proteome Res. 2009, 8(8):4109-15
63
The input of this application
64
is an svm model and an IdXML
65
file with peptide identifications. The svm model file is specified
66
by the <b>svm_model</b> parameter in the command line or the ini file.
67
This file should have been produced by the @ref TOPP_RTModel application.
69
For retention time prediction the peptide sequences are extracted
70
from the IdXML inputfile
71
and passed to the svm. The svm then predicts retention times
72
according to the trained model. The predicted retention times
73
are stored as @code <userParam name="predicted_retention_time" value="<predicted retention time>" />
74
@endcode inside the peptide entities in the IdXML output file.
76
For separation prediction you have to specify two output file names.
77
'out_id:positive' is the filename of the peptides which are predicted
78
to be collected by the column and 'out_id:negative' is the file
79
of the predicted flowthrough peptides.
81
Retention time prediction and separation prediction cannot be combined!
83
<B>The command line parameters of this tool are:</B>
84
@verbinclude TOPP_RTPredict.cli
86
@todo This need serious clean up! Combining certain input and output options will
87
result in strange behaviour, especially when using text output/input.
90
// We do not want this class to show up in the docu:
98
: TOPPBase("RTPredict","Predicts retention times for peptides using a model trained by RTModel.")
104
void registerOptionsAndFlags_()
107
registerInputFile_("in_id","<file>","","peptides with precursor information", false);
108
setValidFormats_("in_id",StringList::create("idXML"));
109
registerInputFile_("in_text","<file>", "","peptides as text-based file", false);
112
registerTOPPSubsection_("out_id", "Output files in idXML format");
113
registerOutputFile_("out_id:file","<file>","","Output file with peptide RT prediction", false);
114
setValidFormats_("out_id:file",StringList::create("idXML"));
115
registerOutputFile_("out_id:positive","<file>","","Output file in IdXML format containing positive predictions (peptide separation prediction - requires negative file to be present as well)\n", false);
116
setValidFormats_("out_id:positive",StringList::create("idXML"));
117
registerOutputFile_("out_id:negative","<file>","","Output file in IdXML format containing negative predictions (peptide separation prediction - requires positive file to be present as well)\n", false);
118
setValidFormats_("out_id:negative",StringList::create("idXML"));
119
registerFlag_("out_id:rewrite_peptideidentification_rtmz", "rewrites each peptideidentification's rt and mz from prediction and calculation (according to the best hit)",true);
121
registerTOPPSubsection_("out_text","Output files in text format");
122
registerOutputFile_("out_text:file","<file>","","Output file with predicted RT values", false);
124
registerInputFile_("svm_model","<file>","","svm model in libsvm format (can be produced by RTModel)");
125
registerDoubleOption_("total_gradient_time","<time>",1.0,"the time (in seconds) of the gradient (peptide RT prediction)", false);
126
setMinFloat_("total_gradient_time", 0.00001);
127
registerIntOption_("max_number_of_peptides", "<int>",100000,"the maximum number of peptides considered at once (bigger number will lead to faster results but needs more memory).\n", false, true);
130
void loadStrings_(String filename, std::vector<String>& sequences)
132
TextFile text_file(filename.c_str(), true);
133
TextFile::iterator it;
137
it = text_file.begin();
138
while(it != text_file.end())
140
sequences.push_back((*it).trim());
145
void writeStringLabelLines_(String filename, map< String, DoubleReal > predicted_data)
148
map< String, DoubleReal >::iterator it;
150
os.open(filename.c_str(), ofstream::out);
152
for(it = predicted_data.begin(); it != predicted_data.end(); ++it)
154
os << it->first << " " << it->second << "\n";
160
ExitCodes main_(int , const char**)
162
IdXMLFile IdXML_file;
163
vector<ProteinIdentification> protein_identifications;
164
vector<PeptideIdentification> identifications;
165
vector< String > peptides;
166
vector< AASequence > modified_peptides;
167
vector< DoubleReal > training_retention_times;
168
vector<PeptideHit> temp_peptide_hits;
170
LibSVMEncoder encoder;
171
String allowed_amino_acid_characters = "ACDEFGHIKLMNPQRSTVWY";
172
vector<DoubleReal> predicted_retention_times;
173
vector<DoubleReal> all_predicted_retention_times;
174
map< String, DoubleReal > predicted_data;
175
map< AASequence, DoubleReal > predicted_modified_data;
176
svm_problem* prediction_data = NULL;
177
SVMData training_samples;
178
SVMData prediction_samples;
179
UInt border_length = 0;
180
UInt k_mer_length = 0;
181
DoubleReal sigma = 0;
182
DoubleReal sigma_0 = 0;
183
DoubleReal sigma_max = 0;
184
String temp_string = "";
185
UInt maximum_length = 50;
186
pair<DoubleReal, DoubleReal> temp_point;
187
vector<Real> performance_retention_times;
188
String svmfile_name = "";
189
Real total_gradient_time = 1.f;
190
bool separation_prediction = false;
191
vector<PeptideIdentification> identifications_positive;
192
vector<PeptideIdentification> identifications_negative;
193
bool first_dim_rt = false;
194
Size number_of_peptides = 0;
195
Size max_number_of_peptides = getIntOption_("max_number_of_peptides");
197
//-------------------------------------------------------------
198
// parsing parameters
199
//-------------------------------------------------------------
201
String outputfile_name_positive = getStringOption_("out_id:positive");
202
String outputfile_name_negative = getStringOption_("out_id:negative");
203
// for separation prediction, we require both files to be present!
204
if (outputfile_name_positive != "" || outputfile_name_negative != "")
206
if (outputfile_name_positive != "" && outputfile_name_negative != "")
208
separation_prediction = true;
212
writeLog_("Both files for separation prediction required. Please specify the other one as well. Aborting!");
213
return ILLEGAL_PARAMETERS;
218
String input_id = getStringOption_("in_id");
219
String input_text = getStringOption_("in_text");
220
if (input_text != "" && input_id != "")
222
writeLog_("Two input parameter files given, only one allowed! Use either -in_id:file or -in_text:file!");
223
return ILLEGAL_PARAMETERS;
225
else if (input_text == "" && input_id == "")
227
writeLog_("No input file given. Aborting...");
228
return ILLEGAL_PARAMETERS;
233
String output_id = getStringOption_("out_id:file");
234
String output_text = getStringOption_("out_text:file");
235
if (output_text == "" && output_id == "" && !separation_prediction)
237
writeLog_("No output files given. Aborting...");
238
return ILLEGAL_PARAMETERS;
241
svmfile_name = getStringOption_("svm_model");
242
total_gradient_time = getDoubleOption_("total_gradient_time");
244
//-------------------------------------------------------------
246
//-------------------------------------------------------------
248
svm.loadModel(svmfile_name);
250
if ((svm.getIntParameter(SVMWrapper::SVM_TYPE) == C_SVC || svm.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC) && !separation_prediction)
252
writeLog_("You cannot perform peptide separation prediction with a model trained for"
253
+ String("\npeptide retention time prediction. Aborting!"));
254
return ILLEGAL_PARAMETERS;
256
if ((svm.getIntParameter(SVMWrapper::SVM_TYPE) != C_SVC && svm.getIntParameter(SVMWrapper::SVM_TYPE) != NU_SVC) && separation_prediction)
258
writeLog_("You cannot perform peptide retention time prediction with a model trained for\n"
259
+ String("peptide separation prediction. Aborting!"));
260
return ILLEGAL_PARAMETERS;
263
// Since the POBK is not included in the libsvm we have to load
264
// additional parameters from additional files.
265
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
267
inputFileReadable_(svmfile_name + "_additional_parameters", "svm_model (derived)");
269
Param additional_parameters;
271
additional_parameters.load(svmfile_name + "_additional_parameters");
272
if (additional_parameters.exists("first_dim_rt")
273
&& additional_parameters.getValue("first_dim_rt") != DataValue::EMPTY)
275
first_dim_rt = additional_parameters.getValue("first_dim_rt").toBool();
277
if (additional_parameters.getValue("kernel_type") != DataValue::EMPTY)
279
svm.setParameter(SVMWrapper::KERNEL_TYPE, ((String) additional_parameters.getValue("kernel_type")).toInt());
282
if (additional_parameters.getValue("border_length") == DataValue::EMPTY
283
&& svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
285
writeLog_("No border length saved in additional parameters file. Aborting!");
286
cout << "No border length saved in additional parameters file. Aborting!" << endl;
287
return ILLEGAL_PARAMETERS;
289
border_length = ((String)additional_parameters.getValue("border_length")).toInt();
290
if (additional_parameters.getValue("k_mer_length") == DataValue::EMPTY
291
&& svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
293
writeLog_("No k-mer length saved in additional parameters file. Aborting!");
294
cout << "No k-mer length saved in additional parameters file. Aborting!" << endl;
295
return ILLEGAL_PARAMETERS;
297
k_mer_length = ((String)additional_parameters.getValue("k_mer_length")).toInt();
298
if (additional_parameters.getValue("sigma") == DataValue::EMPTY
299
&& svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
301
writeLog_("No sigma saved in additional parameters file. Aborting!");
302
cout << "No sigma saved in additional parameters file. Aborting!" << endl;
303
return ILLEGAL_PARAMETERS;
305
sigma = ((String)additional_parameters.getValue("sigma")).toFloat();
306
if (!separation_prediction && additional_parameters.getValue("sigma_0") == DataValue::EMPTY)
308
writeLog_("No sigma_0 saved in additional parameters file. Aborting!");
309
cout << "No sigma_0 length saved in additional parameters file. Aborting!" << endl;
310
return ILLEGAL_PARAMETERS;
312
if (!separation_prediction && additional_parameters.getValue("sigma_0") != DataValue::EMPTY)
314
sigma_0 = additional_parameters.getValue("sigma_0");
316
if (!separation_prediction && additional_parameters.getValue("sigma_max") == DataValue::EMPTY)
318
writeLog_("No sigma_max saved in additional parameters file. Aborting!");
319
cout << "No sigma_max length saved in additional parameters file. Aborting!" << endl;
320
return ILLEGAL_PARAMETERS;
322
if (!separation_prediction && additional_parameters.getValue("sigma_max") != DataValue::EMPTY)
324
sigma_max = additional_parameters.getValue("sigma_max");
328
if ( input_text != "" )
330
loadStrings_(input_text, peptides);
331
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
333
for (Size i = 0; i < peptides.size(); ++i)
335
modified_peptides.push_back(AASequence(peptides[i]));
343
IdXML_file.load(input_id, protein_identifications, identifications, document_id);
346
//-------------------------------------------------------------
348
//-------------------------------------------------------------
352
for (Size i = 0; i < identifications.size(); i++)
354
temp_peptide_hits = identifications[i].getHits();
355
for (Size j = 0; j < temp_peptide_hits.size(); j++)
357
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
359
modified_peptides.push_back(temp_peptide_hits[j].getSequence());
363
peptides.push_back(temp_peptide_hits[j].getSequence().toUnmodifiedString());
368
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
370
number_of_peptides = modified_peptides.size();
374
number_of_peptides = peptides.size();
377
vector<DoubleReal> rts;
378
rts.resize(number_of_peptides, 0);
380
vector<String>::iterator it_from = peptides.begin();
381
vector<String>::iterator it_to = peptides.begin();
382
vector<AASequence>::iterator it_from_mod = modified_peptides.begin();
383
vector<AASequence>::iterator it_to_mod = modified_peptides.begin();
385
while(counter < number_of_peptides)
387
vector<String> temp_peptides;
388
vector<AASequence> temp_modified_peptides;
389
vector<DoubleReal> temp_rts;
391
Size temp_counter = 0;
392
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) != SVMWrapper::OLIGO)
394
while(temp_counter <= max_number_of_peptides && it_to != peptides.end())
399
temp_peptides.insert(temp_peptides.end(), it_from, it_to);
400
//temp_peptides.insert(temp_peptides.end(), peptides.begin(), peptides.end());
401
temp_rts.resize(temp_peptides.size(), 0);
404
encoder.encodeLibSVMProblemWithCompositionAndLengthVectors(temp_peptides,
406
allowed_amino_acid_characters,
410
else if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
412
while(temp_counter < max_number_of_peptides && it_to_mod != modified_peptides.end())
417
temp_modified_peptides.insert(temp_modified_peptides.end(), it_from_mod, it_to_mod);
418
// temp_modified_peptides.insert(temp_modified_peptides.end(), modified_peptides.begin(), modified_peptides.end());
419
temp_rts.resize(temp_modified_peptides.size(), 0);
421
encoder.encodeProblemWithOligoBorderVectors(temp_modified_peptides,
423
allowed_amino_acid_characters,
425
prediction_samples.sequences);
426
prediction_samples.labels = temp_rts;
427
it_from_mod = it_to_mod;
429
counter += temp_counter;
431
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
433
inputFileReadable_((svmfile_name + "_samples").c_str(), "svm_model (derived)");
435
training_samples.load(svmfile_name + "_samples");
436
svm.setTrainingSample(training_samples);
438
svm.setParameter(SVMWrapper::BORDER_LENGTH, (Int) border_length);
439
svm.setParameter(SVMWrapper::SIGMA, sigma);
440
svm.predict(prediction_samples, predicted_retention_times);
441
prediction_samples.labels.clear();
442
prediction_samples.sequences.clear();
446
svm.predict(prediction_data, predicted_retention_times);
447
LibSVMEncoder::destroyProblem(prediction_data);
449
for (Size i = 0; i < temp_counter; ++i)
451
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO && output_text=="")
453
predicted_modified_data.insert(make_pair(temp_modified_peptides[i],
454
(predicted_retention_times[i] * total_gradient_time)));
456
else if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) != SVMWrapper::OLIGO)
458
predicted_data.insert(make_pair(temp_peptides[i],
459
(predicted_retention_times[i] * total_gradient_time)));
463
predicted_data.insert(make_pair(temp_modified_peptides[i].toString(),
464
(predicted_retention_times[i] * total_gradient_time)));
467
all_predicted_retention_times.insert(all_predicted_retention_times.end(), predicted_retention_times.begin(), predicted_retention_times.end());
468
predicted_retention_times.clear();
473
if (!separation_prediction)
475
for (Size i = 0; i < identifications.size(); i++)
477
temp_peptide_hits = identifications[i].getHits();
479
for (Size j = 0; j < temp_peptide_hits.size(); j++)
481
DoubleReal temp_rt = 0.;
482
DoubleReal temp_p_value = 0.;
484
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
486
temp_rt = predicted_modified_data[temp_peptide_hits[j].getSequence()];
490
temp_rt = predicted_data[temp_peptide_hits[j].getSequence().toUnmodifiedString()];
495
temp_point.first = identifications[i].getMetaValue("first_dim_rt");
499
temp_point.first = 0;
500
if (identifications[i].metaValueExists("RT"))
502
temp_point.first = identifications[i].getMetaValue("RT");
505
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
507
temp_point.second = temp_rt;
508
temp_p_value = svm.getPValue(sigma_0, sigma_max, temp_point);
512
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
514
temp_peptide_hits[j].setMetaValue("predicted_RT_p_value_first_dim",temp_p_value);
516
temp_peptide_hits[j].setMetaValue("predicted_RT_first_dim",temp_rt);
517
performance_retention_times.push_back(identifications[i].getMetaValue("first_dim_rt"));
521
if (identifications[i].metaValueExists("RT"))
523
performance_retention_times.push_back(identifications[i].getMetaValue("RT"));
527
performance_retention_times.push_back(0);
529
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
531
temp_peptide_hits[j].setMetaValue("predicted_RT_p_value",temp_p_value);
533
temp_peptide_hits[j].setMetaValue("predicted_RT",temp_rt);
536
identifications[i].setHits(temp_peptide_hits);
537
if (getFlag_("out_id:rewrite_peptideidentification_rtmz"))
539
identifications[i].sort();
540
Int charge = identifications[i].getHits().front().getCharge();
541
DoubleReal mz = identifications[i].getHits().front().getSequence().getMonoWeight(Residue::Full, charge) / DoubleReal(charge);
542
DoubleReal rt = identifications[i].getHits().front().getMetaValue("predicted_RT");
544
identifications[i].setMetaValue("RT",rt);
545
identifications[i].setMetaValue("MZ",mz);
548
identifications[i].setHits(temp_peptide_hits);
551
else // separation prediction
553
vector<PeptideHit> hits_positive;
554
vector<PeptideHit> hits_negative;
556
PeptideIdentification temp_identification;
558
for (Size i = 0; i < identifications.size(); i++)
560
hits_negative.clear();
561
hits_positive.clear();
563
temp_peptide_hits = identifications[i].getHits();
564
for(vector<PeptideHit>::iterator it = temp_peptide_hits.begin();
565
it != temp_peptide_hits.end();
568
if (svm.getIntParameter(SVMWrapper::KERNEL_TYPE) == SVMWrapper::OLIGO)
570
if (predicted_modified_data[it->getSequence()] > 0)
572
hits_positive.push_back(*it);
576
hits_negative.push_back(*it);
581
if (predicted_data[it->getSequence().toUnmodifiedString()] > 0)
583
hits_positive.push_back(*it);
587
hits_negative.push_back(*it);
592
if (identifications[i].metaValueExists("MZ"))
594
temp_identification.setMetaValue("MZ",identifications[i].getMetaValue("MZ"));
596
if (identifications[i].metaValueExists("RT"))
598
temp_identification.setMetaValue("RT",identifications[i].getMetaValue("RT"));
601
temp_identification = identifications[i];
602
temp_identification.setHits(hits_positive);
603
identifications_positive.push_back(temp_identification);
604
temp_identification.setHits(hits_negative);
605
identifications_negative.push_back(temp_identification);
609
//-------------------------------------------------------------
611
//-------------------------------------------------------------
613
if (separation_prediction)
615
IdXML_file.store(outputfile_name_positive,
616
protein_identifications,
617
identifications_positive);
618
IdXML_file.store(outputfile_name_negative,
619
protein_identifications,
620
identifications_negative);
624
if (output_text != "") // text
626
writeStringLabelLines_(output_text, predicted_data);
628
if (output_id != "") // idXML
630
IdXML_file.store(output_id,
631
protein_identifications,
633
writeDebug_("Linear correlation between predicted and measured rt is: "
634
+ String(Math::pearsonCorrelationCoefficient(all_predicted_retention_times.begin(),
635
all_predicted_retention_times.end(),
636
performance_retention_times.begin(),
637
performance_retention_times.end())), 1);
638
writeDebug_("MSE between predicted and measured rt is: "
639
+ String(Math::meanSquareError(all_predicted_retention_times.begin(),
640
all_predicted_retention_times.end(),
641
performance_retention_times.begin(),
642
performance_retention_times.end())), 1);
651
int main( int argc, const char** argv )
654
return tool.main(argc,argv);