1
// -*- Mode: C++; tab-width: 2; -*-
4
// --------------------------------------------------------------------------
5
// OpenMS Mass Spectrometry Framework
6
// --------------------------------------------------------------------------
7
// Copyright (C) 2003-2011 -- Oliver Kohlbacher, Knut Reinert
9
// This library is free software; you can redistribute it and/or
10
// modify it under the terms of the GNU Lesser General Public
11
// License as published by the Free Software Foundation; either
12
// version 2.1 of the License, or (at your option) any later version.
14
// This library is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
// Lesser General Public License for more details.
19
// You should have received a copy of the GNU Lesser General Public
20
// License along with this library; if not, write to the Free Software
21
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
// --------------------------------------------------------------------------
24
// $Maintainer: Andreas Bertsch $
25
// $Authors: Andreas Bertsch $
26
// --------------------------------------------------------------------------
28
#include <OpenMS/FILTERING/TRANSFORMERS/Normalizer.h>
29
#include <OpenMS/COMPARISON/SPECTRA/SpectrumAlignmentScore.h>
30
#include <OpenMS/CONCEPT/Factory.h>
31
#include <OpenMS/ANALYSIS/ID/PILISCrossValidation.h>
32
#include <OpenMS/CHEMISTRY/TheoreticalSpectrumGenerator.h>
33
#include <OpenMS/ANALYSIS/ID/PILISModel.h>
34
#include <OpenMS/ANALYSIS/MRM/MRMFragmentSelection.h>
40
PILISCrossValidation::PILISCrossValidation()
41
: DefaultParamHandler("PILISCrossValidation")
43
defaults_.setValue("nfold", 10, "Number of partitions to use for cross validation");
44
defaults_.setValue("optimization_method", "tophit_against_all_others", "Scoring method used for optimization");
45
defaults_.setValidStrings("optimization_method",StringList::create("tophit_against_all_others,only_top_hit,top_n_ions,top_n_ions_by"));
46
defaults_.setValue("compare_function", "SpectrumAlignmentScore", "Spectra scoring function to use");
47
defaults_.setValidStrings("compare_function", StringList::create("SpectrumAlignmentScore,ZhangSimilarityScore"));
48
defaults_.setValue("num_top_peaks", 2, "Number of highest abundant peaks to consider with top_n_ion and top_n_ions_by optimization_methods");
49
defaults_.setValue("min_intensity", 0.30, "Min relative intensity of highest abundant peaks to consider in top_n_ions_by");
50
defaults_.setValue("fragment_mass_tolerance", 0.5, "Fragment mass tolerance, mainly used in compare function.");
51
defaults_.setValue("normalize_to_TIC", "true", "Whether the spectra should be normalized to TIC before training, to max of one otherwise.");
52
defaults_.setValidStrings("normalize_to_TIC", StringList::create("true,false"));
56
PILISCrossValidation::PILISCrossValidation(const PILISCrossValidation& rhs)
57
: DefaultParamHandler(rhs),
58
cv_options_(rhs.cv_options_)
62
PILISCrossValidation::~PILISCrossValidation()
66
PILISCrossValidation& PILISCrossValidation::operator = (const PILISCrossValidation& rhs)
70
cv_options_ = rhs.cv_options_;
75
void PILISCrossValidation::partition_(vector<vector<Peptide> >& parts, const vector<Peptide>& source)
77
Size nfold(param_.getValue("nfold"));
78
parts = vector<vector<Peptide> >(nfold);
81
set<SignedSize> used_numbers;
83
while (count != source.size())
85
// get new random variable [0, source.size()]
86
Size r = (Size)(DoubleReal(rand())/DoubleReal(RAND_MAX) * (source.size()));
88
if (used_numbers.find(r) == used_numbers.end())
91
used_numbers.insert(r);
92
parts[part++].push_back(source[r]);
104
// generate parameter permutations
105
void PILISCrossValidation::generateParameters_(const Param& param, const Map<String, Option>& options, vector<Param>& parameters)
109
parameters.push_back(param);
112
for (Map<String, Option>::ConstIterator it = options.begin(); it != options.end(); ++it)
114
Map<String, Option> new_options = options;
115
new_options.erase(new_options.find(it->first));
116
Param new_param = param;
118
if (it->second.type == Option::DOUBLE)
120
DoubleReal dbl_min(it->second.dbl_min), dbl_max(it->second.dbl_max);
121
if (dbl_min > dbl_max)
123
cerr << "PILISCrossValidation: " << it->first << " min-value > max-value! (" << dbl_min << ", " << dbl_max << ")" << endl;
125
for (DoubleReal value = dbl_min; value <= dbl_max; value += it->second.dbl_stepsize)
127
new_param.setValue(it->first, value);
128
generateParameters_(new_param, new_options, parameters);
132
if (it->second.type == Option::INT)
134
Int int_min(it->second.int_min), int_max(it->second.int_max);
135
if (int_min > int_max)
137
cerr << "PILISCrossValidation: " << it->first << " min-value > max-value! (" << int_min << ", " << int_max << ")" << endl;
139
for (Int value = int_min; value <= int_max; value += it->second.int_stepsize)
141
new_param.setValue(it->first, value);
142
generateParameters_(new_param, new_options, parameters);
150
void PILISCrossValidation::apply(Param& PILIS_param, const PILISModel& base_model, const vector<Peptide>& peptides)
152
vector<vector<Peptide> > partitions;
153
partition_(partitions, peptides);
155
SpectrumAlignmentScore sa;
156
Param sa_param(sa.getParameters());
157
sa_param.setValue("tolerance", (DoubleReal)PILIS_param.getValue("fragment_mass_tolerance"));
158
sa_param.setValue("use_linear_factor", "true");
159
sa.setParameters(sa_param);
161
Param model_param = base_model.getParameters();
162
vector<Param> all_parameters;
163
generateParameters_(model_param, cv_options_, all_parameters);
166
vector<Param> all_new_parameters;
167
for (vector<Param>::const_iterator it1 = all_parameters.begin(); it1 != all_parameters.end(); ++it1)
170
for (vector<Param>::const_iterator it2 = all_new_parameters.begin(); it2 != all_new_parameters.end(); ++it2)
179
all_new_parameters.push_back(*it1);
182
all_parameters = all_new_parameters;
183
//cerr << all_parameters.size() << " parameters generated" << endl;
184
for (Size i = 0; i != all_parameters.size(); ++i)
186
for (Param::ParamIterator it = all_parameters[i].begin(); it != all_parameters[i].end(); ++it)
188
//cerr << i+1 << " " << it->name << " " << it->value.toString() << endl;
192
Normalizer to_one_normalizer, tic_normalizer;
193
Param normalizer_param(to_one_normalizer.getParameters());
194
normalizer_param.setValue("method", "to_one");
195
to_one_normalizer.setParameters(normalizer_param);
196
normalizer_param.setValue("method", "to_TIC");
197
tic_normalizer.setParameters(normalizer_param);
199
bool normalize_to_TIC(param_.getValue("normalize_to_TIC").toBool());
201
// iterate over all parameter setting permutations
202
vector<DoubleReal> scores;
203
for (vector<Param>::const_iterator pait = all_parameters.begin(); pait != all_parameters.end(); ++pait)
205
cerr << "Param #" << pait - all_parameters.begin() + 1 << "/" << all_parameters.size() << endl;
206
PILISModel model = base_model;
207
model.setParameters(*pait);
208
vector<DoubleReal> top_scores, non_top_scores;
209
vector<vector<RichPeakSpectrum> > exp_spectra;
210
vector<vector<vector<RichPeakSpectrum> > > sim_spectra;
212
// perform cross validation
213
for (Size i = 0; i != partitions.size(); ++i)
215
cerr << "Partition #" << i + 1 << endl;
216
for (Size j = 0; j != partitions.size(); ++j)
218
for (vector<Peptide>::const_iterator it = partitions[j].begin(); it != partitions[j].end(); ++it)
222
RichPeakSpectrum spec = it->spec;
225
if (normalize_to_TIC)
227
tic_normalizer.filterSpectrum(spec);
231
to_one_normalizer.filterSpectrum(spec);
233
//cerr << "training with " << it->sequence << " " << it->charge << "...";
234
model.train(it->spec, it->sequence, it->charge);
235
//cerr << "ended" << endl;
240
vector<vector<RichPeakSpectrum> > sim_spectra_part;
241
vector<RichPeakSpectrum> exp_spectra_part;
242
// evaluate this setting
243
for (vector<Peptide>::const_iterator it = partitions[i].begin(); it != partitions[i].end(); ++it)
245
RichPeakSpectrum exp_spec = it->spec;
246
to_one_normalizer.filterSpectrum(exp_spec);
247
exp_spec.sortByPosition();
248
exp_spectra_part.push_back(exp_spec);
250
//cerr << "Evalulating...(#peptides=" << it->hits.size() << ") ";
251
vector<DoubleReal> new_scores;
253
vector<RichPeakSpectrum> sim_spectra_hits;
254
for (vector<PeptideHit>::const_iterator pit = it->hits.begin(); pit != it->hits.end(); ++pit)
256
RichPeakSpectrum sim_spec;
257
//cerr << "simulate " << pit->getSequence() << " " << pit->getCharge() << "..";
258
model.getSpectrum(sim_spec, pit->getSequence(), pit->getCharge());
261
prec.setCharge(pit->getCharge());
262
prec.setPosition((pit->getSequence().getMonoWeight() + (DoubleReal)pit->getCharge())/(DoubleReal)pit->getCharge());
263
sim_spec.getPrecursors().push_back(prec);
265
//sim_spec.getPrecursorPeak().setCharge(pit->getCharge());
266
//sim_spec.getPrecursorPeak().setPosition((pit->getSequence().getMonoWeight() + (DoubleReal)pit->getCharge())/(DoubleReal)pit->getCharge());
268
to_one_normalizer.filterSpectrum(sim_spec);
269
sim_spec.sortByPosition();
270
sim_spectra_hits.push_back(sim_spec);
272
//cerr << "ended" << endl;
273
sim_spectra_part.push_back(sim_spectra_hits);
275
sim_spectra.push_back(sim_spectra_part);
276
exp_spectra.push_back(exp_spectra_part);
279
scores.push_back(scoreHits(sim_spectra, exp_spectra));
282
Size best_param_pos(0);
283
DoubleReal max_score(0);
285
for (vector<DoubleReal>::const_iterator it = scores.begin(); it != scores.end(); ++it, ++pos)
290
best_param_pos = pos;
295
cerr << "Best parameters (score=" << max_score << ")" << endl;
296
for (Param::ParamIterator it = all_parameters[best_param_pos].begin(); it != all_parameters[best_param_pos].end(); ++it)
298
cerr << " " << it->name << " " << it->value.toString() << endl;
300
PILIS_param = all_parameters[best_param_pos];
305
DoubleReal PILISCrossValidation::scoreHits(const vector<vector<vector<RichPeakSpectrum> > >& sim_spectra, const vector<vector<RichPeakSpectrum> >& exp_spectra)
307
String optimization_method = param_.getValue("optimization_method");
309
if (optimization_method == "tophit_against_all_others")
311
// consider all against all
312
vector<DoubleReal> top_scores, non_top_scores;
313
for (Size i = 0; i != sim_spectra.size(); ++i)
315
for (Size j = 0; j != sim_spectra[i].size(); ++j)
317
for (Size k = 0; k != sim_spectra[i][j].size(); ++k)
319
DoubleReal score = scoreSpectra_(sim_spectra[i][j][k], exp_spectra[i][j]);
322
top_scores.push_back(score);
326
non_top_scores.push_back(score);
333
for (vector<DoubleReal>::const_iterator it1 = top_scores.begin(); it1 != top_scores.end(); ++it1)
335
for (vector<DoubleReal>::const_iterator it2 = non_top_scores.begin(); it2 != non_top_scores.end(); ++it2)
340
DoubleReal score = sum / (DoubleReal)(top_scores.size() * non_top_scores.size());
341
cerr << "Avg. score-diff for param: " << score << endl;
345
if (optimization_method == "only_top_hit")
347
// consider only top-scores
348
vector<DoubleReal> top_scores;
349
for (Size i = 0; i != sim_spectra.size(); ++i)
351
for (Size j = 0; j != sim_spectra[i].size(); ++j)
353
if ( !sim_spectra[i][j].empty())
355
DoubleReal score = scoreSpectra_(sim_spectra[i][j][0], exp_spectra[i][j]);
356
top_scores.push_back(score);
361
for (vector<DoubleReal>::const_iterator it = top_scores.begin(); it != top_scores.end(); ++it)
365
DoubleReal score(sum / (DoubleReal)top_scores.size());
366
cerr << "Avg. score for param: " << score << endl;
370
if (optimization_method == "top_n_ions")
372
MRMFragmentSelection fragment_selection;
373
Size num_correct_topn(0), num_all(0);
374
for (Size i = 0; i != sim_spectra.size(); ++i)
376
for (Size j = 0; j != sim_spectra[i].size(); ++j)
379
if ( !sim_spectra[i][j].empty() )
381
vector<RichPeak1D> exp_highest_peak, sim_highest_peak;
382
fragment_selection.selectFragments(exp_highest_peak, exp_spectra[i][j]);
383
fragment_selection.selectFragments(sim_highest_peak, sim_spectra[i][j][0]);
385
DoubleReal fragment_mass_tolerance((DoubleReal)param_.getValue("fragment_mass_tolerance"));
386
bool has_topn(false);
387
for (Size ii = 0; ii < exp_highest_peak.size(); ++ii)
389
for (Size jj = ii; jj < sim_highest_peak.size(); ++jj)
391
if (fabs(exp_highest_peak[ii].getMZ() - sim_highest_peak[jj].getMZ()) < fragment_mass_tolerance)
406
DoubleReal score((DoubleReal)num_correct_topn / (DoubleReal)num_all);
407
cerr << "Avg. score in top " << param_.getValue("num_top_peaks") << ": " << score << endl;
411
if (optimization_method == "top_n_ions_by")
413
MRMFragmentSelection fragment_selection;
414
DoubleReal fragment_mass_tolerance((DoubleReal)param_.getValue("fragment_mass_tolerance"));
415
DoubleReal min_intensity((DoubleReal)param_.getValue("min_intensity"));
417
TheoreticalSpectrumGenerator tsg;
418
Param tsg_param(tsg.getParameters());
419
tsg_param.setValue("add_metainfo", "true");
420
tsg.setParameters(tsg_param);
422
SpectrumAlignment aligner;
423
Param aligner_param(aligner.getParameters());
424
aligner_param.setValue("tolerance", fragment_mass_tolerance);
425
aligner.setParameters(aligner_param);
427
Size num_correct_topn(0), num_all(0);
428
for (Size i = 0; i != sim_spectra.size(); ++i)
430
for (Size j = 0; j != sim_spectra[i].size(); ++j)
433
if ( !sim_spectra[i][j].empty() )
435
vector<RichPeak1D> sim_highest_peak;
436
//cerr << "Peptide: " << exp_spectra[i][j].getPeptideIdentifications().begin()->getHits().begin()->getSequence() << " "
437
// << exp_spectra[i][j].getPeptideIdentifications().begin()->getHits().begin()->getCharge() << endl;
438
cerr << "Sim-highest peaks: (sim-spectrum size=" << sim_spectra[i][j][0].size() << ") ";
440
for (Size p = 0; p != sim_spectra[i][j][0].size(); ++p)
442
cerr << sim_spectra[i][j][0][p].getMZ() << " " << sim_spectra[i][j][0][p].getIntensity() << " " << sim_spectra[i][j][0][p].getMetaValue("IonName") << endl;
444
fragment_selection.selectFragments(sim_highest_peak, sim_spectra[i][j][0]);
445
cerr << sim_highest_peak.size();
448
// normalize the exp_spectrum to the highest peaks which could be picked
449
RichPeakSpectrum exp_spec(exp_spectra[i][j]);
450
RichPeakSpectrum theo_spec;
451
const AASequence& peptide(exp_spec.getPeptideIdentifications().begin()->getHits().begin()->getSequence());
452
tsg.addPeaks(theo_spec, peptide, Residue::BIon, 1);
453
//tsg.addPeaks(theo_spec, peptide, Residue::BIon, 2); // TODO check which charge states are allowed
454
tsg.addPeaks(theo_spec, peptide, Residue::YIon, 1);
455
//tsg.addPeaks(theo_spec, peptide, Residue::YIon, 2);
456
theo_spec.sortByPosition();
457
vector<pair<Size, Size> > alignment;
458
aligner.getSpectrumAlignment(alignment, exp_spec, theo_spec);
459
vector<RichPeak1D> exp_highest_peak;
461
for (vector<pair<Size, Size> >::const_iterator it = alignment.begin(); it != alignment.end(); ++it)
463
exp_spec[it->first].setMetaValue("IonName", (String)theo_spec[it->second].getMetaValue("IonName"));
464
exp_highest_peak.push_back(exp_spec[it->first]);
467
//cerr << "Exp-highest peaks: ";
468
fragment_selection.selectFragments(exp_highest_peak, exp_spec);
471
// normalize the exp spectrum to max possible intensity to be selected
472
DoubleReal max_exp_int(0);
473
for (vector<RichPeak1D>::const_iterator it = exp_highest_peak.begin(); it != exp_highest_peak.end(); ++it)
475
if (it->getIntensity() > max_exp_int)
477
max_exp_int = it->getIntensity();
481
for (vector<RichPeak1D>::iterator it = exp_highest_peak.begin(); it != exp_highest_peak.end(); ++it)
483
it->setIntensity(it->getIntensity() / max_exp_int);
486
bool has_topn(false);
487
for (Size ii = 0; ii < exp_highest_peak.size(); ++ii)
489
for (Size jj = 0; jj < sim_highest_peak.size(); ++jj)
491
//cerr << ii << " " << jj << " " << exp_highest_peak[ii].getMZ() << " " << sim_highest_peak[jj].getMZ() << " " << exp_highest_peak[ii].getIntensity() << endl;
492
if (fabs(exp_highest_peak[ii].getMZ() - sim_highest_peak[jj].getMZ()) < fragment_mass_tolerance
493
&& exp_highest_peak[ii].getIntensity() >= min_intensity)
495
cerr << "Found: " << exp_spec.getPeptideIdentifications().begin()->getHits().begin()->getSequence()
496
<< " " << exp_spec.getPeptideIdentifications().begin()->getHits().begin()->getCharge()
497
<< " exp m/z=" << exp_highest_peak[ii].getMZ() << ", sim m/z=" << sim_highest_peak[jj].getMZ()
498
<< " exp int=" << exp_highest_peak[ii].getIntensity() << ", sim m/z=" << sim_highest_peak[jj].getIntensity() << endl;
512
DoubleReal score((DoubleReal)num_correct_topn / (DoubleReal)num_all);
513
cerr << "Avg. score in top " << param_.getValue("num_top_peaks") << " with a least " << min_intensity * 100.0 << "% intensity: " << score << endl;
517
cerr << "PILISCrossValidation: unknown optimization_method: " << optimization_method << endl;
521
DoubleReal PILISCrossValidation::scoreSpectra_(const RichPeakSpectrum& spec1, const RichPeakSpectrum& spec2)
524
for (RichPeakSpectrum::ConstIterator piit = spec1.begin(); piit != spec1.end(); ++piit)
527
p.setPosition(piit->getPosition()[0]);
528
p.setIntensity(piit->getIntensity());
531
for (RichPeakSpectrum::ConstIterator piit = spec2.begin(); piit != spec2.end(); ++piit)
534
p.setPosition(piit->getPosition()[0]);
535
p.setIntensity(piit->getIntensity());
538
return (*pscf_)(s1, s2);
541
void PILISCrossValidation::updateMembers_()
543
pscf_ = Factory<PeakSpectrumCompareFunctor>::create(param_.getValue("compare_function"));
544
Param compare_param(pscf_->getParameters());
545
if (compare_param.exists("tolerance"))
547
compare_param.setValue("tolerance", (DoubleReal)param_.getValue("fragment_mass_tolerance"));
548
pscf_->setParameters(compare_param);