1
// -*- mode: C++; tab-width: 2; -*-
4
// --------------------------------------------------------------------------
5
// OpenMS Mass Spectrometry Framework
6
// --------------------------------------------------------------------------
7
// Copyright (C) 2003-2011 -- Oliver Kohlbacher, Knut Reinert
9
// This library is free software; you can redistribute it and/or
10
// modify it under the terms of the GNU Lesser General Public
11
// License as published by the Free Software Foundation; either
12
// version 2.1 of the License, or (at your option) any later version.
14
// This library is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
// Lesser General Public License for more details.
19
// You should have received a copy of the GNU Lesser General Public
20
// License along with this library; if not, write to the Free Software
21
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
// --------------------------------------------------------------------------
24
// $Maintainer: Chris Bielow $
25
// $Authors: Andreas Bertsch, Chris Bielow $
26
// --------------------------------------------------------------------------
28
#include <OpenMS/ANALYSIS/ID/FalseDiscoveryRate.h>
29
#include <OpenMS/DATASTRUCTURES/StringList.h>
30
#include <OpenMS/CONCEPT/LogStream.h>
34
#define FALSE_DISCOVERY_RATE_DEBUG
35
#undef FALSE_DISCOVERY_RATE_DEBUG
41
FalseDiscoveryRate::FalseDiscoveryRate()
42
: DefaultParamHandler("FalseDiscoveryRate")
44
defaults_.setValue("q_value", "true", "If 'true', the q-values will be calculated instead of the FDRs");
45
defaults_.setValidStrings("q_value", StringList::create("true,false"));
46
defaults_.setValue("use_all_hits", "false", "If 'true' not only the first hit, but all are used (peptides only)");
47
defaults_.setValidStrings("use_all_hits", StringList::create("true,false"));
48
defaults_.setValue("split_charge_variants", "false", "If set to 'true' charge variants are treated separately (for peptides of combined target/decoy searches only).");
49
defaults_.setValidStrings("split_charge_variants", StringList::create("true,false"));
50
defaults_.setValue("treat_runs_separately", "false", "If set to 'true' different search runs are treated separately (for peptides of combined target/decoy searches only).");
51
defaults_.setValidStrings("treat_runs_separately", StringList::create("true,false"));
52
defaults_.setValue("decoy_string", "_rev", "String which is appended at the accession of the protein to indicate that it is a decoy protein (for proteins only).");
53
defaults_.setValue("add_decoy_peptides","false", "If set to true, decoy peptides will be written to output file, too. The q-value is set to the closest target score.");
54
defaults_.setValidStrings("add_decoy_peptides", StringList::create("true,false"));
58
void FalseDiscoveryRate::apply(vector<PeptideIdentification>& ids)
60
bool q_value = param_.getValue("q_value").toBool();
61
bool use_all_hits = param_.getValue("use_all_hits").toBool();
62
bool treat_runs_separately = param_.getValue("treat_runs_separately").toBool();
63
bool split_charge_variants = param_.getValue("split_charge_variants").toBool();
64
bool add_decoy_peptides = param_.getValue("add_decoy_peptides").toBool();
65
#ifdef FALSE_DISCOVERY_RATE_DEBUG
66
cerr << "Parameters: q_value=" << q_value << ", use_all_hits=" << use_all_hits << ", treat_runs_separately=" << treat_runs_separately << ", split_charge_variants=" << split_charge_variants << endl;
72
LOG_WARN << "No peptide identifications given to FalseDiscoveryRate! No calculation performed.\n";
76
// first search for all identifiers and charge variants
77
set<String> identifiers;
78
set<SignedSize> charge_variants;
79
for (vector<PeptideIdentification>::iterator it = ids.begin(); it != ids.end(); ++it)
81
identifiers.insert(it->getIdentifier());
86
vector<PeptideHit> hits = it->getHits();
91
for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
93
charge_variants.insert(pit->getCharge());
97
#ifdef FALSE_DISCOVERY_RATE_DEBUG
98
cerr << "#id-runs: " << identifiers.size() << " ";
99
for (set<String>::const_iterator it = identifiers.begin(); it != identifiers.end(); ++it)
106
cerr << "#of charge states: " << charge_variants.size() << " ";
107
for (set<SignedSize>::const_iterator it = charge_variants.begin(); it != charge_variants.end(); ++it)
114
for (set<SignedSize>::const_iterator zit = charge_variants.begin(); zit != charge_variants.end(); ++zit)
116
#ifdef FALSE_DISCOVERY_RATE_DEBUG
117
cerr << "Charge variant=" << *zit << endl;
120
// for all identifiers
121
for (set<String>::const_iterator iit = identifiers.begin(); iit != identifiers.end(); ++iit)
123
if (!treat_runs_separately && iit != identifiers.begin())
128
#ifdef FALSE_DISCOVERY_RATE_DEBUG
129
cerr << "Id-run: " << *iit << endl;
131
// get the scores of all peptide hits
132
vector<DoubleReal> target_scores, decoy_scores;
133
for (vector<PeptideIdentification>::iterator it = ids.begin(); it != ids.end(); ++it)
135
// if runs should be treated separately, the identifiers must be the same
136
if (treat_runs_separately && it->getIdentifier() != *iit)
141
for (Size i = 0; i < it->getHits().size(); ++i)
143
if (split_charge_variants && it->getHits()[i].getCharge() != *zit)
148
if (!it->getHits()[i].metaValueExists("target_decoy"))
150
LOG_FATAL_ERROR << "Meta value 'target_decoy' does not exists, reindex the idXML file with 'PeptideIndexer' first (run-id='" << it->getIdentifier() << ", rank=" << i+1 << " of " << it->getHits().size() << ")!" << endl;
151
throw Exception::MissingInformation(__FILE__,__LINE__,__PRETTY_FUNCTION__,"Meta value 'target_decoy' does not exist!");
154
String target_decoy(it->getHits()[i].getMetaValue("target_decoy"));
155
if (target_decoy == "target")
157
target_scores.push_back(it->getHits()[i].getScore());
161
if (target_decoy == "decoy" || target_decoy == "target+decoy")
163
decoy_scores.push_back(it->getHits()[i].getScore());
167
if (target_decoy != "")
169
LOG_FATAL_ERROR << "Unknown value of meta value 'target_decoy': '" << target_decoy << "'!" << endl;
176
#ifdef FALSE_DISCOVERY_RATE_DEBUG
177
cerr << "#target-scores=" << target_scores.size() << ", #decoy-scores=" << decoy_scores.size() << endl;
180
// check decoy scores
181
if (decoy_scores.empty())
183
String error_string = "FalseDiscoveryRate: #decoy sequences is zero! Setting all target sequences to q-value/FDR 0! ";
184
if (split_charge_variants || treat_runs_separately)
187
if (split_charge_variants)
189
error_string += "charge_variant=" + String(*zit) + " ";
191
if (treat_runs_separately)
193
error_string += "run-id=" + *iit;
197
LOG_ERROR << error_string;
200
// check target scores
201
if (target_scores.empty())
203
String error_string = "FalseDiscoveryRate: #target sequences is zero! Ignoring. ";
204
if (split_charge_variants || treat_runs_separately)
207
if (split_charge_variants)
209
error_string += "charge_variant=" + String(*zit) + " ";
211
if (treat_runs_separately)
213
error_string += "run-id=" + *iit;
217
LOG_ERROR << error_string;
220
if (target_scores.empty() || decoy_scores.empty())
222
// no remove the the relevant entries, or put 'pseudo-scores' in
223
for (vector<PeptideIdentification>::iterator it = ids.begin(); it != ids.end(); ++it)
225
// if runs should be treated separately, the identifiers must be the same
226
if (treat_runs_separately && it->getIdentifier() != *iit)
231
vector<PeptideHit> hits(it->getHits()), new_hits;
232
for (Size i = 0; i < hits.size(); ++i)
234
if (split_charge_variants && hits[i].getCharge() != *zit)
236
new_hits.push_back(hits[i]);
240
if (!hits[i].metaValueExists("target_decoy"))
242
LOG_FATAL_ERROR << "Meta value 'target_decoy' does not exists, reindex the idXML file with 'PeptideIndexer' first (run-id='" << it->getIdentifier() << ", rank=" << i+1 << " of " << hits.size() << ")!" << endl;
243
throw Exception::MissingInformation(__FILE__,__LINE__,__PRETTY_FUNCTION__,"Meta value 'target_decoy' does not exist!");
246
String target_decoy(hits[i].getMetaValue("target_decoy"));
247
if (target_decoy == "target")
249
// if it is a target hit, there are now decoys, fdr/q-value should be zero then
250
new_hits.push_back(hits[i]);
251
String score_type = it->getScoreType() + "_score";
252
new_hits.back().setMetaValue(score_type, new_hits.back().getScore());
253
new_hits.back().setScore(0);
257
if (target_decoy != "decoy" && target_decoy != "target+decoy")
259
LOG_FATAL_ERROR << "Unknown value of meta value 'target_decoy': '" << target_decoy << "'!" << endl;
263
it->setHits(new_hits);
268
// calculate fdr for the forward scores
269
bool higher_score_better(ids.begin()->isHigherScoreBetter());
270
Map<DoubleReal, DoubleReal> score_to_fdr;
271
calculateFDRs_(score_to_fdr, target_scores, decoy_scores, q_value, higher_score_better);
274
for (vector<PeptideIdentification>::iterator it = ids.begin(); it != ids.end(); ++it)
276
// if runs should be treated separately, the identifiers must be the same
277
if (treat_runs_separately && it->getIdentifier() != *iit)
282
String score_type = it->getScoreType() + "_score";
283
vector<PeptideHit> hits;
284
for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
286
PeptideHit hit = *pit;
288
if (split_charge_variants && pit->getCharge() != *zit)
290
hits.push_back(*pit);
293
if (hit.metaValueExists("target_decoy"))
295
String meta_value = (String)hit.getMetaValue("target_decoy");
296
if ((meta_value == "decoy" || meta_value == "target+decoy") && !add_decoy_peptides)
301
hit.setMetaValue(score_type, pit->getScore());
302
hit.setScore(score_to_fdr[pit->getScore()]);
308
if (!split_charge_variants)
314
// higher-score-better can be set now, calculations are finished
315
for (vector<PeptideIdentification>::iterator it = ids.begin(); it != ids.end(); ++it)
319
if (it->getScoreType() != "q-value")
321
it->setScoreType("q-value");
326
if (it->getScoreType() != "FDR")
328
it->setScoreType("FDR");
331
it->setHigherScoreBetter(false);
338
void FalseDiscoveryRate::apply(vector<PeptideIdentification>& fwd_ids, vector<PeptideIdentification>& rev_ids)
340
if (fwd_ids.empty() || rev_ids.empty())
344
vector<DoubleReal> target_scores, decoy_scores;
345
// get the scores of all peptide hits
346
for (vector<PeptideIdentification>::const_iterator it = fwd_ids.begin(); it != fwd_ids.end(); ++it)
348
for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
350
target_scores.push_back(pit->getScore());
354
for (vector<PeptideIdentification>::const_iterator it = rev_ids.begin(); it != rev_ids.end(); ++it)
356
for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
358
decoy_scores.push_back(pit->getScore());
362
bool q_value(param_.getValue("q_value").toBool());
363
bool higher_score_better(fwd_ids.begin()->isHigherScoreBetter());
364
bool add_decoy_peptides = param_.getValue("add_decoy_peptides").toBool();
365
// calculate fdr for the forward scores
366
Map<DoubleReal, DoubleReal> score_to_fdr;
367
calculateFDRs_(score_to_fdr, target_scores, decoy_scores, q_value, higher_score_better);
370
String score_type = fwd_ids.begin()->getScoreType() + "_score";
371
for (vector<PeptideIdentification>::iterator it = fwd_ids.begin(); it != fwd_ids.end(); ++it)
375
it->setScoreType("q-value");
379
it->setScoreType("FDR");
382
it->setHigherScoreBetter(false);
383
vector<PeptideHit> hits = it->getHits();
384
for (vector<PeptideHit>::iterator pit = hits.begin(); pit != hits.end(); ++pit)
386
#ifdef FALSE_DISCOVERY_RATE_DEBUG
387
cerr << pit->getScore() << " " << score_to_fdr[pit->getScore()] << endl;
389
pit->setMetaValue(score_type, pit->getScore());
390
pit->setScore(score_to_fdr[pit->getScore()]);
394
//write as well decoy peptides
395
if(add_decoy_peptides)
397
score_type = rev_ids.begin()->getScoreType() + "_score";
398
for (vector<PeptideIdentification>::iterator it = rev_ids.begin(); it != rev_ids.end(); ++it)
402
it->setScoreType("q-value");
406
it->setScoreType("FDR");
409
it->setHigherScoreBetter(false);
410
vector<PeptideHit> hits = it->getHits();
411
for (vector<PeptideHit>::iterator pit = hits.begin(); pit != hits.end(); ++pit)
413
#ifdef FALSE_DISCOVERY_RATE_DEBUG
414
cerr << pit->getScore() << " " << score_to_fdr[pit->getScore()] << endl;
416
pit->setMetaValue(score_type, pit->getScore());
417
pit->setScore(score_to_fdr[pit->getScore()]);
426
void FalseDiscoveryRate::apply(vector<ProteinIdentification>& ids)
430
LOG_WARN << "No protein identifications given to FalseDiscoveryRate! No calculation performed.\n";
434
vector<DoubleReal> target_scores, decoy_scores;
435
String decoy_string = (String)param_.getValue("decoy_string");
436
for (vector<ProteinIdentification>::const_iterator it = ids.begin(); it != ids.end(); ++it)
438
for (vector<ProteinHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
440
if (pit->getAccession().hasSubstring(decoy_string))
442
decoy_scores.push_back(pit->getScore());
446
target_scores.push_back(pit->getScore());
451
bool q_value(param_.getValue("q_value").toBool());
452
bool higher_score_better(ids.begin()->isHigherScoreBetter());
454
// calculate fdr for the forward scores
455
Map<DoubleReal, DoubleReal> score_to_fdr;
456
calculateFDRs_(score_to_fdr, target_scores, decoy_scores, q_value, higher_score_better);
459
String score_type = ids.begin()->getScoreType() + "_score";
460
for (vector<ProteinIdentification>::iterator it = ids.begin(); it != ids.end(); ++it)
464
it->setScoreType("q-value");
468
it->setScoreType("FDR");
470
it->setHigherScoreBetter(false);
471
vector<ProteinHit> hits = it->getHits();
472
for (vector<ProteinHit>::iterator pit = hits.begin(); pit != hits.end(); ++pit)
474
pit->setMetaValue(score_type, pit->getScore());
475
pit->setScore(score_to_fdr[pit->getScore()]);
483
void FalseDiscoveryRate::apply(vector<ProteinIdentification>& fwd_ids, vector<ProteinIdentification>& rev_ids)
485
if (fwd_ids.empty() || rev_ids.empty())
489
vector<DoubleReal> target_scores, decoy_scores;
490
// get the scores of all peptide hits
491
for (vector<ProteinIdentification>::const_iterator it = fwd_ids.begin(); it != fwd_ids.end(); ++it)
493
for (vector<ProteinHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
495
target_scores.push_back(pit->getScore());
498
for (vector<ProteinIdentification>::const_iterator it = rev_ids.begin(); it != rev_ids.end(); ++it)
500
for (vector<ProteinHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
502
decoy_scores.push_back(pit->getScore());
506
bool q_value(param_.getValue("q_value").toBool());
507
bool higher_score_better(fwd_ids.begin()->isHigherScoreBetter());
508
// calculate fdr for the forward scores
509
Map<DoubleReal, DoubleReal> score_to_fdr;
510
calculateFDRs_(score_to_fdr, target_scores, decoy_scores, q_value, higher_score_better);
513
String score_type = fwd_ids.begin()->getScoreType() + "_score";
514
for (vector<ProteinIdentification>::iterator it = fwd_ids.begin(); it != fwd_ids.end(); ++it)
518
it->setScoreType("q-value");
522
it->setScoreType("FDR");
524
it->setHigherScoreBetter(false);
525
vector<ProteinHit> hits = it->getHits();
526
for (vector<ProteinHit>::iterator pit = hits.begin(); pit != hits.end(); ++pit)
528
pit->setMetaValue(score_type, pit->getScore());
529
pit->setScore(score_to_fdr[pit->getScore()]);
538
void FalseDiscoveryRate::calculateFDRs_(Map<DoubleReal, DoubleReal>& score_to_fdr, vector<DoubleReal>& target_scores, vector<DoubleReal>& decoy_scores, bool q_value, bool higher_score_better)
540
Size number_of_target_scores = target_scores.size();
542
if (higher_score_better && !q_value)
544
sort(target_scores.rbegin(), target_scores.rend());
545
sort(decoy_scores.rbegin(), decoy_scores.rend());
547
else if (!higher_score_better && !q_value)
549
sort(target_scores.begin(), target_scores.end());
550
sort(decoy_scores.begin(), decoy_scores.end());
552
else if (higher_score_better)
554
sort(target_scores.begin(), target_scores.end());
555
sort(decoy_scores.rbegin(), decoy_scores.rend());
559
sort(target_scores.rbegin(), target_scores.rend());
560
sort(decoy_scores.begin(), decoy_scores.end());
564
DoubleReal minimal_fdr = 1.;
568
for (Size i = 0; i != target_scores.size(); ++i)
570
if (decoy_scores.empty())
572
// set FDR to 0 (done below automatically)
574
else if (i == 0 && j == 0)
576
while (j != decoy_scores.size()
577
&&((target_scores[i] <= decoy_scores[j] && higher_score_better) ||
578
(target_scores[i] >= decoy_scores[j] && !higher_score_better)))
585
if (j == decoy_scores.size())
590
&&((target_scores[i] > decoy_scores[j] && higher_score_better) ||
591
(target_scores[i] < decoy_scores[j] && !higher_score_better)))
595
// Since j has to be equal to the number of fps above the threshold we add one
596
if ((target_scores[i] <= decoy_scores[j] && higher_score_better)
597
|| (target_scores[i] >= decoy_scores[j] && !higher_score_better))
603
#ifdef FALSE_DISCOVERY_RATE_DEBUG
604
cerr << target_scores[i] << " " << decoy_scores[j] << " " << i << " " << j << " ";
609
if (minimal_fdr >= (DoubleReal)j / (number_of_target_scores - i))
611
minimal_fdr = (DoubleReal)j / (number_of_target_scores - i);
615
#ifdef FALSE_DISCOVERY_RATE_DEBUG
618
score_to_fdr[target_scores[i]] = fdr;
624
for (Size i = 0; i != target_scores.size(); ++i)
626
while (j != decoy_scores.size() &&
627
((target_scores[i] <= decoy_scores[j] && higher_score_better) ||
628
(target_scores[i] >= decoy_scores[j] && !higher_score_better)))
633
#ifdef FALSE_DISCOVERY_RATE_DEBUG
634
cerr << target_scores[i] << " " << decoy_scores[j] << " " << i << " " << j << " ";
638
fdr = (DoubleReal)j / (DoubleReal)(i + 1);
640
#ifdef FALSE_DISCOVERY_RATE_DEBUG
643
score_to_fdr[target_scores[i]] = fdr;
648
// assign q-value of decoy_score to closest target_score
649
for (Size i = 0; i != decoy_scores.size(); ++i)
651
Size closest_idx = 0;
652
for (Size j = 0; j != target_scores.size(); ++j)
654
if (fabs(decoy_scores[i] - target_scores[j]) < fabs(decoy_scores[i] - target_scores[closest_idx]))
659
score_to_fdr[decoy_scores[i]] = score_to_fdr[target_scores[closest_idx]];
664
} // namespace OpenMS