~ubuntu-branches/ubuntu/wily/openms/wily

« back to all changes in this revision

Viewing changes to source/MATH/STATISTICS/PosteriorErrorProbabilityModel.C

  • Committer: Package Import Robot
  • Author(s): Filippo Rusconi
  • Date: 2013-12-20 11:30:16 UTC
  • mfrom: (5.1.2 sid)
  • Revision ID: package-import@ubuntu.com-20131220113016-wre5g9bteeheq6he
Tags: 1.11.1-3
* remove version number from libbost development package names;
* ensure that AUTHORS is correctly shipped in all packages.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
// -*- mode: C++; tab-width: 2; -*-
2
 
// vi: set ts=2:
3
 
//
4
 
// --------------------------------------------------------------------------
5
 
//                   OpenMS Mass Spectrometry Framework
6
 
// --------------------------------------------------------------------------
7
 
//  Copyright (C) 2003-2009 -- Oliver Kohlbacher, Knut Reinert
8
 
//
9
 
//  This library is free software; you can redistribute it and/or
10
 
//  modify it under the terms of the GNU Lesser General Public
11
 
//  License as published by the Free Software Foundation; either
12
 
//  version 2.1 of the License, or (at your option) any later version.
13
 
//
14
 
//  This library is distributed in the hope that it will be useful,
15
 
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 
//  Lesser General Public License for more details.
18
 
//
19
 
//  You should have received a copy of the GNU Lesser General Public
20
 
//  License along with this library; if not, write to the Free Software
21
 
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
1
// --------------------------------------------------------------------------
 
2
//                   OpenMS -- Open-Source Mass Spectrometry
 
3
// --------------------------------------------------------------------------
 
4
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
 
5
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
 
6
//
 
7
// This software is released under a three-clause BSD license:
 
8
//  * Redistributions of source code must retain the above copyright
 
9
//    notice, this list of conditions and the following disclaimer.
 
10
//  * Redistributions in binary form must reproduce the above copyright
 
11
//    notice, this list of conditions and the following disclaimer in the
 
12
//    documentation and/or other materials provided with the distribution.
 
13
//  * Neither the name of any author or any participating institution
 
14
//    may be used to endorse or promote products derived from this software
 
15
//    without specific prior written permission.
 
16
// For a full list of authors, refer to the file AUTHORS.
 
17
// --------------------------------------------------------------------------
 
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 
19
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 
20
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 
21
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
 
22
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 
23
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 
24
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 
25
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 
26
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 
27
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 
28
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22
29
//
23
30
// --------------------------------------------------------------------------
24
31
// $Maintainer: David Wojnar $
36
43
 
37
44
namespace OpenMS
38
45
{
39
 
        namespace Math
40
 
        {
41
 
                PosteriorErrorProbabilityModel::PosteriorErrorProbabilityModel()
42
 
                : DefaultParamHandler("PosteriorErrorProbabilityModel"), negative_prior_(0.5),max_incorrectly_(0),max_correctly_(0), smallest_score_(0)
43
 
        {
44
 
                        defaults_.setValue("number_of_bins", 100, "Number of bins used for visualization. Only needed if each iteration step of the EM-Algorithm will be visualized", StringList::create("advanced"));
45
 
                        defaults_.setValue("output_plots","false","If true every step of the EM-algorithm will be written to a file as a gnuplot formula",StringList::create("advanced"));
46
 
                        defaults_.setValidStrings("output_plots",StringList::create("true,false"));
47
 
                        defaults_.setValue("output_name","", "if output_plots is on, the output files will be saved in the following manner: <output_name>scores.txt for the scores and <output_name> which contains each step of the EM-algorithm e.g. output_name = /usr/home/OMSSA123 then /usr/home/OMSSA123_scores.txt, /usr/home/OMSSA123 will be written. If no directory is specified, e.g. instead of '/usr/home/OMSSA123' just OMSSA123, the files will be written into the working directory.",StringList::create("advanced,output file"));
48
 
                        defaults_.setValue("incorrectly_assigned","Gumbel", "for 'Gumbel', the Gumbel distribution is used to plot incorrectly assigned sequences. For 'Gauss', the Gauss distribution is used.",StringList::create("advanced"));
49
 
                        defaults_.setValidStrings("incorrectly_assigned",StringList::create("Gumbel,Gauss"));
50
 
                        defaultsToParam_();
51
 
                        calc_incorrect_ = &PosteriorErrorProbabilityModel::getGumbel;
52
 
                        calc_correct_ = &PosteriorErrorProbabilityModel::getGauss;
53
 
                        getNegativeGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGumbelGnuplotFormula;
54
 
                        getPositiveGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGaussGnuplotFormula;
55
 
                }
56
 
                
57
 
                PosteriorErrorProbabilityModel::~PosteriorErrorProbabilityModel()
58
 
                {
59
 
                }
60
 
                                
61
 
                bool PosteriorErrorProbabilityModel::fit( std::vector<double>& search_engine_scores)
62
 
                {       
63
 
                        if(search_engine_scores.empty())
64
 
                        {
65
 
                                return false;
66
 
                        }
67
 
                        //-------------------------------------------------------------
68
 
                        // Initializing Parameters
69
 
                        //-------------------------------------------------------------
70
 
                        sort(search_engine_scores.begin(),search_engine_scores.end());          
71
 
                        
72
 
                        smallest_score_ = search_engine_scores[0];
73
 
                        vector<double> x_scores;
74
 
                        x_scores.resize(search_engine_scores.size());
75
 
                        std::vector< double>::iterator it = x_scores.begin();
76
 
                        for(std::vector< double>::iterator iti = search_engine_scores.begin(); iti < search_engine_scores.end(); ++it,++iti)
77
 
                        {
78
 
                                *it = *iti + fabs(smallest_score_) + 0.001;
79
 
                        }
80
 
                        negative_prior_ = 0.7;
81
 
                        if(param_.getValue("incorrectly_assigned") == "Gumbel")
82
 
                        {
83
 
                                incorrectly_assigned_fit_param_.x0 = gsl_stats_mean(&x_scores[0], 1, ceil(0.5* x_scores.size())) + x_scores[0];
84
 
                                incorrectly_assigned_fit_param_.sigma = gsl_stats_sd(&x_scores[0], 1, x_scores.size() - 1);//pow(gsl_stats_sd_with_fixed_mean(&probabilities[x_score_start], 1, probabilities.size() - x_score_start, gauss_fit_param_.x0),2);
85
 
                                incorrectly_assigned_fit_param_.A = 1   /sqrt(2*3.14159*pow(incorrectly_assigned_fit_param_.sigma,2));  
86
 
                                //TODO: compute directly with gauss. Workaround:
87
 
                                calc_incorrect_ = &PosteriorErrorProbabilityModel::getGauss;
88
 
                                getNegativeGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGumbelGnuplotFormula;
89
 
                        }
90
 
                        else
91
 
                        {
92
 
                                incorrectly_assigned_fit_param_.x0 = gsl_stats_mean(&x_scores[0], 1, ceil(0.5* x_scores.size())) + x_scores[0];
93
 
                                incorrectly_assigned_fit_param_.sigma = gsl_stats_sd(&x_scores[0], 1, x_scores.size() - 1);//pow(gsl_stats_sd_with_fixed_mean(&probabilities[x_score_start], 1, probabilities.size() - x_score_start, gauss_fit_param_.x0),2);
94
 
                                incorrectly_assigned_fit_param_.A = 1   /sqrt(2*3.14159*pow(incorrectly_assigned_fit_param_.sigma,2));          
95
 
                                calc_incorrect_ = &PosteriorErrorProbabilityModel::getGauss;
96
 
                                getNegativeGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGaussGnuplotFormula;
97
 
                        }
98
 
                        getPositiveGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGaussGnuplotFormula;
99
 
                        calc_correct_ = &PosteriorErrorProbabilityModel::getGauss;
100
 
                        Size x_score_start = std::min(x_scores.size()-1, (Size) ceil(x_scores.size()*0.7)); // if only one score is present, ceil(...) will yield 1, which is an invalid index
101
 
                        correctly_assigned_fit_param_.x0 = gsl_stats_mean(&x_scores[x_score_start], 1, x_scores.size() - x_score_start) + x_scores[x_score_start];//(gauss_scores.begin()->getX() + (gauss_scores.end()-1)->getX())/2; 
102
 
                        correctly_assigned_fit_param_.sigma = incorrectly_assigned_fit_param_.sigma;
103
 
                        correctly_assigned_fit_param_.A = 1.0   / sqrt(2*3.14159*pow(correctly_assigned_fit_param_.sigma,2));           
104
 
 
105
 
                        DoubleReal maxlike(0);
106
 
                        vector<DoubleReal> incorrect_density;
107
 
                        vector<DoubleReal> correct_density;
108
 
                        
109
 
                        fillDensities(x_scores,incorrect_density,correct_density);
110
 
                        
111
 
                        
112
 
                        maxlike = computeMaxLikelihood(incorrect_density,correct_density);
113
 
                        //-------------------------------------------------------------
114
 
                        // create files for output
115
 
                        //-------------------------------------------------------------
116
 
                        bool output_plots  = param_.getValue("output_plots").toBool();
117
 
                        TextFile* file = NULL;
118
 
                        if(output_plots)
119
 
                        {
120
 
                                file = InitPlots(x_scores);
121
 
                        }                       
122
 
                        //-------------------------------------------------------------
123
 
                        // Estimate Parameters - EM algorithm
124
 
                        //-------------------------------------------------------------
125
 
                        bool stop_em_init = false;
126
 
                        do
127
 
                        { 
128
 
                                //E-STEP
129
 
                                DoubleReal one_minus_sum_posterior = one_minus_sum_post(incorrect_density,correct_density);
130
 
                                DoubleReal sum_posterior = sum_post(incorrect_density,correct_density);
131
 
                                
132
 
                                //new mean                              
133
 
                                DoubleReal sum_positive_x0 = sum_pos_x0(x_scores, incorrect_density,correct_density);
134
 
                                DoubleReal sum_negative_x0 = sum_neg_x0(x_scores, incorrect_density,correct_density);
135
 
                                
136
 
                                DoubleReal positive_mean = sum_positive_x0/one_minus_sum_posterior; 
137
 
                                DoubleReal negative_mean = sum_negative_x0/sum_posterior; 
138
 
                                
139
 
                                //new standard deviation
140
 
                                DoubleReal sum_positive_sigma = sum_pos_sigma(x_scores,incorrect_density,correct_density, positive_mean);
141
 
                                DoubleReal sum_negative_sigma = sum_neg_sigma(x_scores,incorrect_density,correct_density, negative_mean);
142
 
                                
143
 
                                //update parameters
144
 
                                correctly_assigned_fit_param_.x0 = positive_mean;
145
 
                                if(sum_positive_sigma  != 0 && one_minus_sum_posterior != 0)
146
 
                                {
147
 
                                        correctly_assigned_fit_param_.sigma = sqrt(sum_positive_sigma/one_minus_sum_posterior); 
148
 
                                        correctly_assigned_fit_param_.A = 1/sqrt(2*3.14159*pow(correctly_assigned_fit_param_.sigma,2));
149
 
                                }
150
 
                                                
151
 
                                incorrectly_assigned_fit_param_.x0 = negative_mean;
152
 
                                if(sum_negative_sigma  != 0 && sum_posterior != 0)
153
 
                                {
154
 
                                        incorrectly_assigned_fit_param_.sigma = sqrt(sum_negative_sigma/sum_posterior);
155
 
                                        incorrectly_assigned_fit_param_.A = 1/sqrt(2*3.14159*pow(incorrectly_assigned_fit_param_.sigma,2));
156
 
                                }                               
157
 
 
158
 
                                        
159
 
                                //compute new prior probabilities negative peptides
160
 
                                fillDensities(x_scores,incorrect_density,correct_density);
161
 
                                sum_posterior = sum_post(incorrect_density,correct_density);
162
 
                                negative_prior_ = sum_posterior/x_scores.size();
163
 
                                
164
 
                                DoubleReal new_maxlike(computeMaxLikelihood(incorrect_density,correct_density));
165
 
        if(boost::math::isnan(new_maxlike - maxlike))
166
 
                                {
167
 
                                        return false;
168
 
                                        //throw Exception::UnableToFit(__FILE__,__LINE__,__PRETTY_FUNCTION__,"UnableToFit-PosteriorErrorProbability","Could not fit mixture model to data");                                    
169
 
                                }
170
 
        if(fabs(new_maxlike - maxlike) < 0.001)
171
 
        {
172
 
                stop_em_init = true;
173
 
                sum_posterior = sum_post(incorrect_density,correct_density);
174
 
                                        negative_prior_ = sum_posterior/x_scores.size();
175
 
                
176
 
        }
177
 
                                if(output_plots)
178
 
                                {
179
 
                                        String formula1, formula2,formula3;
180
 
                                        formula1 = ((this)->*(getNegativeGnuplotFormula_))(incorrectly_assigned_fit_param_)+ "* " + String(negative_prior_);//String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
181
 
                                        formula2 = ((this)->*(getPositiveGnuplotFormula_))(correctly_assigned_fit_param_)+ "* (1 - " + String(negative_prior_) + ")";//String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
182
 
                                        formula3 = getBothGnuplotFormula(incorrectly_assigned_fit_param_,correctly_assigned_fit_param_);
183
 
                                        (*file)<<("plot \""+(String)param_.getValue("output_name") +"_scores.txt\" with boxes, " + formula1 + " , " + formula2 + " , " + formula3);
184
 
                                }
185
 
                                //update maximum likelihood
186
 
                                maxlike = new_maxlike;                          
187
 
                        }while(!stop_em_init);
188
 
                        //-------------------------------------------------------------
189
 
                        // Finished fitting
190
 
                        //-------------------------------------------------------------                 
191
 
                        //!!Workaround:
192
 
                        if(param_.getValue("incorrectly_assigned") == "Gumbel")
193
 
                        {
194
 
                                calc_incorrect_ = &PosteriorErrorProbabilityModel::getGumbel;
195
 
                        }
196
 
                        max_incorrectly_ = ((this)->*(calc_incorrect_))(incorrectly_assigned_fit_param_.x0, incorrectly_assigned_fit_param_);
197
 
                        max_correctly_ = ((this)->*(calc_correct_))(correctly_assigned_fit_param_.x0, correctly_assigned_fit_param_ );
198
 
                        if(output_plots)
199
 
                        {
200
 
                                String formula1, formula2,formula3;
201
 
                                formula1 = ((this)->*(getNegativeGnuplotFormula_))(incorrectly_assigned_fit_param_)+ "*" + String(negative_prior_);//String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
202
 
                                formula2 = ((this)->*(getPositiveGnuplotFormula_))(correctly_assigned_fit_param_)+ "* (1 - " + String(negative_prior_) + ")";// String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
203
 
                                formula3 = getBothGnuplotFormula(incorrectly_assigned_fit_param_,correctly_assigned_fit_param_);
204
 
                                (*file)<<("plot \""+(String)param_.getValue("output_name") +"_scores.txt\" with boxes, " + formula1 + " , " + formula2 + " , " + formula3);
205
 
                                file->store((String)param_.getValue("output_name"));
206
 
                                delete file;
207
 
                        }
208
 
                        return true;
209
 
                }
210
 
                
211
 
                bool PosteriorErrorProbabilityModel::fit(  std::vector<double>& search_engine_scores, vector<double>& probabilities)
212
 
                {       
213
 
                        bool return_value;
214
 
                        return_value = fit(search_engine_scores);
215
 
                        if(!return_value) return false;
216
 
                        probabilities.resize(search_engine_scores.size());
217
 
                        vector<double>::iterator probs =probabilities.begin();
218
 
                        for(vector<double>::iterator scores = search_engine_scores.begin(); scores != search_engine_scores.end(); ++scores, ++probs)
219
 
                        {
220
 
                                *probs = computeProbability(*scores);
221
 
                        }
222
 
                        return true;
223
 
                }
224
 
                
225
 
                void PosteriorErrorProbabilityModel::fillDensities(vector<double>& x_scores,vector<DoubleReal>& incorrect_density,vector<DoubleReal>& correct_density)
226
 
                {
227
 
                        if(incorrect_density.size() != x_scores.size())
228
 
                        {
229
 
                                incorrect_density.resize(x_scores.size());
230
 
                                correct_density.resize(x_scores.size());
231
 
                        }
232
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
233
 
                        vector<DoubleReal>::iterator correct = correct_density.begin();
234
 
                        for(vector<double>::iterator scores = x_scores.begin(); scores != x_scores.end(); ++scores, ++incorrect, ++correct)
235
 
                        {
236
 
                                *incorrect = ((this)->*(calc_incorrect_))(*scores, incorrectly_assigned_fit_param_);
237
 
                                *correct = ((this)->*(calc_correct_))(*scores, correctly_assigned_fit_param_ );
238
 
                        }
239
 
                }
240
 
                
241
 
                DoubleReal PosteriorErrorProbabilityModel::computeMaxLikelihood(vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density)
242
 
                {
243
 
                        DoubleReal maxlike(0);
244
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
245
 
                        for(vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end() ; ++correct, ++incorrect)
246
 
                        {
247
 
                                maxlike += log10(negative_prior_* (*incorrect) +(1-negative_prior_)* (*correct));
248
 
                        }
249
 
                        return maxlike;
250
 
                }
251
 
                DoubleReal PosteriorErrorProbabilityModel::one_minus_sum_post(vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density)
252
 
                {
253
 
                        DoubleReal one_min(0);
254
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
255
 
                        for(vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end() ; ++correct, ++incorrect)
256
 
                        {
257
 
                                one_min +=  1  - ((negative_prior_* (*incorrect) )/((negative_prior_* (*incorrect) ) + (1-negative_prior_)* (*correct) ));
258
 
                        }
259
 
                        return one_min;         
260
 
                }
261
 
 
262
 
                DoubleReal PosteriorErrorProbabilityModel::sum_post(vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density)
263
 
                {
264
 
                        DoubleReal post(0);
265
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
266
 
                        for(vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end() ; ++correct, ++incorrect)
267
 
                        {
268
 
                                post += ((negative_prior_* (*incorrect) )/((negative_prior_* (*incorrect) ) + (1-negative_prior_)* (*correct) ));
269
 
                        }
270
 
                        return post;            
271
 
                }
272
 
                
273
 
                DoubleReal PosteriorErrorProbabilityModel::sum_pos_x0(vector<double>& x_scores, vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density)
274
 
                {
275
 
                        DoubleReal pos_x0(0);
276
 
                        vector<double>::iterator the_x = x_scores.begin();
277
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
278
 
                        for(vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end() ; ++correct, ++incorrect,++the_x)
279
 
                        {
280
 
                                pos_x0 += ((1  - ((negative_prior_* (*incorrect) )/((negative_prior_* (*incorrect) ) + (1-negative_prior_)* (*correct) )))* (*the_x) );
281
 
                        }
282
 
                        return pos_x0;  
283
 
                }
284
 
                
285
 
                DoubleReal PosteriorErrorProbabilityModel::sum_neg_x0(vector<double>& x_scores, vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density)
286
 
                {
287
 
                        DoubleReal neg_x0(0);
288
 
                        vector<double>::iterator the_x = x_scores.begin();
289
 
                        vector<DoubleReal>::iterator correct = correct_density.begin();
290
 
                        for(vector<DoubleReal>::iterator incorrect = incorrect_density.begin(); incorrect < incorrect_density.end() ; ++correct, ++incorrect,++the_x)
291
 
                        {
292
 
                                neg_x0 += ((((negative_prior_* (*incorrect) )/((negative_prior_* (*incorrect) ) + (1-negative_prior_)* (*correct) )))* (*the_x) );
293
 
                        }
294
 
                        return neg_x0;  
295
 
                }               
296
 
                
297
 
                DoubleReal PosteriorErrorProbabilityModel::sum_pos_sigma(vector<double>& x_scores, vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density, DoubleReal positive_mean)
298
 
                {
299
 
                        DoubleReal pos_sigma(0);
300
 
                        vector<double>::iterator the_x = x_scores.begin();
301
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
302
 
                        for(vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end() ; ++correct, ++incorrect,++the_x)
303
 
                        {
304
 
                                pos_sigma += ((1  - ((negative_prior_* (*incorrect) )/((negative_prior_* (*incorrect) ) + (1-negative_prior_)* (*correct) )))*pow( (*the_x) - positive_mean,2));
305
 
                        }
306
 
                        return pos_sigma;
307
 
                }
308
 
                
309
 
                DoubleReal PosteriorErrorProbabilityModel::sum_neg_sigma(vector<double>& x_scores, vector<DoubleReal>& incorrect_density, vector<DoubleReal>& correct_density, DoubleReal positive_mean)
310
 
                {
311
 
                        DoubleReal neg_sigma(0);
312
 
                        vector<double>::iterator the_x = x_scores.begin();
313
 
                        vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
314
 
                        for(vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end() ; ++correct, ++incorrect,++the_x)
315
 
                        {
316
 
                                neg_sigma += ((((negative_prior_* (*incorrect) )/((negative_prior_* (*incorrect) ) + (1-negative_prior_)* (*correct) )))*pow( (*the_x) - positive_mean,2));
317
 
                        }
318
 
                        return neg_sigma;
319
 
                }
320
 
                DoubleReal PosteriorErrorProbabilityModel::computeProbability(DoubleReal score)
321
 
                {
322
 
                        score = score + fabs(smallest_score_) + 0.001;
323
 
                        DoubleReal x_neg;
324
 
                        DoubleReal x_pos;
325
 
                        //the score is smaller than the peak of incorreclty assigned sequences. To ensure that the probabilies wont rise again use the incorrectly assigend peak for computation
326
 
                        if(score < incorrectly_assigned_fit_param_.x0)
327
 
                        {
328
 
                                x_neg = max_incorrectly_;       
329
 
                                x_pos = ((this)->*(calc_correct_))(score, correctly_assigned_fit_param_ );
330
 
                        }
331
 
                        //same as above. However, this time to ensure that probabilities wont drop again.
332
 
                        else if(score > correctly_assigned_fit_param_.x0)
333
 
                        {
334
 
                                x_neg = ((this)->*(calc_incorrect_))(score, incorrectly_assigned_fit_param_);
335
 
                                x_pos = max_correctly_;
336
 
                        }
337
 
                        //if its in between use the normal formula
338
 
                        else
339
 
                        {
340
 
                                x_neg = ((this)->*(calc_incorrect_))(score, incorrectly_assigned_fit_param_);
341
 
                                x_pos = ((this)->*(calc_correct_))(score, correctly_assigned_fit_param_ );
342
 
                        }
343
 
                         return (negative_prior_*x_neg)/((negative_prior_*x_neg) + (1-negative_prior_)*x_pos);          
344
 
                }               
345
 
                TextFile* PosteriorErrorProbabilityModel::InitPlots(vector<double> & x_scores)
346
 
                {               
347
 
                                TextFile* file = new TextFile;
348
 
                                String output;
349
 
                                std::vector<DPosition<2> > points;
350
 
                                Int number_of_bins = param_.getValue("number_of_bins");
351
 
                                points.resize(number_of_bins);
352
 
                                DPosition<2> temp;
353
 
                                double dividing_score = (x_scores.back() -x_scores[0])/number_of_bins;
354
 
 
355
 
                                temp.setX(dividing_score/2);
356
 
                                temp.setY(0);
357
 
                                Int bin = 0;
358
 
                                points[bin] = temp;
359
 
                                double temp_divider = dividing_score;
360
 
                                for(std::vector< double>::iterator it = x_scores.begin(); it < x_scores.end(); ++it)
361
 
                                {
362
 
                                        if(temp_divider - *it >= 0 && bin < number_of_bins - 1 )
363
 
                                        {
364
 
                                                        points[bin].setY(points[bin].getY()+1);
365
 
                                        }
366
 
                                        else if(bin  == number_of_bins - 1 )
367
 
                                        {
368
 
                                                points[bin].setY(points[bin].getY()+1);
369
 
                                        }
370
 
                                        else
371
 
                                        {
372
 
                                                temp.setX((temp_divider + temp_divider + dividing_score)/2);
373
 
                                                temp.setY(1);
374
 
                                                ++bin;
375
 
                                                points[bin] = temp;
376
 
                                                temp_divider += dividing_score;
377
 
                                        }
378
 
                                }       
379
 
 
380
 
                                for(vector<DPosition<2> >::iterator it = points.begin(); it < points.end(); ++it)
381
 
                                {
382
 
                                        it->setY( it->getY()/( x_scores.size()  *dividing_score));
383
 
                                }
384
 
                                        
385
 
                                TextFile data_points;
386
 
                                for(vector<DPosition<2> >::iterator it = points.begin(); it < points.end() ; ++it)
387
 
                                {
388
 
                                        String temp  = it->getX();
389
 
                                        temp += "\t";
390
 
                                        temp += it->getY();
391
 
                                        data_points<<temp;
392
 
                                }
393
 
                                data_points.store((String)param_.getValue("output_name") + "_scores.txt");
394
 
                                output = "set output \""+ (String)param_.getValue("output_name") +".ps\"";
395
 
                                (*file)<<"set terminal postscript color solid linewidth 2.0 rounded";
396
 
                                //(*file)<<"set style empty solid 0.5 border -1";
397
 
                                //(*file)<<"set style function lines";
398
 
                                (*file)<<"set xlabel \"discriminant score\"";
399
 
                                (*file)<<"set ylabel \"density\"";
400
 
                                //TODO: (*file)<<"set title ";
401
 
                                (*file)<<"set key off";
402
 
                                (*file)<<(output);
403
 
                                String formula1, formula2;
404
 
                                formula1 = ((this)->*(getNegativeGnuplotFormula_))(incorrectly_assigned_fit_param_)+ "* " + String(negative_prior_);//String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
405
 
                                formula2 = ((this)->*(getPositiveGnuplotFormula_))(correctly_assigned_fit_param_)+ "* (1 - " + String(negative_prior_) + ")";//String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
406
 
                                (*file)<< ("plot \""+(String)param_.getValue("output_name") +"_scores.txt\" with boxes, " + formula1 + " , " + formula2);
407
 
                                return file;
408
 
                }
409
 
 
410
 
                const String PosteriorErrorProbabilityModel::getGumbelGnuplotFormula(const GaussFitter::GaussFitResult& params) const
411
 
                {
412
 
                        // build a formula with the fitted parameters for gnuplot
413
 
                        stringstream formula;
414
 
                        formula << "(1/" << params.sigma <<") * " << "exp(( "<< params.x0<< "- x)/"<< params.sigma <<") * exp(-exp(("<<params.x0<<" - x)/"<<params.sigma<<"))";
415
 
                        return  formula.str();
416
 
                }
417
 
                                
418
 
                const String PosteriorErrorProbabilityModel::getGaussGnuplotFormula(const GaussFitter::GaussFitResult& params) const
419
 
                {
420
 
                        stringstream formula;
421
 
                        formula << params.A << " * exp(-(x - " << params.x0 << ") ** 2 / 2 / (" << params.sigma << ") ** 2)";
422
 
                        return formula.str();
423
 
                }
424
 
                
425
 
                const String PosteriorErrorProbabilityModel::getBothGnuplotFormula(const GaussFitter::GaussFitResult& incorrect,const GaussFitter::GaussFitResult& correct) const
426
 
                {
427
 
                        stringstream formula;
428
 
                        formula << negative_prior_<<"*"<<  ((this)->*(getNegativeGnuplotFormula_))(incorrect) <<" + (1-"<<negative_prior_<<")*"<< ((this)->*(getPositiveGnuplotFormula_))(correct);
429
 
                        return formula.str();
430
 
                }
431
 
                
432
 
                void PosteriorErrorProbabilityModel::plotTargetDecoyEstimation(vector<double> &target,vector<double> & decoy)
433
 
                {
434
 
                                TextFile file;
435
 
                                String output;
436
 
                                std::vector<DPosition<3> > points;
437
 
                                Int number_of_bins = param_.getValue("number_of_bins");
438
 
                                points.resize(number_of_bins);
439
 
                                DPosition<3> temp;
440
 
 
441
 
                                sort(target.begin(),target.end());
442
 
                                sort(decoy.begin(),decoy.end());
443
 
                                
444
 
                                double dividing_score = (max(target.back(),decoy.back())/*scores.back()*/ - min(target[0],decoy[0])/*scores[0]*/)/number_of_bins;
445
 
                                
446
 
                                temp[0] = (dividing_score/2);
447
 
                                temp[1] = 0;
448
 
                                temp[2] = 0;
449
 
                                Int bin = 0;
450
 
                                points[bin] = temp;
451
 
                                double temp_divider = dividing_score;
452
 
                                for(std::vector< double>::iterator it = target.begin(); it < target.end(); ++it)
453
 
                                {
454
 
                                        *it = *it + fabs(smallest_score_) + 0.001;
455
 
                                        if(temp_divider - *it >= 0 && bin < number_of_bins - 1 )
456
 
                                        {
457
 
                                                        points[bin][1] = (points[bin][1]+1);
458
 
                                        }
459
 
                                        else if(bin  == number_of_bins - 1 )
460
 
                                        {
461
 
                                                points[bin][1] = (points[bin][1]+1);
462
 
                                        }
463
 
                                        else
464
 
                                        {
465
 
                                                temp[0] = ((temp_divider + temp_divider + dividing_score)/2);
466
 
                                                temp[1] = 1;
467
 
                                                ++bin;
468
 
                                                points[bin] = temp;
469
 
                                                temp_divider += dividing_score;
470
 
                                        }
471
 
                                }       
472
 
 
473
 
                                bin = 0;
474
 
                                temp_divider = dividing_score;
475
 
                                for(std::vector< double>::iterator it = decoy.begin(); it < decoy.end(); ++it)
476
 
                                {       
477
 
                                        *it = *it + fabs(smallest_score_) + 0.001;
478
 
                                        if(temp_divider - *it >= 0 && bin < number_of_bins - 1 )
479
 
                                        {
480
 
                                                        points[bin][2] = (points[bin][2]+1);
481
 
                                        }
482
 
                                        else if(bin  == number_of_bins - 1 )
483
 
                                        {
484
 
                                                points[bin][2] = (points[bin][2]+1);
485
 
                                        }
486
 
                                        else
487
 
                                        {
488
 
                                                //temp[0] = ((temp_divider + temp_divider + dividing_score)/2);
489
 
                                        //      temp[2] = 1;
490
 
                                                ++bin;
491
 
                                                points[bin][2] = 1;
492
 
                                                temp_divider += dividing_score;
493
 
                                        }
494
 
                                }       
495
 
 
496
 
                                for(vector<DPosition<3> >::iterator it = points.begin(); it < points.end(); ++it)
497
 
                                {
498
 
                                //      if((*it)[1] > (*it)[2])
499
 
                                //      {(*it)[1] = (*it)[1] + (*it)[2];}
500
 
                                /*      else{/(*it)[2] = (*it)[1] + (*it)[2];//}*/
501
 
                                        
502
 
                                        (*it)[1] = ( (*it)[1]/( (decoy.size()+target.size())  *dividing_score));
503
 
                                        (*it)[2] = ( (*it)[2]/( (decoy.size()+target.size())  *dividing_score));
504
 
                                }
505
 
                                        
506
 
                                TextFile data_points;
507
 
                                for(vector<DPosition<3> >::iterator it = points.begin(); it < points.end() ; ++it)
508
 
                                {
509
 
                                        String temp  = (*it)[0];
510
 
                                        temp += "\t";
511
 
                                        temp += (*it)[1];
512
 
                                        temp += "\t";
513
 
                                        temp += (*it)[2];
514
 
                                        data_points<<temp;
515
 
                                }
516
 
                                data_points.store( (String)param_.getValue("output_name") + "_target_decoy_scores.txt");
517
 
                                output = String("set output \"")+  (String)param_.getValue("output_name") + "_target_decoy.ps\"";
518
 
                                (file)<<"set terminal postscript color solid linewidth 2.0 rounded";
519
 
                                //(*file)<<"set style empty solid 0.5 border -1";
520
 
                                //(*file)<<"set style function lines";
521
 
                                (file)<<"set xlabel \"discriminant score\"";
522
 
                                (file)<<"set ylabel \"density\"";
523
 
                                //TODO: (*file)<<"set title ";
524
 
                                (file)<<"set key off";
525
 
                                (file)<<(output);
526
 
                                String formula1, formula2;
527
 
                                formula1 = getGumbelGnuplotFormula(getIncorrectlyAssignedFitResult())+ "* " + String(getNegativePrior());//String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
528
 
                                formula2 = getGaussGnuplotFormula(getCorrectlyAssignedFitResult())+ "* (1 - " + String(getNegativePrior()) + ")";//String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
529
 
                                (file)<< ("plot \"" + (String)param_.getValue("output_name") + "_target_decoy_scores.txt\"   using 1:3  with boxes fill solid 0.8 noborder, \"" + (String)param_.getValue("output_name") + "_target_decoy_scores.txt\"  using 1:2  with boxes, " + formula1 + " , " + formula2);
530
 
                                file.store( (String)param_.getValue("output_name") + "_target_decoy");
531
 
                        }               
532
 
                
533
 
        } //namespace Math
 
46
  namespace Math
 
47
  {
 
48
    PosteriorErrorProbabilityModel::PosteriorErrorProbabilityModel() :
 
49
      DefaultParamHandler("PosteriorErrorProbabilityModel"), negative_prior_(0.5), max_incorrectly_(0), max_correctly_(0), smallest_score_(0)
 
50
    {
 
51
      defaults_.setValue("number_of_bins", 100, "Number of bins used for visualization. Only needed if each iteration step of the EM-Algorithm will be visualized", StringList::create("advanced"));
 
52
      defaults_.setValue("output_plots", "false", "If true every step of the EM-algorithm will be written to a file as a gnuplot formula", StringList::create("advanced"));
 
53
      defaults_.setValidStrings("output_plots", StringList::create("true,false"));
 
54
      defaults_.setValue("output_name", "", "If output_plots is on, the output files will be saved in the following manner: <output_name>scores.txt for the scores and <output_name> which contains each step of the EM-algorithm e.g. output_name = /usr/home/OMSSA123 then /usr/home/OMSSA123_scores.txt, /usr/home/OMSSA123 will be written. If no directory is specified, e.g. instead of '/usr/home/OMSSA123' just OMSSA123, the files will be written into the working directory.", StringList::create("advanced,output file"));
 
55
      defaults_.setValue("incorrectly_assigned", "Gumbel", "for 'Gumbel', the Gumbel distribution is used to plot incorrectly assigned sequences. For 'Gauss', the Gauss distribution is used.", StringList::create("advanced"));
 
56
      defaults_.setValidStrings("incorrectly_assigned", StringList::create("Gumbel,Gauss"));
 
57
      defaultsToParam_();
 
58
      calc_incorrect_ = &PosteriorErrorProbabilityModel::getGumbel;
 
59
      calc_correct_ = &PosteriorErrorProbabilityModel::getGauss;
 
60
      getNegativeGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGumbelGnuplotFormula;
 
61
      getPositiveGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGaussGnuplotFormula;
 
62
    }
 
63
 
 
64
    PosteriorErrorProbabilityModel::~PosteriorErrorProbabilityModel()
 
65
    {
 
66
    }
 
67
 
 
68
    bool PosteriorErrorProbabilityModel::fit(std::vector<double> & search_engine_scores)
 
69
    {
 
70
      if (search_engine_scores.empty())
 
71
      {
 
72
        return false;
 
73
      }
 
74
      //-------------------------------------------------------------
 
75
      // Initializing Parameters
 
76
      //-------------------------------------------------------------
 
77
      sort(search_engine_scores.begin(), search_engine_scores.end());
 
78
 
 
79
      smallest_score_ = search_engine_scores[0];
 
80
      vector<double> x_scores;
 
81
      x_scores.resize(search_engine_scores.size());
 
82
      std::vector<double>::iterator it = x_scores.begin();
 
83
      for (std::vector<double>::iterator iti = search_engine_scores.begin(); iti < search_engine_scores.end(); ++it, ++iti)
 
84
      {
 
85
        *it = *iti + fabs(smallest_score_) + 0.001;
 
86
      }
 
87
      negative_prior_ = 0.7;
 
88
      if (param_.getValue("incorrectly_assigned") == "Gumbel")
 
89
      {
 
90
        incorrectly_assigned_fit_param_.x0 = gsl_stats_mean(&x_scores[0], 1, ceil(0.5 * x_scores.size())) + x_scores[0];
 
91
        incorrectly_assigned_fit_param_.sigma = gsl_stats_sd(&x_scores[0], 1, x_scores.size() - 1);        //pow(gsl_stats_sd_with_fixed_mean(&probabilities[x_score_start], 1, probabilities.size() - x_score_start, gauss_fit_param_.x0),2);
 
92
        incorrectly_assigned_fit_param_.A = 1   / sqrt(2 * 3.14159 * pow(incorrectly_assigned_fit_param_.sigma, 2));
 
93
        //TODO: compute directly with gauss. Workaround:
 
94
        calc_incorrect_ = &PosteriorErrorProbabilityModel::getGauss;
 
95
        getNegativeGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGumbelGnuplotFormula;
 
96
      }
 
97
      else
 
98
      {
 
99
        incorrectly_assigned_fit_param_.x0 = gsl_stats_mean(&x_scores[0], 1, ceil(0.5 * x_scores.size())) + x_scores[0];
 
100
        incorrectly_assigned_fit_param_.sigma = gsl_stats_sd(&x_scores[0], 1, x_scores.size() - 1);        //pow(gsl_stats_sd_with_fixed_mean(&probabilities[x_score_start], 1, probabilities.size() - x_score_start, gauss_fit_param_.x0),2);
 
101
        incorrectly_assigned_fit_param_.A = 1   / sqrt(2 * 3.14159 * pow(incorrectly_assigned_fit_param_.sigma, 2));
 
102
        calc_incorrect_ = &PosteriorErrorProbabilityModel::getGauss;
 
103
        getNegativeGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGaussGnuplotFormula;
 
104
      }
 
105
      getPositiveGnuplotFormula_ = &PosteriorErrorProbabilityModel::getGaussGnuplotFormula;
 
106
      calc_correct_ = &PosteriorErrorProbabilityModel::getGauss;
 
107
      Size x_score_start = std::min(x_scores.size() - 1, (Size) ceil(x_scores.size() * 0.7));   // if only one score is present, ceil(...) will yield 1, which is an invalid index
 
108
      correctly_assigned_fit_param_.x0 = gsl_stats_mean(&x_scores[x_score_start], 1, x_scores.size() - x_score_start) + x_scores[x_score_start];      //(gauss_scores.begin()->getX() + (gauss_scores.end()-1)->getX())/2;
 
109
      correctly_assigned_fit_param_.sigma = incorrectly_assigned_fit_param_.sigma;
 
110
      correctly_assigned_fit_param_.A = 1.0   / sqrt(2 * 3.14159 * pow(correctly_assigned_fit_param_.sigma, 2));
 
111
 
 
112
      DoubleReal maxlike(0);
 
113
      vector<DoubleReal> incorrect_density;
 
114
      vector<DoubleReal> correct_density;
 
115
 
 
116
      fillDensities(x_scores, incorrect_density, correct_density);
 
117
 
 
118
 
 
119
      maxlike = computeMaxLikelihood(incorrect_density, correct_density);
 
120
      //-------------------------------------------------------------
 
121
      // create files for output
 
122
      //-------------------------------------------------------------
 
123
      bool output_plots  = param_.getValue("output_plots").toBool();
 
124
      TextFile * file = NULL;
 
125
      if (output_plots)
 
126
      {
 
127
        file = InitPlots(x_scores);
 
128
      }
 
129
      //-------------------------------------------------------------
 
130
      // Estimate Parameters - EM algorithm
 
131
      //-------------------------------------------------------------
 
132
      bool stop_em_init = false;
 
133
      do
 
134
      {
 
135
        //E-STEP
 
136
        DoubleReal one_minus_sum_posterior = one_minus_sum_post(incorrect_density, correct_density);
 
137
        DoubleReal sum_posterior = sum_post(incorrect_density, correct_density);
 
138
 
 
139
        //new mean
 
140
        DoubleReal sum_positive_x0 = sum_pos_x0(x_scores, incorrect_density, correct_density);
 
141
        DoubleReal sum_negative_x0 = sum_neg_x0(x_scores, incorrect_density, correct_density);
 
142
 
 
143
        DoubleReal positive_mean = sum_positive_x0 / one_minus_sum_posterior;
 
144
        DoubleReal negative_mean = sum_negative_x0 / sum_posterior;
 
145
 
 
146
        //new standard deviation
 
147
        DoubleReal sum_positive_sigma = sum_pos_sigma(x_scores, incorrect_density, correct_density, positive_mean);
 
148
        DoubleReal sum_negative_sigma = sum_neg_sigma(x_scores, incorrect_density, correct_density, negative_mean);
 
149
 
 
150
        //update parameters
 
151
        correctly_assigned_fit_param_.x0 = positive_mean;
 
152
        if (sum_positive_sigma  != 0 && one_minus_sum_posterior != 0)
 
153
        {
 
154
          correctly_assigned_fit_param_.sigma = sqrt(sum_positive_sigma / one_minus_sum_posterior);
 
155
          correctly_assigned_fit_param_.A = 1 / sqrt(2 * 3.14159 * pow(correctly_assigned_fit_param_.sigma, 2));
 
156
        }
 
157
 
 
158
        incorrectly_assigned_fit_param_.x0 = negative_mean;
 
159
        if (sum_negative_sigma  != 0 && sum_posterior != 0)
 
160
        {
 
161
          incorrectly_assigned_fit_param_.sigma = sqrt(sum_negative_sigma / sum_posterior);
 
162
          incorrectly_assigned_fit_param_.A = 1 / sqrt(2 * 3.14159 * pow(incorrectly_assigned_fit_param_.sigma, 2));
 
163
        }
 
164
 
 
165
 
 
166
        //compute new prior probabilities negative peptides
 
167
        fillDensities(x_scores, incorrect_density, correct_density);
 
168
        sum_posterior = sum_post(incorrect_density, correct_density);
 
169
        negative_prior_ = sum_posterior / x_scores.size();
 
170
 
 
171
        DoubleReal new_maxlike(computeMaxLikelihood(incorrect_density, correct_density));
 
172
        if (boost::math::isnan(new_maxlike - maxlike))
 
173
        {
 
174
          return false;
 
175
          //throw Exception::UnableToFit(__FILE__,__LINE__,__PRETTY_FUNCTION__,"UnableToFit-PosteriorErrorProbability","Could not fit mixture model to data");
 
176
        }
 
177
        if (fabs(new_maxlike - maxlike) < 0.001)
 
178
        {
 
179
          stop_em_init = true;
 
180
          sum_posterior = sum_post(incorrect_density, correct_density);
 
181
          negative_prior_ = sum_posterior / x_scores.size();
 
182
 
 
183
        }
 
184
        if (output_plots)
 
185
        {
 
186
          String formula1, formula2, formula3;
 
187
          formula1 = ((this)->*(getNegativeGnuplotFormula_))(incorrectly_assigned_fit_param_) + "* " + String(negative_prior_);         //String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
 
188
          formula2 = ((this)->*(getPositiveGnuplotFormula_))(correctly_assigned_fit_param_) + "* (1 - " + String(negative_prior_) + ")";         //String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
 
189
          formula3 = getBothGnuplotFormula(incorrectly_assigned_fit_param_, correctly_assigned_fit_param_);
 
190
          (*file) << ("plot \"" + (String)param_.getValue("output_name") + "_scores.txt\" with boxes, " + formula1 + " , " + formula2 + " , " + formula3);
 
191
        }
 
192
        //update maximum likelihood
 
193
        maxlike = new_maxlike;
 
194
      }
 
195
      while (!stop_em_init);
 
196
      //-------------------------------------------------------------
 
197
      // Finished fitting
 
198
      //-------------------------------------------------------------
 
199
      //!!Workaround:
 
200
      if (param_.getValue("incorrectly_assigned") == "Gumbel")
 
201
      {
 
202
        calc_incorrect_ = &PosteriorErrorProbabilityModel::getGumbel;
 
203
      }
 
204
      max_incorrectly_ = ((this)->*(calc_incorrect_))(incorrectly_assigned_fit_param_.x0, incorrectly_assigned_fit_param_);
 
205
      max_correctly_ = ((this)->*(calc_correct_))(correctly_assigned_fit_param_.x0, correctly_assigned_fit_param_);
 
206
      if (output_plots)
 
207
      {
 
208
        String formula1, formula2, formula3;
 
209
        formula1 = ((this)->*(getNegativeGnuplotFormula_))(incorrectly_assigned_fit_param_) + "*" + String(negative_prior_);       //String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
 
210
        formula2 = ((this)->*(getPositiveGnuplotFormula_))(correctly_assigned_fit_param_) + "* (1 - " + String(negative_prior_) + ")";       // String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
 
211
        formula3 = getBothGnuplotFormula(incorrectly_assigned_fit_param_, correctly_assigned_fit_param_);
 
212
        (*file) << ("plot \"" + (String)param_.getValue("output_name") + "_scores.txt\" with boxes, " + formula1 + " , " + formula2 + " , " + formula3);
 
213
        file->store((String)param_.getValue("output_name"));
 
214
        delete file;
 
215
      }
 
216
      return true;
 
217
    }
 
218
 
 
219
    bool PosteriorErrorProbabilityModel::fit(std::vector<double> & search_engine_scores, vector<double> & probabilities)
 
220
    {
 
221
      bool return_value;
 
222
      return_value = fit(search_engine_scores);
 
223
      if (!return_value)
 
224
        return false;
 
225
 
 
226
      probabilities.resize(search_engine_scores.size());
 
227
      vector<double>::iterator probs = probabilities.begin();
 
228
      for (vector<double>::iterator scores = search_engine_scores.begin(); scores != search_engine_scores.end(); ++scores, ++probs)
 
229
      {
 
230
        *probs = computeProbability(*scores);
 
231
      }
 
232
      return true;
 
233
    }
 
234
 
 
235
    void PosteriorErrorProbabilityModel::fillDensities(vector<double> & x_scores, vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density)
 
236
    {
 
237
      if (incorrect_density.size() != x_scores.size())
 
238
      {
 
239
        incorrect_density.resize(x_scores.size());
 
240
        correct_density.resize(x_scores.size());
 
241
      }
 
242
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
243
      vector<DoubleReal>::iterator correct = correct_density.begin();
 
244
      for (vector<double>::iterator scores = x_scores.begin(); scores != x_scores.end(); ++scores, ++incorrect, ++correct)
 
245
      {
 
246
        *incorrect = ((this)->*(calc_incorrect_))(*scores, incorrectly_assigned_fit_param_);
 
247
        *correct = ((this)->*(calc_correct_))(*scores, correctly_assigned_fit_param_);
 
248
      }
 
249
    }
 
250
 
 
251
    DoubleReal PosteriorErrorProbabilityModel::computeMaxLikelihood(vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density)
 
252
    {
 
253
      DoubleReal maxlike(0);
 
254
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
255
      for (vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end(); ++correct, ++incorrect)
 
256
      {
 
257
        maxlike += log10(negative_prior_ * (*incorrect) + (1 - negative_prior_) * (*correct));
 
258
      }
 
259
      return maxlike;
 
260
    }
 
261
 
 
262
    DoubleReal PosteriorErrorProbabilityModel::one_minus_sum_post(vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density)
 
263
    {
 
264
      DoubleReal one_min(0);
 
265
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
266
      for (vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end(); ++correct, ++incorrect)
 
267
      {
 
268
        one_min +=  1  - ((negative_prior_ * (*incorrect)) / ((negative_prior_ * (*incorrect)) + (1 - negative_prior_) * (*correct)));
 
269
      }
 
270
      return one_min;
 
271
    }
 
272
 
 
273
    DoubleReal PosteriorErrorProbabilityModel::sum_post(vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density)
 
274
    {
 
275
      DoubleReal post(0);
 
276
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
277
      for (vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end(); ++correct, ++incorrect)
 
278
      {
 
279
        post += ((negative_prior_ * (*incorrect)) / ((negative_prior_ * (*incorrect)) + (1 - negative_prior_) * (*correct)));
 
280
      }
 
281
      return post;
 
282
    }
 
283
 
 
284
    DoubleReal PosteriorErrorProbabilityModel::sum_pos_x0(vector<double> & x_scores, vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density)
 
285
    {
 
286
      DoubleReal pos_x0(0);
 
287
      vector<double>::iterator the_x = x_scores.begin();
 
288
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
289
      for (vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end(); ++correct, ++incorrect, ++the_x)
 
290
      {
 
291
        pos_x0 += ((1  - ((negative_prior_ * (*incorrect)) / ((negative_prior_ * (*incorrect)) + (1 - negative_prior_) * (*correct)))) * (*the_x));
 
292
      }
 
293
      return pos_x0;
 
294
    }
 
295
 
 
296
    DoubleReal PosteriorErrorProbabilityModel::sum_neg_x0(vector<double> & x_scores, vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density)
 
297
    {
 
298
      DoubleReal neg_x0(0);
 
299
      vector<double>::iterator the_x = x_scores.begin();
 
300
      vector<DoubleReal>::iterator correct = correct_density.begin();
 
301
      for (vector<DoubleReal>::iterator incorrect = incorrect_density.begin(); incorrect < incorrect_density.end(); ++correct, ++incorrect, ++the_x)
 
302
      {
 
303
        neg_x0 += ((((negative_prior_ * (*incorrect)) / ((negative_prior_ * (*incorrect)) + (1 - negative_prior_) * (*correct)))) * (*the_x));
 
304
      }
 
305
      return neg_x0;
 
306
    }
 
307
 
 
308
    DoubleReal PosteriorErrorProbabilityModel::sum_pos_sigma(vector<double> & x_scores, vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density, DoubleReal positive_mean)
 
309
    {
 
310
      DoubleReal pos_sigma(0);
 
311
      vector<double>::iterator the_x = x_scores.begin();
 
312
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
313
      for (vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end(); ++correct, ++incorrect, ++the_x)
 
314
      {
 
315
        pos_sigma += ((1  - ((negative_prior_ * (*incorrect)) / ((negative_prior_ * (*incorrect)) + (1 - negative_prior_) * (*correct)))) * pow((*the_x) - positive_mean, 2));
 
316
      }
 
317
      return pos_sigma;
 
318
    }
 
319
 
 
320
    DoubleReal PosteriorErrorProbabilityModel::sum_neg_sigma(vector<double> & x_scores, vector<DoubleReal> & incorrect_density, vector<DoubleReal> & correct_density, DoubleReal positive_mean)
 
321
    {
 
322
      DoubleReal neg_sigma(0);
 
323
      vector<double>::iterator the_x = x_scores.begin();
 
324
      vector<DoubleReal>::iterator incorrect = incorrect_density.begin();
 
325
      for (vector<DoubleReal>::iterator correct = correct_density.begin(); correct < correct_density.end(); ++correct, ++incorrect, ++the_x)
 
326
      {
 
327
        neg_sigma += ((((negative_prior_ * (*incorrect)) / ((negative_prior_ * (*incorrect)) + (1 - negative_prior_) * (*correct)))) * pow((*the_x) - positive_mean, 2));
 
328
      }
 
329
      return neg_sigma;
 
330
    }
 
331
 
 
332
    DoubleReal PosteriorErrorProbabilityModel::computeProbability(DoubleReal score)
 
333
    {
 
334
      score = score + fabs(smallest_score_) + 0.001;
 
335
      DoubleReal x_neg;
 
336
      DoubleReal x_pos;
 
337
      // the score is smaller than the peak of incorrectly assigned sequences. To ensure that the probabilities wont rise again use the incorrectly assigned peak for computation
 
338
      if (score < incorrectly_assigned_fit_param_.x0)
 
339
      {
 
340
        x_neg = max_incorrectly_;
 
341
        x_pos = ((this)->*(calc_correct_))(score, correctly_assigned_fit_param_);
 
342
      }
 
343
      // same as above. However, this time to ensure that probabilities wont drop again.
 
344
      else if (score > correctly_assigned_fit_param_.x0)
 
345
      {
 
346
        x_neg = ((this)->*(calc_incorrect_))(score, incorrectly_assigned_fit_param_);
 
347
        x_pos = max_correctly_;
 
348
      }
 
349
      // if its in between use the normal formula
 
350
      else
 
351
      {
 
352
        x_neg = ((this)->*(calc_incorrect_))(score, incorrectly_assigned_fit_param_);
 
353
        x_pos = ((this)->*(calc_correct_))(score, correctly_assigned_fit_param_);
 
354
      }
 
355
      return (negative_prior_ * x_neg) / ((negative_prior_ * x_neg) + (1 - negative_prior_) * x_pos);
 
356
    }
 
357
 
 
358
    TextFile * PosteriorErrorProbabilityModel::InitPlots(vector<double> & x_scores)
 
359
    {
 
360
      TextFile * file = new TextFile;
 
361
      String output;
 
362
      std::vector<DPosition<2> > points;
 
363
      Int number_of_bins = param_.getValue("number_of_bins");
 
364
      points.resize(number_of_bins);
 
365
      DPosition<2> temp;
 
366
      double dividing_score = (x_scores.back() - x_scores[0]) / number_of_bins;
 
367
 
 
368
      temp.setX(dividing_score / 2);
 
369
      temp.setY(0);
 
370
      Int bin = 0;
 
371
      points[bin] = temp;
 
372
      double temp_divider = dividing_score;
 
373
      for (std::vector<double>::iterator it = x_scores.begin(); it < x_scores.end(); ++it)
 
374
      {
 
375
        if (temp_divider - *it >= 0 && bin < number_of_bins - 1)
 
376
        {
 
377
          points[bin].setY(points[bin].getY() + 1);
 
378
        }
 
379
        else if (bin  == number_of_bins - 1)
 
380
        {
 
381
          points[bin].setY(points[bin].getY() + 1);
 
382
        }
 
383
        else
 
384
        {
 
385
          temp.setX((temp_divider + temp_divider + dividing_score) / 2);
 
386
          temp.setY(1);
 
387
          ++bin;
 
388
          points[bin] = temp;
 
389
          temp_divider += dividing_score;
 
390
        }
 
391
      }
 
392
 
 
393
      for (vector<DPosition<2> >::iterator it = points.begin(); it < points.end(); ++it)
 
394
      {
 
395
        it->setY(it->getY() / (x_scores.size()  * dividing_score));
 
396
      }
 
397
 
 
398
      TextFile data_points;
 
399
      for (vector<DPosition<2> >::iterator it = points.begin(); it < points.end(); ++it)
 
400
      {
 
401
        String temp  = it->getX();
 
402
        temp += "\t";
 
403
        temp += it->getY();
 
404
        data_points << temp;
 
405
      }
 
406
      data_points.store((String)param_.getValue("output_name") + "_scores.txt");
 
407
      output = "set output \"" + (String)param_.getValue("output_name") + ".ps\"";
 
408
      (*file) << "set terminal postscript color solid linewidth 2.0 rounded";
 
409
      //(*file)<<"set style empty solid 0.5 border -1";
 
410
      //(*file)<<"set style function lines";
 
411
      (*file) << "set xlabel \"discriminant score\"";
 
412
      (*file) << "set ylabel \"density\"";
 
413
      //TODO: (*file)<<"set title ";
 
414
      (*file) << "set key off";
 
415
      (*file) << (output);
 
416
      String formula1, formula2;
 
417
      formula1 = ((this)->*(getNegativeGnuplotFormula_))(incorrectly_assigned_fit_param_) + "* " + String(negative_prior_);         //String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
 
418
      formula2 = ((this)->*(getPositiveGnuplotFormula_))(correctly_assigned_fit_param_) + "* (1 - " + String(negative_prior_) + ")";         //String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
 
419
      (*file) << ("plot \"" + (String)param_.getValue("output_name") + "_scores.txt\" with boxes, " + formula1 + " , " + formula2);
 
420
      return file;
 
421
    }
 
422
 
 
423
    const String PosteriorErrorProbabilityModel::getGumbelGnuplotFormula(const GaussFitter::GaussFitResult & params) const
 
424
    {
 
425
      // build a formula with the fitted parameters for gnuplot
 
426
      stringstream formula;
 
427
      formula << "(1/" << params.sigma << ") * " << "exp(( " << params.x0 << "- x)/" << params.sigma << ") * exp(-exp((" << params.x0 << " - x)/" << params.sigma << "))";
 
428
      return formula.str();
 
429
    }
 
430
 
 
431
    const String PosteriorErrorProbabilityModel::getGaussGnuplotFormula(const GaussFitter::GaussFitResult & params) const
 
432
    {
 
433
      stringstream formula;
 
434
      formula << params.A << " * exp(-(x - " << params.x0 << ") ** 2 / 2 / (" << params.sigma << ") ** 2)";
 
435
      return formula.str();
 
436
    }
 
437
 
 
438
    const String PosteriorErrorProbabilityModel::getBothGnuplotFormula(const GaussFitter::GaussFitResult & incorrect, const GaussFitter::GaussFitResult & correct) const
 
439
    {
 
440
      stringstream formula;
 
441
      formula << negative_prior_ << "*" <<  ((this)->*(getNegativeGnuplotFormula_))(incorrect) << " + (1-" << negative_prior_ << ")*" << ((this)->*(getPositiveGnuplotFormula_))(correct);
 
442
      return formula.str();
 
443
    }
 
444
 
 
445
    void PosteriorErrorProbabilityModel::plotTargetDecoyEstimation(vector<double> & target, vector<double> & decoy)
 
446
    {
 
447
      TextFile file;
 
448
      String output;
 
449
      std::vector<DPosition<3> > points;
 
450
      Int number_of_bins = param_.getValue("number_of_bins");
 
451
      points.resize(number_of_bins);
 
452
      DPosition<3> temp;
 
453
 
 
454
      sort(target.begin(), target.end());
 
455
      sort(decoy.begin(), decoy.end());
 
456
 
 
457
      double dividing_score = (max(target.back(), decoy.back()) /*scores.back()*/ - min(target[0], decoy[0]) /*scores[0]*/) / number_of_bins;
 
458
 
 
459
      temp[0] = (dividing_score / 2);
 
460
      temp[1] = 0;
 
461
      temp[2] = 0;
 
462
      Int bin = 0;
 
463
      points[bin] = temp;
 
464
      double temp_divider = dividing_score;
 
465
      for (std::vector<double>::iterator it = target.begin(); it < target.end(); ++it)
 
466
      {
 
467
        *it = *it + fabs(smallest_score_) + 0.001;
 
468
        if (temp_divider - *it >= 0 && bin < number_of_bins - 1)
 
469
        {
 
470
          points[bin][1] = (points[bin][1] + 1);
 
471
        }
 
472
        else if (bin  == number_of_bins - 1)
 
473
        {
 
474
          points[bin][1] = (points[bin][1] + 1);
 
475
        }
 
476
        else
 
477
        {
 
478
          temp[0] = ((temp_divider + temp_divider + dividing_score) / 2);
 
479
          temp[1] = 1;
 
480
          ++bin;
 
481
          points[bin] = temp;
 
482
          temp_divider += dividing_score;
 
483
        }
 
484
      }
 
485
 
 
486
      bin = 0;
 
487
      temp_divider = dividing_score;
 
488
      for (std::vector<double>::iterator it = decoy.begin(); it < decoy.end(); ++it)
 
489
      {
 
490
        *it = *it + fabs(smallest_score_) + 0.001;
 
491
        if (temp_divider - *it >= 0 && bin < number_of_bins - 1)
 
492
        {
 
493
          points[bin][2] = (points[bin][2] + 1);
 
494
        }
 
495
        else if (bin  == number_of_bins - 1)
 
496
        {
 
497
          points[bin][2] = (points[bin][2] + 1);
 
498
        }
 
499
        else
 
500
        {
 
501
          // temp[0] = ((temp_divider + temp_divider + dividing_score)/2);
 
502
          // temp[2] = 1;
 
503
          ++bin;
 
504
          points[bin][2] = 1;
 
505
          temp_divider += dividing_score;
 
506
        }
 
507
      }
 
508
 
 
509
      for (vector<DPosition<3> >::iterator it = points.begin(); it < points.end(); ++it)
 
510
      {
 
511
        // if((*it)[1] > (*it)[2])
 
512
        // {(*it)[1] = (*it)[1] + (*it)[2];}
 
513
        /* else{/(*it)[2] = (*it)[1] + (*it)[2];//}*/
 
514
 
 
515
        (*it)[1] = ((*it)[1] / ((decoy.size() + target.size())  * dividing_score));
 
516
        (*it)[2] = ((*it)[2] / ((decoy.size() + target.size())  * dividing_score));
 
517
      }
 
518
 
 
519
      TextFile data_points;
 
520
      for (vector<DPosition<3> >::iterator it = points.begin(); it < points.end(); ++it)
 
521
      {
 
522
        String temp  = (*it)[0];
 
523
        temp += "\t";
 
524
        temp += (*it)[1];
 
525
        temp += "\t";
 
526
        temp += (*it)[2];
 
527
        data_points << temp;
 
528
      }
 
529
      data_points.store((String)param_.getValue("output_name") + "_target_decoy_scores.txt");
 
530
      output = String("set output \"") +  (String)param_.getValue("output_name") + "_target_decoy.ps\"";
 
531
      (file) << "set terminal postscript color solid linewidth 2.0 rounded";
 
532
      //(*file)<<"set style empty solid 0.5 border -1";
 
533
      //(*file)<<"set style function lines";
 
534
      (file) << "set xlabel \"discriminant score\"";
 
535
      (file) << "set ylabel \"density\"";
 
536
      //TODO: (*file)<<"set title ";
 
537
      (file) << "set key off";
 
538
      (file) << (output);
 
539
      String formula1, formula2;
 
540
      formula1 = getGumbelGnuplotFormula(getIncorrectlyAssignedFitResult()) + "* " + String(getNegativePrior());         //String(incorrectly_assigned_fit_param_.A) +" * exp(-(x - " + String(incorrectly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(incorrectly_assigned_fit_param_.sigma) + ") ** 2)"+ "*" + String(negative_prior_);
 
541
      formula2 = getGaussGnuplotFormula(getCorrectlyAssignedFitResult()) + "* (1 - " + String(getNegativePrior()) + ")";         //String(correctly_assigned_fit_param_.A) +" * exp(-(x - " + String(correctly_assigned_fit_param_.x0) + ") ** 2 / 2 / (" + String(correctly_assigned_fit_param_.sigma) + ") ** 2)"+ "* (1 - " + String(negative_prior_) + ")";
 
542
      (file) << ("plot \"" + (String)param_.getValue("output_name") + "_target_decoy_scores.txt\"   using 1:3  with boxes fill solid 0.8 noborder, \"" + (String)param_.getValue("output_name") + "_target_decoy_scores.txt\"  using 1:2  with boxes, " + formula1 + " , " + formula2);
 
543
      file.store((String)param_.getValue("output_name") + "_target_decoy");
 
544
    }
 
545
 
 
546
  }   //namespace Math
534
547
} // namespace OpenMS
535