~ubuntu-branches/ubuntu/wily/openms/wily

« back to all changes in this revision

Viewing changes to source/CHEMISTRY/SvmTheoreticalSpectrumGeneratorTrainer.C

  • Committer: Package Import Robot
  • Author(s): Filippo Rusconi
  • Date: 2013-12-20 11:30:16 UTC
  • mfrom: (5.1.2 sid)
  • Revision ID: package-import@ubuntu.com-20131220113016-wre5g9bteeheq6he
Tags: 1.11.1-3
* remove version number from libbost development package names;
* ensure that AUTHORS is correctly shipped in all packages.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
// -*- mode: C++; tab-width: 2; -*-
2
 
// vi: set ts=2:
3
 
//
4
 
// --------------------------------------------------------------------------
5
 
//                   OpenMS Mass Spectrometry Framework
6
 
// --------------------------------------------------------------------------
7
 
//  Copyright (C) 2003-2011 -- Oliver Kohlbacher, Knut Reinert
8
 
//
9
 
//  This library is free software; you can redistribute it and/or
10
 
//  modify it under the terms of the GNU Lesser General Public
11
 
//  License as published by the Free Software Foundation; either
12
 
//  version 2.1 of the License, or (at your option) any later version.
13
 
//
14
 
//  This library is distributed in the hope that it will be useful,
15
 
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 
//  Lesser General Public License for more details.
18
 
//
19
 
//  You should have received a copy of the GNU Lesser General Public
20
 
//  License along with this library; if not, write to the Free Software
21
 
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
1
// --------------------------------------------------------------------------
 
2
//                   OpenMS -- Open-Source Mass Spectrometry
 
3
// --------------------------------------------------------------------------
 
4
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
 
5
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
 
6
//
 
7
// This software is released under a three-clause BSD license:
 
8
//  * Redistributions of source code must retain the above copyright
 
9
//    notice, this list of conditions and the following disclaimer.
 
10
//  * Redistributions in binary form must reproduce the above copyright
 
11
//    notice, this list of conditions and the following disclaimer in the
 
12
//    documentation and/or other materials provided with the distribution.
 
13
//  * Neither the name of any author or any participating institution
 
14
//    may be used to endorse or promote products derived from this software
 
15
//    without specific prior written permission.
 
16
// For a full list of authors, refer to the file AUTHORS.
 
17
// --------------------------------------------------------------------------
 
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 
19
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 
20
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 
21
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
 
22
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 
23
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 
24
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 
25
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 
26
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 
27
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 
28
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22
29
//
23
30
// --------------------------------------------------------------------------
24
31
// $Maintainer: Sandro Andreotti $
43
50
namespace OpenMS
44
51
{
45
52
  SvmTheoreticalSpectrumGeneratorTrainer::SvmTheoreticalSpectrumGeneratorTrainer() :
46
 
      DefaultParamHandler("SvmTheoreticalSpectrumGeneratorTrainer")
 
53
    DefaultParamHandler("SvmTheoreticalSpectrumGeneratorTrainer")
47
54
  {
48
 
    defaults_.setValue("write_training_files", "false", "If set to true no models are trained but files (<Filename>_<ion_type>_training.dat) are produced \
49
 
for the selected primary ion types. They can be used as input for LibSVM command line tools");
50
 
 
 
55
    defaults_.setValue("write_training_files", "false", "If set to true no models are trained but files (<Filename>_<ion_type>_training.dat) are produced for the selected primary ion types. They can be used as input for LibSVM command line tools");
51
56
    defaults_.setValidStrings("write_training_files", StringList::create("true,false"));
52
57
 
53
58
    defaults_.setValue("number_intensity_levels", 7, "The number of intensity bins (for secondary type models)");
76
81
    defaults_.setValidStrings("add_y2_ions", StringList::create("true,false"));
77
82
 
78
83
    defaults_.setValue("svm:svc_type", 0, "Type of the SVC: 0=C_SVC 1=NU_SVC");
79
 
    defaults_.setMinInt("svm:svc_type",0);
80
 
    defaults_.setMaxInt("svm:svc_type",1);
 
84
    defaults_.setMinInt("svm:svc_type", 0);
 
85
    defaults_.setMaxInt("svm:svc_type", 1);
81
86
 
82
87
    defaults_.setValue("svm:svr_type", 1, "Type of the SVR: 0=EPSILON_SVR 1=NU_SVR");
83
 
    defaults_.setMinInt("svm:svr_type",0);
84
 
    defaults_.setMaxInt("svm:svr_type",1);
 
88
    defaults_.setMinInt("svm:svr_type", 0);
 
89
    defaults_.setMaxInt("svm:svr_type", 1);
85
90
 
86
91
    defaults_.setValue("svm:svc:kernel_type", 2, "Type of the kernel:  0=LINEAR 1=POLY 2=RBF 3=SIGMOID");
87
 
    defaults_.setMinInt("svm:svc:kernel_type",0);
88
 
    defaults_.setMaxInt("svm:svc:kernel_type",3);
 
92
    defaults_.setMinInt("svm:svc:kernel_type", 0);
 
93
    defaults_.setMaxInt("svm:svc:kernel_type", 3);
89
94
 
90
95
    defaults_.setValue("svm:svr:kernel_type", 2, "Type of the kernel:  0=LINEAR 1=POLY 2=RBF 3=SIGMOID");
91
 
    defaults_.setMinInt("svm:svr:kernel_type",0);
92
 
    defaults_.setMaxInt("svm:svr:kernel_type",3);
 
96
    defaults_.setMinInt("svm:svr:kernel_type", 0);
 
97
    defaults_.setMaxInt("svm:svr:kernel_type", 3);
93
98
 
94
99
 
95
100
    defaults_.setValue("svm:svc:degree", 3, "For POLY");
96
 
    defaults_.setMinInt("svm:svc:degree",1);
 
101
    defaults_.setMinInt("svm:svc:degree", 1);
97
102
 
98
103
    defaults_.setValue("svm:svr:degree", 3, "For POLY");
99
 
    defaults_.setMinInt("svm:svr:degree",1);
 
104
    defaults_.setMinInt("svm:svr:degree", 1);
100
105
 
101
106
    defaults_.setValue("svm:svc:gamma", 0.0, "For POLY/RBF/SIGMOID");
102
 
    defaults_.setMinFloat("svm:svc:gamma",0.0);
 
107
    defaults_.setMinFloat("svm:svc:gamma", 0.0);
103
108
 
104
109
    defaults_.setValue("svm:svr:gamma", 0.0, "For POLY/RBF/SIGMOID");
105
 
    defaults_.setMinFloat("svm:svr:gamma",0.0);
 
110
    defaults_.setMinFloat("svm:svr:gamma", 0.0);
106
111
 
107
112
    //defaults_.setValue("svm:svc:coef0", 0.0, "For POLY/SIGMOID");
108
113
    //defaults_.setMinFloat("svm:svc:coef0",0.0);
141
146
    defaults_.setSectionDescription("svm:svr", "Parameters for svm - regression of peak intensities");
142
147
 
143
148
    defaults_.setValue("svm:n_fold", 5, "n_fold cross validation is performed");
144
 
    defaults_.setMinInt("svm:n_fold",1);
 
149
    defaults_.setMinInt("svm:n_fold", 1);
145
150
 
146
151
    defaults_.setValue("svm:grid", "false", "Perform grid search");
147
152
    defaults_.setValidStrings("svm:grid", StringList::create("true,false"));
149
154
    defaults_.setValue("svm:additive_cv", "false", "Additive step size (if false multiplicative)");
150
155
    defaults_.setValidStrings("svm:additive_cv", StringList::create("true,false"));
151
156
 
152
 
    defaults_.setValue("svm:svc:degree_start",1,"starting point of degree");
 
157
    defaults_.setValue("svm:svc:degree_start", 1, "starting point of degree");
153
158
    defaults_.setMinInt("svm:svc:degree_start", 1);
154
 
    defaults_.setValue("svm:svc:degree_step_size",2,"step size point of degree");
155
 
    defaults_.setValue("svm:svc:degree_stop",4,"stopping point of degree");
 
159
    defaults_.setValue("svm:svc:degree_step_size", 2, "step size point of degree");
 
160
    defaults_.setValue("svm:svc:degree_stop", 4, "stopping point of degree");
156
161
 
157
 
    defaults_.setValue("svm:svc:gamma_start",0.00001,"starting point of gamma");
 
162
    defaults_.setValue("svm:svc:gamma_start", 0.00001, "starting point of gamma");
158
163
    defaults_.setMinFloat("svm:svc:gamma_start", 0.0);
159
164
    defaults_.setMaxFloat("svm:svc:gamma_start", 1.0);
160
 
    defaults_.setValue("svm:svc:gamma_step_size",100,"step size point of gamma");
161
 
    defaults_.setValue("svm:svc:gamma_stop",0.1,"stopping point of gamma");
162
 
 
163
 
    defaults_.setValue("svm:svc:c_start",0.1,"starting point of c");
164
 
    defaults_.setValue("svm:svc:c_step_size",100,"step size of c");
165
 
    defaults_.setValue("svm:svc:c_stop",1000,"stopping point of c");
166
 
 
167
 
    defaults_.setValue("svm:svc:nu_start",0.3,"starting point of nu");
 
165
    defaults_.setValue("svm:svc:gamma_step_size", 100, "step size point of gamma");
 
166
    defaults_.setValue("svm:svc:gamma_stop", 0.1, "stopping point of gamma");
 
167
 
 
168
    defaults_.setValue("svm:svc:c_start", 0.1, "starting point of c");
 
169
    defaults_.setValue("svm:svc:c_step_size", 100, "step size of c");
 
170
    defaults_.setValue("svm:svc:c_stop", 1000, "stopping point of c");
 
171
 
 
172
    defaults_.setValue("svm:svc:nu_start", 0.3, "starting point of nu");
168
173
    defaults_.setMinFloat("svm:svc:nu_start", 0);
169
174
    defaults_.setMaxFloat("svm:svc:nu_start", 1);
170
 
    defaults_.setValue("svm:svc:nu_step_size",2,"step size of nu");
171
 
    defaults_.setValue("svm:svc:nu_stop",0.6,"stopping point of nu");
 
175
    defaults_.setValue("svm:svc:nu_step_size", 2, "step size of nu");
 
176
    defaults_.setValue("svm:svc:nu_stop", 0.6, "stopping point of nu");
172
177
    defaults_.setMinFloat("svm:svc:nu_stop", 0);
173
178
    defaults_.setMaxFloat("svm:svc:nu_stop", 1);
174
179
 
175
 
    defaults_.setValue("svm:svr:degree_start",1,"starting point of degree");
 
180
    defaults_.setValue("svm:svr:degree_start", 1, "starting point of degree");
176
181
    defaults_.setMinInt("svm:svr:degree_start", 1);
177
 
    defaults_.setValue("svm:svr:degree_step_size",2,"step size point of degree");
178
 
    defaults_.setValue("svm:svr:degree_stop",4,"stopping point of degree");
 
182
    defaults_.setValue("svm:svr:degree_step_size", 2, "step size point of degree");
 
183
    defaults_.setValue("svm:svr:degree_stop", 4, "stopping point of degree");
179
184
 
180
 
    defaults_.setValue("svm:svr:gamma_start",0.00001,"starting point of gamma");
 
185
    defaults_.setValue("svm:svr:gamma_start", 0.00001, "starting point of gamma");
181
186
    defaults_.setMinFloat("svm:svr:gamma_start", 0.0);
182
187
    defaults_.setMaxFloat("svm:svr:gamma_start", 1.0);
183
 
    defaults_.setValue("svm:svr:gamma_step_size",100,"step size point of gamma");
184
 
    defaults_.setValue("svm:svr:gamma_stop",0.1,"stopping point of gamma");
185
 
 
186
 
    defaults_.setValue("svm:svr:p_start",0.00001,"starting point of p");
187
 
    defaults_.setValue("svm:svr:p_step_size",100,"step size point of p");
188
 
    defaults_.setValue("svm:svr:p_stop",0.1,"stopping point of p");
189
 
 
190
 
    defaults_.setValue("svm:svr:c_start",0.1,"starting point of c");
191
 
    defaults_.setValue("svm:svr:c_step_size",100,"step size of c");
192
 
    defaults_.setValue("svm:svr:c_stop",1000,"stopping point of c");
193
 
 
194
 
    defaults_.setValue("svm:svr:nu_start",0.3,"starting point of nu");
 
188
    defaults_.setValue("svm:svr:gamma_step_size", 100, "step size point of gamma");
 
189
    defaults_.setValue("svm:svr:gamma_stop", 0.1, "stopping point of gamma");
 
190
 
 
191
    defaults_.setValue("svm:svr:p_start", 0.00001, "starting point of p");
 
192
    defaults_.setValue("svm:svr:p_step_size", 100, "step size point of p");
 
193
    defaults_.setValue("svm:svr:p_stop", 0.1, "stopping point of p");
 
194
 
 
195
    defaults_.setValue("svm:svr:c_start", 0.1, "starting point of c");
 
196
    defaults_.setValue("svm:svr:c_step_size", 100, "step size of c");
 
197
    defaults_.setValue("svm:svr:c_stop", 1000, "stopping point of c");
 
198
 
 
199
    defaults_.setValue("svm:svr:nu_start", 0.3, "starting point of nu");
195
200
    defaults_.setMinFloat("svm:svr:nu_start", 0);
196
201
    defaults_.setMaxFloat("svm:svr:nu_start", 1);
197
 
    defaults_.setValue("svm:svr:nu_step_size",2,"step size of nu");
198
 
    defaults_.setValue("svm:svr:nu_stop",0.6,"stopping point of nu");
 
202
    defaults_.setValue("svm:svr:nu_step_size", 2, "step size of nu");
 
203
    defaults_.setValue("svm:svr:nu_stop", 0.6, "stopping point of nu");
199
204
    defaults_.setMinFloat("svm:svr:nu_stop", 0);
200
205
    defaults_.setMaxFloat("svm:svr:nu_stop", 1);
201
206
 
204
209
    defaultsToParam_();
205
210
  }
206
211
 
207
 
  SvmTheoreticalSpectrumGeneratorTrainer::SvmTheoreticalSpectrumGeneratorTrainer(const SvmTheoreticalSpectrumGeneratorTrainer& rhs) :
 
212
  SvmTheoreticalSpectrumGeneratorTrainer::SvmTheoreticalSpectrumGeneratorTrainer(const SvmTheoreticalSpectrumGeneratorTrainer & rhs) :
208
213
    DefaultParamHandler(rhs)
209
214
  {
210
215
    updateMembers_();
211
216
  }
212
217
 
213
 
  SvmTheoreticalSpectrumGeneratorTrainer& SvmTheoreticalSpectrumGeneratorTrainer::operator =(const SvmTheoreticalSpectrumGeneratorTrainer& rhs)
 
218
  SvmTheoreticalSpectrumGeneratorTrainer & SvmTheoreticalSpectrumGeneratorTrainer::operator=(const SvmTheoreticalSpectrumGeneratorTrainer & rhs)
214
219
  {
215
220
    if (this != &rhs)
216
221
    {
217
 
      DefaultParamHandler::operator=(rhs);      
 
222
      DefaultParamHandler::operator=(rhs);
218
223
      updateMembers_();
219
224
    }
220
225
    return *this;
224
229
  {
225
230
  }
226
231
 
227
 
 
228
 
  void SvmTheoreticalSpectrumGeneratorTrainer::trainModel(const PeakMap &spectra, const std::vector<AASequence>&annotations, String filename, Int precursor_charge)
 
232
  void SvmTheoreticalSpectrumGeneratorTrainer::trainModel(const PeakMap & spectra, const std::vector<AASequence> & annotations, String filename, Int precursor_charge)
229
233
  {
230
234
    //----------- BEGIN OF PARAMETER READING-------------------------
231
235
 
245
249
 
246
250
    //build set of ion types
247
251
    std::vector<IonType> ion_types;
248
 
    std::vector<bool>is_primary;
 
252
    std::vector<bool> is_primary;
249
253
 
250
254
    if (DataValue(param_.getValue("add_b_ions")).toBool())
251
255
    {
256
260
    {
257
261
      ion_types.push_back(IonType(Residue::YIon, EmpiricalFormula(), 1));
258
262
      is_primary.push_back(true);
259
 
    }    
 
263
    }
260
264
    if (DataValue(param_.getValue("add_x_ions")).toBool())
261
265
    {
262
266
      ion_types.push_back(IonType(Residue::XIon, EmpiricalFormula(), 1));
263
267
      is_primary.push_back(false);
264
 
      secondary_types=true;
 
268
      secondary_types = true;
265
269
    }
266
270
    if (DataValue(param_.getValue("add_a_ions")).toBool())
267
271
    {
268
272
      ion_types.push_back(IonType(Residue::AIon, EmpiricalFormula(), 1));
269
273
      is_primary.push_back(false);
270
 
      secondary_types=true;
 
274
      secondary_types = true;
271
275
    }
272
276
    if (DataValue(param_.getValue("add_z_ions")).toBool())
273
277
    {
274
278
      ion_types.push_back(IonType(Residue::ZIon, EmpiricalFormula(), 1));
275
279
      is_primary.push_back(false);
276
 
      secondary_types=true;
 
280
      secondary_types = true;
277
281
    }
278
282
    if (DataValue(param_.getValue("add_c_ions")).toBool())
279
283
    {
280
284
      ion_types.push_back(IonType(Residue::CIon, EmpiricalFormula(), 1));
281
285
      is_primary.push_back(false);
282
 
      secondary_types=true;
 
286
      secondary_types = true;
283
287
    }
284
288
 
285
289
    if (DataValue(param_.getValue("add_losses")).toBool())
293
297
        ion_types.push_back(IonType(Residue::BIon, loss_water, 1));
294
298
        is_primary.push_back(false);
295
299
        is_primary.push_back(false);
296
 
        secondary_types=true;
 
300
        secondary_types = true;
297
301
      }
298
302
      if (DataValue(param_.getValue("add_y_ions")).toBool())
299
303
      {
301
305
        ion_types.push_back(IonType(Residue::YIon, loss_water, 1));
302
306
        is_primary.push_back(false);
303
307
        is_primary.push_back(false);
304
 
        secondary_types=true;
 
308
        secondary_types = true;
305
309
      }
306
310
    }
307
311
 
309
313
    {
310
314
      ion_types.push_back(IonType(Residue::YIon, EmpiricalFormula(), 2));
311
315
      is_primary.push_back(false);
312
 
      secondary_types=true;
 
316
      secondary_types = true;
313
317
    }
314
318
    if (DataValue(param_.getValue("add_b2_ions")).toBool())
315
319
    {
316
320
      ion_types.push_back(IonType(Residue::BIon, EmpiricalFormula(), 2));
317
321
      is_primary.push_back(false);
318
 
      secondary_types=true;
 
322
      secondary_types = true;
319
323
    }
320
324
 
321
325
    //-----------------------LOADING THE SVM PARAMETERS------------------
326
330
    bool scaling = param_.getValue("svm:scaling").toBool();
327
331
    if (scaling)
328
332
    {
329
 
      upper=param_.getValue("svm:scaling_upper");
330
 
      lower=param_.getValue("svm:scaling_lower");
 
333
      upper = param_.getValue("svm:scaling_upper");
 
334
      lower = param_.getValue("svm:scaling_lower");
331
335
    }
332
336
 
333
337
    SVMWrapper wrap_reg, wrap_class;
346
350
    wrap_class.setParameter(SVMWrapper::DEGREE, (Int)param_.getValue("svm:svc:degree"));
347
351
    wrap_class.setParameter(SVMWrapper::C, (DoubleReal)param_.getValue("svm:svc:C"));
348
352
    wrap_class.setParameter(SVMWrapper::NU, (DoubleReal)param_.getValue("svm:svc:nu"));
349
 
    wrap_class.setParameter(SVMWrapper::GAMMA, (DoubleReal)param_.getValue("svm:svc:gamma"));    
 
353
    wrap_class.setParameter(SVMWrapper::GAMMA, (DoubleReal)param_.getValue("svm:svc:gamma"));
350
354
 
351
355
    //-----------------------LOADING THE Grid Search Parameters------------------
352
356
 
354
358
    bool additive_cv = param_.getValue("svm:additive_cv").toBool();
355
359
 
356
360
    //SVR Grid params
357
 
    std::map< SVMWrapper::SVM_parameter_type, DoubleReal > start_values_reg, end_values_reg, step_sizes_reg;
358
 
    if(grid)
 
361
    std::map<SVMWrapper::SVM_parameter_type, DoubleReal> start_values_reg, end_values_reg, step_sizes_reg;
 
362
    if (grid)
359
363
    {
360
364
      DoubleReal gamma_start = (DoubleReal)param_.getValue("svm:svr:gamma_start");
361
365
      DoubleReal gamma_step_size = (DoubleReal)param_.getValue("svm:svr:gamma_step_size");
410
414
      DoubleReal c_step_size = 0.;
411
415
      DoubleReal c_stop = 0.;
412
416
 
413
 
      c_start =(DoubleReal)param_.getValue("svm:svr:c_start");
414
 
      c_step_size =(DoubleReal)param_.getValue("svm:svr:c_step_size");
 
417
      c_start = (DoubleReal)param_.getValue("svm:svr:c_start");
 
418
      c_step_size = (DoubleReal)param_.getValue("svm:svr:c_step_size");
415
419
      if (!additive_cv && c_step_size <= 1)
416
420
      {
417
421
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Step size of c <= 1 and additive_cv is false. Aborting!");
422
426
      step_sizes_reg.insert(make_pair(SVMWrapper::C, c_step_size));
423
427
      end_values_reg.insert(make_pair(SVMWrapper::C, c_stop));
424
428
 
425
 
      if ( (wrap_reg.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVR || wrap_reg.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC))
 
429
      if ((wrap_reg.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVR || wrap_reg.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC))
426
430
      {
427
431
        DoubleReal nu_start = 0.;
428
432
        DoubleReal nu_step_size = 0.;
443
447
    }
444
448
 
445
449
    //SVC Grid params
446
 
    std::map< SVMWrapper::SVM_parameter_type, DoubleReal > start_values_class, end_values_class, step_sizes_class;
 
450
    std::map<SVMWrapper::SVM_parameter_type, DoubleReal> start_values_class, end_values_class, step_sizes_class;
447
451
 
448
 
    if(grid)
 
452
    if (grid)
449
453
    {
450
454
      DoubleReal gamma_start = (DoubleReal)param_.getValue("svm:svc:gamma_start");
451
455
      DoubleReal gamma_step_size = (DoubleReal)param_.getValue("svm:svc:gamma_step_size");
475
479
        start_values_class.insert(make_pair(SVMWrapper::DEGREE, degree_start));
476
480
        step_sizes_class.insert(make_pair(SVMWrapper::DEGREE, degree_step_size));
477
481
        end_values_class.insert(make_pair(SVMWrapper::DEGREE, degree_stop));
478
 
      }      
 
482
      }
479
483
 
480
484
      DoubleReal c_start = 0.;
481
485
      DoubleReal c_step_size = 0.;
482
486
      DoubleReal c_stop = 0.;
483
487
 
484
 
      c_start =(DoubleReal)param_.getValue("svm:svc:c_start");
485
 
      c_step_size =(DoubleReal)param_.getValue("svm:svc:c_step_size");
 
488
      c_start = (DoubleReal)param_.getValue("svm:svc:c_start");
 
489
      c_step_size = (DoubleReal)param_.getValue("svm:svc:c_step_size");
486
490
      if (!additive_cv && c_step_size <= 1)
487
491
      {
488
492
        throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Step size of c <= 1 and additive_cv is false. Aborting!");
493
497
      step_sizes_class.insert(make_pair(SVMWrapper::C, c_step_size));
494
498
      end_values_class.insert(make_pair(SVMWrapper::C, c_stop));
495
499
 
496
 
      if ( (wrap_class.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVR || wrap_class.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC))
 
500
      if ((wrap_class.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVR || wrap_class.getIntParameter(SVMWrapper::SVM_TYPE) == NU_SVC))
497
501
      {
498
502
        DoubleReal nu_start = 0.;
499
 
        DoubleReal nu_step_size = 0.;
 
503
        DoubleReal  nu_step_size = 0.;
500
504
        DoubleReal nu_stop = 0.;
501
505
 
502
506
        nu_start = (DoubleReal)param_.getValue("svm:svc:nu_start");
526
530
    info_outfile.push_back("<PrimaryTypes>");
527
531
 
528
532
    //count number of usable spectra
529
 
    Size usable_spectra=spectra.size();
530
 
    std::vector<bool>is_spectrum_usable(spectra.size(), true);
 
533
    Size usable_spectra = spectra.size();
 
534
    std::vector<bool> is_spectrum_usable(spectra.size(), true);
531
535
 
532
536
    for (Size index = 0; index < spectra.size(); ++index)
533
537
    {
534
538
      //test whether annotation for spectrum match. If theoretical and empirical mass differ too much skip this spectrum
535
 
      Int empirical_charge=spectra[index].getPrecursors()[0].getCharge();
536
 
      DoubleReal empirical_parent_mass=spectra[index].getPrecursors()[0].getMZ();
537
 
      DoubleReal theoretical_parent_mass=annotations[index].getMonoWeight(Residue::Full, empirical_charge)/empirical_charge;
 
539
      Int empirical_charge = spectra[index].getPrecursors()[0].getCharge();
 
540
      DoubleReal empirical_parent_mass = spectra[index].getPrecursors()[0].getMZ();
 
541
      DoubleReal theoretical_parent_mass = annotations[index].getMonoWeight(Residue::Full, empirical_charge) / empirical_charge;
538
542
 
539
 
      if (abs(empirical_parent_mass-theoretical_parent_mass) > parent_tolerance)
 
543
      if (abs(empirical_parent_mass - theoretical_parent_mass) > parent_tolerance)
540
544
      {
541
545
        is_spectrum_usable[index] = false;
542
546
        --usable_spectra;
543
 
        std::cerr << "skipping spectrum " << index <<" due to parent mass missmatch"<<std::endl;
 
547
        std::cerr << "skipping spectrum " << index << " due to parent mass missmatch" << std::endl;
544
548
      }
545
549
      else if (precursor_charge != empirical_charge)
546
550
      {
547
551
        is_spectrum_usable[index] = false;
548
552
        --usable_spectra;
549
 
        std::cerr << "skipping spectrum " << index <<" due to wrong precursor charge"<<std::endl;
 
553
        std::cerr << "skipping spectrum " << index << " due to wrong precursor charge" << std::endl;
550
554
      }
551
555
    }
552
556
 
553
557
    //only required for generatrion of descriptors
554
 
    SvmTheoreticalSpectrumGenerator spec_gen;    
 
558
    SvmTheoreticalSpectrumGenerator spec_gen;
555
559
 
556
 
    //Amino acid sequences used for obtaining prefix and suffix mass    
 
560
    //Amino acid sequences used for obtaining prefix and suffix mass
557
561
    AASequence prefix, suffix;
558
562
 
559
563
    DescriptorSet tmp_desc;
560
 
    Size num_features = spec_gen.generateDescriptorSet_(annotations[0],0,ion_types[0], 1, tmp_desc);
 
564
    Size num_features = spec_gen.generateDescriptorSet_(annotations[0], 0, ion_types[0], 1, tmp_desc);
561
565
 
562
566
    //use number of features to adapt gamma parameter if set to 0 (same is used in svm-train.c)
563
 
    if(wrap_reg.getDoubleParameter(SVMWrapper::GAMMA) == 0. )
 
567
    if (wrap_reg.getDoubleParameter(SVMWrapper::GAMMA) == 0.)
564
568
    {
565
 
      wrap_reg.setParameter(SVMWrapper::GAMMA, 1.0/num_features);
 
569
      wrap_reg.setParameter(SVMWrapper::GAMMA, 1.0 / num_features);
566
570
    }
567
 
    if(wrap_class.getDoubleParameter(SVMWrapper::GAMMA) == 0. )
 
571
    if (wrap_class.getDoubleParameter(SVMWrapper::GAMMA) == 0.)
568
572
    {
569
 
      wrap_class.setParameter(SVMWrapper::GAMMA, 1.0/num_features);
 
573
      wrap_class.setParameter(SVMWrapper::GAMMA, 1.0 / num_features);
570
574
    }
571
575
 
572
576
    //vectors to store the minimum and maximum value appearing in the training data for each feature (required for scaling)
573
577
    ObservedIntensMap observed_intensities;
574
 
    std::map<Size, std::vector<DescriptorSet> >training_input;
575
 
    std::map<Size, std::vector<double> >training_output;
 
578
    std::map<Size, std::vector<DescriptorSet> > training_input;
 
579
    std::map<Size, std::vector<double> > training_output;
576
580
 
577
581
    Size spec_index = 0;
578
582
    Size x = 1;
579
583
    //run over all input spectra
580
 
    for (PeakMap::const_iterator map_it = spectra.begin(); map_it < spectra.end(); map_it+=x, spec_index+=x)
581
 
    {      
 
584
    for (PeakMap::const_iterator map_it = spectra.begin(); map_it < spectra.end(); map_it += x, spec_index += x)
 
585
    {
582
586
      if (!is_spectrum_usable[spec_index])
583
587
      {
584
588
        continue;
585
 
      }      
 
589
      }
586
590
 
587
591
      Size precursor_charge = map_it->getPrecursors()[0].getCharge();
588
592
      PeakSpectrum input_spec_norm(*map_it);
590
594
      normalizeIntensity(input_spec_norm);
591
595
 
592
596
      for (Size type_nr = 0; type_nr < ion_types.size(); ++type_nr)
593
 
      {        
 
597
      {
594
598
        //store the intensities of the detected peaks for the given type in the actual spectrum (required for sec. type prob model)
595
599
        countIntensities_(input_spec_norm, annotations[spec_index], ion_types[type_nr], observed_intensities, peak_tolerance, number_of_regions);
596
600
      }
597
601
 
598
602
      for (Size type_nr = 0; type_nr < ion_types.size(); ++type_nr)
599
603
      {
600
 
        if(!is_primary[type_nr])
 
604
        if (!is_primary[type_nr])
601
605
        {
602
606
          continue;
603
 
        }        
 
607
        }
604
608
 
605
609
        DoubleReal true_offset_mass = 0.0;
606
610
        Residue::ResidueType residue = ion_types[type_nr].residue;
608
612
        EmpiricalFormula loss = ion_types[type_nr].loss;
609
613
 
610
614
        training_input[type_nr].reserve(usable_spectra);
611
 
        training_output[type_nr].reserve(usable_spectra);               
 
615
        training_output[type_nr].reserve(usable_spectra);
612
616
 
613
617
        for (Size frag_pos = 1; frag_pos < annotations[spec_index].size(); ++frag_pos)
614
618
        {
639
643
          }
640
644
 
641
645
          DescriptorSet descriptors;
642
 
          spec_gen.generateDescriptorSet_(annotations[spec_index],frag_pos-1,ion_types[type_nr], precursor_charge, descriptors);
 
646
          spec_gen.generateDescriptorSet_(annotations[spec_index], frag_pos - 1, ion_types[type_nr], precursor_charge, descriptors);
643
647
 
644
648
          training_input[type_nr].push_back(descriptors);
645
 
          training_output[type_nr].push_back(observed_peak_intensity);          
 
649
          training_output[type_nr].push_back(observed_peak_intensity);
646
650
        }
647
 
      }//end of running over all types
648
 
    }//end of running over all spectra
 
651
      } //end of running over all types
 
652
    } //end of running over all spectra
649
653
 
650
 
    //scale the input data    
651
 
    std::vector<double>min_features;
 
654
    //scale the input data
 
655
    std::vector<double> min_features;
652
656
    min_features.reserve(num_features);
653
 
    std::vector<double>max_features;
 
657
    std::vector<double> max_features;
654
658
    max_features.reserve(num_features);
655
659
 
656
660
    min_features.assign(num_features, std::numeric_limits<double>::infinity());
657
661
    max_features.assign(num_features, -1 * std::numeric_limits<double>::infinity());
658
662
 
659
663
    //SCALING
660
 
    if(scaling)
 
664
    if (scaling)
661
665
    {
662
666
      for (Size type_nr = 0; type_nr < ion_types.size(); ++type_nr)
663
667
      {
664
668
        //first compute the minimal and maximal entries for each feature
665
669
        std::vector<svm_node>::iterator it;
666
 
        for(Size train_row=0; train_row<training_input[type_nr].size(); ++train_row)
 
670
        for (Size train_row = 0; train_row < training_input[type_nr].size(); ++train_row)
667
671
        {
668
672
          Size index = 0;
669
 
          for(it = training_input[type_nr][train_row].descriptors.begin(); it != training_input[type_nr][train_row].descriptors.end(); ++it)
 
673
          for (it = training_input[type_nr][train_row].descriptors.begin(); it != training_input[type_nr][train_row].descriptors.end(); ++it)
670
674
          {
671
 
            while( (Int) index < it->index-1)
 
675
            while ((Int) index < it->index - 1)
672
676
            {
673
677
              min_features[index] = std::min(min_features[index], 0.0);
674
 
              max_features[index] = std::max(max_features[index],0.0);
 
678
              max_features[index] = std::max(max_features[index], 0.0);
675
679
              ++index;
676
680
            }
677
681
            min_features[index] = std::min(min_features[index], it->value);
679
683
            ++index;
680
684
          }
681
685
 
682
 
          while(index<num_features)
 
686
          while (index < num_features)
683
687
          {
684
688
            min_features[index] = std::min(min_features[index], 0.0);
685
 
            max_features[index] = std::max(max_features[index],0.0);
 
689
            max_features[index] = std::max(max_features[index], 0.0);
686
690
            ++index;
687
691
          }
688
692
        }
695
699
 
696
700
      for (Size type_nr = 0; type_nr < ion_types.size(); ++type_nr)
697
701
      {
698
 
        if(!is_primary[type_nr])
 
702
        if (!is_primary[type_nr])
699
703
        {
700
704
          continue;
701
705
        }
702
706
 
703
707
        //with the minimal and maximal now scale
704
 
        for(Size train_row=0; train_row<training_input[type_nr].size(); ++train_row)
 
708
        for (Size train_row = 0; train_row < training_input[type_nr].size(); ++train_row)
705
709
        {
706
 
          spec_gen.scaleDescriptorSet_(training_input[type_nr][train_row],lower, upper);
707
 
        }        
 
710
          spec_gen.scaleDescriptorSet_(training_input[type_nr][train_row], lower, upper);
 
711
        }
708
712
      }
709
713
    }
710
714
 
711
715
 
712
716
    // Now 2 Steps - first we train a SVM classifier to predict abundance or miss of a peak.
713
717
    // In second step we train a SVM regression model to predict intensities. This svm is trained
714
 
    // only with rows for abundant peaks    
 
718
    // only with rows for abundant peaks
715
719
 
716
720
    //Within this loop the SVR and SVC are trained for the primary ion types
717
721
    for (Size type_nr = 0; type_nr < ion_types.size(); ++type_nr)
718
722
    {
719
 
      if(!is_primary[type_nr])
 
723
      if (!is_primary[type_nr])
720
724
      {
721
725
        continue;
722
726
      }
723
727
 
724
728
      String svm_model_file_class = filename + "_" + Residue::getResidueTypeName(ion_types[type_nr].residue) + "_" +
725
 
                                            ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_class.svm";
 
729
                                    ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_class.svm";
726
730
      String svm_model_file_reg = filename + "_" + Residue::getResidueTypeName(ion_types[type_nr].residue) + "_" +
727
 
          ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_reg.svm";
 
731
                                  ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_reg.svm";
728
732
 
729
733
 
730
734
      //------------------------------------------------------------------------------------------
731
735
      //----------------------------------Training of SVR-Model-----------------------------
732
736
      //------------------------------------------------------------------------------------------
733
737
 
734
 
      std::vector<DescriptorSet>training_input_reg;
735
 
      std::vector<double>training_output_reg;
 
738
      std::vector<DescriptorSet> training_input_reg;
 
739
      std::vector<double> training_output_reg;
736
740
      training_input_reg.reserve(training_input[type_nr].size());
737
741
      training_output_reg.reserve(training_output[type_nr].size());
738
742
 
739
 
      for(Size i = 0; i < training_output[type_nr].size(); ++i)
 
743
      for (Size i = 0; i < training_output[type_nr].size(); ++i)
740
744
      {
741
745
        training_input_reg.push_back(training_input[type_nr][i]);
742
 
        training_output_reg.push_back(std::max(0.0,training_output[type_nr][i]));
 
746
        training_output_reg.push_back(std::max(0.0, training_output[type_nr][i]));
743
747
      }
744
748
 
745
 
      if(write_outfiles)
746
 
      {        
 
749
      if (write_outfiles)
 
750
      {
747
751
        writeTrainingFile_(training_input_reg, training_output_reg, String("Training_") + Residue::getResidueTypeName(ion_types[type_nr].residue) + "_" +
748
 
                             ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_reg.dat");
 
752
                           ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_reg.dat");
749
753
      }
750
754
      else
751
755
      {
752
756
        //std::vector<double> predictions_reg(training_input_reg.size(), 0);
753
 
        svm_node ** input_training_reg = new svm_node * [training_input_reg.size()];
 
757
        svm_node ** input_training_reg = new svm_node *[training_input_reg.size()];
754
758
        double * output_training_reg = &training_output_reg[0];
755
 
        for(Size i = 0; i < training_input_reg.size(); ++i)
 
759
        for (Size i = 0; i < training_input_reg.size(); ++i)
756
760
        {
757
761
          input_training_reg[i] = &(training_input_reg[i].descriptors[0]);
758
762
        }
760
764
        svm_problem svm_p_reg;
761
765
        svm_p_reg.l = (int) training_input_reg.size();
762
766
        svm_p_reg.y = output_training_reg;
763
 
        svm_p_reg.x = input_training_reg;        
 
767
        svm_p_reg.x = input_training_reg;
764
768
 
765
 
        if(n_fold > 1)
766
 
        {          
 
769
        if (n_fold > 1)
 
770
        {
767
771
          //perform cross validation
768
 
          std::map< SVMWrapper::SVM_parameter_type, DoubleReal > best_parameters;
 
772
          std::map<SVMWrapper::SVM_parameter_type, DoubleReal> best_parameters;
769
773
          wrap_reg.performCrossValidation(&svm_p_reg,
770
774
                                          SVMData(),
771
775
                                          false,
780
784
                                          );
781
785
 
782
786
          //use best parameters to train final svm on full dataset
783
 
          std::map< SVMWrapper::SVM_parameter_type, DoubleReal >::const_iterator best_param_iter;
784
 
          for(best_param_iter = best_parameters.begin(); best_param_iter != best_parameters.end(); ++best_param_iter)
 
787
          std::map<SVMWrapper::SVM_parameter_type, DoubleReal>::const_iterator best_param_iter;
 
788
          for (best_param_iter = best_parameters.begin(); best_param_iter != best_parameters.end(); ++best_param_iter)
785
789
          {
786
790
            wrap_reg.setParameter(best_param_iter->first, best_param_iter->second);
787
791
          }
791
795
        wrap_reg.train(&svm_p_reg);
792
796
        wrap_reg.saveModel(svm_model_file_reg.c_str());
793
797
 
794
 
        delete [] input_training_reg;
 
798
        delete[] input_training_reg;
795
799
 
796
 
      }//end of else
 
800
      } //end of else
797
801
 
798
802
 
799
803
      //------------------------------------------------------------------------------------------
801
805
      //------------------------------------------------------------------------------------------
802
806
 
803
807
      //make binary
804
 
      for(Size i=0; i<training_output[type_nr].size(); ++i)
 
808
      for (Size i = 0; i < training_output[type_nr].size(); ++i)
805
809
      {
806
 
        training_output[type_nr][i] = training_output[type_nr][i] != -1 ? 1:0 ; //if peak was abundant class 1 else 0
 
810
        training_output[type_nr][i] = training_output[type_nr][i] != -1 ? 1 : 0; //if peak was abundant class 1 else 0
807
811
      }
808
812
 
809
 
      //create balanced set      
810
 
      if(balancing)
 
813
      //create balanced set
 
814
      if (balancing)
811
815
      {
812
 
        std::vector<std::vector<Size> >training_data_by_class(2);
 
816
        std::vector<std::vector<Size> > training_data_by_class(2);
813
817
        std::vector<double>::const_iterator it;
814
 
        std::vector<DescriptorSet>tmp_training;
815
 
        std::vector<double>tmp_output;
816
 
        for(it = training_output[type_nr].begin(); it != training_output[type_nr].end(); ++it)
 
818
        std::vector<DescriptorSet> tmp_training;
 
819
        std::vector<double> tmp_output;
 
820
        for (it = training_output[type_nr].begin(); it != training_output[type_nr].end(); ++it)
817
821
        {
818
 
          training_data_by_class[*it].push_back(it-training_output[type_nr].begin());
 
822
          training_data_by_class[*it].push_back(it - training_output[type_nr].begin());
819
823
        }
820
 
        Size min_size=std::min(training_data_by_class[0].size(),training_data_by_class[1].size());
821
 
        std::random_shuffle(training_data_by_class[0].begin(),training_data_by_class[0].end() );
822
 
        std::random_shuffle(training_data_by_class[1].begin(),training_data_by_class[1].end() );
 
824
        Size min_size = std::min(training_data_by_class[0].size(), training_data_by_class[1].size());
 
825
        std::random_shuffle(training_data_by_class[0].begin(), training_data_by_class[0].end());
 
826
        std::random_shuffle(training_data_by_class[1].begin(), training_data_by_class[1].end());
823
827
 
824
 
        for(Size num = 0; num < min_size; ++num)
 
828
        for (Size num = 0; num < min_size; ++num)
825
829
        {
826
 
          for(Size intens = 0; intens < 2; ++intens)
 
830
          for (Size intens = 0; intens < 2; ++intens)
827
831
          {
828
832
            tmp_training.push_back(training_input[type_nr][training_data_by_class[intens][num]]);
829
833
            tmp_output.push_back(intens);
833
837
        training_output[type_nr] = tmp_output;
834
838
      }
835
839
 
836
 
      if(write_outfiles)
 
840
      if (write_outfiles)
837
841
      {
838
842
        writeTrainingFile_(training_input[type_nr], training_output[type_nr], String("Training_") + Residue::getResidueTypeName(ion_types[type_nr].residue) + "_" +
839
 
                             ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_class.dat");
 
843
                           ion_types[type_nr].loss.getString() + "_" + ion_types[type_nr].charge + "_class.dat");
840
844
      }
841
 
 
842
845
      else
843
 
      {        
 
846
      {
844
847
        double * output_training_class = &training_output[type_nr][0];
845
 
        svm_node ** input_training_class = new svm_node * [training_input[type_nr].size()];
846
 
        for(Size i = 0; i < training_input[type_nr].size(); ++i)
 
848
        svm_node ** input_training_class = new svm_node *[training_input[type_nr].size()];
 
849
        for (Size i = 0; i < training_input[type_nr].size(); ++i)
847
850
        {
848
851
          input_training_class[i] = &(training_input[type_nr][i].descriptors[0]);
849
852
        }
854
857
        svm_p_class.x = input_training_class;
855
858
 
856
859
        //perform cross validation
857
 
        if(n_fold > 1)
 
860
        if (n_fold > 1)
858
861
        {
859
862
          //perform cross validation
860
 
          std::map< SVMWrapper::SVM_parameter_type, DoubleReal > best_parameters;
 
863
          std::map<SVMWrapper::SVM_parameter_type, DoubleReal> best_parameters;
861
864
          wrap_class.performCrossValidation(&svm_p_class,
862
865
                                            SVMData(),
863
866
                                            false,
872
875
                                            );
873
876
 
874
877
          //use best parameters to train final svm on full dataset
875
 
          std::map< SVMWrapper::SVM_parameter_type, DoubleReal >::const_iterator best_param_iter;
876
 
          for(best_param_iter = best_parameters.begin(); best_param_iter != best_parameters.end(); ++best_param_iter)
 
878
          std::map<SVMWrapper::SVM_parameter_type, DoubleReal>::const_iterator best_param_iter;
 
879
          for (best_param_iter = best_parameters.begin(); best_param_iter != best_parameters.end(); ++best_param_iter)
877
880
          {
878
881
            wrap_class.setParameter(best_param_iter->first, best_param_iter->second);
879
882
          }
883
886
        wrap_class.train(&svm_p_class);
884
887
        wrap_class.saveModel(svm_model_file_class);
885
888
 
886
 
        delete [] input_training_class;
 
889
        delete[] input_training_class;
887
890
 
888
891
 
889
892
        //add entries to info file
899
902
        info_outfile.push_back("</SvmModelFileReg>");
900
903
        info_outfile.push_back("</IonType>");
901
904
 
902
 
      }//end of else
 
905
      } //end of else
903
906
 
904
 
    }//End of training SVR and SVC for the primary types
 
907
    } //End of training SVR and SVC for the primary types
905
908
 
906
909
 
907
910
    //If we are only generating the outfiles we can terminare here
908
 
    if(write_outfiles)
 
911
    if (write_outfiles)
909
912
    {
910
913
      return;
911
914
    }
922
925
    info_outfile.push_back("</MaxFeatures>");
923
926
    info_outfile.push_back("<MinFeatures>");
924
927
    info_outfile.insert(info_outfile.end(), min_features.begin(), min_features.end());
925
 
    info_outfile.push_back("</MinFeatures>");    
 
928
    info_outfile.push_back("</MinFeatures>");
926
929
 
927
930
    //------------------------------------------------------------------------------------------
928
931
    //----------------------Training prob. model for secondary types------------------
929
932
    //------------------------------------------------------------------------------------------
930
933
 
931
934
    info_outfile.push_back("<SecondaryTypes>");
932
 
    if(secondary_types)
 
935
    if (secondary_types)
933
936
    {
934
937
      info_outfile.push_back("<IntensityLevels>");
935
938
      info_outfile.push_back(number_of_intensity_levels);
942
945
    info_outfile.store(info_outfile_name);
943
946
  }
944
947
 
945
 
 
946
948
  //like in Cong Zhou paper
947
 
  void SvmTheoreticalSpectrumGeneratorTrainer::normalizeIntensity(PeakSpectrum &S) const
 
949
  void SvmTheoreticalSpectrumGeneratorTrainer::normalizeIntensity(PeakSpectrum & S) const
948
950
  {
949
951
    NLargest n_larg;
950
 
    Param larg_param=n_larg.getParameters();
951
 
    larg_param.setValue("n", (Int)(S.size()*0.8));
 
952
    Param larg_param = n_larg.getParameters();
 
953
    larg_param.setValue("n", (Int)(S.size() * 0.8));
952
954
    n_larg.setParameters(larg_param);
953
955
    n_larg.filterPeakSpectrum(S);
954
956
    S.sortByPosition();
955
957
 
956
958
    Normalizer norm;
957
 
    Param norm_param=norm.getParameters();
 
959
    Param norm_param = norm.getParameters();
958
960
    norm_param.setValue("method", "to_TIC");
959
961
    norm.setParameters(norm_param);
960
962
    norm.filterPeakSpectrum(S);
961
963
 
962
 
    DoubleReal min_intens=  std::numeric_limits<double>::infinity();
 
964
    DoubleReal min_intens =  std::numeric_limits<double>::infinity();
963
965
    DoubleReal max_intens = -1 * std::numeric_limits<double>::infinity();
964
966
 
965
 
    std::vector<DoubleReal>intensities(S.size());
966
 
    for(Size i=0; i<S.size(); ++i)
 
967
    std::vector<DoubleReal> intensities(S.size());
 
968
    for (Size i = 0; i < S.size(); ++i)
967
969
    {
968
970
      //std::cerr<<"before norm: "<<S[i].getIntensity()<<std::endl;
969
 
      if(S[i].getIntensity()>0)
 
971
      if (S[i].getIntensity() > 0)
970
972
      {
971
 
        intensities[i]=log(S[i].getIntensity() * 100);
972
 
        max_intens=std::max(max_intens, intensities[i]);
973
 
        min_intens=std::min(min_intens, intensities[i]);
 
973
        intensities[i] = log(S[i].getIntensity() * 100);
 
974
        max_intens = std::max(max_intens, intensities[i]);
 
975
        min_intens = std::min(min_intens, intensities[i]);
974
976
      }
975
977
    }
976
978
 
977
979
    DoubleReal lower = 0;
978
980
    DoubleReal upper = 1;
979
981
    //normalize intensities to one
980
 
    for(Size i=0; i<S.size(); ++i)
 
982
    for (Size i = 0; i < S.size(); ++i)
981
983
    {
982
 
      if(S[i].getIntensity()>0)
 
984
      if (S[i].getIntensity() > 0)
983
985
      {
984
 
        DoubleReal intens = lower + (upper-lower) *
985
 
                             (intensities[i]-min_intens)/
986
 
                             (max_intens-min_intens);
 
986
        DoubleReal intens = lower + (upper - lower) *
 
987
                            (intensities[i] - min_intens) /
 
988
                            (max_intens - min_intens);
987
989
        S[i].setIntensity(intens);
988
990
      }
989
991
      else
994
996
    }
995
997
  }
996
998
 
997
 
 
998
 
  void SvmTheoreticalSpectrumGeneratorTrainer::trainSecondaryTypes_(TextFile &info_outfile,
 
999
  void SvmTheoreticalSpectrumGeneratorTrainer::trainSecondaryTypes_(TextFile & info_outfile,
999
1000
                                                                    Size number_of_regions,
1000
1001
                                                                    Size number_of_intensity_levels,
1001
 
                                                                    ObservedIntensMap &observed_intensities,
1002
 
                                                                    const std::vector<IonType> &ion_types,
1003
 
                                                                    const std::vector<bool> &is_primary
 
1002
                                                                    ObservedIntensMap & observed_intensities,
 
1003
                                                                    const std::vector<IonType> & ion_types,
 
1004
                                                                    const std::vector<bool> & is_primary
1004
1005
                                                                    )
1005
1006
  {
1006
 
    std::vector<DoubleReal>tmp;
1007
 
    for(Size region = 0; region < number_of_regions; ++ region)
 
1007
    std::vector<DoubleReal> tmp;
 
1008
    for (Size region = 0; region < number_of_regions; ++region)
1008
1009
    {
1009
 
    //we start by binning the intensities. We select the bin boarders such that
1010
 
    //the intensities of the primary ions are equally split
1011
 
      std::vector<DoubleReal> &observed_b=observed_intensities[std::make_pair(IonType(Residue::BIon), region)];
 
1010
      //we start by binning the intensities. We select the bin boarders such that
 
1011
      //the intensities of the primary ions are equally split
 
1012
      std::vector<DoubleReal> & observed_b = observed_intensities[std::make_pair(IonType(Residue::BIon), region)];
1012
1013
      tmp.insert(tmp.end(), observed_b.begin(), observed_b.end());
1013
 
      std::vector<DoubleReal> &observed_y=observed_intensities[std::make_pair(IonType(Residue::YIon), region)];
1014
 
      tmp.insert(tmp.end(),observed_y.begin(), observed_y.end());
 
1014
      std::vector<DoubleReal> & observed_y = observed_intensities[std::make_pair(IonType(Residue::YIon), region)];
 
1015
      tmp.insert(tmp.end(), observed_y.begin(), observed_y.end());
1015
1016
    }
1016
1017
 
1017
 
    std::vector<DoubleReal>bin_boarders(number_of_intensity_levels);
1018
 
    std::vector<DoubleReal>bin_values(number_of_intensity_levels);
 
1018
    std::vector<DoubleReal> bin_boarders(number_of_intensity_levels);
 
1019
    std::vector<DoubleReal> bin_values(number_of_intensity_levels);
1019
1020
    std::sort(tmp.begin(), tmp.end());
1020
1021
 
1021
1022
    //find the first nonzero
1022
 
    Size first_non_zero=0;
1023
 
    while(first_non_zero<tmp.size()&& tmp[first_non_zero]==0)
 
1023
    Size first_non_zero = 0;
 
1024
    while (first_non_zero < tmp.size() && tmp[first_non_zero] == 0)
1024
1025
    {
1025
1026
      ++first_non_zero;
1026
1027
    }
1027
1028
 
1028
 
    Size non_zero_size = tmp.size()-first_non_zero;
 
1029
    Size non_zero_size = tmp.size() - first_non_zero;
1029
1030
    Size prev_index = 0;
1030
 
    for(Size i=1; i<number_of_intensity_levels; ++i)
 
1031
    for (Size i = 1; i < number_of_intensity_levels; ++i)
1031
1032
    {
1032
 
      Size index = i* (DoubleReal) non_zero_size/(number_of_intensity_levels-1) + first_non_zero;
 
1033
      Size index = i * (DoubleReal) non_zero_size / (number_of_intensity_levels - 1) + first_non_zero;
1033
1034
      DoubleReal count = 0;
1034
 
      for(Size j = prev_index; j < index; ++j)
 
1035
      for (Size j = prev_index; j < index; ++j)
1035
1036
      {
1036
1037
        count += tmp[j];
1037
1038
      }
1038
 
      bin_boarders[i-1] = tmp[index-1];
1039
 
      bin_values[i-1] = count/(index-prev_index);
 
1039
      bin_boarders[i - 1] = tmp[index - 1];
 
1040
      bin_values[i - 1] = count / (index - prev_index);
1040
1041
      prev_index = index;
1041
1042
    }
1042
1043
 
1043
1044
    info_outfile.push_back("<IntensityBinBoarders>");
1044
 
    for(Size i = 0; i < number_of_intensity_levels-1; ++i)
 
1045
    for (Size i = 0; i < number_of_intensity_levels - 1; ++i)
1045
1046
    {
1046
1047
      info_outfile.push_back(bin_boarders[i]);
1047
1048
    }
1048
1049
    info_outfile.push_back("</IntensityBinBoarders>");
1049
1050
    info_outfile.push_back("<IntensityBinValues>");
1050
 
    for(Size i = 0; i < number_of_intensity_levels-1; ++i)
 
1051
    for (Size i = 0; i < number_of_intensity_levels - 1; ++i)
1051
1052
    {
1052
1053
      info_outfile.push_back(bin_values[i]);
1053
1054
    }
1054
1055
    info_outfile.push_back("</IntensityBinValues>");
1055
1056
 
1056
1057
    //use the boarder values to bin the entries
1057
 
    for(Size region = 0; region < number_of_regions; ++ region)
 
1058
    for (Size region = 0; region < number_of_regions; ++region)
1058
1059
    {
1059
 
      for(Size i=0; i<ion_types.size(); ++i)
 
1060
      for (Size i = 0; i < ion_types.size(); ++i)
1060
1061
      {
1061
1062
        std::vector<DoubleReal> & intensities = observed_intensities[std::make_pair(ion_types[i], region)];
1062
 
        for(Size j=0; j<intensities.size(); ++j)
 
1063
        for (Size j = 0; j < intensities.size(); ++j)
1063
1064
        {
1064
1065
          DoubleReal intens = intensities[j];
1065
 
          if(intens == 0.0|| intens == -1.0)
 
1066
          if (intens == 0.0 || intens == -1.0)
1066
1067
            continue;
1067
1068
 
1068
 
          Size k=1;
1069
 
          while(k<number_of_intensity_levels-1 && intens>bin_boarders[k-1] )
 
1069
          Size k = 1;
 
1070
          while (k<number_of_intensity_levels - 1 && intens> bin_boarders[k - 1])
1070
1071
            ++k;
1071
1072
 
1072
 
          intensities[j]=k;
 
1073
          intensities[j] = k;
1073
1074
        }
1074
1075
      }
1075
1076
    }
1076
1077
 
1077
 
    std::map<std::pair<IonType,Size>, std::vector<std::vector<DoubleReal> > >joint_counts;
1078
 
    std::map<std::pair<IonType,Size>,  std::vector<DoubleReal> >background_counts;
 
1078
    std::map<std::pair<IonType, Size>, std::vector<std::vector<DoubleReal> > > joint_counts;
 
1079
    std::map<std::pair<IonType, Size>, std::vector<DoubleReal> > background_counts;
1079
1080
 
1080
1081
    //count joint appearences of primary and secondary peaks
1081
 
    for(Size i=0; i<ion_types.size(); ++i)
 
1082
    for (Size i = 0; i < ion_types.size(); ++i)
1082
1083
    {
1083
1084
      const IonType & type = ion_types[i];
1084
 
      if(is_primary[i])
 
1085
      if (is_primary[i])
1085
1086
        continue;
1086
1087
 
1087
 
      IonType primary_type=IonType(Residue::BIon);
 
1088
      IonType primary_type = IonType(Residue::BIon);
1088
1089
 
1089
 
      if(type.residue == Residue::YIon ||
1090
 
         type.residue == Residue::XIon ||
1091
 
         type.residue == Residue::ZIon)
 
1090
      if (type.residue == Residue::YIon ||
 
1091
          type.residue == Residue::XIon ||
 
1092
          type.residue == Residue::ZIon)
1092
1093
      {
1093
 
        primary_type=IonType(Residue::YIon);
 
1094
        primary_type = IonType(Residue::YIon);
1094
1095
      }
1095
 
      for(Size region = 0; region < number_of_regions; ++region)
 
1096
      for (Size region = 0; region < number_of_regions; ++region)
1096
1097
      {
1097
 
        joint_counts[std::make_pair(type, region)].assign(number_of_intensity_levels, std::vector<DoubleReal>(number_of_intensity_levels,1));
 
1098
        joint_counts[std::make_pair(type, region)].assign(number_of_intensity_levels, std::vector<DoubleReal>(number_of_intensity_levels, 1));
1098
1099
        background_counts[std::make_pair(type, region)].assign(number_of_intensity_levels, number_of_intensity_levels);
1099
 
        const std::vector<DoubleReal> &secondary = observed_intensities[std::make_pair(type, region)];
1100
 
        const std::vector<DoubleReal> &primary = observed_intensities[std::make_pair(primary_type, region)];
 
1100
        const std::vector<DoubleReal> & secondary = observed_intensities[std::make_pair(type, region)];
 
1101
        const std::vector<DoubleReal> & primary = observed_intensities[std::make_pair(primary_type, region)];
1101
1102
 
1102
 
        for(Size j=0; j<primary.size(); ++j)
 
1103
        for (Size j = 0; j < primary.size(); ++j)
1103
1104
        {
1104
 
          if(secondary[j]!=-1.0)
 
1105
          if (secondary[j] != -1.0)
1105
1106
          {
1106
1107
            ++joint_counts[std::make_pair(type, region)][(Size)secondary[j]][(Size)primary[j]];
1107
1108
            ++background_counts[std::make_pair(type, region)][(Size)primary[j]];
1111
1112
    }
1112
1113
 
1113
1114
    //compute conditional probabilities and store them in the outfile
1114
 
    for(Size i=0; i<ion_types.size(); ++i)
 
1115
    for (Size i = 0; i < ion_types.size(); ++i)
1115
1116
    {
1116
1117
      const IonType & type = ion_types[i];
1117
 
      if(is_primary[i])
 
1118
      if (is_primary[i])
1118
1119
        continue;
1119
1120
 
1120
1121
      info_outfile.push_back("<IonType>");
1123
1124
      info_outfile.push_back(type.charge);
1124
1125
      info_outfile.push_back("<ConditionalProbabilities>");
1125
1126
 
1126
 
      for(Size region = 0; region < number_of_regions; ++region)
 
1127
      for (Size region = 0; region < number_of_regions; ++region)
1127
1128
      {
1128
 
        info_outfile.push_back("<Region "+ String(region) + ">");
 
1129
        info_outfile.push_back("<Region " + String(region) + ">");
1129
1130
        std::vector<DoubleReal> & back_counts = background_counts[std::make_pair(type, region)];
1130
1131
 
1131
 
        for(Size prim = 0; prim < number_of_intensity_levels; ++prim)
 
1132
        for (Size prim = 0; prim < number_of_intensity_levels; ++prim)
1132
1133
        {
1133
 
          for(Size sec = 0; sec < number_of_intensity_levels; ++sec)
 
1134
          for (Size sec = 0; sec < number_of_intensity_levels; ++sec)
1134
1135
          {
1135
 
            if(back_counts[prim]!=0)
 
1136
            if (back_counts[prim] != 0)
1136
1137
            {
1137
 
              joint_counts[std::make_pair(type, region)][sec][prim] = joint_counts[std::make_pair(type, region)][sec][prim]/back_counts[prim];
 
1138
              joint_counts[std::make_pair(type, region)][sec][prim] = joint_counts[std::make_pair(type, region)][sec][prim] / back_counts[prim];
1138
1139
              //std::cerr<<"conditional prob  "<<type.residue<<" "<<sec<<"  "<<prim<<"  "<<joint_counts[std::make_pair(type, region)][sec][prim]<<std::endl;
1139
1140
            }
1140
1141
            info_outfile.push_back(joint_counts[std::make_pair(type, region)][sec][prim]);
1144
1145
      }
1145
1146
      info_outfile.push_back("</ConditionalProbabilities>");
1146
1147
      info_outfile.push_back("</IonType>");
1147
 
      }
 
1148
    }
1148
1149
  }
1149
1150
 
1150
 
 
1151
 
  void SvmTheoreticalSpectrumGeneratorTrainer::countIntensities_(const PeakSpectrum &spectrum,
1152
 
                                           const AASequence &annotation,
1153
 
                                           IonType type,
1154
 
                                           std::map<std::pair<IonType, Size>, std::vector<DoubleReal> > & observed_intensities,
1155
 
                                           DoubleReal tolerance,
1156
 
                                           Size number_of_regions
1157
 
                                           )
 
1151
  void SvmTheoreticalSpectrumGeneratorTrainer::countIntensities_(const PeakSpectrum & spectrum,
 
1152
                                                                 const AASequence & annotation,
 
1153
                                                                 IonType type,
 
1154
                                                                 std::map<std::pair<IonType, Size>, std::vector<DoubleReal> > & observed_intensities,
 
1155
                                                                 DoubleReal tolerance,
 
1156
                                                                 Size number_of_regions
 
1157
                                                                 )
1158
1158
  {
1159
1159
    Residue::ResidueType residue = type.residue;
1160
1160
    EmpiricalFormula loss = type.loss;
1171
1171
      AASequence prefix = annotation.getPrefix(frag_pos);
1172
1172
      AASequence suffix = annotation.getSuffix(annotation.size() - frag_pos);
1173
1173
 
1174
 
      Size region = std::min(number_of_regions-1, (Size)floor(number_of_regions * prefix.getMonoWeight(Residue::Internal)/annotation.getMonoWeight()));
 
1174
      Size region = std::min(number_of_regions - 1, (Size)floor(number_of_regions * prefix.getMonoWeight(Residue::Internal) / annotation.getMonoWeight()));
1175
1175
 
1176
 
      if (annotation[frag_pos-1].hasNeutralLoss())
 
1176
      if (annotation[frag_pos - 1].hasNeutralLoss())
1177
1177
      {
1178
 
        std::vector<EmpiricalFormula> loss_formulas = annotation[frag_pos-1].getLossFormulas();
 
1178
        std::vector<EmpiricalFormula> loss_formulas = annotation[frag_pos - 1].getLossFormulas();
1179
1179
        for (Size k = 0; k != loss_formulas.size(); ++k)
1180
1180
        {
1181
1181
          possible_n_term_losses.insert(loss_formulas[k].getString());
1183
1183
      }
1184
1184
      //check for possible losses on the c-terminal ions
1185
1185
      possible_c_term_losses.clear();
1186
 
      for(Size pos = frag_pos; pos < annotation.size(); ++pos)
 
1186
      for (Size pos = frag_pos; pos < annotation.size(); ++pos)
1187
1187
      {
1188
1188
        if (annotation[pos].hasNeutralLoss())
1189
1189
        {
1193
1193
            possible_c_term_losses.insert(loss_formulas[k].getString());
1194
1194
          }
1195
1195
        }
1196
 
      }    
 
1196
      }
1197
1197
 
1198
1198
      //N-terminal fragments
1199
1199
      if (residue == Residue::AIon || residue == Residue::BIon || residue == Residue::CIon)
1200
1200
      {
1201
1201
        //if loss is not supported or no loss ions shall be generated -- continue
1202
 
        if(!loss.isEmpty() && (!possible_n_term_losses.count(loss.getString())))
 
1202
        if (!loss.isEmpty() && (!possible_n_term_losses.count(loss.getString())))
1203
1203
        {
1204
1204
          observed_intensities[std::make_pair(type, region)].push_back(-1);
1205
1205
          continue;
1211
1211
      else if (residue == Residue::XIon || residue == Residue::YIon || residue == Residue::ZIon)
1212
1212
      {
1213
1213
        //if loss is not supported or no loss ions shall be generated -- continue
1214
 
        if(!loss.isEmpty() && (!possible_c_term_losses.count(loss.getString())))
 
1214
        if (!loss.isEmpty() && (!possible_c_term_losses.count(loss.getString())))
1215
1215
        {
1216
1216
          observed_intensities[std::make_pair(type, region)].push_back(-1);
1217
1217
          continue;
1233
1233
    }
1234
1234
  }
1235
1235
 
1236
 
 
1237
 
  void SvmTheoreticalSpectrumGeneratorTrainer::writeTrainingFile_(std::vector<DescriptorSet> &training_input, std::vector<DoubleReal> &training_output, String filename)
 
1236
  void SvmTheoreticalSpectrumGeneratorTrainer::writeTrainingFile_(std::vector<DescriptorSet> & training_input, std::vector<DoubleReal> & training_output, String filename)
1238
1237
  {
1239
 
    std::cerr<<"Creating Training File.. "<< filename;
 
1238
    std::cerr << "Creating Training File.. " << filename;
1240
1239
    TextFile file;
1241
 
    for(Size i = 0; i < training_input.size(); ++i)
 
1240
    for (Size i = 0; i < training_input.size(); ++i)
1242
1241
    {
1243
1242
      std::stringstream ss;
1244
 
      ss<<training_output[i]<<" ";
 
1243
      ss << training_output[i] << " ";
1245
1244
      std::vector<svm_node>::iterator it_debug;
1246
 
      for(it_debug=training_input[i].descriptors.begin(); it_debug < training_input[i].descriptors.end() - 1; ++it_debug)
 
1245
      for (it_debug = training_input[i].descriptors.begin(); it_debug < training_input[i].descriptors.end() - 1; ++it_debug)
1247
1246
      {
1248
 
        ss<<" "<< it_debug->index << ":" << it_debug->value;
 
1247
        ss << " " << it_debug->index << ":" << it_debug->value;
1249
1248
      }
1250
 
      file.push_back (ss.str());
 
1249
      file.push_back(ss.str());
1251
1250
    }
1252
1251
    file.store(filename);
1253
 
    std::cerr<<" Done" <<std::endl;
 
1252
    std::cerr << " Done" << std::endl;
1254
1253
  }
1255
1254
 
1256
 
 
1257
 
}//Namespace
1258
 
 
1259
 
 
1260
 
 
1261
 
 
 
1255
} //Namespace