~ubuntu-branches/ubuntu/wily/openms/wily

« back to all changes in this revision

Viewing changes to source/APPLICATIONS/TOPP/SpecLibSearcher.C

  • Committer: Package Import Robot
  • Author(s): Filippo Rusconi
  • Date: 2013-12-20 11:30:16 UTC
  • mfrom: (5.1.2 sid)
  • Revision ID: package-import@ubuntu.com-20131220113016-wre5g9bteeheq6he
Tags: 1.11.1-3
* remove version number from libbost development package names;
* ensure that AUTHORS is correctly shipped in all packages.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
// -*- mode: C++; tab-width: 2; -*-
2
 
// vi: set ts=2:
3
 
//
4
 
// --------------------------------------------------------------------------
5
 
//                   OpenMS Mass Spectrometry Framework
6
 
// --------------------------------------------------------------------------
7
 
//  Copyright (C) 2003-2011 -- Oliver Kohlbacher, Knut Reinert
8
 
//
9
 
//  This library is free software; you can redistribute it and/or
10
 
//  modify it under the terms of the GNU Lesser General Public
11
 
//  License as published by the Free Software Foundation; either
12
 
//  version 2.1 of the License, or (at your option) any later version.
13
 
//
14
 
//  This library is distributed in the hope that it will be useful,
15
 
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 
//  Lesser General Public License for more details.
18
 
//
19
 
//  You should have received a copy of the GNU Lesser General Public
20
 
//  License along with this library; if not, write to the Free Software
21
 
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
1
// --------------------------------------------------------------------------
 
2
//                   OpenMS -- Open-Source Mass Spectrometry
 
3
// --------------------------------------------------------------------------
 
4
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
 
5
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
 
6
//
 
7
// This software is released under a three-clause BSD license:
 
8
//  * Redistributions of source code must retain the above copyright
 
9
//    notice, this list of conditions and the following disclaimer.
 
10
//  * Redistributions in binary form must reproduce the above copyright
 
11
//    notice, this list of conditions and the following disclaimer in the
 
12
//    documentation and/or other materials provided with the distribution.
 
13
//  * Neither the name of any author or any participating institution
 
14
//    may be used to endorse or promote products derived from this software
 
15
//    without specific prior written permission.
 
16
// For a full list of authors, refer to the file AUTHORS.
 
17
// --------------------------------------------------------------------------
 
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 
19
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 
20
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 
21
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
 
22
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 
23
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 
24
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 
25
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 
26
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 
27
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 
28
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22
29
//
23
30
// --------------------------------------------------------------------------
24
31
// $Maintainer: David Wojnar $
55
62
  @brief Identifies peptide MS/MS spectra by spectral matching with a searchable spectral library.
56
63
 
57
64
<CENTER>
58
 
        <table>
59
 
                <tr>
60
 
                        <td ALIGN = "center" BGCOLOR="#EBEBEB"> pot. predecessor tools </td>
61
 
                        <td VALIGN="middle" ROWSPAN=2> \f$ \longrightarrow \f$ SpecLibSearcher \f$ \longrightarrow \f$</td>
62
 
                        <td ALIGN = "center" BGCOLOR="#EBEBEB"> pot. successor tools </td>
63
 
                </tr>
64
 
                <tr>
65
 
                        <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref UTILS_SpecLibCreator </td>
66
 
                        <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_IDFilter or @n any protein/peptide processing tool</td>
67
 
                </tr>
68
 
        </table>
 
65
    <table>
 
66
        <tr>
 
67
            <td ALIGN = "center" BGCOLOR="#EBEBEB"> pot. predecessor tools </td>
 
68
            <td VALIGN="middle" ROWSPAN=2> \f$ \longrightarrow \f$ SpecLibSearcher \f$ \longrightarrow \f$</td>
 
69
            <td ALIGN = "center" BGCOLOR="#EBEBEB"> pot. successor tools </td>
 
70
        </tr>
 
71
        <tr>
 
72
            <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref UTILS_SpecLibCreator </td>
 
73
            <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_IDFilter or @n any protein/peptide processing tool</td>
 
74
        </tr>
 
75
    </table>
69
76
</CENTER>
70
77
 
71
 
        @experimental This TOPP-tool is not well tested and not all features might be properly implemented and tested.
 
78
    @experimental This TOPP-tool is not well tested and not all features might be properly implemented and tested.
72
79
 
73
 
        <B>The command line parameters of this tool are:</B>
74
 
        @verbinclude TOPP_SpecLibSearcher.cli
 
80
    <B>The command line parameters of this tool are:</B>
 
81
    @verbinclude TOPP_SpecLibSearcher.cli
 
82
    <B>INI file documentation of this tool:</B>
 
83
    @htmlinclude TOPP_SpecLibSearcher.html
75
84
*/
76
85
 
77
86
// We do not want this class to show up in the docu:
78
87
/// @cond TOPPCLASSES
79
88
 
80
 
class TOPPSpecLibSearcher
81
 
        : public TOPPBase
82
 
        {
83
 
        public:
84
 
                TOPPSpecLibSearcher()
85
 
                : TOPPBase("SpecLibSearcher","Identifies peptide MS/MS spectra by spectral matching with a searchable spectral library.")
86
 
                {
87
 
                }
88
 
 
89
 
        protected:
90
 
                void registerOptionsAndFlags_()
91
 
                {
92
 
                        registerInputFileList_("in","<files>",StringList::create(""),"Input files");
93
 
      setValidFormats_("in",StringList::create("mzML"));
94
 
                        registerInputFile_("lib","<file>","","searchable spectral library(MSP format)");
95
 
                        registerOutputFileList_("out","<files>",StringList::create(""),"Output files. Have to be as many as input files");
96
 
                        setValidFormats_("out",StringList::create("idXML"));
97
 
                        registerDoubleOption_("precursor_mass_tolerance","<tolerance>",3,"Precursor mass tolerance, (Th)",false);
98
 
                        registerIntOption_("round_precursor_to_integer","<number>",10,"many precursor m/z multipling number lead to the same number; are packed in the same vector for faster search.Should be higher for high-resolution data",false,true);
99
 
                //      registerDoubleOption_("fragment_mass_tolerance","<tolerance>",0.3,"Fragment mass error",false);
100
 
 
101
 
   //   registerStringOption_("precursor_error_units", "<unit>", "Da", "parent monoisotopic mass error units", false);
102
 
     // registerStringOption_("fragment_error_units", "<unit>", "Da", "fragment monoisotopic mass error units", false);
103
 
     // vector<String> valid_strings;
104
 
     // valid_strings.push_back("Da");
105
 
     // setValidStrings_("precursor_error_units", valid_strings);
106
 
     // setValidStrings_("fragment_error_units", valid_strings);
107
 
                //      registerIntOption_("min_precursor_charge", "<charge>", 1, "minimum precursor ion charge", false);
108
 
     // registerIntOption_("max_precursor_charge", "<charge>", 3, "maximum precursor ion charge", false);
109
 
     registerStringOption_("compare_function","<string>","ZhangSimilarityScore","function for similarity comparisson",false);
110
 
     PeakSpectrumCompareFunctor::registerChildren();
111
 
     setValidStrings_("compare_function",Factory<PeakSpectrumCompareFunctor>::registeredProducts());
112
 
                 registerIntOption_("top_hits","<number>",10,"save the first <number> top hits. For all type -1",false);
113
 
     addEmptyLine_();
114
 
     addText_("Filtering options. Most are especially useful when the query spectra are raw.");
115
 
     registerIntOption_("min_peaks","<number>",5, "required mininum number of peaks for a query spectrum",false);
116
 
     registerDoubleOption_("remove_peaks_below_threshold","<threshold>",2.01,"All peaks of a query spectrum with intensities below <threshold> will be zeroed.",false);
117
 
     registerIntOption_("max_peaks","<number>",150,"Use only the top <number> of peaks.",false);
118
 
     registerIntOption_("cut_peaks_below","<number>",1000,"Remove all peaks which are lower than 1/<number> of the highest peaks. Default equals all peaks which are lower than 0.001 of the maximum intensity peak",false);
119
 
 
120
 
                        vector<String> all_mods;
121
 
                        ModificationsDB::getInstance()->getAllSearchModifications(all_mods);
122
 
        registerStringList_("fixed_modifications", "<mods>", StringList::create(""), "fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false);
123
 
                        setValidStrings_("fixed_modifications", all_mods);
124
 
 
125
 
                        registerStringList_("variable_modifications", "<mods>", StringList::create(""), "variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false);
126
 
                        setValidStrings_("variable_modifications", all_mods);
127
 
                        addEmptyLine_();
128
 
                        addText_("");
129
 
                }
130
 
 
131
 
                ExitCodes main_(int , const char**)
132
 
                {
133
 
                        //-------------------------------------------------------------
134
 
                        // parameter handling
135
 
                        //-------------------------------------------------------------
136
 
 
137
 
                        StringList in_spec = getStringList_("in");
138
 
                        StringList out = getStringList_("out");
139
 
                        String in_lib = getStringOption_("lib");
140
 
                        String compare_function = getStringOption_("compare_function");
141
 
                        Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer");
142
 
                        Real precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance");
143
 
                        //Int min_precursor_charge = getIntOption_("min_precursor_charge");
144
 
                        //Int max_precursor_charge = getIntOption_("max_precursor_charge");
145
 
                        Real remove_peaks_below_threshold = getDoubleOption_("remove_peaks_below_threshold");
146
 
                        UInt min_peaks = getIntOption_("min_peaks");
147
 
                        UInt max_peaks= getIntOption_("max_peaks");
148
 
                        Int cut_peaks_below = getIntOption_("cut_peaks_below");
149
 
                        StringList fixed_modifications = getStringList_("fixed_modifications");
150
 
                        StringList variable_modifications = getStringList_("variable_modifications");
151
 
                        Int top_hits  = getIntOption_("top_hits");
152
 
                        if(top_hits < -1 )
153
 
                        {
154
 
                                writeLog_("top_hits (should be  >= -1 )");
155
 
                                return ILLEGAL_PARAMETERS;
156
 
                        }
157
 
                        //-------------------------------------------------------------
158
 
                        // loading input
159
 
                        //-------------------------------------------------------------
160
 
                        if(out.size() != in_spec.size())
161
 
                        {
162
 
                                writeLog_("out (should be as many as input files)");
163
 
                                return ILLEGAL_PARAMETERS;
164
 
                        }
165
 
 
166
 
                        time_t prog_time = time(NULL);
167
 
                        MSPFile spectral_library;
168
 
                        RichPeakMap query, library;
169
 
                        //spectrum which will be identified
170
 
      MzMLFile spectra;
171
 
                        spectra.setLogType(log_type_);
172
 
 
173
 
                        time_t start_build_time = time(NULL);
174
 
                        //-------------------------------------------------------------
175
 
                        //building map for faster search
176
 
                        //-------------------------------------------------------------
177
 
 
178
 
                        //library containing already identified peptide spectra
179
 
                        vector< PeptideIdentification > ids;
180
 
                        spectral_library.load(in_lib,ids,library);
181
 
 
182
 
                        map<Size, vector<PeakSpectrum> > MSLibrary;
183
 
                        {
184
 
                        RichPeakMap::iterator s;
185
 
                        vector<PeptideIdentification>::iterator i;
186
 
                        ModificationsDB* mdb = ModificationsDB::getInstance();
187
 
                        for(s = library.begin(), i = ids.begin() ; s < library.end();++s,++i)
188
 
                        {
189
 
                                DoubleReal precursor_MZ = (*s).getPrecursors()[0].getMZ();
190
 
                                Size MZ_multi = (Size)precursor_MZ*precursor_mass_multiplier;
191
 
                                map<Size,vector<PeakSpectrum> >::iterator found;
192
 
                                found = MSLibrary.find(MZ_multi);
193
 
 
194
 
                                PeakSpectrum librar;
195
 
                                bool variable_modifications_ok = true;
196
 
                                bool fixed_modifications_ok = true;
197
 
                                const AASequence& aaseq= i->getHits()[0].getSequence();
198
 
                                //variable fixed modifications
199
 
                                if(!fixed_modifications.empty())
200
 
                                {
201
 
                                        for(Size i = 0; i< aaseq.size(); ++i)
202
 
                                        {
203
 
                                                const   Residue& mod  = aaseq.getResidue(i);
204
 
                                                for(Size s = 0; s < fixed_modifications.size();++s)
205
 
                                                {
206
 
                                                 if(mod.getOneLetterCode() ==mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification())
207
 
                                                 {
208
 
                                                         fixed_modifications_ok = false;
209
 
                                                                break;
210
 
                                                        }
211
 
                                                }
212
 
                                        }
213
 
                                }
214
 
                                //variable modifications
215
 
                                if(aaseq.isModified() && (!variable_modifications.empty()))
216
 
                                {
217
 
                                        for(Size i = 0; i< aaseq.size(); ++i)
218
 
                                        {
219
 
                                                if(aaseq.isModified(i))
220
 
                                                {
221
 
                                                        const   Residue& mod  = aaseq.getResidue(i);
222
 
                                                        for(Size s = 0; s < variable_modifications.size();++s)
223
 
                                                        {
224
 
                                                                if(mod.getOneLetterCode() ==mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification())
225
 
                                                                {
226
 
                                                                        variable_modifications_ok = false;
227
 
                                                                        break;
228
 
                                                                }
229
 
                                                        }
230
 
                                                }
231
 
                                        }
232
 
                                }
233
 
                                if(variable_modifications_ok && fixed_modifications_ok)
234
 
                                {
235
 
                                        PeptideIdentification& translocate_pid = *i;
236
 
                                        librar.getPeptideIdentifications().push_back(translocate_pid);
237
 
                                        librar.setPrecursors(s->getPrecursors());
238
 
                                        //library entry transformation
239
 
                                        for(UInt l = 0; l< s->size(); ++l)
240
 
                                        {
241
 
                                                Peak1D peak;
242
 
                                                if((*s)[l].getIntensity() >  remove_peaks_below_threshold)
243
 
                                                {
244
 
                                                        const String& info = (*s)[l].getMetaValue("MSPPeakInfo");
245
 
                                                        if(info[0] == '?')
246
 
                                                        {
247
 
                                                                peak.setIntensity(sqrt(0.2*(*s)[l].getIntensity()));
248
 
                                                        }
249
 
                                                        else
250
 
                                                        {
251
 
                                                                peak.setIntensity(sqrt((*s)[l].getIntensity()));
252
 
                                                        }
253
 
 
254
 
                                                        peak.setMZ((*s)[l].getMZ());
255
 
                                                        peak.setPosition((*s)[l].getPosition());
256
 
                                                        librar.push_back(peak);
257
 
                                                }
258
 
                                        }
259
 
                                        if(found != MSLibrary.end())
260
 
                                        {
261
 
                                                found->second.push_back(librar);
262
 
                                        }
263
 
                                        else
264
 
                                        {
265
 
                                                vector<PeakSpectrum> tmp;
266
 
                                                tmp.push_back(librar);
267
 
                                                MSLibrary.insert(make_pair(MZ_multi,tmp));
268
 
                                        }
269
 
                                }
270
 
                        }
271
 
                        }
272
 
                        time_t end_build_time = time(NULL);
273
 
                  cout<<"Time needed for preprocessing data: "<<(end_build_time - start_build_time)<<"\n";
274
 
                        //compare function
275
 
                        PeakSpectrumCompareFunctor* comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function);
276
 
                        //-------------------------------------------------------------
277
 
                        // calculations
278
 
                        //-------------------------------------------------------------
279
 
                        DoubleReal score;
280
 
                        StringList::iterator in, out_file;
281
 
                        for(in  = in_spec.begin() , out_file  = out.begin(); in < in_spec.end(); ++in,++out_file)
282
 
                        {
283
 
                                time_t start_time = time(NULL);
284
 
                                spectra.load(*in,query);
285
 
                                //Will hold valuable hits
286
 
                                vector<PeptideIdentification> peptide_ids;
287
 
                                vector<ProteinIdentification> protein_ids;
288
 
                                // Write parameters to ProteinIdentifcation
289
 
                                ProteinIdentification prot_id;
290
 
                                //Parameters of identificaion
291
 
                                prot_id.setIdentifier("test");
292
 
                                prot_id.setSearchEngineVersion("SpecLibSearcher");
293
 
                                prot_id.setDateTime(DateTime::now());
294
 
                                prot_id.setScoreType(compare_function);
295
 
                                ProteinIdentification::SearchParameters searchparam;
296
 
                                searchparam.precursor_tolerance = precursor_mass_tolerance;
297
 
                                prot_id.setSearchParameters(searchparam);
298
 
                                /***********SEARCH**********/
299
 
                                for(UInt j = 0; j < query.size(); ++j)
300
 
                                {
301
 
                                        //Set identifier for each identifications
302
 
                                        PeptideIdentification pid;
303
 
                                        pid.setIdentifier("test");
304
 
                                        pid.setScoreType(compare_function);
305
 
                                        ProteinHit pr_hit;
306
 
                                        pr_hit.setAccession(j);
307
 
                                        prot_id.insertHit(pr_hit);
308
 
                                        //RichPeak1D to Peak1D transformation for the compare function query
309
 
                                        PeakSpectrum quer;
310
 
                                        bool peak_ok = true;
311
 
                                        query[j].sortByIntensity(true);
312
 
                                        DoubleReal min_high_intensity = 0;
313
 
 
314
 
          if(query[j].empty() || query[j].getMSLevel()!=2 )
315
 
          {
316
 
            continue;
317
 
          }
318
 
          if(query[j].getPrecursors().empty())
319
 
          {
320
 
            writeLog_("Warning MS2 spectrum without precursor information");
321
 
            continue;
322
 
          }
323
 
 
324
 
          min_high_intensity = (1/cut_peaks_below)*query[j][0].getIntensity();
325
 
 
326
 
                                        query[j].sortByPosition();
327
 
                                        for(UInt k = 0; k < query[j].size() && k < max_peaks; ++k)
328
 
                                        {
329
 
                                                if(query[j][k].getIntensity() >  remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity )
330
 
                                                {
331
 
                                                        Peak1D peak;
332
 
                                                        peak.setIntensity(sqrt(query[j][k].getIntensity()));
333
 
                                                        peak.setMZ(query[j][k].getMZ());
334
 
                                                        peak.setPosition(query[j][k].getPosition());
335
 
                                                        quer.push_back(peak);
336
 
                                                }
337
 
                                        }
338
 
                                        if(quer.size() >= min_peaks)
339
 
                                        {
340
 
                                                peak_ok = true;
341
 
                                        }
342
 
                                        else
343
 
                                        {
344
 
                                                peak_ok = false;
345
 
                                        }          
346
 
                                        DoubleReal query_MZ = query[j].getPrecursors()[0].getMZ();
347
 
                                        if(peak_ok)
348
 
                                        {
349
 
                                                bool charge_one = false;
350
 
                                                Int percent = (Int)Math::round((query[j].size()/100.0) * 3.0);
351
 
                                                Int margin  = (Int)Math::round((query[j].size()/100.0)* 1.0);
352
 
                                                for(vector<RichPeak1D>::iterator peak = query[j].end()-1; percent >= 0; --peak , --percent)
353
 
                                                {
354
 
                                                        if(peak->getMZ() < query_MZ)
355
 
                                                        {
356
 
                                                                break;
357
 
                                                        }
358
 
                                                }
359
 
                                                if(percent > margin)
360
 
                                                {
361
 
                                                        charge_one = true;
362
 
                                                }
363
 
                                                Real min_MZ = (query_MZ - precursor_mass_tolerance) *precursor_mass_multiplier;
364
 
                                                Real max_MZ = (query_MZ + precursor_mass_tolerance) *precursor_mass_multiplier;
365
 
                                                for(Size mz = (Size)min_MZ; mz <= ((Size)max_MZ)+1; ++mz)
366
 
                                                {
367
 
                                                        map<Size, vector<PeakSpectrum> >::iterator found;
368
 
                                                                found = MSLibrary.find(mz);
369
 
                                                                if(found != MSLibrary.end())
370
 
                                                                {
371
 
                                                                        vector<PeakSpectrum>& library = found->second;
372
 
                                                                        for(Size i = 0; i < library.size(); ++i)
373
 
                                                                        {
374
 
                                                                                Real this_MZ  = library[i].getPrecursors()[0].getMZ()*precursor_mass_multiplier;
375
 
                                                                                if(this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) ||charge_one == false ))
376
 
                                                                                {
377
 
                                                                                        PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0];
378
 
                                                                                        PeakSpectrum& librar = library[i];
379
 
                                                                                        //Special treatment for SpectraST score as it computes a score based on the whole library
380
 
                                                                                        if(compare_function == "SpectraSTSimilarityScore")
381
 
                                                                                        {
382
 
                                                                                                SpectraSTSimilarityScore* sp= static_cast<SpectraSTSimilarityScore*>(comparor);
383
 
                                                                                                BinnedSpectrum quer_bin = sp->transform(quer);
384
 
                                                                                                BinnedSpectrum librar_bin = sp->transform(librar);
385
 
                                                                                                score = (*sp)(quer,librar);//(*sp)(quer_bin,librar_bin);
386
 
                                                                                                double dot_bias = sp->dot_bias(quer_bin,librar_bin,score);
387
 
                                                                                                hit.setMetaValue("DOTBIAS",dot_bias);
388
 
                                                                                        }
389
 
                                                                                        else
390
 
                                                                                        {
391
 
                                                                                                if(compare_function =="CompareFouriertransform")
392
 
                                                                                                {
393
 
                                                                                                        CompareFouriertransform* ft = static_cast<CompareFouriertransform*>(comparor);
394
 
                                                                                                        ft->transform(quer);
395
 
                                                                                                        ft->transform(librar);
396
 
                                                                                                }
397
 
                                                                                                score = (*comparor)(quer,librar);
398
 
                                                                                        }
399
 
 
400
 
                                                                                        DataValue RT(library[i].getRT());
401
 
                                                                                        DataValue MZ(library[i].getPrecursors()[0].getMZ());
402
 
                                                                                        hit.setMetaValue("RT",RT);
403
 
                                                                                        hit.setMetaValue("MZ",MZ);
404
 
                                                                                        hit.setScore(score);
405
 
                                                                                        hit.addProteinAccession(pr_hit.getAccession());
406
 
                                                                                        pid.insertHit(hit);
407
 
                                                                                }
408
 
                                                                        }
409
 
                                                                }
410
 
                                                        }
411
 
                                                }
412
 
                                                pid.setHigherScoreBetter(true);
413
 
                                                pid.sort();
414
 
                                                if(compare_function =="SpectraSTSimilarityScore")
415
 
                                                {
416
 
                                                        if(!pid.empty() && !pid.getHits().empty())
417
 
                                                        {
418
 
                                                                vector<PeptideHit> final_hits;
419
 
                                                                final_hits.resize(pid.getHits().size());
420
 
                                                                SpectraSTSimilarityScore* sp= static_cast<SpectraSTSimilarityScore*>(comparor);
421
 
                                                                Size runner_up = 1;
422
 
                                                                for( ; runner_up < pid.getHits().size() ; ++runner_up)
423
 
                                                                {
424
 
                                                                        if(pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString()|| runner_up > 5)
425
 
                                                                        {
426
 
                                                                                break;
427
 
                                                                        }
428
 
                                                                }
429
 
                                                                double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore());
430
 
                                                                for(Size s = 0; s < pid.getHits().size();++s)
431
 
                                                                {
432
 
                                                                        final_hits[s] = pid.getHits()[s];
433
 
                                                                        final_hits[s].setMetaValue("delta D",delta_D);
434
 
                                                                        final_hits[s].setMetaValue("dot product",pid.getHits()[s].getScore());
435
 
                                                                        final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(),delta_D,pid.getHits()[s].getMetaValue("DOTBIAS")));
436
 
 
437
 
                                                                        //final_hits[s].removeMetaValue("DOTBIAS");
438
 
                                                        }
439
 
                                                        pid.setHits(final_hits);
440
 
                                                        pid.sort();
441
 
                                                        pid.setMetaValue("MZ",query[j].getPrecursors()[0].getMZ());
442
 
                                                        pid.setMetaValue("RT",query_MZ);
443
 
                                                }
444
 
                                        }
445
 
                                        if(top_hits != -1 && (UInt)top_hits < pid.getHits().size())
446
 
                                        {
447
 
                                                vector<PeptideHit> hits;
448
 
                                                hits.resize(top_hits);
449
 
                                                for(Size i = 0; i < (UInt)top_hits; ++i)
450
 
                                                {
451
 
                                                        hits[i] = pid.getHits()[i];
452
 
                                                }
453
 
                                                pid.setHits(hits);
454
 
                                        }
455
 
                                        peptide_ids.push_back(pid);
456
 
                                }
457
 
                                protein_ids.push_back(prot_id);
458
 
                                //-------------------------------------------------------------
459
 
                                // writing output
460
 
                                //-------------------------------------------------------------
461
 
                                IdXMLFile id_xml_file;
462
 
                                id_xml_file.store(*out_file,protein_ids,peptide_ids);
463
 
                                time_t end_time = time(NULL);
464
 
                                cout<<"Search time: "<<difftime(end_time,start_time)<<" seconds for "<<*in<<"\n";
465
 
                        }
466
 
                        time_t end_time = time(NULL);
467
 
                        cout<<"Total time: "<<difftime(end_time,prog_time)<<" secconds\n";
468
 
                        return EXECUTION_OK;
469
 
                }
470
 
 
471
 
        };
472
 
 
473
 
 
474
 
 
475
 
 
476
 
int main( int argc, const char** argv )
477
 
{
478
 
    TOPPSpecLibSearcher tool;
479
 
    return tool.main(argc,argv);
 
89
class TOPPSpecLibSearcher :
 
90
  public TOPPBase
 
91
{
 
92
public:
 
93
  TOPPSpecLibSearcher() :
 
94
    TOPPBase("SpecLibSearcher", "Identifies peptide MS/MS spectra by spectral matching with a searchable spectral library.")
 
95
  {
 
96
  }
 
97
 
 
98
protected:
 
99
  void registerOptionsAndFlags_()
 
100
  {
 
101
    registerInputFileList_("in", "<files>", StringList::create(""), "Input files");
 
102
    setValidFormats_("in", StringList::create("mzML"));
 
103
    registerInputFile_("lib", "<file>", "", "searchable spectral library (MSP format)");
 
104
    setValidFormats_("lib", StringList::create("msp"));
 
105
    registerOutputFileList_("out", "<files>", StringList::create(""), "Output files. Have to be as many as input files");
 
106
    setValidFormats_("out", StringList::create("idXML"));
 
107
    registerDoubleOption_("precursor_mass_tolerance", "<tolerance>", 3, "Precursor mass tolerance, (Th)", false);
 
108
    registerIntOption_("round_precursor_to_integer", "<number>", 10, "many precursor m/z multipling number lead to the same number; are packed in the same vector for faster search.Should be higher for high-resolution data", false, true);
 
109
    // registerDoubleOption_("fragment_mass_tolerance","<tolerance>",0.3,"Fragment mass error",false);
 
110
 
 
111
    // registerStringOption_("precursor_error_units", "<unit>", "Da", "parent monoisotopic mass error units", false);
 
112
    // registerStringOption_("fragment_error_units", "<unit>", "Da", "fragment monoisotopic mass error units", false);
 
113
    // vector<String> valid_strings;
 
114
    // valid_strings.push_back("Da");
 
115
    // setValidStrings_("precursor_error_units", valid_strings);
 
116
    // setValidStrings_("fragment_error_units", valid_strings);
 
117
    // registerIntOption_("min_precursor_charge", "<charge>", 1, "minimum precursor ion charge", false);
 
118
    // registerIntOption_("max_precursor_charge", "<charge>", 3, "maximum precursor ion charge", false);
 
119
    registerStringOption_("compare_function", "<string>", "ZhangSimilarityScore", "function for similarity comparisson", false);
 
120
    PeakSpectrumCompareFunctor::registerChildren();
 
121
    setValidStrings_("compare_function", Factory<PeakSpectrumCompareFunctor>::registeredProducts());
 
122
    registerIntOption_("top_hits", "<number>", 10, "save the first <number> top hits. For all type -1", false);
 
123
 
 
124
    addEmptyLine_();
 
125
    registerTOPPSubsection_("filter", "Filtering options. Most are especially useful when the query spectra are raw.");
 
126
    registerDoubleOption_("filter:remove_peaks_below_threshold", "<threshold>", 2.01, "All peaks of a query spectrum with intensities below <threshold> will be zeroed.", false);
 
127
    registerIntOption_("filter:min_peaks", "<number>", 5, "required mininum number of peaks for a query spectrum", false);
 
128
    registerIntOption_("filter:max_peaks", "<number>", 150, "Use only the top <number> of peaks.", false);
 
129
    registerIntOption_("filter:cut_peaks_below", "<number>", 1000, "Remove all peaks which are lower than 1/<number> of the highest peaks. Default equals all peaks which are lower than 0.001 of the maximum intensity peak", false);
 
130
 
 
131
    vector<String> all_mods;
 
132
    ModificationsDB::getInstance()->getAllSearchModifications(all_mods);
 
133
    registerStringList_("fixed_modifications", "<mods>", StringList::create(""), "fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false);
 
134
    setValidStrings_("fixed_modifications", all_mods);
 
135
 
 
136
    registerStringList_("variable_modifications", "<mods>", StringList::create(""), "variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false);
 
137
    setValidStrings_("variable_modifications", all_mods);
 
138
    addEmptyLine_();
 
139
  }
 
140
 
 
141
  ExitCodes main_(int, const char **)
 
142
  {
 
143
    //-------------------------------------------------------------
 
144
    // parameter handling
 
145
    //-------------------------------------------------------------
 
146
 
 
147
    StringList in_spec = getStringList_("in");
 
148
    StringList out = getStringList_("out");
 
149
    String in_lib = getStringOption_("lib");
 
150
    String compare_function = getStringOption_("compare_function");
 
151
    Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer");
 
152
    Real precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance");
 
153
    //Int min_precursor_charge = getIntOption_("min_precursor_charge");
 
154
    //Int max_precursor_charge = getIntOption_("max_precursor_charge");
 
155
    Real remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold");
 
156
    UInt min_peaks = getIntOption_("filter:min_peaks");
 
157
    UInt max_peaks = getIntOption_("filter:max_peaks");
 
158
    Int cut_peaks_below = getIntOption_("filter:cut_peaks_below");
 
159
    StringList fixed_modifications = getStringList_("fixed_modifications");
 
160
    StringList variable_modifications = getStringList_("variable_modifications");
 
161
    Int top_hits  = getIntOption_("top_hits");
 
162
    if (top_hits < -1)
 
163
    {
 
164
      writeLog_("top_hits (should be  >= -1 )");
 
165
      return ILLEGAL_PARAMETERS;
 
166
    }
 
167
    //-------------------------------------------------------------
 
168
    // loading input
 
169
    //-------------------------------------------------------------
 
170
    if (out.size() != in_spec.size())
 
171
    {
 
172
      writeLog_("out (should be as many as input files)");
 
173
      return ILLEGAL_PARAMETERS;
 
174
    }
 
175
 
 
176
    time_t prog_time = time(NULL);
 
177
    MSPFile spectral_library;
 
178
    RichPeakMap query, library;
 
179
    //spectrum which will be identified
 
180
    MzMLFile spectra;
 
181
    spectra.setLogType(log_type_);
 
182
 
 
183
    time_t start_build_time = time(NULL);
 
184
    //-------------------------------------------------------------
 
185
    //building map for faster search
 
186
    //-------------------------------------------------------------
 
187
 
 
188
    //library containing already identified peptide spectra
 
189
    vector<PeptideIdentification> ids;
 
190
    spectral_library.load(in_lib, ids, library);
 
191
 
 
192
    map<Size, vector<PeakSpectrum> > MSLibrary;
 
193
    {
 
194
      RichPeakMap::iterator s;
 
195
      vector<PeptideIdentification>::iterator i;
 
196
      ModificationsDB * mdb = ModificationsDB::getInstance();
 
197
      for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i)
 
198
      {
 
199
        DoubleReal precursor_MZ = (*s).getPrecursors()[0].getMZ();
 
200
        Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier;
 
201
        map<Size, vector<PeakSpectrum> >::iterator found;
 
202
        found = MSLibrary.find(MZ_multi);
 
203
 
 
204
        PeakSpectrum librar;
 
205
        bool variable_modifications_ok = true;
 
206
        bool fixed_modifications_ok = true;
 
207
        const AASequence & aaseq = i->getHits()[0].getSequence();
 
208
        //variable fixed modifications
 
209
        if (!fixed_modifications.empty())
 
210
        {
 
211
          for (Size i = 0; i < aaseq.size(); ++i)
 
212
          {
 
213
            const   Residue & mod  = aaseq.getResidue(i);
 
214
            for (Size s = 0; s < fixed_modifications.size(); ++s)
 
215
            {
 
216
              if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification())
 
217
              {
 
218
                fixed_modifications_ok = false;
 
219
                break;
 
220
              }
 
221
            }
 
222
          }
 
223
        }
 
224
        //variable modifications
 
225
        if (aaseq.isModified() && (!variable_modifications.empty()))
 
226
        {
 
227
          for (Size i = 0; i < aaseq.size(); ++i)
 
228
          {
 
229
            if (aaseq.isModified(i))
 
230
            {
 
231
              const   Residue & mod  = aaseq.getResidue(i);
 
232
              for (Size s = 0; s < variable_modifications.size(); ++s)
 
233
              {
 
234
                if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification())
 
235
                {
 
236
                  variable_modifications_ok = false;
 
237
                  break;
 
238
                }
 
239
              }
 
240
            }
 
241
          }
 
242
        }
 
243
        if (variable_modifications_ok && fixed_modifications_ok)
 
244
        {
 
245
          PeptideIdentification & translocate_pid = *i;
 
246
          librar.getPeptideIdentifications().push_back(translocate_pid);
 
247
          librar.setPrecursors(s->getPrecursors());
 
248
          //library entry transformation
 
249
          for (UInt l = 0; l < s->size(); ++l)
 
250
          {
 
251
            Peak1D peak;
 
252
            if ((*s)[l].getIntensity() >  remove_peaks_below_threshold)
 
253
            {
 
254
              const String & info = (*s)[l].getMetaValue("MSPPeakInfo");
 
255
              if (info[0] == '?')
 
256
              {
 
257
                peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity()));
 
258
              }
 
259
              else
 
260
              {
 
261
                peak.setIntensity(sqrt((*s)[l].getIntensity()));
 
262
              }
 
263
 
 
264
              peak.setMZ((*s)[l].getMZ());
 
265
              peak.setPosition((*s)[l].getPosition());
 
266
              librar.push_back(peak);
 
267
            }
 
268
          }
 
269
          if (found != MSLibrary.end())
 
270
          {
 
271
            found->second.push_back(librar);
 
272
          }
 
273
          else
 
274
          {
 
275
            vector<PeakSpectrum> tmp;
 
276
            tmp.push_back(librar);
 
277
            MSLibrary.insert(make_pair(MZ_multi, tmp));
 
278
          }
 
279
        }
 
280
      }
 
281
    }
 
282
    time_t end_build_time = time(NULL);
 
283
    cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n";
 
284
    //compare function
 
285
    PeakSpectrumCompareFunctor * comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function);
 
286
    //-------------------------------------------------------------
 
287
    // calculations
 
288
    //-------------------------------------------------------------
 
289
    DoubleReal score;
 
290
    StringList::iterator in, out_file;
 
291
    for (in  = in_spec.begin(), out_file  = out.begin(); in < in_spec.end(); ++in, ++out_file)
 
292
    {
 
293
      time_t start_time = time(NULL);
 
294
      spectra.load(*in, query);
 
295
      //Will hold valuable hits
 
296
      vector<PeptideIdentification> peptide_ids;
 
297
      vector<ProteinIdentification> protein_ids;
 
298
      // Write parameters to ProteinIdentifcation
 
299
      ProteinIdentification prot_id;
 
300
      //Parameters of identificaion
 
301
      prot_id.setIdentifier("test");
 
302
      prot_id.setSearchEngineVersion("SpecLibSearcher");
 
303
      prot_id.setDateTime(DateTime::now());
 
304
      prot_id.setScoreType(compare_function);
 
305
      ProteinIdentification::SearchParameters searchparam;
 
306
      searchparam.precursor_tolerance = precursor_mass_tolerance;
 
307
      prot_id.setSearchParameters(searchparam);
 
308
      /***********SEARCH**********/
 
309
      for (UInt j = 0; j < query.size(); ++j)
 
310
      {
 
311
        //Set identifier for each identifications
 
312
        PeptideIdentification pid;
 
313
        pid.setIdentifier("test");
 
314
        pid.setScoreType(compare_function);
 
315
        ProteinHit pr_hit;
 
316
        pr_hit.setAccession(j);
 
317
        prot_id.insertHit(pr_hit);
 
318
        //RichPeak1D to Peak1D transformation for the compare function query
 
319
        PeakSpectrum quer;
 
320
        bool peak_ok = true;
 
321
        query[j].sortByIntensity(true);
 
322
        DoubleReal min_high_intensity = 0;
 
323
 
 
324
        if (query[j].empty() || query[j].getMSLevel() != 2)
 
325
        {
 
326
          continue;
 
327
        }
 
328
        if (query[j].getPrecursors().empty())
 
329
        {
 
330
          writeLog_("Warning MS2 spectrum without precursor information");
 
331
          continue;
 
332
        }
 
333
 
 
334
        min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity();
 
335
 
 
336
        query[j].sortByPosition();
 
337
        for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k)
 
338
        {
 
339
          if (query[j][k].getIntensity() >  remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity)
 
340
          {
 
341
            Peak1D peak;
 
342
            peak.setIntensity(sqrt(query[j][k].getIntensity()));
 
343
            peak.setMZ(query[j][k].getMZ());
 
344
            peak.setPosition(query[j][k].getPosition());
 
345
            quer.push_back(peak);
 
346
          }
 
347
        }
 
348
        if (quer.size() >= min_peaks)
 
349
        {
 
350
          peak_ok = true;
 
351
        }
 
352
        else
 
353
        {
 
354
          peak_ok = false;
 
355
        }
 
356
        DoubleReal query_MZ = query[j].getPrecursors()[0].getMZ();
 
357
        if (peak_ok)
 
358
        {
 
359
          bool charge_one = false;
 
360
          Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0);
 
361
          Int margin  = (Int) Math::round((query[j].size() / 100.0) * 1.0);
 
362
          for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent)
 
363
          {
 
364
            if (peak->getMZ() < query_MZ)
 
365
            {
 
366
              break;
 
367
            }
 
368
          }
 
369
          if (percent > margin)
 
370
          {
 
371
            charge_one = true;
 
372
          }
 
373
          Real min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier;
 
374
          Real max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier;
 
375
          for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz)
 
376
          {
 
377
            map<Size, vector<PeakSpectrum> >::iterator found;
 
378
            found = MSLibrary.find(mz);
 
379
            if (found != MSLibrary.end())
 
380
            {
 
381
              vector<PeakSpectrum> & library = found->second;
 
382
              for (Size i = 0; i < library.size(); ++i)
 
383
              {
 
384
                Real this_MZ  = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier;
 
385
                if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false))
 
386
                {
 
387
                  PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0];
 
388
                  PeakSpectrum & librar = library[i];
 
389
                  //Special treatment for SpectraST score as it computes a score based on the whole library
 
390
                  if (compare_function == "SpectraSTSimilarityScore")
 
391
                  {
 
392
                    SpectraSTSimilarityScore * sp = static_cast<SpectraSTSimilarityScore *>(comparor);
 
393
                    BinnedSpectrum quer_bin = sp->transform(quer);
 
394
                    BinnedSpectrum librar_bin = sp->transform(librar);
 
395
                    score = (*sp)(quer, librar);                           //(*sp)(quer_bin,librar_bin);
 
396
                    double dot_bias = sp->dot_bias(quer_bin, librar_bin, score);
 
397
                    hit.setMetaValue("DOTBIAS", dot_bias);
 
398
                  }
 
399
                  else
 
400
                  {
 
401
                    if (compare_function == "CompareFouriertransform")
 
402
                    {
 
403
                      CompareFouriertransform * ft = static_cast<CompareFouriertransform *>(comparor);
 
404
                      ft->transform(quer);
 
405
                      ft->transform(librar);
 
406
                    }
 
407
                    score = (*comparor)(quer, librar);
 
408
                  }
 
409
 
 
410
                  DataValue RT(library[i].getRT());
 
411
                  DataValue MZ(library[i].getPrecursors()[0].getMZ());
 
412
                  hit.setMetaValue("RT", RT);
 
413
                  hit.setMetaValue("MZ", MZ);
 
414
                  hit.setScore(score);
 
415
                  hit.addProteinAccession(pr_hit.getAccession());
 
416
                  pid.insertHit(hit);
 
417
                }
 
418
              }
 
419
            }
 
420
          }
 
421
        }
 
422
        pid.setHigherScoreBetter(true);
 
423
        pid.sort();
 
424
        if (compare_function == "SpectraSTSimilarityScore")
 
425
        {
 
426
          if (!pid.empty() && !pid.getHits().empty())
 
427
          {
 
428
            vector<PeptideHit> final_hits;
 
429
            final_hits.resize(pid.getHits().size());
 
430
            SpectraSTSimilarityScore * sp = static_cast<SpectraSTSimilarityScore *>(comparor);
 
431
            Size runner_up = 1;
 
432
            for (; runner_up < pid.getHits().size(); ++runner_up)
 
433
            {
 
434
              if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5)
 
435
              {
 
436
                break;
 
437
              }
 
438
            }
 
439
            double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore());
 
440
            for (Size s = 0; s < pid.getHits().size(); ++s)
 
441
            {
 
442
              final_hits[s] = pid.getHits()[s];
 
443
              final_hits[s].setMetaValue("delta D", delta_D);
 
444
              final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore());
 
445
              final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS")));
 
446
 
 
447
              //final_hits[s].removeMetaValue("DOTBIAS");
 
448
            }
 
449
            pid.setHits(final_hits);
 
450
            pid.sort();
 
451
            pid.setMetaValue("MZ", query[j].getPrecursors()[0].getMZ());
 
452
            pid.setMetaValue("RT", query_MZ);
 
453
          }
 
454
        }
 
455
        if (top_hits != -1 && (UInt)top_hits < pid.getHits().size())
 
456
        {
 
457
          vector<PeptideHit> hits;
 
458
          hits.resize(top_hits);
 
459
          for (Size i = 0; i < (UInt)top_hits; ++i)
 
460
          {
 
461
            hits[i] = pid.getHits()[i];
 
462
          }
 
463
          pid.setHits(hits);
 
464
        }
 
465
        peptide_ids.push_back(pid);
 
466
      }
 
467
      protein_ids.push_back(prot_id);
 
468
      //-------------------------------------------------------------
 
469
      // writing output
 
470
      //-------------------------------------------------------------
 
471
      IdXMLFile id_xml_file;
 
472
      id_xml_file.store(*out_file, protein_ids, peptide_ids);
 
473
      time_t end_time = time(NULL);
 
474
      cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n";
 
475
    }
 
476
    time_t end_time = time(NULL);
 
477
    cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n";
 
478
    return EXECUTION_OK;
 
479
  }
 
480
 
 
481
};
 
482
 
 
483
 
 
484
 
 
485
 
 
486
int main(int argc, const char ** argv)
 
487
{
 
488
  TOPPSpecLibSearcher tool;
 
489
  return tool.main(argc, argv);
480
490
}
481
491
 
482
492
/// @endcond