~ubuntu-branches/ubuntu/wily/openms/wily

« back to all changes in this revision

Viewing changes to source/APPLICATIONS/UTILS/DigestorMotif.C

  • Committer: Package Import Robot
  • Author(s): Filippo Rusconi
  • Date: 2012-11-12 15:58:12 UTC
  • Revision ID: package-import@ubuntu.com-20121112155812-vr15wtg9b50cuesg
Tags: upstream-1.9.0
ImportĀ upstreamĀ versionĀ 1.9.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
// -*- mode: C++; tab-width: 2; -*-
 
2
// vi: set ts=2:
 
3
//
 
4
// --------------------------------------------------------------------------
 
5
//                   OpenMS Mass Spectrometry Framework
 
6
// --------------------------------------------------------------------------
 
7
//  Copyright (C) 2003-2008 -- Oliver Kohlbacher, Knut Reinert
 
8
//
 
9
//  This library is free software; you can redistribute it and/or
 
10
//  modify it under the terms of the GNU Lesser General Public
 
11
//  License as published by the Free Software Foundation; either
 
12
//  version 2.1 of the License, or (at your option) any later version.
 
13
//
 
14
//  This library is distributed in the hope that it will be useful,
 
15
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
 
16
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
17
//  Lesser General Public License for more details.
 
18
//
 
19
//  You should have received a copy of the GNU Lesser General Public
 
20
//  License along with this library; if not, write to the Free Software
 
21
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
22
//
 
23
// --------------------------------------------------------------------------
 
24
// $Maintainer: Sven Nahnsen $
 
25
// $Author: Sven Nahnsen $
 
26
// --------------------------------------------------------------------------
 
27
 
 
28
#include <OpenMS/FORMAT/IdXMLFile.h>
 
29
#include <OpenMS/FORMAT/FASTAFile.h>
 
30
#include <OpenMS/METADATA/ProteinIdentification.h>
 
31
#include <OpenMS/APPLICATIONS/TOPPBase.h>
 
32
#include <OpenMS/CHEMISTRY/EnzymaticDigestion.h>
 
33
 
 
34
#include <map>
 
35
 
 
36
using namespace OpenMS;
 
37
using namespace std;
 
38
 
 
39
#define SEP "\t"
 
40
 
 
41
//-------------------------------------------------------------
 
42
//Doxygen docu
 
43
//-------------------------------------------------------------
 
44
 
 
45
/**
 
46
        @page UTILS_DigestorMotif DigestorMotif
 
47
 
 
48
        @brief This application is used to digest a protein database to get all peptides given a cleavage enzyme. It will also produce peptide statistics given the mass 
 
49
        accuracy of the instrument. You can extract peptides with specific motifs,e.g. onyl cysteine containing peptides for ICAT experiments. At the moment only trypsin is supported.
 
50
 
 
51
        <B>The command line parameters of this tool are:</B>
 
52
        @verbinclude UTILS_DigestorMotif.cli
 
53
 
 
54
*/
 
55
 
 
56
// We do not want this class to show up in the docu:
 
57
/// @cond TOPPCLASSES
 
58
 
 
59
class TOPPDigestorMotif
 
60
        : public TOPPBase
 
61
{
 
62
        public:
 
63
                TOPPDigestorMotif()
 
64
                        : TOPPBase("DigestorMotif","digests a protein database in-silico",false)
 
65
                {
 
66
 
 
67
                }
 
68
 
 
69
        protected:
 
70
                void registerOptionsAndFlags_()
 
71
                {
 
72
                        registerInputFile_("in","<file>","","input file");
 
73
                        registerOutputFile_("out","<file>","","output file (peptides)\n");
 
74
                        registerIntOption_("missed_cleavages","<number>",1,"the number of allowed missed cleavages", false);
 
75
                        registerIntOption_("mass_accuracy","<number>",1000,"give your mass accuracy in ppb", false);
 
76
                        registerIntOption_("min_length","<number>",6,"minimum length of peptide", false);
 
77
                        registerIntOption_("out_option","<number>",1,"indicate 1 (peptide table only), 2 (statistics only) or (both peptide table + statistics)", false);
 
78
                        registerStringOption_("enzyme","<string>","Trypsin","the digestion enzyme", false);
 
79
                        registerStringOption_("motif","<string>","M","the motif for the restricted peptidome", false);
 
80
                        setMinInt_("missed_cleavages", 0);
 
81
                }
 
82
 
 
83
                ExitCodes main_(int , const char**)
 
84
                {
 
85
                        vector<ProteinIdentification> protein_identifications;
 
86
                        vector<PeptideIdentification> identifications;
 
87
                        std::vector<FASTAFile::FASTAEntry> protein_data;
 
88
                        FASTAFile file;
 
89
                        EnzymaticDigestion digestor;
 
90
                        vector<AASequence> temp_peptides;
 
91
                        PeptideIdentification peptide_identification;
 
92
                        ProteinIdentification protein_identification;
 
93
                        PeptideHit temp_peptide_hit;
 
94
                        ProteinHit temp_protein_hit;
 
95
                        vector<String> protein_accessions;
 
96
                        vector<String> parts;
 
97
                        String inputfile_name = "";
 
98
                        String outputfile_name = "";
 
99
                        UInt min_size = 0, counter = 0;
 
100
                        UInt missed_cleavages = 0;
 
101
                        DoubleReal accurate_mass, min_mass, max_mass;
 
102
                        UInt mass_acc = 1000, out_opt;
 
103
                        EmpiricalFormula EF;
 
104
                        UInt zero_count;
 
105
                        ProteinIdentification::SearchParameters search_parameters;
 
106
 
 
107
                        protein_identifications.push_back(ProteinIdentification());
 
108
                        //-------------------------------------------------------------
 
109
                        // parsing parameters
 
110
                        //-------------------------------------------------------------
 
111
                        inputfile_name = getStringOption_("in");
 
112
                        outputfile_name = getStringOption_("out");
 
113
                        min_size = getIntOption_("min_length");
 
114
                        mass_acc = getIntOption_("mass_accuracy");
 
115
                        out_opt = getIntOption_("out_option");
 
116
                        missed_cleavages = getIntOption_("missed_cleavages");
 
117
                        AASequence M = getStringOption_("motif");
 
118
 
 
119
                        //-------------------------------------------------------------
 
120
                        // reading input
 
121
                        //-------------------------------------------------------------
 
122
 
 
123
 
 
124
                        file.load(inputfile_name, protein_data);
 
125
                        //-------------------------------------------------------------
 
126
                        // calculations
 
127
                        //-------------------------------------------------------------
 
128
 
 
129
                        // This should be updated if more cleavage enzymes are available
 
130
                        digestor.setEnzyme(EnzymaticDigestion::TRYPSIN);
 
131
                        search_parameters.enzyme = ProteinIdentification::TRYPSIN;
 
132
                        digestor.setMissedCleavages(missed_cleavages);
 
133
 
 
134
                        protein_accessions.resize(1, String(""));
 
135
                        for(UInt i = 0; i < protein_data.size(); ++i)
 
136
                        {
 
137
                                protein_accessions[0] = protein_data[i].identifier;
 
138
                                temp_protein_hit.setSequence(protein_data[i].sequence);
 
139
                                temp_protein_hit.setAccession(protein_accessions[0]);
 
140
 
 
141
                                digestor.digest(AASequence(protein_data[i].sequence), temp_peptides);
 
142
                                temp_peptide_hit.setProteinAccessions(protein_accessions);
 
143
                                for(UInt j = 0; j < temp_peptides.size(); ++j)
 
144
                                {
 
145
                                        if (temp_peptides[j].size() >= min_size)
 
146
                                        {
 
147
                                                if (temp_peptides[j].hasSubsequence(M) == TRUE)
 
148
                                                {
 
149
                                                        temp_peptide_hit.setSequence(temp_peptides[j]);
 
150
                                                        peptide_identification.insertHit(temp_peptide_hit);
 
151
                                                }
 
152
                                        }
 
153
                                }
 
154
                                protein_identifications[0].insertHit(temp_protein_hit);
 
155
                        }
 
156
                        DateTime date_time;
 
157
                        String date_time_string = "";
 
158
                        date_time.now();
 
159
 
 
160
                        date_time_string = date_time.get();
 
161
                        protein_identifications[0].setSearchParameters(search_parameters);
 
162
                        protein_identifications[0].setDateTime(date_time);
 
163
                        protein_identifications[0].setSearchEngine("In-silico digestion");
 
164
                        protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string);
 
165
                        peptide_identification.setIdentifier("In-silico_digestion" + date_time_string);
 
166
                        identifications.push_back(peptide_identification);
 
167
 
 
168
                        //-------------------------------------------------------------
 
169
                        // writing output
 
170
                        //-------------------------------------------------------------
 
171
 
 
172
                  ofstream fp_out(outputfile_name.c_str());
 
173
                  if(out_opt==2)
 
174
      {
 
175
                    fp_out<<"mass_error"<<SEP<<"#proteins in database"<<SEP<<"# tryptic peptides"<<SEP<<"# unique peptide weights"<<SEP<<"# identifiable proteins"<<SEP<<"average window_size"<<"\n";
 
176
      }
 
177
                  UInt mass_iter = mass_acc;
 
178
                  while(mass_iter>0)
 
179
                  {
 
180
                          vector<DoubleReal> MIN, MAX;
 
181
                          vector< String > peptides, protein_names, PROTEINS;
 
182
                          vector<vector<DoubleReal> > B, Y;
 
183
                          vector<UInt> OVER;
 
184
                          vector<UInt> IonCounter;
 
185
                          UInt total=0;
 
186
                          if(out_opt==1 || out_opt==3)
 
187
        {
 
188
                            fp_out<<"counter"<<SEP<<"ProteinID"<<SEP<<"PeptideLocation"<<SEP<<"PeptideSequence"<<SEP<<"C"<<SEP<<"H"<<SEP<<"N"<<SEP<<"O"<<SEP<<"S"<<SEP<<"length"<<SEP<<"weight"<<SEP<<"min_weight"<<SEP<<"max_weight"<<SEP<<"Formula"<<SEP<<"D"<<SEP<<"E"<<SEP<<"K"<<SEP<<"R"<<SEP<<"H"<<SEP<<"Y"<<SEP<<"W"<<SEP<<"F"<<SEP<<"C"<<SEP<<"M"<<SEP<<"S"<<SEP<<"T"<<SEP<<"N"<<SEP<<"Q"<<SEP<<"G"<<SEP<<"A"<<SEP<<"V"<<SEP<<"L"<<SEP<<"I"<<SEP<<"P"<<SEP<<"hydrophobicity"<<"\n";
 
189
        }
 
190
 
 
191
                          for(UInt i = 0; i < protein_data.size(); ++i)
 
192
                          {
 
193
                                  protein_accessions[0] = protein_data[i].identifier;
 
194
                                  temp_protein_hit.setAccession(protein_accessions[0]);
 
195
                                  digestor.digest(AASequence(protein_data[i].sequence), temp_peptides);
 
196
                                  temp_peptide_hit.setProteinAccessions(protein_accessions);
 
197
                                  for(UInt j = 0; j < temp_peptides.size(); ++j)
 
198
                                  {
 
199
                                          //vector<DoubleReal> B_peptide, Y_peptide;
 
200
                                          vector<DoubleReal> peptide_ions;
 
201
                                          accurate_mass = temp_peptides[j].getMonoWeight();
 
202
                                          min_mass = accurate_mass - mass_iter*accurate_mass/1000000000;
 
203
                                          max_mass = accurate_mass + mass_iter*accurate_mass/1000000000;
 
204
                                          EF=temp_peptides[j].getFormula();
 
205
                                          for(UInt r=1;r<=temp_peptides[j].size();++r)
 
206
                                          {
 
207
                                                  //B_peptide.push_back(temp_peptides[j].getPrefix(r).getMonoWeight());
 
208
                                                  peptide_ions.push_back(temp_peptides[j].getPrefix(r).getMonoWeight());
 
209
                                                  peptide_ions.push_back(temp_peptides[j].getSuffix(r).getMonoWeight());
 
210
                                                  //Y_peptide.push_back(temp_peptides[j].getSuffix(r).getMonoWeight());
 
211
                                          }
 
212
                                          if (temp_peptides[j].size() >= min_size)
 
213
                                          {
 
214
                                                  if (temp_peptides[j].hasSubsequence(M) == TRUE)
 
215
                                                  {
 
216
                                                          OVER.push_back((-1)); //because the increment of the first will always be counted;
 
217
                                                          //IonCounter.push_back(0);
 
218
                                                          MIN.push_back(min_mass);
 
219
                                                          MAX.push_back(max_mass);
 
220
                                                          Y.push_back(peptide_ions);
 
221
                                                          //B.push_back(B_peptide);
 
222
                                                          protein_names.push_back(protein_accessions[0]);
 
223
                                                          temp_peptide_hit.setSequence(temp_peptides[j]);
 
224
                                                          peptide_identification.insertHit(temp_peptide_hit);
 
225
                                                          if(out_opt==1 || out_opt==3)
 
226
                {
 
227
                                                            fp_out <<counter<<SEP<<">"<<protein_accessions[0]<<SEP<<j<<SEP<<temp_peptides[j]<<SEP<<EF.getNumberOf("C")<<SEP<<EF.getNumberOf("H")<<SEP<<EF.getNumberOf("N")<<SEP<<EF.getNumberOf("O")<<SEP<<EF.getNumberOf("S")<<SEP<<temp_peptides[j].size()<<SEP<<precisionWrapper(temp_peptides[j].getMonoWeight())<<SEP<<precisionWrapper(min_mass)<<SEP<<precisionWrapper(max_mass)<<SEP<<temp_peptides[j].getFormula()<<SEP<<temp_peptides[j].getNumberOf("D")<<SEP<<temp_peptides[j].getNumberOf("E")<<SEP<<temp_peptides[j].getNumberOf("K")<<SEP<<temp_peptides[j].getNumberOf("R")<<SEP<<temp_peptides[j].getNumberOf("H")<<SEP<<temp_peptides[j].getNumberOf("Y")<<SEP<<temp_peptides[j].getNumberOf("W")<<SEP<<temp_peptides[j].getNumberOf("F")<<SEP<<temp_peptides[j].getNumberOf("C")<<SEP<<temp_peptides[j].getNumberOf("M")<<SEP<<temp_peptides[j].getNumberOf("S")<<SEP<<temp_peptides[j].getNumberOf("T")<<SEP<<temp_peptides[j].getNumberOf("N")<<SEP<<temp_peptides[j].getNumberOf("Q")<<SEP<<temp_peptides[j].getNumberOf("G")<<SEP<<temp_peptides[j].getNumberOf("A")<<SEP<<temp_peptides[j].getNumberOf("V")<<SEP<<temp_peptides[j].getNumberOf("L")<<SEP<<temp_peptides[j].getNumberOf("I")<<SEP<<temp_peptides[j].getNumberOf("P")<<SEP<<temp_peptides[j].getNumberOf("D")*(-3.5) + temp_peptides[j].getNumberOf("E")*(-3.5) + temp_peptides[j].getNumberOf("K")*(-3.9) + temp_peptides[j].getNumberOf("R")*(-4.5) + temp_peptides[j].getNumberOf("H")*(-3.2) + temp_peptides[j].getNumberOf("Y")*(-1.3) + temp_peptides[j].getNumberOf("W")*(-0.9) + temp_peptides[j].getNumberOf("F")*(2.8) + temp_peptides[j].getNumberOf("C")*(2.5) + temp_peptides[j].getNumberOf("M")*(1.9) + temp_peptides[j].getNumberOf("S")*(-0.8) + temp_peptides[j].getNumberOf("T")*(-0.7) + temp_peptides[j].getNumberOf("N")*(-3.5) + temp_peptides[j].getNumberOf("Q")*(-3.5) + temp_peptides[j].getNumberOf("G")*(-0.4) + temp_peptides[j].getNumberOf("A")*(1.8) + temp_peptides[j].getNumberOf("V")*(4.2) + temp_peptides[j].getNumberOf("L")*(4.5) + temp_peptides[j].getNumberOf("I")*(4.5) + temp_peptides[j].getNumberOf("P")*(-1.6)<<"\n";
 
228
                }
 
229
                                                          counter++;
 
230
                                                  }
 
231
                                          }
 
232
                                  }
 
233
                                  protein_identifications[0].insertHit(temp_protein_hit);
 
234
                          }
 
235
        if(out_opt!=2)
 
236
        {
 
237
          fp_out<<"MW_count"<<SEP;
 
238
        }
 
239
 
 
240
                          for(UInt r=1;r<=100;++r)
 
241
                          {
 
242
                                  fp_out<<"y"<<r<<SEP<<"b"<<r<<SEP;
 
243
                          }
 
244
                          fp_out<<"\n";
 
245
                          fp_out<<"MW_count"<<SEP<<"Overlapping ions in search space"<<"\n";
 
246
                          for(UInt x=0; x < MAX.size(); ++x)
 
247
                          {
 
248
                                  /*for(UInt it = 0; it < ions.size(); ++it)
 
249
                                  {
 
250
                                          ions[it] = -1; //all ions from the same peptide all be counted
 
251
                                  }
 
252
                                  */
 
253
                                  cout<<"2nd loop"<<SEP<<MAX.size() - x<<endl;
 
254
                                  vector<UInt> IonCounter;
 
255
                                  for(UInt y=0; y < MAX.size(); ++y)
 
256
                                  {
 
257
                                          if((MIN[y]<MIN[x] && MAX[y]>MIN[x]) || (MAX[y]>MAX[x] && MIN[y]<MAX[x]) || (MIN[x]==MIN[y]))
 
258
                                          {
 
259
                                                  OVER[x]=OVER[x]+1;
 
260
                                                  //find overlapping tandem ions
 
261
                                                  vector<DoubleReal> X_temp, Y_temp;
 
262
                                                  X_temp = Y[x];
 
263
                                                  Y_temp = Y[y];
 
264
                                                  UInt ions = 0;
 
265
                                                  for(UInt xx=0; xx<X_temp.size();++xx)
 
266
                                                  {
 
267
                                                          for(UInt yy=0; yy<Y_temp.size();++yy)
 
268
                                                          {
 
269
                                                                  if(fabs(X_temp[xx]-Y_temp[yy])<=1)
 
270
                                                                  {
 
271
                                                                          ions+=1;
 
272
                                                                  }
 
273
                                                          }
 
274
                                                  }
 
275
                                                  IonCounter.push_back(ions);
 
276
                                          }
 
277
                                  }
 
278
                                  if(out_opt==3)
 
279
                                  {
 
280
                                          fp_out<<OVER[x]<<SEP;
 
281
                                          if(MAX[x]<3500)
 
282
            {
 
283
                                            for(UInt it = 0; it<IonCounter.size(); ++it)
 
284
                                            {
 
285
                                                    fp_out<<IonCounter[it]<<SEP;
 
286
                                            }
 
287
                                          }
 
288
                                          fp_out<<"\n";
 
289
                                          cout<<OVER[x];
 
290
                                  }
 
291
                                  total = total + OVER[x];
 
292
                                  if(OVER[x]==0)
 
293
                                  {
 
294
                                          ++zero_count;
 
295
                                          PROTEINS.push_back(protein_names[x]);
 
296
                                  }
 
297
                    }
 
298
                    UInt pro_count =0;
 
299
                    for(UInt a = 0; a < PROTEINS.size()-1; ++a)
 
300
                    {
 
301
                            if(PROTEINS[a]==PROTEINS[a+1])
 
302
                            {
 
303
                                    ++pro_count;
 
304
                            }
 
305
                            cout<<PROTEINS.size()<<endl<<pro_count<<endl;
 
306
                    }
 
307
 
 
308
                          if(out_opt != 2)
 
309
        {
 
310
                            mass_iter = 0;
 
311
        }
 
312
                          else
 
313
        {
 
314
                            mass_iter = mass_iter - 1;
 
315
        }
 
316
 
 
317
                          if(out_opt>1)
 
318
        {
 
319
                            fp_out<<mass_iter<<SEP<<protein_data.size()<<SEP<<MAX.size()<<SEP<<zero_count<<SEP<<PROTEINS.size()-pro_count<<SEP<<total<<endl;
 
320
        }
 
321
                          pro_count = 0;
 
322
                          zero_count = 0;
 
323
                  }
 
324
 
 
325
                        return EXECUTION_OK;
 
326
                }
 
327
};
 
328
 
 
329
 
 
330
int main( int argc, const char** argv )
 
331
{
 
332
        TOPPDigestorMotif tool;
 
333
        return tool.main(argc,argv);
 
334
}
 
335
 
 
336
/// @endcond
 
337
 
 
338
 
 
339
 
 
340
 
 
341