1
// -*- mode: C++; tab-width: 2; -*-
4
// --------------------------------------------------------------------------
5
// OpenMS Mass Spectrometry Framework
6
// --------------------------------------------------------------------------
7
// Copyright (C) 2003-2008 -- Oliver Kohlbacher, Knut Reinert
9
// This library is free software; you can redistribute it and/or
10
// modify it under the terms of the GNU Lesser General Public
11
// License as published by the Free Software Foundation; either
12
// version 2.1 of the License, or (at your option) any later version.
14
// This library is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
// Lesser General Public License for more details.
19
// You should have received a copy of the GNU Lesser General Public
20
// License along with this library; if not, write to the Free Software
21
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
// --------------------------------------------------------------------------
24
// $Maintainer: Sven Nahnsen $
25
// $Author: Sven Nahnsen $
26
// --------------------------------------------------------------------------
28
#include <OpenMS/FORMAT/IdXMLFile.h>
29
#include <OpenMS/FORMAT/FASTAFile.h>
30
#include <OpenMS/METADATA/ProteinIdentification.h>
31
#include <OpenMS/APPLICATIONS/TOPPBase.h>
32
#include <OpenMS/CHEMISTRY/EnzymaticDigestion.h>
36
using namespace OpenMS;
41
//-------------------------------------------------------------
43
//-------------------------------------------------------------
46
@page UTILS_DigestorMotif DigestorMotif
48
@brief This application is used to digest a protein database to get all peptides given a cleavage enzyme. It will also produce peptide statistics given the mass
49
accuracy of the instrument. You can extract peptides with specific motifs,e.g. onyl cysteine containing peptides for ICAT experiments. At the moment only trypsin is supported.
51
<B>The command line parameters of this tool are:</B>
52
@verbinclude UTILS_DigestorMotif.cli
56
// We do not want this class to show up in the docu:
59
class TOPPDigestorMotif
64
: TOPPBase("DigestorMotif","digests a protein database in-silico",false)
70
void registerOptionsAndFlags_()
72
registerInputFile_("in","<file>","","input file");
73
registerOutputFile_("out","<file>","","output file (peptides)\n");
74
registerIntOption_("missed_cleavages","<number>",1,"the number of allowed missed cleavages", false);
75
registerIntOption_("mass_accuracy","<number>",1000,"give your mass accuracy in ppb", false);
76
registerIntOption_("min_length","<number>",6,"minimum length of peptide", false);
77
registerIntOption_("out_option","<number>",1,"indicate 1 (peptide table only), 2 (statistics only) or (both peptide table + statistics)", false);
78
registerStringOption_("enzyme","<string>","Trypsin","the digestion enzyme", false);
79
registerStringOption_("motif","<string>","M","the motif for the restricted peptidome", false);
80
setMinInt_("missed_cleavages", 0);
83
ExitCodes main_(int , const char**)
85
vector<ProteinIdentification> protein_identifications;
86
vector<PeptideIdentification> identifications;
87
std::vector<FASTAFile::FASTAEntry> protein_data;
89
EnzymaticDigestion digestor;
90
vector<AASequence> temp_peptides;
91
PeptideIdentification peptide_identification;
92
ProteinIdentification protein_identification;
93
PeptideHit temp_peptide_hit;
94
ProteinHit temp_protein_hit;
95
vector<String> protein_accessions;
97
String inputfile_name = "";
98
String outputfile_name = "";
99
UInt min_size = 0, counter = 0;
100
UInt missed_cleavages = 0;
101
DoubleReal accurate_mass, min_mass, max_mass;
102
UInt mass_acc = 1000, out_opt;
105
ProteinIdentification::SearchParameters search_parameters;
107
protein_identifications.push_back(ProteinIdentification());
108
//-------------------------------------------------------------
109
// parsing parameters
110
//-------------------------------------------------------------
111
inputfile_name = getStringOption_("in");
112
outputfile_name = getStringOption_("out");
113
min_size = getIntOption_("min_length");
114
mass_acc = getIntOption_("mass_accuracy");
115
out_opt = getIntOption_("out_option");
116
missed_cleavages = getIntOption_("missed_cleavages");
117
AASequence M = getStringOption_("motif");
119
//-------------------------------------------------------------
121
//-------------------------------------------------------------
124
file.load(inputfile_name, protein_data);
125
//-------------------------------------------------------------
127
//-------------------------------------------------------------
129
// This should be updated if more cleavage enzymes are available
130
digestor.setEnzyme(EnzymaticDigestion::TRYPSIN);
131
search_parameters.enzyme = ProteinIdentification::TRYPSIN;
132
digestor.setMissedCleavages(missed_cleavages);
134
protein_accessions.resize(1, String(""));
135
for(UInt i = 0; i < protein_data.size(); ++i)
137
protein_accessions[0] = protein_data[i].identifier;
138
temp_protein_hit.setSequence(protein_data[i].sequence);
139
temp_protein_hit.setAccession(protein_accessions[0]);
141
digestor.digest(AASequence(protein_data[i].sequence), temp_peptides);
142
temp_peptide_hit.setProteinAccessions(protein_accessions);
143
for(UInt j = 0; j < temp_peptides.size(); ++j)
145
if (temp_peptides[j].size() >= min_size)
147
if (temp_peptides[j].hasSubsequence(M) == TRUE)
149
temp_peptide_hit.setSequence(temp_peptides[j]);
150
peptide_identification.insertHit(temp_peptide_hit);
154
protein_identifications[0].insertHit(temp_protein_hit);
157
String date_time_string = "";
160
date_time_string = date_time.get();
161
protein_identifications[0].setSearchParameters(search_parameters);
162
protein_identifications[0].setDateTime(date_time);
163
protein_identifications[0].setSearchEngine("In-silico digestion");
164
protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string);
165
peptide_identification.setIdentifier("In-silico_digestion" + date_time_string);
166
identifications.push_back(peptide_identification);
168
//-------------------------------------------------------------
170
//-------------------------------------------------------------
172
ofstream fp_out(outputfile_name.c_str());
175
fp_out<<"mass_error"<<SEP<<"#proteins in database"<<SEP<<"# tryptic peptides"<<SEP<<"# unique peptide weights"<<SEP<<"# identifiable proteins"<<SEP<<"average window_size"<<"\n";
177
UInt mass_iter = mass_acc;
180
vector<DoubleReal> MIN, MAX;
181
vector< String > peptides, protein_names, PROTEINS;
182
vector<vector<DoubleReal> > B, Y;
184
vector<UInt> IonCounter;
186
if(out_opt==1 || out_opt==3)
188
fp_out<<"counter"<<SEP<<"ProteinID"<<SEP<<"PeptideLocation"<<SEP<<"PeptideSequence"<<SEP<<"C"<<SEP<<"H"<<SEP<<"N"<<SEP<<"O"<<SEP<<"S"<<SEP<<"length"<<SEP<<"weight"<<SEP<<"min_weight"<<SEP<<"max_weight"<<SEP<<"Formula"<<SEP<<"D"<<SEP<<"E"<<SEP<<"K"<<SEP<<"R"<<SEP<<"H"<<SEP<<"Y"<<SEP<<"W"<<SEP<<"F"<<SEP<<"C"<<SEP<<"M"<<SEP<<"S"<<SEP<<"T"<<SEP<<"N"<<SEP<<"Q"<<SEP<<"G"<<SEP<<"A"<<SEP<<"V"<<SEP<<"L"<<SEP<<"I"<<SEP<<"P"<<SEP<<"hydrophobicity"<<"\n";
191
for(UInt i = 0; i < protein_data.size(); ++i)
193
protein_accessions[0] = protein_data[i].identifier;
194
temp_protein_hit.setAccession(protein_accessions[0]);
195
digestor.digest(AASequence(protein_data[i].sequence), temp_peptides);
196
temp_peptide_hit.setProteinAccessions(protein_accessions);
197
for(UInt j = 0; j < temp_peptides.size(); ++j)
199
//vector<DoubleReal> B_peptide, Y_peptide;
200
vector<DoubleReal> peptide_ions;
201
accurate_mass = temp_peptides[j].getMonoWeight();
202
min_mass = accurate_mass - mass_iter*accurate_mass/1000000000;
203
max_mass = accurate_mass + mass_iter*accurate_mass/1000000000;
204
EF=temp_peptides[j].getFormula();
205
for(UInt r=1;r<=temp_peptides[j].size();++r)
207
//B_peptide.push_back(temp_peptides[j].getPrefix(r).getMonoWeight());
208
peptide_ions.push_back(temp_peptides[j].getPrefix(r).getMonoWeight());
209
peptide_ions.push_back(temp_peptides[j].getSuffix(r).getMonoWeight());
210
//Y_peptide.push_back(temp_peptides[j].getSuffix(r).getMonoWeight());
212
if (temp_peptides[j].size() >= min_size)
214
if (temp_peptides[j].hasSubsequence(M) == TRUE)
216
OVER.push_back((-1)); //because the increment of the first will always be counted;
217
//IonCounter.push_back(0);
218
MIN.push_back(min_mass);
219
MAX.push_back(max_mass);
220
Y.push_back(peptide_ions);
221
//B.push_back(B_peptide);
222
protein_names.push_back(protein_accessions[0]);
223
temp_peptide_hit.setSequence(temp_peptides[j]);
224
peptide_identification.insertHit(temp_peptide_hit);
225
if(out_opt==1 || out_opt==3)
227
fp_out <<counter<<SEP<<">"<<protein_accessions[0]<<SEP<<j<<SEP<<temp_peptides[j]<<SEP<<EF.getNumberOf("C")<<SEP<<EF.getNumberOf("H")<<SEP<<EF.getNumberOf("N")<<SEP<<EF.getNumberOf("O")<<SEP<<EF.getNumberOf("S")<<SEP<<temp_peptides[j].size()<<SEP<<precisionWrapper(temp_peptides[j].getMonoWeight())<<SEP<<precisionWrapper(min_mass)<<SEP<<precisionWrapper(max_mass)<<SEP<<temp_peptides[j].getFormula()<<SEP<<temp_peptides[j].getNumberOf("D")<<SEP<<temp_peptides[j].getNumberOf("E")<<SEP<<temp_peptides[j].getNumberOf("K")<<SEP<<temp_peptides[j].getNumberOf("R")<<SEP<<temp_peptides[j].getNumberOf("H")<<SEP<<temp_peptides[j].getNumberOf("Y")<<SEP<<temp_peptides[j].getNumberOf("W")<<SEP<<temp_peptides[j].getNumberOf("F")<<SEP<<temp_peptides[j].getNumberOf("C")<<SEP<<temp_peptides[j].getNumberOf("M")<<SEP<<temp_peptides[j].getNumberOf("S")<<SEP<<temp_peptides[j].getNumberOf("T")<<SEP<<temp_peptides[j].getNumberOf("N")<<SEP<<temp_peptides[j].getNumberOf("Q")<<SEP<<temp_peptides[j].getNumberOf("G")<<SEP<<temp_peptides[j].getNumberOf("A")<<SEP<<temp_peptides[j].getNumberOf("V")<<SEP<<temp_peptides[j].getNumberOf("L")<<SEP<<temp_peptides[j].getNumberOf("I")<<SEP<<temp_peptides[j].getNumberOf("P")<<SEP<<temp_peptides[j].getNumberOf("D")*(-3.5) + temp_peptides[j].getNumberOf("E")*(-3.5) + temp_peptides[j].getNumberOf("K")*(-3.9) + temp_peptides[j].getNumberOf("R")*(-4.5) + temp_peptides[j].getNumberOf("H")*(-3.2) + temp_peptides[j].getNumberOf("Y")*(-1.3) + temp_peptides[j].getNumberOf("W")*(-0.9) + temp_peptides[j].getNumberOf("F")*(2.8) + temp_peptides[j].getNumberOf("C")*(2.5) + temp_peptides[j].getNumberOf("M")*(1.9) + temp_peptides[j].getNumberOf("S")*(-0.8) + temp_peptides[j].getNumberOf("T")*(-0.7) + temp_peptides[j].getNumberOf("N")*(-3.5) + temp_peptides[j].getNumberOf("Q")*(-3.5) + temp_peptides[j].getNumberOf("G")*(-0.4) + temp_peptides[j].getNumberOf("A")*(1.8) + temp_peptides[j].getNumberOf("V")*(4.2) + temp_peptides[j].getNumberOf("L")*(4.5) + temp_peptides[j].getNumberOf("I")*(4.5) + temp_peptides[j].getNumberOf("P")*(-1.6)<<"\n";
233
protein_identifications[0].insertHit(temp_protein_hit);
237
fp_out<<"MW_count"<<SEP;
240
for(UInt r=1;r<=100;++r)
242
fp_out<<"y"<<r<<SEP<<"b"<<r<<SEP;
245
fp_out<<"MW_count"<<SEP<<"Overlapping ions in search space"<<"\n";
246
for(UInt x=0; x < MAX.size(); ++x)
248
/*for(UInt it = 0; it < ions.size(); ++it)
250
ions[it] = -1; //all ions from the same peptide all be counted
253
cout<<"2nd loop"<<SEP<<MAX.size() - x<<endl;
254
vector<UInt> IonCounter;
255
for(UInt y=0; y < MAX.size(); ++y)
257
if((MIN[y]<MIN[x] && MAX[y]>MIN[x]) || (MAX[y]>MAX[x] && MIN[y]<MAX[x]) || (MIN[x]==MIN[y]))
260
//find overlapping tandem ions
261
vector<DoubleReal> X_temp, Y_temp;
265
for(UInt xx=0; xx<X_temp.size();++xx)
267
for(UInt yy=0; yy<Y_temp.size();++yy)
269
if(fabs(X_temp[xx]-Y_temp[yy])<=1)
275
IonCounter.push_back(ions);
280
fp_out<<OVER[x]<<SEP;
283
for(UInt it = 0; it<IonCounter.size(); ++it)
285
fp_out<<IonCounter[it]<<SEP;
291
total = total + OVER[x];
295
PROTEINS.push_back(protein_names[x]);
299
for(UInt a = 0; a < PROTEINS.size()-1; ++a)
301
if(PROTEINS[a]==PROTEINS[a+1])
305
cout<<PROTEINS.size()<<endl<<pro_count<<endl;
314
mass_iter = mass_iter - 1;
319
fp_out<<mass_iter<<SEP<<protein_data.size()<<SEP<<MAX.size()<<SEP<<zero_count<<SEP<<PROTEINS.size()-pro_count<<SEP<<total<<endl;
330
int main( int argc, const char** argv )
332
TOPPDigestorMotif tool;
333
return tool.main(argc,argv);