~ubuntu-branches/ubuntu/wily/openms/wily

« back to all changes in this revision

Viewing changes to source/FILTERING/ID/IDFilter.C

  • Committer: Package Import Robot
  • Author(s): Filippo Rusconi
  • Date: 2012-11-12 15:58:12 UTC
  • Revision ID: package-import@ubuntu.com-20121112155812-vr15wtg9b50cuesg
Tags: upstream-1.9.0
ImportĀ upstreamĀ versionĀ 1.9.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
// -*- mode: C++; tab-width: 2; -*-
 
2
// vi: set ts=2:
 
3
//
 
4
// --------------------------------------------------------------------------
 
5
//                   OpenMS Mass Spectrometry Framework
 
6
// --------------------------------------------------------------------------
 
7
//  Copyright (C) 2003-2011 -- Oliver Kohlbacher, Knut Reinert
 
8
//
 
9
//  This library is free software; you can redistribute it and/or
 
10
//  modify it under the terms of the GNU Lesser General Public
 
11
//  License as published by the Free Software Foundation; either
 
12
//  version 2.1 of the License, or (at your option) any later version.
 
13
// 
 
14
//  This library is distributed in the hope that it will be useful,
 
15
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
 
16
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
17
//  Lesser General Public License for more details.
 
18
//
 
19
//  You should have received a copy of the GNU Lesser General Public
 
20
//  License along with this library; if not, write to the Free Software
 
21
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
22
//
 
23
// --------------------------------------------------------------------------
 
24
// $Maintainer: Nico Pfeifer $
 
25
// $Authors: Nico Pfeifer $
 
26
// --------------------------------------------------------------------------
 
27
 
 
28
#include <OpenMS/FILTERING/ID/IDFilter.h>
 
29
#include <OpenMS/CONCEPT/LogStream.h>
 
30
 
 
31
#include <cmath>
 
32
 
 
33
using namespace std;
 
34
 
 
35
namespace OpenMS 
 
36
{
 
37
  IDFilter::IDFilter()
 
38
  {
 
39
  }
 
40
   
 
41
  IDFilter::~IDFilter()
 
42
  {
 
43
  }
 
44
  
 
45
  void IDFilter::filterIdentificationsUnique(const PeptideIdentification& identification,
 
46
                                                                                                                                                                         PeptideIdentification&                         filtered_identification)
 
47
        {
 
48
                vector<PeptideHit> hits;
 
49
                filtered_identification = identification;               
 
50
                vector<PeptideHit> temp_hits = identification.getHits();
 
51
                
 
52
                for(vector<PeptideHit>::iterator it = temp_hits.begin();
 
53
                                it != temp_hits.end();
 
54
                                ++it)
 
55
                {
 
56
                        if (find(hits.begin(), hits.end(), *it) == hits.end())
 
57
                        {
 
58
                                hits.push_back(*it);
 
59
                        }
 
60
                }
 
61
                filtered_identification.setHits(hits);          
 
62
        }  
 
63
 
 
64
        void IDFilter::filterIdentificationsByBestHits(const PeptideIdentification& identification,
 
65
                                                                                                                                                                                                 PeptideIdentification& filtered_identification, 
 
66
                                                                                                                                                                                                 bool strict)
 
67
        {
 
68
                vector<PeptideHit> filtered_peptide_hits;
 
69
                PeptideHit temp_peptide_hit;
 
70
                vector<Size> new_peptide_indices;               
 
71
 
 
72
                filtered_identification = identification;               
 
73
                filtered_identification.setHits(vector<PeptideHit>());
 
74
                
 
75
    if ( !identification.getHits().empty() )
 
76
                {
 
77
                        Real optimal_value = identification.getHits()[0].getScore();
 
78
                        new_peptide_indices.push_back(0);
 
79
                        
 
80
                        // searching for peptide(s) with maximal score                  
 
81
                        for (Size i = 1; i < identification.getHits().size(); i++)
 
82
                        {
 
83
                                Real temp_score = identification.getHits()[i].getScore();
 
84
        bool new_leader = false;
 
85
                                if (   ( identification.isHigherScoreBetter() && (temp_score > optimal_value))
 
86
            || (!identification.isHigherScoreBetter() && (temp_score < optimal_value)) ) new_leader=true;
 
87
          
 
88
        if (new_leader)
 
89
                                {
 
90
                                        optimal_value = temp_score;
 
91
                                        new_peptide_indices.clear();
 
92
                                        new_peptide_indices.push_back(i);
 
93
                                }                               
 
94
                                else if (temp_score == optimal_value)
 
95
                                {
 
96
                                        new_peptide_indices.push_back(i);
 
97
                                }
 
98
                        }                                               
 
99
                        if (!strict || new_peptide_indices.size() == 1)
 
100
                        {
 
101
                                for (Size i = 0; i < new_peptide_indices.size(); i++)
 
102
                                {
 
103
                                        filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]);
 
104
                                }
 
105
                        }
 
106
                }
 
107
 
 
108
    if ( !filtered_peptide_hits.empty() )
 
109
                {
 
110
                filtered_identification.setHits(filtered_peptide_hits);
 
111
                        filtered_identification.assignRanks();                                                                                                                                                                                                  
 
112
                }
 
113
        }
 
114
 
 
115
        void IDFilter::filterIdentificationsByLength(const PeptideIdentification&       identification,
 
116
                                                                                                                                                                                         Size                                                                           min_length,
 
117
                                                                                                                                                                                         PeptideIdentification&                                 filtered_identification)
 
118
        {
 
119
                vector<Size> new_peptide_indices;               
 
120
                vector<PeptideHit> filtered_peptide_hits;
 
121
                
 
122
                filtered_identification = identification;               
 
123
                filtered_identification.setHits(vector<PeptideHit>());
 
124
 
 
125
                const vector<PeptideHit>& temp_peptide_hits = identification.getHits();
 
126
 
 
127
                for (Size i = 0; i < temp_peptide_hits.size(); i++)
 
128
                {
 
129
                if (temp_peptide_hits[i].getSequence().size() >= min_length)
 
130
                {
 
131
                        new_peptide_indices.push_back(i);
 
132
                        }                               
 
133
                }               
 
134
                
 
135
                for (Size i = 0; i < new_peptide_indices.size(); i++)
 
136
                {
 
137
                        filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]);
 
138
                }
 
139
    if ( !filtered_peptide_hits.empty() )
 
140
                {
 
141
                filtered_identification.setHits(filtered_peptide_hits);
 
142
                        filtered_identification.assignRanks();                                                                                                                                                                                                  
 
143
                }               
 
144
        }
 
145
 
 
146
        void IDFilter::filterIdentificationsByProteins(const PeptideIdentification& identification, 
 
147
                                                                                                                                                                                                 const vector< FASTAFile::FASTAEntry >& proteins,
 
148
                                                                                                                                                                                                 PeptideIdentification& filtered_identification,
 
149
                                                                                                                                                                                                 bool no_protein_identifiers)
 
150
        {
 
151
    // TODO: this is highly inefficient! the Protein-Index should be build once for all peptide-identifications instead of
 
152
    //       doing this once for every ID. Furthermore the index itself is inefficient (use seqan instead)
 
153
                String protein_sequences;
 
154
                String accession_sequences;
 
155
                vector<PeptideHit> filtered_peptide_hits;
 
156
                PeptideHit temp_peptide_hit;
 
157
                
 
158
                filtered_identification = identification;
 
159
                filtered_identification.setHits(vector<PeptideHit>());
 
160
                
 
161
                for (Size i = 0; i < proteins.size(); i++)
 
162
                {
 
163
                        if (proteins[i].identifier!="")
 
164
                        {
 
165
                                accession_sequences.append("*" + proteins[i].identifier);
 
166
                        }
 
167
                        if (proteins[i].sequence!="")
 
168
                        {
 
169
                                protein_sequences.append("*" + proteins[i].sequence);
 
170
                        }
 
171
                }
 
172
                accession_sequences.append("*");
 
173
                protein_sequences.append("*");
 
174
                
 
175
                for (Size i = 0; i < identification.getHits().size(); i++)
 
176
                {
 
177
                        if (no_protein_identifiers || accession_sequences=="*")
 
178
                        { // filter by sequence alone if no protein accesssions are available
 
179
                        if (protein_sequences.find(identification.getHits()[i].getSequence().toUnmodifiedString()) != String::npos)
 
180
                        {
 
181
                                filtered_peptide_hits.push_back(identification.getHits()[i]);
 
182
                        }
 
183
                        }
 
184
                        else
 
185
                        { // filter by protein accessions
 
186
                                for(vector<String>::const_iterator ac_it = identification.getHits()[i].getProteinAccessions().begin();
 
187
                                                ac_it != identification.getHits()[i].getProteinAccessions().end();
 
188
                                                ++ac_it)
 
189
                                {
 
190
                                if (accession_sequences.find("*" + *ac_it) != String::npos)
 
191
                                {
 
192
                                        filtered_peptide_hits.push_back(identification.getHits()[i]);
 
193
            break; // we found a matching protein, the peptide is valid -> exit
 
194
                                }
 
195
                        }
 
196
                        }
 
197
                }
 
198
 
 
199
    filtered_identification.setHits(filtered_peptide_hits);
 
200
                filtered_identification.assignRanks();                                                                                                                                                  
 
201
        }
 
202
        
 
203
        void IDFilter::filterIdentificationsByProteins(const ProteinIdentification& identification, 
 
204
                                                                                                                                                                                                 const vector< FASTAFile::FASTAEntry >& proteins,
 
205
                                                 ProteinIdentification& filtered_identification)
 
206
        {
 
207
                String protein_sequences;
 
208
                String accession_sequences;
 
209
                vector<ProteinHit> filtered_protein_hits;
 
210
                ProteinHit temp_protein_hit;
 
211
                
 
212
                filtered_identification=identification;
 
213
                filtered_identification.setHits(vector<ProteinHit>());
 
214
                
 
215
                for (Size i = 0; i < proteins.size(); i++)
 
216
                {
 
217
                        accession_sequences.append("*" + proteins[i].identifier);
 
218
                }
 
219
                accession_sequences.append("*");
 
220
                
 
221
                for (Size i = 0; i < identification.getHits().size(); i++)
 
222
                {
 
223
                if (accession_sequences.find("*" + identification.getHits()[i].getAccession()) != String::npos)
 
224
                {
 
225
                        filtered_protein_hits.push_back(identification.getHits()[i]);
 
226
                }
 
227
                }
 
228
 
 
229
    filtered_identification.setHits(filtered_protein_hits);
 
230
                filtered_identification.assignRanks();                                                                                  
 
231
        }
 
232
        
 
233
        void IDFilter::filterIdentificationsByExclusionPeptides(const PeptideIdentification& identification,
 
234
                                                                                                                                                                                                                                        const set<String>&                              peptides,
 
235
                                                                                                                                                                                                                                        PeptideIdentification& filtered_identification)
 
236
        {
 
237
                String protein_sequences;
 
238
                String accession_sequences;
 
239
                vector<PeptideHit> filtered_peptide_hits;
 
240
                PeptideHit temp_peptide_hit;
 
241
                
 
242
                filtered_identification=identification;
 
243
                filtered_identification.setHits(vector<PeptideHit>());
 
244
                
 
245
                for (Size i = 0; i < identification.getHits().size(); i++)
 
246
                {
 
247
                if (find(peptides.begin(), peptides.end(), identification.getHits()[i].getSequence().toString()) == peptides.end())
 
248
                {
 
249
                        filtered_peptide_hits.push_back(identification.getHits()[i]);
 
250
                }
 
251
                }
 
252
    if ( !filtered_peptide_hits.empty() )
 
253
                {
 
254
                filtered_identification.setHits(filtered_peptide_hits);
 
255
                        filtered_identification.assignRanks();
 
256
                }
 
257
        }
 
258
        
 
259
        void IDFilter::filterIdentificationsByRTFirstDimPValues(const PeptideIdentification&    identification,
 
260
                                                                                                                                                                                                                                        PeptideIdentification&                          filtered_identification,
 
261
                                                                                                                                                                                                                                        DoubleReal                                                                              p_value)
 
262
        {
 
263
                DoubleReal border = 1 - p_value;
 
264
                vector< Size > new_peptide_indices;             
 
265
                vector<PeptideHit> filtered_peptide_hits;
 
266
                PeptideHit temp_peptide_hit;
 
267
                
 
268
                filtered_identification=identification;
 
269
                filtered_identification.setHits(vector<PeptideHit>());
 
270
                
 
271
    Size missing_meta_value=0;
 
272
 
 
273
    for ( Size i = 0; i < identification.getHits().size(); ++i )
 
274
    {
 
275
                        if (identification.getHits()[i].metaValueExists("predicted_RT_p_value_first_dim"))
 
276
      {
 
277
                          if ((DoubleReal)(identification.getHits()[i].getMetaValue("predicted_RT_p_value_first_dim")) <= border )
 
278
                          {
 
279
                          filtered_peptide_hits.push_back(identification.getHits()[i]);
 
280
                          }
 
281
      }
 
282
      else ++missing_meta_value;
 
283
                }
 
284
    if (missing_meta_value>0) LOG_WARN << "Filtering identifications by p-value did not work on " << missing_meta_value << " of " << identification.getHits().size() << " hits. Your data is missing a meta-value ('predicted_RT_p_value_first_dim') from RTPredict!\n";
 
285
 
 
286
    if ( !filtered_peptide_hits.empty() )
 
287
                {
 
288
                filtered_identification.setHits(filtered_peptide_hits);         
 
289
                filtered_identification.assignRanks();                                                                                  
 
290
                }
 
291
        }                                                                                                                                                                                                                                                                                                                                                                
 
292
 
 
293
        void IDFilter::filterIdentificationsByRTPValues(const PeptideIdentification&    identification,
 
294
                                                                                                                                                                                                        PeptideIdentification&                          filtered_identification,
 
295
                                                                                                                                                                                                        DoubleReal                                                                              p_value)
 
296
        {
 
297
                DoubleReal border = 1 - p_value;
 
298
                vector< Size > new_peptide_indices;             
 
299
                vector<PeptideHit> filtered_peptide_hits;
 
300
                PeptideHit temp_peptide_hit;
 
301
                
 
302
                filtered_identification=identification;
 
303
                filtered_identification.setHits(vector<PeptideHit>());
 
304
                
 
305
    Size missing_meta_value=0;
 
306
 
 
307
                for (Size i = 0; i < identification.getHits().size(); i++)
 
308
                {
 
309
                        if (identification.getHits()[i].metaValueExists("predicted_RT_p_value"))
 
310
      {
 
311
                          if ((DoubleReal)(identification.getHits()[i].getMetaValue("predicted_RT_p_value")) <= border )
 
312
                          {
 
313
                          filtered_peptide_hits.push_back(identification.getHits()[i]);
 
314
                          }             
 
315
      }
 
316
      else ++missing_meta_value;
 
317
                }
 
318
    if (missing_meta_value>0) LOG_WARN << "Filtering identifications by p-value did not work on " << missing_meta_value << " of " << identification.getHits().size() << " hits. Your data is missing a meta-value ('predicted_RT_p_value') from RTPredict!\n";
 
319
 
 
320
    if ( !filtered_peptide_hits.empty() )
 
321
                {
 
322
                filtered_identification.setHits(filtered_peptide_hits);         
 
323
                filtered_identification.assignRanks();                                                                                  
 
324
                }
 
325
        }
 
326
        
 
327
        void IDFilter::removeUnreferencedProteinHits(const ProteinIdentification&       identification, 
 
328
                                                                                                                                                                                        const vector<PeptideIdentification> peptide_identifications, 
 
329
                                                                                                                                                                                        ProteinIdentification&  filtered_identification)
 
330
        {
 
331
                vector<ProteinHit> filtered_protein_hits;
 
332
                const vector<ProteinHit>& temp_protein_hits = identification.getHits();
 
333
                vector<PeptideHit> temp_peptide_hits;
 
334
 
 
335
                filtered_identification=identification;
 
336
                filtered_identification.setHits(vector<ProteinHit>());
 
337
                String identifier = identification.getIdentifier();
 
338
                
 
339
                Size i = 0;             
 
340
                for (Size j = 0; j < temp_protein_hits.size(); ++j)
 
341
                {
 
342
                        bool found = false;
 
343
                        i = 0;
 
344
                        while(i < peptide_identifications.size() && !found) 
 
345
                        {
 
346
                                if (identifier == peptide_identifications[i].getIdentifier())
 
347
                                {
 
348
                                        temp_peptide_hits.clear();
 
349
                                        peptide_identifications[i].getReferencingHits(temp_protein_hits[j].getAccession(), temp_peptide_hits);
 
350
          if ( !temp_peptide_hits.empty() )
 
351
                                        {
 
352
                                                filtered_protein_hits.push_back(temp_protein_hits[j]);
 
353
                                                found = true;
 
354
                                        }
 
355
                                }
 
356
                                ++i;
 
357
                        }
 
358
                }
 
359
                filtered_identification.setHits(filtered_protein_hits);
 
360
        }                                                                                                                                                                                                                                                                                                                                                                
 
361
                
 
362
} // namespace OpenMS