38
45
using namespace std;
42
InspectOutfile::InspectOutfile() {}
45
InspectOutfile::InspectOutfile(const InspectOutfile&) {}
48
InspectOutfile::~InspectOutfile() {}
50
/// assignment operator
51
InspectOutfile& InspectOutfile::operator=(const InspectOutfile& inspect_outfile)
53
if (this == &inspect_outfile) return *this;
58
bool InspectOutfile::operator==(const InspectOutfile&) const
63
vector<Size> InspectOutfile::load(const String& result_filename, vector<PeptideIdentification>& peptide_identifications,
64
ProteinIdentification& protein_identification, const DoubleReal p_value_threshold, const String& database_filename)
66
// check whether the p_value is correct
67
if ( (p_value_threshold < 0) || (p_value_threshold > 1) )
69
throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "The parameters 'p_value_threshold' must be >= 0 and <=1 !");
72
ifstream result_file(result_filename.c_str());
75
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
91
vector< String > substrings;
92
vector< Size > corrupted_lines;
94
PeptideIdentification peptide_identification;
96
if ( !getline(result_file, line) ) // the header is read in a special function, so it can be skipped
100
throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
102
if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);
106
DateTime datetime = DateTime::now();
107
if ( protein_identification.getSearchEngine().empty() ) identifier = "InsPecT_" + datetime.getDate();
108
else protein_identification.getSearchEngine() + "_" + datetime.getDate();
110
// to get the precursor retention time and mz values later, save the filename and the numbers of the scans
111
vector< pair< String, vector< pair< Size, Size > > > > files_and_peptide_identification_with_scan_number;
112
// the record number is mapped to the position in the protein hits, to retrieve their sequences
113
map<Size, Size> rn_position_map;
117
spectrum_file_column(-1),
124
record_number_column(-1),
125
DB_file_pos_column(-1),
126
spec_file_pos_column(-1);
128
String::size_type start(0), end(0);
132
readOutHeader(result_filename, line, spectrum_file_column, scan_column, peptide_column, protein_column, charge_column, MQ_score_column, p_value_column, record_number_column, DB_file_pos_column, spec_file_pos_column, number_of_columns);
134
catch( Exception::ParseError & p_e )
49
InspectOutfile::InspectOutfile()
54
InspectOutfile::InspectOutfile(const InspectOutfile &)
59
InspectOutfile::~InspectOutfile()
63
/// assignment operator
64
InspectOutfile & InspectOutfile::operator=(const InspectOutfile & inspect_outfile)
66
if (this == &inspect_outfile)
73
bool InspectOutfile::operator==(const InspectOutfile &) const
78
vector<Size> InspectOutfile::load(const String & result_filename, vector<PeptideIdentification> & peptide_identifications,
79
ProteinIdentification & protein_identification, const DoubleReal p_value_threshold, const String & database_filename)
81
// check whether the p_value is correct
82
if ((p_value_threshold < 0) || (p_value_threshold > 1))
84
throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "The parameters 'p_value_threshold' must be >= 0 and <=1 !");
87
ifstream result_file(result_filename.c_str());
90
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
104
number_of_columns(0);
106
vector<String> substrings;
107
vector<Size> corrupted_lines;
109
PeptideIdentification peptide_identification;
111
if (!getline(result_file, line)) // the header is read in a special function, so it can be skipped
115
throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
117
if (!line.empty() && (line[line.length() - 1] < 33))
118
line.resize(line.length() - 1);
122
DateTime datetime = DateTime::now();
123
if (protein_identification.getSearchEngine().empty())
124
identifier = "InsPecT_" + datetime.getDate();
126
protein_identification.getSearchEngine() + "_" + datetime.getDate();
128
// to get the precursor retention time and mz values later, save the filename and the numbers of the scans
129
vector<pair<String, vector<pair<Size, Size> > > > files_and_peptide_identification_with_scan_number;
130
// the record number is mapped to the position in the protein hits, to retrieve their sequences
131
map<Size, Size> rn_position_map;
135
spectrum_file_column(-1),
142
record_number_column(-1),
143
DB_file_pos_column(-1),
144
spec_file_pos_column(-1);
146
String::size_type start(0), end(0);
150
readOutHeader(result_filename, line, spectrum_file_column, scan_column, peptide_column, protein_column, charge_column, MQ_score_column, p_value_column, record_number_column, DB_file_pos_column, spec_file_pos_column, number_of_columns);
152
catch (Exception::ParseError & p_e)
138
156
LOG_WARN << "ParseError (" << p_e.getMessage() << ") caught in " << __FILE__ << "\n";
142
while ( getline(result_file, line) )
145
if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);
147
if ( line.empty() ) continue;
149
// check whether the line has enough columns
150
line.split('\t', substrings);
151
if ( substrings.size() != number_of_columns )
153
corrupted_lines.push_back(line_number);
157
// if the pvalue is too small, skip the line
158
if ( substrings[p_value_column].toFloat() > p_value_threshold ) continue;
161
ProteinHit protein_hit;
162
// get accession number and type
163
getACAndACType(substrings[protein_column], accession, accession_type);
164
protein_hit.setAccession(accession);
165
// protein_hit.setScore(0.0);
167
// the database position of the protein (the i-th protein)
168
record_number = substrings[record_number_column].toInt();
170
// map the database position of the protein to its position in the protein hits and insert it, if it's a new protein
171
if ( rn_position_map.find(record_number) == rn_position_map.end() )
173
rn_position_map[record_number] = protein_identification.getHits().size();
174
protein_identification.insertHit(protein_hit);
177
// if a new scan is found (new file or new scan), insert it into the vector (the first time the condition is fullfilled because spectrum_file is "")
178
if ( (substrings[spectrum_file_column] != spectrum_file) || ((Size) substrings[scan_column].toInt() != scan_number) )
180
if ( substrings[spectrum_file_column] != spectrum_file ) // if it's a new file, insert it into the vector (used to retrieve RT and MT later)
182
// if it's the first file or if hits have been found in the file before, insert a new file
183
if ( files_and_peptide_identification_with_scan_number.empty() || !files_and_peptide_identification_with_scan_number.back().second.empty() )
185
files_and_peptide_identification_with_scan_number.push_back(make_pair(substrings[spectrum_file_column], vector< pair<Size , Size> >()));
187
// otherwise change the name of the last file entry (the one without hits)
188
else files_and_peptide_identification_with_scan_number.back().first = substrings[spectrum_file_column];
191
spectrum_file = substrings[spectrum_file_column];
192
scan_number = substrings[scan_column].toInt();
194
// if it's not the first scan and if hits have been found, insert the peptide identification
195
if ( !peptide_identification.empty() && !peptide_identification.getHits().empty() )
197
files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));
198
peptide_identifications.push_back(peptide_identification);
200
peptide_identification = PeptideIdentification();
202
peptide_identification.setIdentifier(identifier);
203
peptide_identification.setSignificanceThreshold(p_value_threshold);
204
peptide_identification.setScoreType(score_type_);
207
// get the peptide infos from the new peptide and insert it
208
PeptideHit peptide_hit;
209
peptide_hit.setCharge(substrings[charge_column].toInt());
210
peptide_hit.setScore(substrings[MQ_score_column].toFloat());
211
peptide_hit.setRank(0); // all ranks are set to zero and assigned later
213
// get the sequence and the amino acid before and after
214
String sequence, sequence_with_mods;
215
sequence_with_mods = substrings[peptide_column];
216
start = sequence_with_mods.find('.') + 1;
217
end = sequence_with_mods.find_last_of('.');
218
if ( start >= 2 ) peptide_hit.setAABefore(sequence_with_mods[start - 2]);
219
if ( end< sequence_with_mods.length() + 1 ) peptide_hit.setAAAfter(sequence_with_mods[end + 1]);
221
//remove modifications (small characters and anything that's not in the alphabet)
222
sequence_with_mods = substrings[peptide_column].substr(start, end-start);
223
for ( String::ConstIterator c_i = sequence_with_mods.begin(); c_i != sequence_with_mods.end(); ++c_i )
225
if ( (bool) isalpha(*c_i) && (bool) isupper(*c_i) ) sequence.append(1, *c_i);
228
peptide_hit.setSequence(sequence);
229
peptide_hit.addProteinAccession(accession);
231
peptide_identification.insertHit(peptide_hit);
238
// if it's not the first scan and if hits have been found, insert the peptide identification
239
if ( !peptide_identification.empty() && !peptide_identification.getHits().empty() )
241
files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));
242
peptide_identifications.push_back(peptide_identification);
245
// if the last file had no hits, delete it
246
if ( !files_and_peptide_identification_with_scan_number.empty() && files_and_peptide_identification_with_scan_number.back().second.empty() )
248
files_and_peptide_identification_with_scan_number.pop_back();
251
if ( !peptide_identifications.empty() ) peptide_identifications.back().assignRanks();
253
// search the sequence of the proteins
254
if ( !protein_identification.getHits().empty() && !database_filename.empty() )
256
vector< ProteinHit > protein_hits = protein_identification.getHits();
257
vector< String > sequences;
258
getSequences(database_filename, rn_position_map, sequences);
260
// set the retrieved sequences
261
vector< String >::const_iterator s_i = sequences.begin();
262
for ( map< Size, Size >::const_iterator rn_i = rn_position_map.begin(); rn_i != rn_position_map.end(); ++rn_i, ++s_i ) protein_hits[rn_i->second].setSequence(*s_i);
265
rn_position_map.clear();
266
protein_identification.setHits(protein_hits);
267
protein_hits.clear();
270
// get the precursor retention times and mz values
271
getPrecursorRTandMZ(files_and_peptide_identification_with_scan_number, peptide_identifications);
272
protein_identification.setDateTime(datetime);
273
protein_identification.setIdentifier(identifier);
275
return corrupted_lines;
278
// < record number, number of protein in a vector >
280
InspectOutfile::getSequences(
281
const String& database_filename,
282
const map< Size, Size >& wanted_records,
283
vector< String >& sequences)
285
ifstream database(database_filename.c_str());
288
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);
291
vector< Size > not_found;
292
Size seen_records(0);
294
database.seekg(0, ios::end);
295
streampos sp = database.tellg();
296
database.seekg(0, ios::beg);
298
for ( map< Size, Size >::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i )
300
for ( ; seen_records < wr_i->first; ++seen_records )
302
database.ignore(sp, trie_delimiter_);
304
database.get(sequence, trie_delimiter_);
305
sequences.push_back(sequence.str());
306
if ( sequences.back().empty() ) not_found.push_back(wr_i->first);
310
// close the filestreams
318
InspectOutfile::getACAndACType(
321
String& accession_type)
323
String swissprot_prefixes = "JLOPQUX";
324
/// @todo replace this by general FastA implementation? (Martin)
326
accession_type.clear();
327
pair< String, String > p;
328
// if it's a FASTA line
329
if ( line.hasPrefix(">") ) line.erase(0,1);
330
if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);
333
// if it's a swissprot accession
334
if ( line.hasPrefix("tr") || line.hasPrefix("sp") )
336
accession = line.substr(3, line.find('|', 3)-3);
337
accession_type = "SwissProt";
339
else if ( line.hasPrefix("gi") )
341
String::size_type snd(line.find('|', 3));
342
String::size_type third(0);
343
if ( snd != String::npos )
345
third = line.find('|', ++snd) + 1;
347
accession = line.substr(third, line.find('|', third)-third);
348
accession_type = line.substr(snd, third-1-snd);
350
if ( accession_type == "gb" ) accession_type = "GenBank";
351
else if ( accession_type == "emb" ) accession_type = "EMBL";
352
else if ( accession_type == "dbj" ) accession_type = "DDBJ";
353
else if ( accession_type == "ref" ) accession_type = "NCBI";
354
else if ( (accession_type == "sp") || (accession_type == "tr") ) accession_type = "SwissProt";
355
else if ( accession_type == "gnl" )
357
accession_type = accession;
358
snd = line.find('|', third);
359
third = line.find('|', ++snd);
360
if ( third != String::npos ) accession = line.substr(snd, third-snd);
363
third = line.find(' ', snd);
364
if ( third != String::npos ) accession = line.substr(snd, third-snd);
365
else accession = line.substr(snd);
370
String::size_type pos1(line.find('(', 0));
371
String::size_type pos2(0);
372
if ( pos1 != String::npos )
374
pos2 = line.find(')', ++pos1);
375
if ( pos2 != String::npos )
377
accession = line.substr(pos1, pos2 - pos1);
378
if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";
379
else accession.clear();
382
if ( accession.empty() )
384
accession_type = "gi";
385
if ( snd != String::npos ) accession = line.substr(3, snd-4);
388
if ( snd == String::npos ) snd = line.find(' ', 3);
389
if ( snd != String::npos ) accession = line.substr(3, snd-3);
390
else accession = line.substr(3);
395
else if ( line.hasPrefix("ref") )
397
accession = line.substr(4, line.find('|', 4) - 4);
398
accession_type = "NCBI";
400
else if ( line.hasPrefix("gnl") )
403
accession_type = line.substr(0, line.find('|', 0));
404
accession = line.substr(accession_type.length()+1);
406
else if ( line.hasPrefix("lcl") )
409
accession_type = "lcl";
414
String::size_type pos1(line.find('(', 0));
415
String::size_type pos2(0);
416
if ( pos1 != String::npos )
418
pos2 = line.find(')', ++pos1);
419
if ( pos2 != String::npos )
421
accession = line.substr(pos1, pos2 - pos1);
422
if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";
423
else accession.clear();
426
if ( accession.empty() )
428
pos1 = line.find('|');
429
accession = line.substr(0, pos1);
430
if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";
433
pos1 = line.find(' ');
434
accession = line.substr(0, pos1);
435
if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";
438
accession = line.substr(0, 6);
439
if ( String(swissprot_prefixes).find(accession[0], 0) != String::npos ) accession_type = "SwissProt";
440
else accession.clear();
445
if ( accession.empty() )
447
accession = line.trim();
448
accession_type = "unknown";
453
InspectOutfile::getPrecursorRTandMZ(
454
const vector< pair< String, vector< pair < Size, Size > > > >& files_and_peptide_identification_with_scan_number,
455
vector< PeptideIdentification >& ids)
457
MSExperiment<> experiment;
460
for ( vector< pair< String, vector< pair< Size, Size > > > >::const_iterator fs_i = files_and_peptide_identification_with_scan_number.begin(); fs_i != files_and_peptide_identification_with_scan_number.end(); ++fs_i )
462
getExperiment(experiment, type, fs_i->first); // may throw an exception if the filetype could not be determined
464
if ( experiment.size() < fs_i->second.back().second )
466
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Not enought scans in file! (" + String(experiment.size()) + " available, should be at least " + String(fs_i->second.back().second) + ")", fs_i->first);
469
for ( vector< pair< Size, Size > >::const_iterator pi_scan_i = fs_i->second.begin(); pi_scan_i != fs_i->second.end(); ++pi_scan_i )
471
ids[pi_scan_i->first].setMetaValue("MZ", experiment[pi_scan_i->second - 1].getPrecursors()[0].getMZ());
472
ids[pi_scan_i->first].setMetaValue("RT", experiment[pi_scan_i->second - 1].getRT());
478
InspectOutfile::compressTrieDB(
479
const String& database_filename,
480
const String& index_filename,
481
vector< Size >& wanted_records,
482
const String& snd_database_filename,
483
const String& snd_index_filename,
486
if ( database_filename == snd_database_filename )
488
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", database_filename);
490
if ( index_filename == snd_index_filename )
492
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", index_filename);
494
ifstream database( database_filename.c_str());
497
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);
500
ifstream index(index_filename.c_str(), ios::in | ios::binary);
505
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);
508
// determine the length of the index file
509
index.seekg(0, ios::end);
510
streampos index_length = index.tellg();
511
index.seekg(0, ios::beg);
512
bool empty_records = wanted_records.empty();
513
if ( wanted_records.empty() )
515
for ( Size i = 0; i < index_length / record_length_; ++i ) wanted_records.push_back(i);
518
// take the wanted records, copy their sequences to the new db and write the index file accordingly
519
ofstream snd_database;
520
if ( append ) snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::app);
521
else snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::trunc);
528
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_database_filename);
532
if ( append ) snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::app);
533
else snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::trunc);
540
snd_database.close();
541
snd_database.clear();
542
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_index_filename);
545
char* index_record = new char[record_length_]; // to copy one record from the index file
546
Size database_pos(0), snd_database_pos(0); // their sizes HAVE TO BE 4 bytes
548
streampos index_pos(0);
550
for ( vector< Size >::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i )
552
// get the according record in the index file
553
if ( index_length < Int((*wr_i + 1) * record_length_) ) // if the file is too short
555
delete [] index_record;
560
snd_database.close();
561
snd_database.clear();
564
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "index file is too short!", index_filename);
566
index.seekg((*wr_i) * record_length_);
567
index.read(index_record, record_length_);
569
// all but the first sequence are preceded by an asterisk
570
if ( append ) snd_database.put(trie_delimiter_);
573
// check if we have to reverse the database_pos part (which is saved in little endian)
574
if (OPENMS_IS_BIG_ENDIAN)
577
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
579
tmp = index_record[db_pos_length_ + i];
580
index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
581
index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
585
// go to the beginning of the sequence
587
// whoever wrote this code - please don't ever do this again.
588
// x86 does *not* have a monopoly, nor does little endian.
589
memcpy(&database_pos, index_record + db_pos_length_, trie_db_pos_length_);
590
database.seekg(database_pos);
592
// store the corresponding index for the second database
593
snd_database_pos = snd_database.tellp(); // get the position in the second database
595
memcpy(index_record + db_pos_length_, &snd_database_pos, trie_db_pos_length_); // and copy to its place in the index record
597
// fixing the above "suboptimal" code
598
if (OPENMS_IS_BIG_ENDIAN)
601
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
603
tmp = index_record[db_pos_length_ + i];
604
index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
605
index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
609
snd_index.write((char*) index_record, record_length_); // because only the trie-db position changed, not the position in the original database, nor the protein name
611
// store the sequence
612
database.get(sequence, trie_delimiter_);
613
snd_database << sequence.str();
618
if ( empty_records ) wanted_records.clear();
619
delete [] index_record;
624
snd_database.close();
625
snd_database.clear();
631
InspectOutfile::generateTrieDB(
632
const String& source_database_filename,
633
const String& database_filename,
634
const String& index_filename,
636
const String species)
638
ifstream source_database(source_database_filename.c_str());
639
if ( !source_database )
641
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);
645
String ac_label, sequence_start_label, sequence_end_label, comment_label, species_label;
646
getLabels(source_database_filename, ac_label, sequence_start_label, sequence_end_label, comment_label, species_label);
649
if ( append ) database.open(database_filename.c_str(), ios::app | ios::out );
650
else database.open(database_filename.c_str());
653
source_database.close();
654
source_database.clear();
655
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);
658
if ( append ) index.open(index_filename.c_str(), ios::app | ios::out | ios::binary );
659
else index.open(index_filename.c_str(), ios::out | ios::binary );
662
source_database.close();
663
source_database.clear();
666
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);
669
// using flags to mark what has already been read
671
unsigned char ac_flag = 1;
672
unsigned char species_flag = !species.empty()*2; // if no species is given, take all proteins
673
unsigned char sequence_flag = 4;
675
unsigned char record_flags = 0;
677
String::size_type pos(0); // the position in a line
678
unsigned long long source_database_pos = source_database.tellg(); // the start of a protein in the source database
679
unsigned long long source_database_pos_buffer = 0; // because you don't know whether a new protein starts unless the line is read, the actual position is buffered before any new getline
680
Size database_pos(0);
681
String line, sequence, protein_name;
682
char* record = new char[record_length_]; // a record in the index file
683
char* protein_name_pos = record + db_pos_length_ + trie_db_pos_length_;
685
while ( getline(source_database, line) )
687
if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);
690
// empty and comment lines are skipped
691
if ( line.empty() || line.hasPrefix(comment_label) )
693
source_database_pos_buffer = source_database.tellg();
697
// read the sequence if the accession and the species have been read already
698
if ( record_flags == (ac_flag | species_flag | sequence_flag) )
700
if ( !line.hasPrefix(sequence_end_label) ) // if it is still the same protein, append the sequence
702
line.trim(); // erase all whitespaces from the sequence
703
line.remove(trie_delimiter_);
704
// save this part of the sequence
705
sequence.append(line);
707
else // if a new protein is found, write down the old one
709
// if the sequence is not empty, the record has the correct form
710
if ( !sequence.empty() )
712
// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)
713
if ( append ) database.put('*');
714
database_pos = database.tellp();
717
memcpy(record, &source_database_pos, db_pos_length_); // source database position
718
if (OPENMS_IS_BIG_ENDIAN)
721
for (Size i = 0; i < db_pos_length_ / 2; i++)
724
record[i] = record[db_pos_length_ - 1 - i];
725
record[db_pos_length_ - 1 - i] = tmp;
729
// whoever wrote this code - please don't ever do this again.
730
// x86 does *not* have a monopoly, nor does little endian.
731
memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position
733
// fix the above "suboptimal" code
734
if (OPENMS_IS_BIG_ENDIAN)
737
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
739
tmp = record[db_pos_length_ + i];
740
record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
741
record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
745
index.write(record, record_length_);
746
// protein name / accession has already been written
747
database << sequence;
748
source_database_pos = source_database_pos_buffer; // the position of the start of the new protein
753
// set back the record flags for a new record
758
// if not reading the sequence
759
if ( !(record_flags & sequence_flag) )
761
if ( line.hasPrefix(ac_label) )
763
pos = ac_label.length(); // find the beginning of the accession
765
while ( (line.length() > pos) && (line[pos] < 33) ) ++pos; // discard the whitespaces after the label
766
if ( pos != line.length() ) // if no accession is found, skip this protein
768
memset(protein_name_pos, 0, protein_name_length_); // clear the protein name
769
// read at most protein_name_length_ characters from the record name and write them to the record
770
protein_name = line.substr(pos, protein_name_length_);
771
protein_name.substitute('>', '}');
772
memcpy(protein_name_pos, protein_name.c_str(), protein_name.length());
774
record_flags |= ac_flag; // set the ac flag
776
else record_flags = 0;
778
// if a species line is found and an accession has already been found, check whether this record is from the wanted species, if not, skip it
779
if ( species_flag && line.hasPrefix(species_label) && (record_flags == ac_flag) )
781
pos = species_label.length();
782
if ( line.find(species, pos) != String::npos ) record_flags |= species_flag;
783
else record_flags = 0;
785
// if the beginning of the sequence is found and accession and correct species have been found
786
if ( line.hasPrefix(sequence_start_label) && ((record_flags & (ac_flag | species_flag)) == (ac_flag | species_flag)) ) record_flags |= sequence_flag;
788
source_database_pos_buffer = source_database.tellg();
791
source_database.close();
792
source_database.clear();
794
// if the last record has no sequence end label, the sequence has to be appended nevertheless (e.g. FASTA)
795
if ( record_flags == (ac_flag | species_flag | sequence_flag) && !sequence.empty() )
797
// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)
798
if ( append ) database.put('*');
799
database_pos = database.tellp();
802
// whoever wrote this code - please don't ever do this again.
803
// x86 does *not* have a monopoly, nor does little endian.
804
memcpy(record, &source_database_pos, db_pos_length_); // source database position
805
if (OPENMS_IS_BIG_ENDIAN)
808
for (Size i = 0; i < db_pos_length_ / 2; i++)
811
record[i] = record[db_pos_length_ - 1 - i];
812
record[db_pos_length_ - 1 - i] = tmp;
816
memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position
818
// fix the above "suboptimal" code
819
if (OPENMS_IS_BIG_ENDIAN)
822
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
824
tmp = record[db_pos_length_ + i];
825
record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
826
record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
830
index.write(record, record_length_);
831
// protein name / accession has already been written
832
database << sequence;
838
// close the filestreams
845
void InspectOutfile::getLabels(
846
const String& source_database_filename,
848
String& sequence_start_label,
849
String& sequence_end_label,
850
String& comment_label,
851
String& species_label)
853
ac_label = sequence_start_label = sequence_end_label = comment_label = species_label = "";
854
ifstream source_database(source_database_filename.c_str());
855
if ( !source_database )
857
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);
861
while ( getline(source_database, line) && (sequence_start_label.empty()) )
863
if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);
864
if ( line.trim().empty() ) continue;
866
else if ( line.hasPrefix(">") )
869
sequence_start_label = ">";
870
sequence_end_label = ">";
874
else if ( line.hasPrefix("SQ") )
877
sequence_start_label = "SQ";
878
sequence_end_label = "//";
879
comment_label = "CC";
880
species_label = "OS";
883
source_database.close();
884
source_database.clear();
886
// if no known start separator is found
887
if (sequence_start_label.empty())
889
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "database has unknown file format (neither trie nor FASTA nor swissprot)" , source_database_filename);
893
vector<Size> InspectOutfile::getWantedRecords(const String& result_filename, DoubleReal p_value_threshold)
895
// check whether the p_value is correct
896
if ( (p_value_threshold < 0) || (p_value_threshold > 1) )
898
throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the parameters 'p_value_threshold' must be >= 0 and <=1 !");
901
ifstream result_file(result_filename.c_str());
904
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
908
vector< String > substrings;
910
set< Size > wanted_records_set;
920
spectrum_file_column(-1),
927
record_number_column(-1),
928
DB_file_pos_column(-1),
929
spec_file_pos_column(-1);
931
Size number_of_columns(0);
933
if ( !getline(result_file, line) )
937
throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
940
readOutHeader(result_filename, line, spectrum_file_column, scan_column, peptide_column, protein_column, charge_column, MQ_score_column, p_value_column, record_number_column, DB_file_pos_column, spec_file_pos_column, number_of_columns);
942
while (getline(result_file, line))
945
if (!line.empty() && (line[line.length()-1] < 33)) line.resize(line.length() - 1);
947
if ( line.empty() ) continue;
948
line.split('\t', substrings);
950
// check whether the line has enough columns
951
if ( substrings.size() != number_of_columns )
953
corrupted_lines.push_back(line_number);
957
// check whether the line has enough columns
958
if (substrings.size() != number_of_columns) continue;
960
// take only those peptides whose p-value is less or equal the given threshold
961
if (substrings[p_value_column].toFloat() > p_value_threshold) continue;
963
wanted_records_set.insert(substrings[record_number_column].toInt());
969
for ( set< Size >::const_iterator rn_i = wanted_records_set.begin(); rn_i != wanted_records_set.end(); ++rn_i )
971
wanted_records.push_back(*rn_i);
974
return wanted_records;
978
InspectOutfile::getSearchEngineAndVersion(
979
const String& cmd_output,
980
ProteinIdentification& protein_identification)
982
protein_identification.setSearchEngine("InsPecT");
983
protein_identification.setSearchEngineVersion("unknown");
984
// searching for something like this: InsPecT version 20060907, InsPecT version 20100331
985
QString response(cmd_output.toQString());
160
while (getline(result_file, line))
163
if (!line.empty() && (line[line.length() - 1] < 33))
164
line.resize(line.length() - 1);
169
// check whether the line has enough columns
170
line.split('\t', substrings);
171
if (substrings.size() != number_of_columns)
173
corrupted_lines.push_back(line_number);
177
// if the pvalue is too small, skip the line
178
if (substrings[p_value_column].toFloat() > p_value_threshold)
182
ProteinHit protein_hit;
183
// get accession number and type
184
getACAndACType(substrings[protein_column], accession, accession_type);
185
protein_hit.setAccession(accession);
186
// protein_hit.setScore(0.0);
188
// the database position of the protein (the i-th protein)
189
record_number = substrings[record_number_column].toInt();
191
// map the database position of the protein to its position in the protein hits and insert it, if it's a new protein
192
if (rn_position_map.find(record_number) == rn_position_map.end())
194
rn_position_map[record_number] = protein_identification.getHits().size();
195
protein_identification.insertHit(protein_hit);
198
// if a new scan is found (new file or new scan), insert it into the vector (the first time the condition is fullfilled because spectrum_file is "")
199
if ((substrings[spectrum_file_column] != spectrum_file) || ((Size) substrings[scan_column].toInt() != scan_number))
201
if (substrings[spectrum_file_column] != spectrum_file) // if it's a new file, insert it into the vector (used to retrieve RT and MT later)
203
// if it's the first file or if hits have been found in the file before, insert a new file
204
if (files_and_peptide_identification_with_scan_number.empty() || !files_and_peptide_identification_with_scan_number.back().second.empty())
206
files_and_peptide_identification_with_scan_number.push_back(make_pair(substrings[spectrum_file_column], vector<pair<Size, Size> >()));
208
// otherwise change the name of the last file entry (the one without hits)
210
files_and_peptide_identification_with_scan_number.back().first = substrings[spectrum_file_column];
213
spectrum_file = substrings[spectrum_file_column];
214
scan_number = substrings[scan_column].toInt();
216
// if it's not the first scan and if hits have been found, insert the peptide identification
217
if (!peptide_identification.empty() && !peptide_identification.getHits().empty())
219
files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));
220
peptide_identifications.push_back(peptide_identification);
222
peptide_identification = PeptideIdentification();
224
peptide_identification.setIdentifier(identifier);
225
peptide_identification.setSignificanceThreshold(p_value_threshold);
226
peptide_identification.setScoreType(score_type_);
229
// get the peptide infos from the new peptide and insert it
230
PeptideHit peptide_hit;
231
peptide_hit.setCharge(substrings[charge_column].toInt());
232
peptide_hit.setScore(substrings[MQ_score_column].toFloat());
233
peptide_hit.setRank(0); // all ranks are set to zero and assigned later
235
// get the sequence and the amino acid before and after
236
String sequence, sequence_with_mods;
237
sequence_with_mods = substrings[peptide_column];
238
start = sequence_with_mods.find('.') + 1;
239
end = sequence_with_mods.find_last_of('.');
241
peptide_hit.setAABefore(sequence_with_mods[start - 2]);
242
if (end < sequence_with_mods.length() + 1)
243
peptide_hit.setAAAfter(sequence_with_mods[end + 1]);
245
//remove modifications (small characters and anything that's not in the alphabet)
246
sequence_with_mods = substrings[peptide_column].substr(start, end - start);
247
for (String::ConstIterator c_i = sequence_with_mods.begin(); c_i != sequence_with_mods.end(); ++c_i)
249
if ((bool) isalpha(*c_i) && (bool) isupper(*c_i))
250
sequence.append(1, *c_i);
253
peptide_hit.setSequence(sequence);
254
peptide_hit.addProteinAccession(accession);
256
peptide_identification.insertHit(peptide_hit);
263
// if it's not the first scan and if hits have been found, insert the peptide identification
264
if (!peptide_identification.empty() && !peptide_identification.getHits().empty())
266
files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));
267
peptide_identifications.push_back(peptide_identification);
270
// if the last file had no hits, delete it
271
if (!files_and_peptide_identification_with_scan_number.empty() && files_and_peptide_identification_with_scan_number.back().second.empty())
273
files_and_peptide_identification_with_scan_number.pop_back();
276
if (!peptide_identifications.empty())
277
peptide_identifications.back().assignRanks();
279
// search the sequence of the proteins
280
if (!protein_identification.getHits().empty() && !database_filename.empty())
282
vector<ProteinHit> protein_hits = protein_identification.getHits();
283
vector<String> sequences;
284
getSequences(database_filename, rn_position_map, sequences);
286
// set the retrieved sequences
287
vector<String>::const_iterator s_i = sequences.begin();
288
for (map<Size, Size>::const_iterator rn_i = rn_position_map.begin(); rn_i != rn_position_map.end(); ++rn_i, ++s_i)
289
protein_hits[rn_i->second].setSequence(*s_i);
292
rn_position_map.clear();
293
protein_identification.setHits(protein_hits);
294
protein_hits.clear();
297
// get the precursor retention times and mz values
298
getPrecursorRTandMZ(files_and_peptide_identification_with_scan_number, peptide_identifications);
299
protein_identification.setDateTime(datetime);
300
protein_identification.setIdentifier(identifier);
302
return corrupted_lines;
305
// < record number, number of protein in a vector >
307
InspectOutfile::getSequences(
308
const String & database_filename,
309
const map<Size, Size> & wanted_records,
310
vector<String> & sequences)
312
ifstream database(database_filename.c_str());
315
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);
318
vector<Size> not_found;
319
Size seen_records(0);
321
database.seekg(0, ios::end);
322
streampos sp = database.tellg();
323
database.seekg(0, ios::beg);
325
for (map<Size, Size>::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i)
327
for (; seen_records < wr_i->first; ++seen_records)
329
database.ignore(sp, trie_delimiter_);
331
database.get(sequence, trie_delimiter_);
332
sequences.push_back(sequence.str());
333
if (sequences.back().empty())
334
not_found.push_back(wr_i->first);
338
// close the filestreams
346
InspectOutfile::getACAndACType(
349
String & accession_type)
351
String swissprot_prefixes = "JLOPQUX";
352
/// @todo replace this by general FastA implementation? (Martin)
354
accession_type.clear();
355
pair<String, String> p;
356
// if it's a FASTA line
357
if (line.hasPrefix(">"))
359
if (!line.empty() && (line[line.length() - 1] < 33))
360
line.resize(line.length() - 1);
363
// if it's a swissprot accession
364
if (line.hasPrefix("tr") || line.hasPrefix("sp"))
366
accession = line.substr(3, line.find('|', 3) - 3);
367
accession_type = "SwissProt";
369
else if (line.hasPrefix("gi"))
371
String::size_type snd(line.find('|', 3));
372
String::size_type third(0);
373
if (snd != String::npos)
375
third = line.find('|', ++snd) + 1;
377
accession = line.substr(third, line.find('|', third) - third);
378
accession_type = line.substr(snd, third - 1 - snd);
380
if (accession_type == "gb")
381
accession_type = "GenBank";
382
else if (accession_type == "emb")
383
accession_type = "EMBL";
384
else if (accession_type == "dbj")
385
accession_type = "DDBJ";
386
else if (accession_type == "ref")
387
accession_type = "NCBI";
388
else if ((accession_type == "sp") || (accession_type == "tr"))
389
accession_type = "SwissProt";
390
else if (accession_type == "gnl")
392
accession_type = accession;
393
snd = line.find('|', third);
394
third = line.find('|', ++snd);
395
if (third != String::npos)
396
accession = line.substr(snd, third - snd);
399
third = line.find(' ', snd);
400
if (third != String::npos)
401
accession = line.substr(snd, third - snd);
403
accession = line.substr(snd);
408
String::size_type pos1(line.find('(', 0));
409
String::size_type pos2(0);
410
if (pos1 != String::npos)
412
pos2 = line.find(')', ++pos1);
413
if (pos2 != String::npos)
415
accession = line.substr(pos1, pos2 - pos1);
416
if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))
417
accession_type = "SwissProt";
422
if (accession.empty())
424
accession_type = "gi";
425
if (snd != String::npos)
426
accession = line.substr(3, snd - 4);
429
if (snd == String::npos)
430
snd = line.find(' ', 3);
431
if (snd != String::npos)
432
accession = line.substr(3, snd - 3);
434
accession = line.substr(3);
439
else if (line.hasPrefix("ref"))
441
accession = line.substr(4, line.find('|', 4) - 4);
442
accession_type = "NCBI";
444
else if (line.hasPrefix("gnl"))
447
accession_type = line.substr(0, line.find('|', 0));
448
accession = line.substr(accession_type.length() + 1);
450
else if (line.hasPrefix("lcl"))
453
accession_type = "lcl";
458
String::size_type pos1(line.find('(', 0));
459
String::size_type pos2(0);
460
if (pos1 != String::npos)
462
pos2 = line.find(')', ++pos1);
463
if (pos2 != String::npos)
465
accession = line.substr(pos1, pos2 - pos1);
466
if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))
467
accession_type = "SwissProt";
472
if (accession.empty())
474
pos1 = line.find('|');
475
accession = line.substr(0, pos1);
476
if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))
477
accession_type = "SwissProt";
480
pos1 = line.find(' ');
481
accession = line.substr(0, pos1);
482
if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))
483
accession_type = "SwissProt";
486
accession = line.substr(0, 6);
487
if (String(swissprot_prefixes).find(accession[0], 0) != String::npos)
488
accession_type = "SwissProt";
495
if (accession.empty())
497
accession = line.trim();
498
accession_type = "unknown";
503
InspectOutfile::getPrecursorRTandMZ(
504
const vector<pair<String, vector<pair<Size, Size> > > > & files_and_peptide_identification_with_scan_number,
505
vector<PeptideIdentification> & ids)
507
MSExperiment<> experiment;
510
for (vector<pair<String, vector<pair<Size, Size> > > >::const_iterator fs_i = files_and_peptide_identification_with_scan_number.begin(); fs_i != files_and_peptide_identification_with_scan_number.end(); ++fs_i)
512
getExperiment(experiment, type, fs_i->first); // may throw an exception if the filetype could not be determined
514
if (experiment.size() < fs_i->second.back().second)
516
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Not enought scans in file! (" + String(experiment.size()) + " available, should be at least " + String(fs_i->second.back().second) + ")", fs_i->first);
519
for (vector<pair<Size, Size> >::const_iterator pi_scan_i = fs_i->second.begin(); pi_scan_i != fs_i->second.end(); ++pi_scan_i)
521
ids[pi_scan_i->first].setMetaValue("MZ", experiment[pi_scan_i->second - 1].getPrecursors()[0].getMZ());
522
ids[pi_scan_i->first].setMetaValue("RT", experiment[pi_scan_i->second - 1].getRT());
528
InspectOutfile::compressTrieDB(
529
const String & database_filename,
530
const String & index_filename,
531
vector<Size> & wanted_records,
532
const String & snd_database_filename,
533
const String & snd_index_filename,
536
if (database_filename == snd_database_filename)
538
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", database_filename);
540
if (index_filename == snd_index_filename)
542
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", index_filename);
544
ifstream database(database_filename.c_str());
547
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);
550
ifstream index(index_filename.c_str(), ios::in | ios::binary);
555
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);
558
// determine the length of the index file
559
index.seekg(0, ios::end);
560
streampos index_length = index.tellg();
561
index.seekg(0, ios::beg);
562
bool empty_records = wanted_records.empty();
563
if (wanted_records.empty())
565
for (Size i = 0; i < index_length / record_length_; ++i)
566
wanted_records.push_back(i);
569
// take the wanted records, copy their sequences to the new db and write the index file accordingly
570
ofstream snd_database;
572
snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::app);
574
snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::trunc);
581
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_database_filename);
586
snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::app);
588
snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::trunc);
595
snd_database.close();
596
snd_database.clear();
597
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_index_filename);
600
char * index_record = new char[record_length_]; // to copy one record from the index file
601
Size database_pos(0), snd_database_pos(0); // their sizes HAVE TO BE 4 bytes
603
streampos index_pos(0);
605
for (vector<Size>::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i)
607
// get the according record in the index file
608
if (index_length < Int((*wr_i + 1) * record_length_)) // if the file is too short
610
delete[] index_record;
615
snd_database.close();
616
snd_database.clear();
619
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "index file is too short!", index_filename);
621
index.seekg((*wr_i) * record_length_);
622
index.read(index_record, record_length_);
624
// all but the first sequence are preceded by an asterisk
626
snd_database.put(trie_delimiter_);
629
// check if we have to reverse the database_pos part (which is saved in little endian)
630
if (OPENMS_IS_BIG_ENDIAN)
633
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
635
tmp = index_record[db_pos_length_ + i];
636
index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
637
index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
641
// go to the beginning of the sequence
643
// whoever wrote this code - please don't ever do this again.
644
// x86 does *not* have a monopoly, nor does little endian.
645
memcpy(&database_pos, index_record + db_pos_length_, trie_db_pos_length_);
646
database.seekg(database_pos);
648
// store the corresponding index for the second database
649
snd_database_pos = snd_database.tellp(); // get the position in the second database
651
memcpy(index_record + db_pos_length_, &snd_database_pos, trie_db_pos_length_); // and copy to its place in the index record
653
// fixing the above "suboptimal" code
654
if (OPENMS_IS_BIG_ENDIAN)
657
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
659
tmp = index_record[db_pos_length_ + i];
660
index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
661
index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
665
snd_index.write((char *) index_record, record_length_); // because only the trie-db position changed, not the position in the original database, nor the protein name
667
// store the sequence
668
database.get(sequence, trie_delimiter_);
669
snd_database << sequence.str();
675
wanted_records.clear();
676
delete[] index_record;
681
snd_database.close();
682
snd_database.clear();
688
InspectOutfile::generateTrieDB(
689
const String & source_database_filename,
690
const String & database_filename,
691
const String & index_filename,
693
const String species)
695
ifstream source_database(source_database_filename.c_str());
696
if (!source_database)
698
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);
702
String ac_label, sequence_start_label, sequence_end_label, comment_label, species_label;
703
getLabels(source_database_filename, ac_label, sequence_start_label, sequence_end_label, comment_label, species_label);
707
database.open(database_filename.c_str(), ios::app | ios::out);
709
database.open(database_filename.c_str());
712
source_database.close();
713
source_database.clear();
714
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);
718
index.open(index_filename.c_str(), ios::app | ios::out | ios::binary);
720
index.open(index_filename.c_str(), ios::out | ios::binary);
723
source_database.close();
724
source_database.clear();
727
throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);
730
// using flags to mark what has already been read
732
unsigned char ac_flag = 1;
733
unsigned char species_flag = !species.empty() * 2; // if no species is given, take all proteins
734
unsigned char sequence_flag = 4;
736
unsigned char record_flags = 0;
738
String::size_type pos(0); // the position in a line
739
unsigned long long source_database_pos = source_database.tellg(); // the start of a protein in the source database
740
unsigned long long source_database_pos_buffer = 0; // because you don't know whether a new protein starts unless the line is read, the actual position is buffered before any new getline
741
Size database_pos(0);
742
String line, sequence, protein_name;
743
char * record = new char[record_length_]; // a record in the index file
744
char * protein_name_pos = record + db_pos_length_ + trie_db_pos_length_;
746
while (getline(source_database, line))
748
if (!line.empty() && (line[line.length() - 1] < 33))
749
line.resize(line.length() - 1);
752
// empty and comment lines are skipped
753
if (line.empty() || line.hasPrefix(comment_label))
755
source_database_pos_buffer = source_database.tellg();
759
// read the sequence if the accession and the species have been read already
760
if (record_flags == (ac_flag | species_flag | sequence_flag))
762
if (!line.hasPrefix(sequence_end_label)) // if it is still the same protein, append the sequence
764
line.trim(); // erase all whitespaces from the sequence
765
line.remove(trie_delimiter_);
766
// save this part of the sequence
767
sequence.append(line);
769
else // if a new protein is found, write down the old one
771
// if the sequence is not empty, the record has the correct form
772
if (!sequence.empty())
774
// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)
777
database_pos = database.tellp();
780
memcpy(record, &source_database_pos, db_pos_length_); // source database position
781
if (OPENMS_IS_BIG_ENDIAN)
784
for (Size i = 0; i < db_pos_length_ / 2; i++)
787
record[i] = record[db_pos_length_ - 1 - i];
788
record[db_pos_length_ - 1 - i] = tmp;
792
// whoever wrote this code - please don't ever do this again.
793
// x86 does *not* have a monopoly, nor does little endian.
794
memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position
796
// fix the above "suboptimal" code
797
if (OPENMS_IS_BIG_ENDIAN)
800
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
802
tmp = record[db_pos_length_ + i];
803
record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
804
record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
808
index.write(record, record_length_);
809
// protein name / accession has already been written
810
database << sequence;
811
source_database_pos = source_database_pos_buffer; // the position of the start of the new protein
816
// set back the record flags for a new record
821
// if not reading the sequence
822
if (!(record_flags & sequence_flag))
824
if (line.hasPrefix(ac_label))
826
pos = ac_label.length(); // find the beginning of the accession
828
while ((line.length() > pos) && (line[pos] < 33))
829
++pos; // discard the whitespaces after the label
830
if (pos != line.length()) // if no accession is found, skip this protein
832
memset(protein_name_pos, 0, protein_name_length_); // clear the protein name
833
// read at most protein_name_length_ characters from the record name and write them to the record
834
protein_name = line.substr(pos, protein_name_length_);
835
protein_name.substitute('>', '}');
836
memcpy(protein_name_pos, protein_name.c_str(), protein_name.length());
838
record_flags |= ac_flag; // set the ac flag
843
// if a species line is found and an accession has already been found, check whether this record is from the wanted species, if not, skip it
844
if (species_flag && line.hasPrefix(species_label) && (record_flags == ac_flag))
846
pos = species_label.length();
847
if (line.find(species, pos) != String::npos)
848
record_flags |= species_flag;
852
// if the beginning of the sequence is found and accession and correct species have been found
853
if (line.hasPrefix(sequence_start_label) && ((record_flags & (ac_flag | species_flag)) == (ac_flag | species_flag)))
854
record_flags |= sequence_flag;
856
source_database_pos_buffer = source_database.tellg();
859
source_database.close();
860
source_database.clear();
862
// if the last record has no sequence end label, the sequence has to be appended nevertheless (e.g. FASTA)
863
if (record_flags == (ac_flag | species_flag | sequence_flag) && !sequence.empty())
865
// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)
868
database_pos = database.tellp();
871
// whoever wrote this code - please don't ever do this again.
872
// x86 does *not* have a monopoly, nor does little endian.
873
memcpy(record, &source_database_pos, db_pos_length_); // source database position
874
if (OPENMS_IS_BIG_ENDIAN)
877
for (Size i = 0; i < db_pos_length_ / 2; i++)
880
record[i] = record[db_pos_length_ - 1 - i];
881
record[db_pos_length_ - 1 - i] = tmp;
885
memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position
887
// fix the above "suboptimal" code
888
if (OPENMS_IS_BIG_ENDIAN)
891
for (Size i = 0; i < trie_db_pos_length_ / 2; i++)
893
tmp = record[db_pos_length_ + i];
894
record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];
895
record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;
899
index.write(record, record_length_);
900
// protein name / accession has already been written
901
database << sequence;
907
// close the filestreams
914
void InspectOutfile::getLabels(
915
const String & source_database_filename,
917
String & sequence_start_label,
918
String & sequence_end_label,
919
String & comment_label,
920
String & species_label)
922
ac_label = sequence_start_label = sequence_end_label = comment_label = species_label = "";
923
ifstream source_database(source_database_filename.c_str());
924
if (!source_database)
926
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);
930
while (getline(source_database, line) && (sequence_start_label.empty()))
932
if (!line.empty() && (line[line.length() - 1] < 33))
933
line.resize(line.length() - 1);
934
if (line.trim().empty())
937
else if (line.hasPrefix(">"))
940
sequence_start_label = ">";
941
sequence_end_label = ">";
945
else if (line.hasPrefix("SQ"))
948
sequence_start_label = "SQ";
949
sequence_end_label = "//";
950
comment_label = "CC";
951
species_label = "OS";
954
source_database.close();
955
source_database.clear();
957
// if no known start separator is found
958
if (sequence_start_label.empty())
960
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "database has unknown file format (neither trie nor FASTA nor swissprot)", source_database_filename);
964
vector<Size> InspectOutfile::getWantedRecords(const String & result_filename, DoubleReal p_value_threshold)
966
// check whether the p_value is correct
967
if ((p_value_threshold < 0) || (p_value_threshold > 1))
969
throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the parameters 'p_value_threshold' must be >= 0 and <=1 !");
972
ifstream result_file(result_filename.c_str());
975
throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
979
vector<String> substrings;
981
set<Size> wanted_records_set;
991
spectrum_file_column(-1),
998
record_number_column(-1),
999
DB_file_pos_column(-1),
1000
spec_file_pos_column(-1);
1002
Size number_of_columns(0);
1004
if (!getline(result_file, line))
1006
result_file.close();
1007
result_file.clear();
1008
throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);
1011
readOutHeader(result_filename, line, spectrum_file_column, scan_column, peptide_column, protein_column, charge_column, MQ_score_column, p_value_column, record_number_column, DB_file_pos_column, spec_file_pos_column, number_of_columns);
1013
while (getline(result_file, line))
1016
if (!line.empty() && (line[line.length() - 1] < 33))
1017
line.resize(line.length() - 1);
1021
line.split('\t', substrings);
1023
// check whether the line has enough columns
1024
if (substrings.size() != number_of_columns)
1026
corrupted_lines.push_back(line_number);
1030
// check whether the line has enough columns
1031
if (substrings.size() != number_of_columns)
1034
// take only those peptides whose p-value is less or equal the given threshold
1035
if (substrings[p_value_column].toFloat() > p_value_threshold)
1038
wanted_records_set.insert(substrings[record_number_column].toInt());
1041
result_file.close();
1042
result_file.clear();
1044
for (set<Size>::const_iterator rn_i = wanted_records_set.begin(); rn_i != wanted_records_set.end(); ++rn_i)
1046
wanted_records.push_back(*rn_i);
1049
return wanted_records;
1053
InspectOutfile::getSearchEngineAndVersion(
1054
const String & cmd_output,
1055
ProteinIdentification & protein_identification)
1057
protein_identification.setSearchEngine("InsPecT");
1058
protein_identification.setSearchEngineVersion("unknown");
1059
// searching for something like this: InsPecT version 20060907, InsPecT version 20100331
1060
QString response(cmd_output.toQString());
986
1061
QRegExp rx("InsPecT (version|vesrion) (\\d+)"); // older versions of InsPecT have typo...
987
if (rx.indexIn(response) == -1) return false;
988
protein_identification.setSearchEngineVersion(String(rx.cap(2)));
1062
if (rx.indexIn(response) == -1)
1065
protein_identification.setSearchEngineVersion(String(rx.cap(2)));
993
InspectOutfile::readOutHeader(
994
const String& filename,
995
const String& header_line,
996
Int& spectrum_file_column,
1001
Int& MQ_score_column,
1002
Int& p_value_column,
1003
Int& record_number_column,
1004
Int& DB_file_pos_column,
1005
Int& spec_file_pos_column,
1006
Size& number_of_columns)
1008
spectrum_file_column = scan_column = peptide_column = protein_column = charge_column = MQ_score_column = p_value_column = record_number_column = DB_file_pos_column = spec_file_pos_column = -1;
1010
vector< String > substrings;
1011
header_line.split('\t', substrings);
1013
// #SpectrumFile Scan# Annotation Protein Charge MQScore Length TotalPRMScore MedianPRMScore FractionY FractionB Intensity NTT p-value F-Score DeltaScore DeltaScoreOther RecordNumber DBFilePos SpecFilePos
1014
for ( vector< String >::const_iterator s_i = substrings.begin(); s_i != substrings.end(); ++s_i )
1016
if ( (*s_i) == "#SpectrumFile" ) spectrum_file_column = s_i - substrings.begin();
1017
else if ( (*s_i) == "Scan#" ) scan_column = s_i - substrings.begin();
1018
else if ( (*s_i) == "Annotation" ) peptide_column = s_i - substrings.begin();
1019
else if ( (*s_i) == "Protein" ) protein_column = s_i - substrings.begin();
1020
else if ( (*s_i) == "Charge" ) charge_column = s_i - substrings.begin();
1021
else if ( (*s_i) == "MQScore" ) MQ_score_column = s_i - substrings.begin();
1022
else if ( (*s_i) == "p-value" ) p_value_column = s_i - substrings.begin();
1023
else if ( (*s_i) == "RecordNumber" ) record_number_column = s_i - substrings.begin();
1024
else if ( (*s_i) == "DBFilePos" ) DB_file_pos_column = s_i - substrings.begin();
1025
else if ( (*s_i) == "SpecFilePos" ) spec_file_pos_column = s_i - substrings.begin();
1028
if ( (spectrum_file_column == -1) || (scan_column == -1) || (peptide_column == -1) || (protein_column == -1) || (charge_column == -1) || (MQ_score_column == -1) || (p_value_column == -1) || (record_number_column == -1) || (DB_file_pos_column == -1) || (spec_file_pos_column == -1) )
1030
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "at least one of the columns '#SpectrumFile', 'Scan#', 'Annotation', 'Protein', 'Charge', 'MQScore', 'p-value', 'RecordNumber', 'DBFilePos' or 'SpecFilePos' is missing!", filename);
1032
number_of_columns = substrings.size();
1035
const Size InspectOutfile::db_pos_length_ = 8;
1036
const Size InspectOutfile::trie_db_pos_length_ = 4;
1037
const Size InspectOutfile::protein_name_length_ = 80;
1038
const Size InspectOutfile::record_length_ = db_pos_length_ + trie_db_pos_length_ + protein_name_length_;
1039
const char InspectOutfile::trie_delimiter_ = '*';
1040
const String InspectOutfile::score_type_ = "Inspect";
1070
InspectOutfile::readOutHeader(
1071
const String & filename,
1072
const String & header_line,
1073
Int & spectrum_file_column,
1075
Int & peptide_column,
1076
Int & protein_column,
1077
Int & charge_column,
1078
Int & MQ_score_column,
1079
Int & p_value_column,
1080
Int & record_number_column,
1081
Int & DB_file_pos_column,
1082
Int & spec_file_pos_column,
1083
Size & number_of_columns)
1085
spectrum_file_column = scan_column = peptide_column = protein_column = charge_column = MQ_score_column = p_value_column = record_number_column = DB_file_pos_column = spec_file_pos_column = -1;
1087
vector<String> substrings;
1088
header_line.split('\t', substrings);
1090
// #SpectrumFile Scan# Annotation Protein Charge MQScore Length TotalPRMScore MedianPRMScore FractionY FractionB Intensity NTT p-value F-Score DeltaScore DeltaScoreOther RecordNumber DBFilePos SpecFilePos
1091
for (vector<String>::const_iterator s_i = substrings.begin(); s_i != substrings.end(); ++s_i)
1093
if ((*s_i) == "#SpectrumFile")
1094
spectrum_file_column = s_i - substrings.begin();
1095
else if ((*s_i) == "Scan#")
1096
scan_column = s_i - substrings.begin();
1097
else if ((*s_i) == "Annotation")
1098
peptide_column = s_i - substrings.begin();
1099
else if ((*s_i) == "Protein")
1100
protein_column = s_i - substrings.begin();
1101
else if ((*s_i) == "Charge")
1102
charge_column = s_i - substrings.begin();
1103
else if ((*s_i) == "MQScore")
1104
MQ_score_column = s_i - substrings.begin();
1105
else if ((*s_i) == "p-value")
1106
p_value_column = s_i - substrings.begin();
1107
else if ((*s_i) == "RecordNumber")
1108
record_number_column = s_i - substrings.begin();
1109
else if ((*s_i) == "DBFilePos")
1110
DB_file_pos_column = s_i - substrings.begin();
1111
else if ((*s_i) == "SpecFilePos")
1112
spec_file_pos_column = s_i - substrings.begin();
1115
if ((spectrum_file_column == -1) || (scan_column == -1) || (peptide_column == -1) || (protein_column == -1) || (charge_column == -1) || (MQ_score_column == -1) || (p_value_column == -1) || (record_number_column == -1) || (DB_file_pos_column == -1) || (spec_file_pos_column == -1))
1117
throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "at least one of the columns '#SpectrumFile', 'Scan#', 'Annotation', 'Protein', 'Charge', 'MQScore', 'p-value', 'RecordNumber', 'DBFilePos' or 'SpecFilePos' is missing!", filename);
1119
number_of_columns = substrings.size();
1122
const Size InspectOutfile::db_pos_length_ = 8;
1123
const Size InspectOutfile::trie_db_pos_length_ = 4;
1124
const Size InspectOutfile::protein_name_length_ = 80;
1125
const Size InspectOutfile::record_length_ = db_pos_length_ + trie_db_pos_length_ + protein_name_length_;
1126
const char InspectOutfile::trie_delimiter_ = '*';
1127
const String InspectOutfile::score_type_ = "Inspect";
1042
1129
} //namespace OpenMS