~ubuntu-branches/ubuntu/wily/openms/wily

readOutHeader(result_filename, line, spectrum_file_column, scan_column, peptide_column, protein_column, charge_column, MQ_score_column, p_value_column, record_number_column, DB_file_pos_column, spec_file_pos_column, number_of_columns);

133

}

134

catch( Exception::ParseError & p_e )

135

{

136

result_file.close();

137

result_file.clear();

InspectOutfile::InspectOutfile()

{

}

/// copy constructor

InspectOutfile::InspectOutfile(const InspectOutfile &)

{

}

/// destructor

InspectOutfile::~InspectOutfile()

{

}

/// assignment operator

InspectOutfile & InspectOutfile::operator=(const InspectOutfile & inspect_outfile)

{

if (this == &inspect_outfile)

return *this;

}

/// equality operator

bool InspectOutfile::operator==(const InspectOutfile &) const

{

return true;

}

vector<Size> InspectOutfile::load(const String & result_filename, vector<PeptideIdentification> & peptide_identifications,

ProteinIdentification & protein_identification, const DoubleReal p_value_threshold, const String & database_filename)

{

// check whether the p_value is correct

if ((p_value_threshold < 0) || (p_value_threshold > 1))

{

throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "The parameters 'p_value_threshold' must be >= 0 and <=1 !");

}

ifstream result_file(result_filename.c_str());

if (!result_file)

{

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);

}

String

line,

accession,

accession_type,

spectrum_file,

identifier;

100

Size

101

record_number(0),

102

scan_number(0),

103

line_number(0),

104

number_of_columns(0);

105

106

vector<String> substrings;

107

vector<Size> corrupted_lines;

108

109

PeptideIdentification peptide_identification;

110

111

if (!getline(result_file, line)) // the header is read in a special function, so it can be skipped

112

{

113

result_file.close();

114

result_file.clear();

115

throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);

116

}

117

if (!line.empty() && (line[line.length() - 1] < 33))

118

line.resize(line.length() - 1);

119

line.trim();

120

++line_number;

121

122

DateTime datetime = DateTime::now();

123

if (protein_identification.getSearchEngine().empty())

124

identifier = "InsPecT_" + datetime.getDate();

125

else

126

protein_identification.getSearchEngine() + "_" + datetime.getDate();

127

128

// to get the precursor retention time and mz values later, save the filename and the numbers of the scans

129

vector<pair<String, vector<pair<Size, Size> > > > files_and_peptide_identification_with_scan_number;

130

// the record number is mapped to the position in the protein hits, to retrieve their sequences

131

map<Size, Size> rn_position_map;

132

133

// get the header

134

Int

135

spectrum_file_column(-1),

136

scan_column(-1),

137

peptide_column(-1),

138

protein_column(-1),

139

charge_column(-1),

140

MQ_score_column(-1),

141

p_value_column(-1),

142

record_number_column(-1),

143

DB_file_pos_column(-1),

144

spec_file_pos_column(-1);

145

146

String::size_type start(0), end(0);

147

148

try

149

{

150

151

}

152

catch (Exception::ParseError & p_e)

153

{

154

result_file.close();

155

result_file.clear();

138

156

LOG_WARN << "ParseError (" << p_e.getMessage() << ") caught in " << __FILE__ << "\n";

139

157

throw;

140

}

141

142

while ( getline(result_file, line) )

143

{

144

++line_number;

145

if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);

146

line.trim();

147

if ( line.empty() ) continue;

148

149

// check whether the line has enough columns

150

line.split('\t', substrings);

151

if ( substrings.size() != number_of_columns )

152

{

153

corrupted_lines.push_back(line_number);

154

continue;

155

}

156

157

// if the pvalue is too small, skip the line

158

if ( substrings[p_value_column].toFloat() > p_value_threshold ) continue;

159

160

// the protein

161

ProteinHit protein_hit;

162

// get accession number and type

163

getACAndACType(substrings[protein_column], accession, accession_type);

164

protein_hit.setAccession(accession);

165

// protein_hit.setScore(0.0);

166

167

// the database position of the protein (the i-th protein)

168

record_number = substrings[record_number_column].toInt();

169

170

// map the database position of the protein to its position in the protein hits and insert it, if it's a new protein

171

if ( rn_position_map.find(record_number) == rn_position_map.end() )

172

{

173

rn_position_map[record_number] = protein_identification.getHits().size();

174

protein_identification.insertHit(protein_hit);

175

}

176

177

// if a new scan is found (new file or new scan), insert it into the vector (the first time the condition is fullfilled because spectrum_file is "")

178

if ( (substrings[spectrum_file_column] != spectrum_file) || ((Size) substrings[scan_column].toInt() != scan_number) )

179

{

180

if ( substrings[spectrum_file_column] != spectrum_file ) // if it's a new file, insert it into the vector (used to retrieve RT and MT later)

181

{

182

// if it's the first file or if hits have been found in the file before, insert a new file

183

if ( files_and_peptide_identification_with_scan_number.empty() || !files_and_peptide_identification_with_scan_number.back().second.empty() )

184

{

185

files_and_peptide_identification_with_scan_number.push_back(make_pair(substrings[spectrum_file_column], vector< pair<Size , Size> >()));

186

}

187

// otherwise change the name of the last file entry (the one without hits)

188

else files_and_peptide_identification_with_scan_number.back().first = substrings[spectrum_file_column];

189

}

190

191

spectrum_file = substrings[spectrum_file_column];

192

scan_number = substrings[scan_column].toInt();

193

194

// if it's not the first scan and if hits have been found, insert the peptide identification

195

if ( !peptide_identification.empty() && !peptide_identification.getHits().empty() )

196

{

197

files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));

198

peptide_identifications.push_back(peptide_identification);

199

}

200

peptide_identification = PeptideIdentification();

201

202

peptide_identification.setIdentifier(identifier);

203

peptide_identification.setSignificanceThreshold(p_value_threshold);

204

peptide_identification.setScoreType(score_type_);

205

}

206

207

// get the peptide infos from the new peptide and insert it

208

PeptideHit peptide_hit;

209

peptide_hit.setCharge(substrings[charge_column].toInt());

210

peptide_hit.setScore(substrings[MQ_score_column].toFloat());

211

peptide_hit.setRank(0); // all ranks are set to zero and assigned later

212

213

// get the sequence and the amino acid before and after

214

String sequence, sequence_with_mods;

215

sequence_with_mods = substrings[peptide_column];

216

start = sequence_with_mods.find('.') + 1;

217

end = sequence_with_mods.find_last_of('.');

218

if ( start >= 2 ) peptide_hit.setAABefore(sequence_with_mods[start - 2]);

219

if ( end< sequence_with_mods.length() + 1 ) peptide_hit.setAAAfter(sequence_with_mods[end + 1]);

220

221

//remove modifications (small characters and anything that's not in the alphabet)

222

sequence_with_mods = substrings[peptide_column].substr(start, end-start);

223

for ( String::ConstIterator c_i = sequence_with_mods.begin(); c_i != sequence_with_mods.end(); ++c_i )

224

{

225

if ( (bool) isalpha(*c_i) && (bool) isupper(*c_i) ) sequence.append(1, *c_i);

226

}

227

228

peptide_hit.setSequence(sequence);

229

peptide_hit.addProteinAccession(accession);

230

231

peptide_identification.insertHit(peptide_hit);

232

}

233

234

// result file read

235

result_file.close();

236

result_file.clear();

237

238

// if it's not the first scan and if hits have been found, insert the peptide identification

239

if ( !peptide_identification.empty() && !peptide_identification.getHits().empty() )

240

{

241

files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));

242

peptide_identifications.push_back(peptide_identification);

243

}

244

245

// if the last file had no hits, delete it

246

if ( !files_and_peptide_identification_with_scan_number.empty() && files_and_peptide_identification_with_scan_number.back().second.empty() )

247

{

248

files_and_peptide_identification_with_scan_number.pop_back();

249

}

250

251

if ( !peptide_identifications.empty() ) peptide_identifications.back().assignRanks();

252

253

// search the sequence of the proteins

254

if ( !protein_identification.getHits().empty() && !database_filename.empty() )

255

{

256

vector< ProteinHit > protein_hits = protein_identification.getHits();

257

vector< String > sequences;

258

getSequences(database_filename, rn_position_map, sequences);

259

260

// set the retrieved sequences

261

vector< String >::const_iterator s_i = sequences.begin();

262

for ( map< Size, Size >::const_iterator rn_i = rn_position_map.begin(); rn_i != rn_position_map.end(); ++rn_i, ++s_i ) protein_hits[rn_i->second].setSequence(*s_i);

263

264

sequences.clear();

265

rn_position_map.clear();

266

protein_identification.setHits(protein_hits);

267

protein_hits.clear();

268

}

269

270

// get the precursor retention times and mz values

271

getPrecursorRTandMZ(files_and_peptide_identification_with_scan_number, peptide_identifications);

272

protein_identification.setDateTime(datetime);

273

protein_identification.setIdentifier(identifier);

274

275

return corrupted_lines;

276

}

277

278

// < record number, number of protein in a vector >

279

vector< Size >

280

InspectOutfile::getSequences(

281

const String& database_filename,

282

const map< Size, Size >& wanted_records,

283

vector< String >& sequences)

284

{

285

ifstream database(database_filename.c_str());

286

if (!database)

287

{

288

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);

289

}

290

291

vector< Size > not_found;

292

Size seen_records(0);

293

stringbuf sequence;

294

database.seekg(0, ios::end);

295

streampos sp = database.tellg();

296

database.seekg(0, ios::beg);

297

298

for ( map< Size, Size >::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i )

299

{

300

for ( ; seen_records < wr_i->first; ++seen_records )

301

{

302

database.ignore(sp, trie_delimiter_);

303

}

304

database.get(sequence, trie_delimiter_);

305

sequences.push_back(sequence.str());

306

if ( sequences.back().empty() ) not_found.push_back(wr_i->first);

307

sequence.str("");

308

}

309

310

// close the filestreams

311

database.close();

312

database.clear();

313

314

return not_found;

315

}

316

317

void

318

InspectOutfile::getACAndACType(

319

String line,

320

String& accession,

321

String& accession_type)

322

{

323

String swissprot_prefixes = "JLOPQUX";

324

/// @todo replace this by general FastA implementation? (Martin)

325

accession.clear();

326

accession_type.clear();

327

pair< String, String > p;

328

// if it's a FASTA line

329

if ( line.hasPrefix(">") ) line.erase(0,1);

330

if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);

331

line.trim();

332

333

// if it's a swissprot accession

334

if ( line.hasPrefix("tr") || line.hasPrefix("sp") )

335

{

336

accession = line.substr(3, line.find('|', 3)-3);

337

accession_type = "SwissProt";

338

}

339

else if ( line.hasPrefix("gi") )

340

{

341

String::size_type snd(line.find('|', 3));

342

String::size_type third(0);

343

if ( snd != String::npos )

344

{

345

third = line.find('|', ++snd) + 1;

346

347

accession = line.substr(third, line.find('|', third)-third);

348

accession_type = line.substr(snd, third-1-snd);

349

}

350

if ( accession_type == "gb" ) accession_type = "GenBank";

351

else if ( accession_type == "emb" ) accession_type = "EMBL";

352

else if ( accession_type == "dbj" ) accession_type = "DDBJ";

353

else if ( accession_type == "ref" ) accession_type = "NCBI";

354

else if ( (accession_type == "sp") || (accession_type == "tr") ) accession_type = "SwissProt";

355

else if ( accession_type == "gnl" )

356

{

357

accession_type = accession;

358

snd = line.find('|', third);

359

third = line.find('|', ++snd);

360

if ( third != String::npos ) accession = line.substr(snd, third-snd);

361

else

362

{

363

third = line.find(' ', snd);

364

if ( third != String::npos ) accession = line.substr(snd, third-snd);

365

else accession = line.substr(snd);

366

}

367

}

368

else

369

{

370

String::size_type pos1(line.find('(', 0));

371

String::size_type pos2(0);

372

if ( pos1 != String::npos )

373

{

374

pos2 = line.find(')', ++pos1);

375

if ( pos2 != String::npos )

376

{

377

accession = line.substr(pos1, pos2 - pos1);

378

if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";

379

else accession.clear();

380

}

381

}

382

if ( accession.empty() )

383

{

384

accession_type = "gi";

385

if ( snd != String::npos ) accession = line.substr(3, snd-4);

386

else

387

{

388

if ( snd == String::npos ) snd = line.find(' ', 3);

389

if ( snd != String::npos ) accession = line.substr(3, snd-3);

390

else accession = line.substr(3);

391

}

392

}

393

}

394

}

395

else if ( line.hasPrefix("ref") )

396

{

397

accession = line.substr(4, line.find('|', 4) - 4);

398

accession_type = "NCBI";

399

}

400

else if ( line.hasPrefix("gnl") )

401

{

402

line.erase(0,3);

403

accession_type = line.substr(0, line.find('|', 0));

404

accession = line.substr(accession_type.length()+1);

405

}

406

else if ( line.hasPrefix("lcl") )

407

{

408

line.erase(0,4);

409

accession_type = "lcl";

410

accession = line;

411

}

412

else

413

{

414

String::size_type pos1(line.find('(', 0));

415

String::size_type pos2(0);

416

if ( pos1 != String::npos )

417

{

418

pos2 = line.find(')', ++pos1);

419

if ( pos2 != String::npos )

420

{

421

accession = line.substr(pos1, pos2 - pos1);

422

if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";

423

else accession.clear();

424

}

425

}

426

if ( accession.empty() )

427

{

428

pos1 = line.find('|');

429

accession = line.substr(0, pos1);

430

if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";

431

else

432

{

433

pos1 = line.find(' ');

434

accession = line.substr(0, pos1);

435

if ( (accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos) ) accession_type = "SwissProt";

436

else

437

{

438

accession = line.substr(0, 6);

439

if ( String(swissprot_prefixes).find(accession[0], 0) != String::npos ) accession_type = "SwissProt";

440

else accession.clear();

441

}

442

}

443

}

444

}

445

if ( accession.empty() )

446

{

447

accession = line.trim();

448

accession_type = "unknown";

449

}

450

}

451

452

void

453

InspectOutfile::getPrecursorRTandMZ(

454

const vector< pair< String, vector< pair < Size, Size > > > >& files_and_peptide_identification_with_scan_number,

455

vector< PeptideIdentification >& ids)

456

{

457

MSExperiment<> experiment;

458

String type;

459

460

for ( vector< pair< String, vector< pair< Size, Size > > > >::const_iterator fs_i = files_and_peptide_identification_with_scan_number.begin(); fs_i != files_and_peptide_identification_with_scan_number.end(); ++fs_i )

461

{

462

getExperiment(experiment, type, fs_i->first); // may throw an exception if the filetype could not be determined

463

464

if ( experiment.size() < fs_i->second.back().second )

465

{

466

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Not enought scans in file! (" + String(experiment.size()) + " available, should be at least " + String(fs_i->second.back().second) + ")", fs_i->first);

467

}

468

469

for ( vector< pair< Size, Size > >::const_iterator pi_scan_i = fs_i->second.begin(); pi_scan_i != fs_i->second.end(); ++pi_scan_i )

470

{

471

ids[pi_scan_i->first].setMetaValue("MZ", experiment[pi_scan_i->second - 1].getPrecursors()[0].getMZ());

472

ids[pi_scan_i->first].setMetaValue("RT", experiment[pi_scan_i->second - 1].getRT());

473

}

474

}

475

}

476

477

void

478

InspectOutfile::compressTrieDB(

479

const String& database_filename,

480

const String& index_filename,

481

vector< Size >& wanted_records,

482

const String& snd_database_filename,

483

const String& snd_index_filename,

484

bool append)

485

{

486

if ( database_filename == snd_database_filename )

487

{

488

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", database_filename);

489

}

490

if ( index_filename == snd_index_filename )

491

{

492

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", index_filename);

493

}

494

ifstream database( database_filename.c_str());

495

if ( !database )

496

{

497

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);

498

}

499

500

ifstream index(index_filename.c_str(), ios::in | ios::binary);

501

if ( !index )

502

{

503

database.close();

504

database.clear();

505

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);

506

}

507

508

// determine the length of the index file

509

index.seekg(0, ios::end);

510

streampos index_length = index.tellg();

511

index.seekg(0, ios::beg);

512

bool empty_records = wanted_records.empty();

513

if ( wanted_records.empty() )

514

{

515

for ( Size i = 0; i < index_length / record_length_; ++i ) wanted_records.push_back(i);

516

}

517

518

// take the wanted records, copy their sequences to the new db and write the index file accordingly

519

ofstream snd_database;

520

if ( append ) snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::app);

521

else snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::trunc);

522

if ( !snd_database )

523

{

524

database.close();

525

database.clear();

526

index.close();

527

index.clear();

528

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_database_filename);

529

}

530

531

ofstream snd_index;

532

if ( append ) snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::app);

533

else snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::trunc);

534

if ( !snd_index )

535

{

536

database.close();

537

database.clear();

538

index.close();

539

index.clear();

540

snd_database.close();

541

snd_database.clear();

542

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_index_filename);

543

}

544

545

char* index_record = new char[record_length_]; // to copy one record from the index file

546

Size database_pos(0), snd_database_pos(0); // their sizes HAVE TO BE 4 bytes

547

stringbuf sequence;

548

streampos index_pos(0);

549

550

for ( vector< Size >::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i )

551

{

552

// get the according record in the index file

553

if ( index_length < Int((*wr_i + 1) * record_length_) ) // if the file is too short

554

{

555

delete [] index_record;

556

database.close();

557

database.clear();

558

index.close();

559

index.clear();

560

snd_database.close();

561

snd_database.clear();

562

snd_index.close();

563

snd_index.clear();

564

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "index file is too short!", index_filename);

565

}

566

index.seekg((*wr_i) * record_length_);

567

index.read(index_record, record_length_);

568

569

// all but the first sequence are preceded by an asterisk

570

if ( append ) snd_database.put(trie_delimiter_);

571

append = true;

572

573

// check if we have to reverse the database_pos part (which is saved in little endian)

574

if (OPENMS_IS_BIG_ENDIAN)

575

{

576

char tmp;

577

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

578

{

579

tmp = index_record[db_pos_length_ + i];

580

index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

581

index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

582

}

583

}

584

585

// go to the beginning of the sequence

586

587

// whoever wrote this code - please don't ever do this again.

588

// x86 does *not* have a monopoly, nor does little endian.

589

memcpy(&database_pos, index_record + db_pos_length_, trie_db_pos_length_);

590

database.seekg(database_pos);

591

592

// store the corresponding index for the second database

593

snd_database_pos = snd_database.tellp(); // get the position in the second database

594

595

memcpy(index_record + db_pos_length_, &snd_database_pos, trie_db_pos_length_); // and copy to its place in the index record

596

597

// fixing the above "suboptimal" code

598

if (OPENMS_IS_BIG_ENDIAN)

599

{

600

char tmp;

601

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

602

{

603

tmp = index_record[db_pos_length_ + i];

604

index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

605

index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

606

}

607

}

608

609

snd_index.write((char*) index_record, record_length_); // because only the trie-db position changed, not the position in the original database, nor the protein name

610

611

// store the sequence

612

database.get(sequence, trie_delimiter_);

613

snd_database << sequence.str();

614

sequence.str("");

615

}

616

617

618

if ( empty_records ) wanted_records.clear();

619

delete [] index_record;

620

database.close();

621

database.clear();

622

index.close();

623

index.clear();

624

snd_database.close();

625

snd_database.clear();

626

snd_index.close();

627

snd_index.clear();

628

}

629

630

void

631

InspectOutfile::generateTrieDB(

632

const String& source_database_filename,

633

const String& database_filename,

634

const String& index_filename,

635

bool append,

636

const String species)

637

{

638

ifstream source_database(source_database_filename.c_str());

639

if ( !source_database )

640

{

641

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);

642

}

643

644

// get the labels

645

String ac_label, sequence_start_label, sequence_end_label, comment_label, species_label;

646

getLabels(source_database_filename, ac_label, sequence_start_label, sequence_end_label, comment_label, species_label);

647

648

ofstream database;

649

if ( append ) database.open(database_filename.c_str(), ios::app | ios::out );

650

else database.open(database_filename.c_str());

651

if ( !database )

652

{

653

source_database.close();

654

source_database.clear();

655

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);

656

}

657

ofstream index;

658

if ( append ) index.open(index_filename.c_str(), ios::app | ios::out | ios::binary );

659

else index.open(index_filename.c_str(), ios::out | ios::binary );

660

if ( !index )

661

{

662

source_database.close();

663

source_database.clear();

664

database.close();

665

database.clear();

666

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);

667

}

668

669

// using flags to mark what has already been read

670

// the flags

671

unsigned char ac_flag = 1;

672

unsigned char species_flag = !species.empty()*2; // if no species is given, take all proteins

673

unsigned char sequence_flag = 4;

674

// the value

675

unsigned char record_flags = 0;

676

677

String::size_type pos(0); // the position in a line

678

unsigned long long source_database_pos = source_database.tellg(); // the start of a protein in the source database

679

unsigned long long source_database_pos_buffer = 0; // because you don't know whether a new protein starts unless the line is read, the actual position is buffered before any new getline

680

Size database_pos(0);

681

String line, sequence, protein_name;

682

char* record = new char[record_length_]; // a record in the index file

683

char* protein_name_pos = record + db_pos_length_ + trie_db_pos_length_;

684

685

while ( getline(source_database, line) )

686

{

687

if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);

688

line.trim();

689

690

// empty and comment lines are skipped

691

if ( line.empty() || line.hasPrefix(comment_label) )

692

{

693

source_database_pos_buffer = source_database.tellg();

694

continue;

695

}

696

697

// read the sequence if the accession and the species have been read already

698

if ( record_flags == (ac_flag | species_flag | sequence_flag) )

699

{

700

if ( !line.hasPrefix(sequence_end_label) ) // if it is still the same protein, append the sequence

701

{

702

line.trim(); // erase all whitespaces from the sequence

703

line.remove(trie_delimiter_);

704

// save this part of the sequence

705

sequence.append(line);

706

}

707

else // if a new protein is found, write down the old one

708

{

709

// if the sequence is not empty, the record has the correct form

710

if ( !sequence.empty() )

711

{

712

// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)

713

if ( append ) database.put('*');

714

database_pos = database.tellp();

715

716

// write the record

717

memcpy(record, &source_database_pos, db_pos_length_); // source database position

718

if (OPENMS_IS_BIG_ENDIAN)

719

{

720

char tmp;

721

for (Size i = 0; i < db_pos_length_ / 2; i++)

722

{

723

tmp = record[i];

724

record[i] = record[db_pos_length_ - 1 - i];

725

record[db_pos_length_ - 1 - i] = tmp;

726

}

727

}

728

729

// whoever wrote this code - please don't ever do this again.

730

// x86 does *not* have a monopoly, nor does little endian.

731

memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position

732

733

// fix the above "suboptimal" code

734

if (OPENMS_IS_BIG_ENDIAN)

735

{

736

char tmp;

737

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

738

{

739

tmp = record[db_pos_length_ + i];

740

record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

741

record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

742

}

743

}

744

745

index.write(record, record_length_);

746

// protein name / accession has already been written

747

database << sequence;

748

source_database_pos = source_database_pos_buffer; // the position of the start of the new protein

749

append = true;

750

}

751

sequence.clear();

752

753

// set back the record flags for a new record

754

record_flags = 0;

755

}

756

}

757

758

// if not reading the sequence

759

if ( !(record_flags & sequence_flag) )

760

{

761

if ( line.hasPrefix(ac_label) )

762

{

763

pos = ac_label.length(); // find the beginning of the accession

764

765

while ( (line.length() > pos) && (line[pos] < 33) ) ++pos; // discard the whitespaces after the label

766

if ( pos != line.length() ) // if no accession is found, skip this protein

767

{

768

memset(protein_name_pos, 0, protein_name_length_); // clear the protein name

769

// read at most protein_name_length_ characters from the record name and write them to the record

770

protein_name = line.substr(pos, protein_name_length_);

771

protein_name.substitute('>', '}');

772

memcpy(protein_name_pos, protein_name.c_str(), protein_name.length());

773

774

record_flags |= ac_flag; // set the ac flag

775

}

776

else record_flags = 0;

777

}

778

// if a species line is found and an accession has already been found, check whether this record is from the wanted species, if not, skip it

779

if ( species_flag && line.hasPrefix(species_label) && (record_flags == ac_flag) )

780

{

781

pos = species_label.length();

782

if ( line.find(species, pos) != String::npos ) record_flags |= species_flag;

783

else record_flags = 0;

784

}

785

// if the beginning of the sequence is found and accession and correct species have been found

786

if ( line.hasPrefix(sequence_start_label) && ((record_flags & (ac_flag | species_flag)) == (ac_flag | species_flag)) ) record_flags |= sequence_flag;

787

}

788

source_database_pos_buffer = source_database.tellg();

789

}

790

// source file read

791

source_database.close();

792

source_database.clear();

793

794

// if the last record has no sequence end label, the sequence has to be appended nevertheless (e.g. FASTA)

795

if ( record_flags == (ac_flag | species_flag | sequence_flag) && !sequence.empty() )

796

{

797

// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)

798

if ( append ) database.put('*');

799

database_pos = database.tellp();

800

801

// write the record

802

// whoever wrote this code - please don't ever do this again.

803

// x86 does *not* have a monopoly, nor does little endian.

804

memcpy(record, &source_database_pos, db_pos_length_); // source database position

805

if (OPENMS_IS_BIG_ENDIAN)

806

{

807

char tmp;

808

for (Size i = 0; i < db_pos_length_ / 2; i++)

809

{

810

tmp = record[i];

811

record[i] = record[db_pos_length_ - 1 - i];

812

record[db_pos_length_ - 1 - i] = tmp;

813

}

814

}

815

816

memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position

817

818

// fix the above "suboptimal" code

819

if (OPENMS_IS_BIG_ENDIAN)

820

{

821

char tmp;

822

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

823

{

824

tmp = record[db_pos_length_ + i];

825

record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

826

record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

827

}

828

}

829

830

index.write(record, record_length_);

831

// protein name / accession has already been written

832

database << sequence;

833

append = true;

834

}

835

836

delete [] record;

837

838

// close the filestreams

839

database.close();

840

database.clear();

841

index.close();

842

index.clear();

843

}

844

845

void InspectOutfile::getLabels(

846

const String& source_database_filename,

847

String& ac_label,

848

String& sequence_start_label,

849

String& sequence_end_label,

850

String& comment_label,

851

String& species_label)

852

{

853

ac_label = sequence_start_label = sequence_end_label = comment_label = species_label = "";

854

ifstream source_database(source_database_filename.c_str());

855

if ( !source_database )

856

{

857

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);

858

}

859

860

String line;

861

while ( getline(source_database, line) && (sequence_start_label.empty()) )

862

{

863

if ( !line.empty() && (line[line.length()-1] < 33) ) line.resize(line.length()-1);

864

if ( line.trim().empty() ) continue;

865

866

else if ( line.hasPrefix(">") )

867

{

868

ac_label = ">";

869

sequence_start_label = ">";

870

sequence_end_label = ">";

871

comment_label = ";";

872

species_label = ">";

873

}

874

else if ( line.hasPrefix("SQ") )

875

{

876

ac_label = "AC";

877

sequence_start_label = "SQ";

878

sequence_end_label = "//";

879

comment_label = "CC";

880

species_label = "OS";

881

}

882

}

883

source_database.close();

884

source_database.clear();

885

886

// if no known start separator is found

887

if (sequence_start_label.empty())

888

{

889

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "database has unknown file format (neither trie nor FASTA nor swissprot)" , source_database_filename);

890

}

891

}

892

893

vector<Size> InspectOutfile::getWantedRecords(const String& result_filename, DoubleReal p_value_threshold)

894

{

895

// check whether the p_value is correct

896

if ( (p_value_threshold < 0) || (p_value_threshold > 1) )

897

{

898

throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the parameters 'p_value_threshold' must be >= 0 and <=1 !");

899

}

900

901

ifstream result_file(result_filename.c_str());

902

if (!result_file)

903

{

904

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);

905

}

906

907

String line;

908

vector< String > substrings;

909

910

set< Size > wanted_records_set;

911

912

vector< Size >

913

wanted_records,

914

corrupted_lines;

915

916

Size line_number(0);

917

918

// get the header

919

Int

920

spectrum_file_column(-1),

921

scan_column(-1),

922

peptide_column(-1),

923

protein_column(-1),

924

charge_column(-1),

925

MQ_score_column(-1),

926

p_value_column(-1),

927

record_number_column(-1),

928

DB_file_pos_column(-1),

929

spec_file_pos_column(-1);

930

931

Size number_of_columns(0);

932

933

if ( !getline(result_file, line) )

934

{

935

result_file.close();

936

result_file.clear();

937

throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);

938

}

939

++line_number;

940

941

942

while (getline(result_file, line))

943

{

944

++line_number;

945

if (!line.empty() && (line[line.length()-1] < 33)) line.resize(line.length() - 1);

946

line.trim();

947

if ( line.empty() ) continue;

948

line.split('\t', substrings);

949

950

// check whether the line has enough columns

951

if ( substrings.size() != number_of_columns )

952

{

953

corrupted_lines.push_back(line_number);

954

continue;

955

}

956

957

// check whether the line has enough columns

958

if (substrings.size() != number_of_columns) continue;

959

960

// take only those peptides whose p-value is less or equal the given threshold

961

if (substrings[p_value_column].toFloat() > p_value_threshold) continue;

962

963

wanted_records_set.insert(substrings[record_number_column].toInt());

964

}

965

966

result_file.close();

967

result_file.clear();

968

969

for ( set< Size >::const_iterator rn_i = wanted_records_set.begin(); rn_i != wanted_records_set.end(); ++rn_i )

970

{

971

wanted_records.push_back(*rn_i);

972

}

973

974

return wanted_records;

975

}

976

977

bool

978

InspectOutfile::getSearchEngineAndVersion(

979

const String& cmd_output,

980

ProteinIdentification& protein_identification)

981

{

982

protein_identification.setSearchEngine("InsPecT");

983

protein_identification.setSearchEngineVersion("unknown");

984

// searching for something like this: InsPecT version 20060907, InsPecT version 20100331

985

QString response(cmd_output.toQString());

158

}

159

160

while (getline(result_file, line))

161

{

162

++line_number;

163

if (!line.empty() && (line[line.length() - 1] < 33))

164

line.resize(line.length() - 1);

165

line.trim();

166

if (line.empty())

167

continue;

168

169

// check whether the line has enough columns

170

line.split('\t', substrings);

171

if (substrings.size() != number_of_columns)

172

{

173

corrupted_lines.push_back(line_number);

174

continue;

175

}

176

177

// if the pvalue is too small, skip the line

178

if (substrings[p_value_column].toFloat() > p_value_threshold)

179

continue;

180

181

// the protein

182

ProteinHit protein_hit;

183

// get accession number and type

184

getACAndACType(substrings[protein_column], accession, accession_type);

185

protein_hit.setAccession(accession);

186

// protein_hit.setScore(0.0);

187

188

// the database position of the protein (the i-th protein)

189

record_number = substrings[record_number_column].toInt();

190

191

// map the database position of the protein to its position in the protein hits and insert it, if it's a new protein

192

if (rn_position_map.find(record_number) == rn_position_map.end())

193

{

194

rn_position_map[record_number] = protein_identification.getHits().size();

195

protein_identification.insertHit(protein_hit);

196

}

197

198

// if a new scan is found (new file or new scan), insert it into the vector (the first time the condition is fullfilled because spectrum_file is "")

199

if ((substrings[spectrum_file_column] != spectrum_file) || ((Size) substrings[scan_column].toInt() != scan_number))

200

{

201

if (substrings[spectrum_file_column] != spectrum_file) // if it's a new file, insert it into the vector (used to retrieve RT and MT later)

202

{

203

// if it's the first file or if hits have been found in the file before, insert a new file

204

if (files_and_peptide_identification_with_scan_number.empty() || !files_and_peptide_identification_with_scan_number.back().second.empty())

205

{

206

files_and_peptide_identification_with_scan_number.push_back(make_pair(substrings[spectrum_file_column], vector<pair<Size, Size> >()));

207

}

208

// otherwise change the name of the last file entry (the one without hits)

209

else

210

files_and_peptide_identification_with_scan_number.back().first = substrings[spectrum_file_column];

211

}

212

213

spectrum_file = substrings[spectrum_file_column];

214

scan_number = substrings[scan_column].toInt();

215

216

// if it's not the first scan and if hits have been found, insert the peptide identification

217

if (!peptide_identification.empty() && !peptide_identification.getHits().empty())

218

{

219

files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));

220

peptide_identifications.push_back(peptide_identification);

221

}

222

peptide_identification = PeptideIdentification();

223

224

peptide_identification.setIdentifier(identifier);

225

peptide_identification.setSignificanceThreshold(p_value_threshold);

226

peptide_identification.setScoreType(score_type_);

227

}

228

229

// get the peptide infos from the new peptide and insert it

230

PeptideHit peptide_hit;

231

peptide_hit.setCharge(substrings[charge_column].toInt());

232

peptide_hit.setScore(substrings[MQ_score_column].toFloat());

233

peptide_hit.setRank(0); // all ranks are set to zero and assigned later

234

235

// get the sequence and the amino acid before and after

236

String sequence, sequence_with_mods;

237

sequence_with_mods = substrings[peptide_column];

238

start = sequence_with_mods.find('.') + 1;

239

end = sequence_with_mods.find_last_of('.');

240

if (start >= 2)

241

peptide_hit.setAABefore(sequence_with_mods[start - 2]);

242

if (end < sequence_with_mods.length() + 1)

243

peptide_hit.setAAAfter(sequence_with_mods[end + 1]);

244

245

//remove modifications (small characters and anything that's not in the alphabet)

246

sequence_with_mods = substrings[peptide_column].substr(start, end - start);

247

for (String::ConstIterator c_i = sequence_with_mods.begin(); c_i != sequence_with_mods.end(); ++c_i)

248

{

249

if ((bool) isalpha(*c_i) && (bool) isupper(*c_i))

250

sequence.append(1, *c_i);

251

}

252

253

peptide_hit.setSequence(sequence);

254

peptide_hit.addProteinAccession(accession);

255

256

peptide_identification.insertHit(peptide_hit);

257

}

258

259

// result file read

260

result_file.close();

261

result_file.clear();

262

263

// if it's not the first scan and if hits have been found, insert the peptide identification

264

if (!peptide_identification.empty() && !peptide_identification.getHits().empty())

265

{

266

files_and_peptide_identification_with_scan_number.back().second.push_back(make_pair(peptide_identifications.size(), scan_number));

267

peptide_identifications.push_back(peptide_identification);

268

}

269

270

// if the last file had no hits, delete it

271

if (!files_and_peptide_identification_with_scan_number.empty() && files_and_peptide_identification_with_scan_number.back().second.empty())

272

{

273

files_and_peptide_identification_with_scan_number.pop_back();

274

}

275

276

if (!peptide_identifications.empty())

277

peptide_identifications.back().assignRanks();

278

279

// search the sequence of the proteins

280

if (!protein_identification.getHits().empty() && !database_filename.empty())

281

{

282

vector<ProteinHit> protein_hits = protein_identification.getHits();

283

vector<String> sequences;

284

getSequences(database_filename, rn_position_map, sequences);

285

286

// set the retrieved sequences

287

vector<String>::const_iterator s_i = sequences.begin();

288

for (map<Size, Size>::const_iterator rn_i = rn_position_map.begin(); rn_i != rn_position_map.end(); ++rn_i, ++s_i)

289

protein_hits[rn_i->second].setSequence(*s_i);

290

291

sequences.clear();

292

rn_position_map.clear();

293

protein_identification.setHits(protein_hits);

294

protein_hits.clear();

295

}

296

297

// get the precursor retention times and mz values

298

getPrecursorRTandMZ(files_and_peptide_identification_with_scan_number, peptide_identifications);

299

protein_identification.setDateTime(datetime);

300

protein_identification.setIdentifier(identifier);

301

302

return corrupted_lines;

303

}

304

305

// < record number, number of protein in a vector >

306

vector<Size>

307

InspectOutfile::getSequences(

308

const String & database_filename,

309

const map<Size, Size> & wanted_records,

310

vector<String> & sequences)

311

{

312

ifstream database(database_filename.c_str());

313

if (!database)

314

{

315

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);

316

}

317

318

vector<Size> not_found;

319

Size seen_records(0);

320

stringbuf sequence;

321

database.seekg(0, ios::end);

322

streampos sp = database.tellg();

323

database.seekg(0, ios::beg);

324

325

for (map<Size, Size>::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i)

326

{

327

for (; seen_records < wr_i->first; ++seen_records)

328

{

329

database.ignore(sp, trie_delimiter_);

330

}

331

database.get(sequence, trie_delimiter_);

332

sequences.push_back(sequence.str());

333

if (sequences.back().empty())

334

not_found.push_back(wr_i->first);

335

sequence.str("");

336

}

337

338

// close the filestreams

339

database.close();

340

database.clear();

341

342

return not_found;

343

}

344

345

void

346

InspectOutfile::getACAndACType(

347

String line,

348

String & accession,

349

String & accession_type)

350

{

351

String swissprot_prefixes = "JLOPQUX";

352

/// @todo replace this by general FastA implementation? (Martin)

353

accession.clear();

354

accession_type.clear();

355

pair<String, String> p;

356

// if it's a FASTA line

357

if (line.hasPrefix(">"))

358

line.erase(0, 1);

359

if (!line.empty() && (line[line.length() - 1] < 33))

360

line.resize(line.length() - 1);

361

line.trim();

362

363

// if it's a swissprot accession

364

if (line.hasPrefix("tr") || line.hasPrefix("sp"))

365

{

366

accession = line.substr(3, line.find('|', 3) - 3);

367

accession_type = "SwissProt";

368

}

369

else if (line.hasPrefix("gi"))

370

{

371

String::size_type snd(line.find('|', 3));

372

String::size_type third(0);

373

if (snd != String::npos)

374

{

375

third = line.find('|', ++snd) + 1;

376

377

accession = line.substr(third, line.find('|', third) - third);

378

accession_type = line.substr(snd, third - 1 - snd);

379

}

380

if (accession_type == "gb")

381

accession_type = "GenBank";

382

else if (accession_type == "emb")

383

accession_type = "EMBL";

384

else if (accession_type == "dbj")

385

accession_type = "DDBJ";

386

else if (accession_type == "ref")

387

accession_type = "NCBI";

388

else if ((accession_type == "sp") || (accession_type == "tr"))

389

accession_type = "SwissProt";

390

else if (accession_type == "gnl")

391

{

392

accession_type = accession;

393

snd = line.find('|', third);

394

third = line.find('|', ++snd);

395

if (third != String::npos)

396

accession = line.substr(snd, third - snd);

397

else

398

{

399

third = line.find(' ', snd);

400

if (third != String::npos)

401

accession = line.substr(snd, third - snd);

402

else

403

accession = line.substr(snd);

404

}

405

}

406

else

407

{

408

String::size_type pos1(line.find('(', 0));

409

String::size_type pos2(0);

410

if (pos1 != String::npos)

411

{

412

pos2 = line.find(')', ++pos1);

413

if (pos2 != String::npos)

414

{

415

accession = line.substr(pos1, pos2 - pos1);

416

if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))

417

accession_type = "SwissProt";

418

else

419

accession.clear();

420

}

421

}

422

if (accession.empty())

423

{

424

accession_type = "gi";

425

if (snd != String::npos)

426

accession = line.substr(3, snd - 4);

427

else

428

{

429

if (snd == String::npos)

430

snd = line.find(' ', 3);

431

if (snd != String::npos)

432

accession = line.substr(3, snd - 3);

433

else

434

accession = line.substr(3);

435

}

436

}

437

}

438

}

439

else if (line.hasPrefix("ref"))

440

{

441

accession = line.substr(4, line.find('|', 4) - 4);

442

accession_type = "NCBI";

443

}

444

else if (line.hasPrefix("gnl"))

445

{

446

line.erase(0, 3);

447

accession_type = line.substr(0, line.find('|', 0));

448

accession = line.substr(accession_type.length() + 1);

449

}

450

else if (line.hasPrefix("lcl"))

451

{

452

line.erase(0, 4);

453

accession_type = "lcl";

454

accession = line;

455

}

456

else

457

{

458

String::size_type pos1(line.find('(', 0));

459

String::size_type pos2(0);

460

if (pos1 != String::npos)

461

{

462

pos2 = line.find(')', ++pos1);

463

if (pos2 != String::npos)

464

{

465

accession = line.substr(pos1, pos2 - pos1);

466

if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))

467

accession_type = "SwissProt";

468

else

469

accession.clear();

470

}

471

}

472

if (accession.empty())

473

{

474

pos1 = line.find('|');

475

accession = line.substr(0, pos1);

476

if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))

477

accession_type = "SwissProt";

478

else

479

{

480

pos1 = line.find(' ');

481

accession = line.substr(0, pos1);

482

if ((accession.size() == 6) && (String(swissprot_prefixes).find(accession[0], 0) != String::npos))

483

accession_type = "SwissProt";

484

else

485

{

486

accession = line.substr(0, 6);

487

if (String(swissprot_prefixes).find(accession[0], 0) != String::npos)

488

accession_type = "SwissProt";

489

else

490

accession.clear();

491

}

492

}

493

}

494

}

495

if (accession.empty())

496

{

497

accession = line.trim();

498

accession_type = "unknown";

499

}

500

}

501

502

void

503

InspectOutfile::getPrecursorRTandMZ(

504

const vector<pair<String, vector<pair<Size, Size> > > > & files_and_peptide_identification_with_scan_number,

505

vector<PeptideIdentification> & ids)

506

{

507

MSExperiment<> experiment;

508

String type;

509

510

for (vector<pair<String, vector<pair<Size, Size> > > >::const_iterator fs_i = files_and_peptide_identification_with_scan_number.begin(); fs_i != files_and_peptide_identification_with_scan_number.end(); ++fs_i)

511

{

512

getExperiment(experiment, type, fs_i->first); // may throw an exception if the filetype could not be determined

513

514

if (experiment.size() < fs_i->second.back().second)

515

{

516

517

}

518

519

for (vector<pair<Size, Size> >::const_iterator pi_scan_i = fs_i->second.begin(); pi_scan_i != fs_i->second.end(); ++pi_scan_i)

520

{

521

ids[pi_scan_i->first].setMetaValue("MZ", experiment[pi_scan_i->second - 1].getPrecursors()[0].getMZ());

522

ids[pi_scan_i->first].setMetaValue("RT", experiment[pi_scan_i->second - 1].getRT());

523

}

524

}

525

}

526

527

void

528

InspectOutfile::compressTrieDB(

529

const String & database_filename,

530

const String & index_filename,

531

vector<Size> & wanted_records,

532

const String & snd_database_filename,

533

const String & snd_index_filename,

534

bool append)

535

{

536

if (database_filename == snd_database_filename)

537

{

538

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", database_filename);

539

}

540

if (index_filename == snd_index_filename)

541

{

542

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Same filename can not be used for original and second database!", index_filename);

543

}

544

ifstream database(database_filename.c_str());

545

if (!database)

546

{

547

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);

548

}

549

550

ifstream index(index_filename.c_str(), ios::in | ios::binary);

551

if (!index)

552

{

553

database.close();

554

database.clear();

555

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);

556

}

557

558

// determine the length of the index file

559

index.seekg(0, ios::end);

560

streampos index_length = index.tellg();

561

index.seekg(0, ios::beg);

562

bool empty_records = wanted_records.empty();

563

if (wanted_records.empty())

564

{

565

for (Size i = 0; i < index_length / record_length_; ++i)

566

wanted_records.push_back(i);

567

}

568

569

// take the wanted records, copy their sequences to the new db and write the index file accordingly

570

ofstream snd_database;

571

if (append)

572

snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::app);

573

else

574

snd_database.open(snd_database_filename.c_str(), std::ios::out | std::ios::trunc);

575

if (!snd_database)

576

{

577

database.close();

578

database.clear();

579

index.close();

580

index.clear();

581

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_database_filename);

582

}

583

584

ofstream snd_index;

585

if (append)

586

snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::app);

587

else

588

snd_index.open(snd_index_filename.c_str(), std::ios::out | std::ios::binary | std::ios::trunc);

589

if (!snd_index)

590

{

591

database.close();

592

database.clear();

593

index.close();

594

index.clear();

595

snd_database.close();

596

snd_database.clear();

597

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, snd_index_filename);

598

}

599

600

char * index_record = new char[record_length_]; // to copy one record from the index file

601

Size database_pos(0), snd_database_pos(0); // their sizes HAVE TO BE 4 bytes

602

stringbuf sequence;

603

streampos index_pos(0);

604

605

for (vector<Size>::const_iterator wr_i = wanted_records.begin(); wr_i != wanted_records.end(); ++wr_i)

606

{

607

// get the according record in the index file

608

if (index_length < Int((*wr_i + 1) * record_length_)) // if the file is too short

609

{

610

delete[] index_record;

611

database.close();

612

database.clear();

613

index.close();

614

index.clear();

615

snd_database.close();

616

snd_database.clear();

617

snd_index.close();

618

snd_index.clear();

619

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "index file is too short!", index_filename);

620

}

621

index.seekg((*wr_i) * record_length_);

622

index.read(index_record, record_length_);

623

624

// all but the first sequence are preceded by an asterisk

625

if (append)

626

snd_database.put(trie_delimiter_);

627

append = true;

628

629

// check if we have to reverse the database_pos part (which is saved in little endian)

630

if (OPENMS_IS_BIG_ENDIAN)

631

{

632

char tmp;

633

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

634

{

635

tmp = index_record[db_pos_length_ + i];

636

index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

637

index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

638

}

639

}

640

641

// go to the beginning of the sequence

642

643

// whoever wrote this code - please don't ever do this again.

644

// x86 does *not* have a monopoly, nor does little endian.

645

memcpy(&database_pos, index_record + db_pos_length_, trie_db_pos_length_);

646

database.seekg(database_pos);

647

648

// store the corresponding index for the second database

649

snd_database_pos = snd_database.tellp(); // get the position in the second database

650

651

memcpy(index_record + db_pos_length_, &snd_database_pos, trie_db_pos_length_); // and copy to its place in the index record

652

653

// fixing the above "suboptimal" code

654

if (OPENMS_IS_BIG_ENDIAN)

655

{

656

char tmp;

657

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

658

{

659

tmp = index_record[db_pos_length_ + i];

660

index_record[db_pos_length_ + i] = index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

661

index_record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

662

}

663

}

664

665

snd_index.write((char *) index_record, record_length_); // because only the trie-db position changed, not the position in the original database, nor the protein name

666

667

// store the sequence

668

database.get(sequence, trie_delimiter_);

669

snd_database << sequence.str();

670

sequence.str("");

671

}

672

673

674

if (empty_records)

675

wanted_records.clear();

676

delete[] index_record;

677

database.close();

678

database.clear();

679

index.close();

680

index.clear();

681

snd_database.close();

682

snd_database.clear();

683

snd_index.close();

684

snd_index.clear();

685

}

686

687

void

688

InspectOutfile::generateTrieDB(

689

const String & source_database_filename,

690

const String & database_filename,

691

const String & index_filename,

692

bool append,

693

const String species)

694

{

695

ifstream source_database(source_database_filename.c_str());

696

if (!source_database)

697

{

698

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);

699

}

700

701

// get the labels

702

String ac_label, sequence_start_label, sequence_end_label, comment_label, species_label;

703

getLabels(source_database_filename, ac_label, sequence_start_label, sequence_end_label, comment_label, species_label);

704

705

ofstream database;

706

if (append)

707

database.open(database_filename.c_str(), ios::app | ios::out);

708

else

709

database.open(database_filename.c_str());

710

if (!database)

711

{

712

source_database.close();

713

source_database.clear();

714

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, database_filename);

715

}

716

ofstream index;

717

if (append)

718

index.open(index_filename.c_str(), ios::app | ios::out | ios::binary);

719

else

720

index.open(index_filename.c_str(), ios::out | ios::binary);

721

if (!index)

722

{

723

source_database.close();

724

source_database.clear();

725

database.close();

726

database.clear();

727

throw Exception::UnableToCreateFile(__FILE__, __LINE__, __PRETTY_FUNCTION__, index_filename);

728

}

729

730

// using flags to mark what has already been read

731

// the flags

732

unsigned char ac_flag = 1;

733

unsigned char species_flag = !species.empty() * 2; // if no species is given, take all proteins

734

unsigned char sequence_flag = 4;

735

// the value

736

unsigned char record_flags = 0;

737

738

String::size_type pos(0); // the position in a line

739

unsigned long long source_database_pos = source_database.tellg(); // the start of a protein in the source database

740

unsigned long long source_database_pos_buffer = 0; // because you don't know whether a new protein starts unless the line is read, the actual position is buffered before any new getline

741

Size database_pos(0);

742

String line, sequence, protein_name;

743

char * record = new char[record_length_]; // a record in the index file

744

char * protein_name_pos = record + db_pos_length_ + trie_db_pos_length_;

745

746

while (getline(source_database, line))

747

{

748

if (!line.empty() && (line[line.length() - 1] < 33))

749

line.resize(line.length() - 1);

750

line.trim();

751

752

// empty and comment lines are skipped

753

if (line.empty() || line.hasPrefix(comment_label))

754

{

755

source_database_pos_buffer = source_database.tellg();

756

continue;

757

}

758

759

// read the sequence if the accession and the species have been read already

760

if (record_flags == (ac_flag | species_flag | sequence_flag))

761

{

762

if (!line.hasPrefix(sequence_end_label)) // if it is still the same protein, append the sequence

763

{

764

line.trim(); // erase all whitespaces from the sequence

765

line.remove(trie_delimiter_);

766

// save this part of the sequence

767

sequence.append(line);

768

}

769

else // if a new protein is found, write down the old one

770

{

771

// if the sequence is not empty, the record has the correct form

772

if (!sequence.empty())

773

{

774

// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)

775

if (append)

776

database.put('*');

777

database_pos = database.tellp();

778

779

// write the record

780

memcpy(record, &source_database_pos, db_pos_length_); // source database position

781

if (OPENMS_IS_BIG_ENDIAN)

782

{

783

char tmp;

784

for (Size i = 0; i < db_pos_length_ / 2; i++)

785

{

786

tmp = record[i];

787

record[i] = record[db_pos_length_ - 1 - i];

788

record[db_pos_length_ - 1 - i] = tmp;

789

}

790

}

791

792

// whoever wrote this code - please don't ever do this again.

793

// x86 does *not* have a monopoly, nor does little endian.

794

memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position

795

796

// fix the above "suboptimal" code

797

if (OPENMS_IS_BIG_ENDIAN)

798

{

799

char tmp;

800

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

801

{

802

tmp = record[db_pos_length_ + i];

803

record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

804

record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

805

}

806

}

807

808

index.write(record, record_length_);

809

// protein name / accession has already been written

810

database << sequence;

811

source_database_pos = source_database_pos_buffer; // the position of the start of the new protein

812

append = true;

813

}

814

sequence.clear();

815

816

// set back the record flags for a new record

817

record_flags = 0;

818

}

819

}

820

821

// if not reading the sequence

822

if (!(record_flags & sequence_flag))

823

{

824

if (line.hasPrefix(ac_label))

825

{

826

pos = ac_label.length(); // find the beginning of the accession

827

828

while ((line.length() > pos) && (line[pos] < 33))

829

++pos; // discard the whitespaces after the label

830

if (pos != line.length()) // if no accession is found, skip this protein

831

{

832

memset(protein_name_pos, 0, protein_name_length_); // clear the protein name

833

// read at most protein_name_length_ characters from the record name and write them to the record

834

protein_name = line.substr(pos, protein_name_length_);

835

protein_name.substitute('>', '}');

836

memcpy(protein_name_pos, protein_name.c_str(), protein_name.length());

837

838

record_flags |= ac_flag; // set the ac flag

839

}

840

else

841

record_flags = 0;

842

}

843

// if a species line is found and an accession has already been found, check whether this record is from the wanted species, if not, skip it

844

if (species_flag && line.hasPrefix(species_label) && (record_flags == ac_flag))

845

{

846

pos = species_label.length();

847

if (line.find(species, pos) != String::npos)

848

record_flags |= species_flag;

849

else

850

record_flags = 0;

851

}

852

// if the beginning of the sequence is found and accession and correct species have been found

853

if (line.hasPrefix(sequence_start_label) && ((record_flags & (ac_flag | species_flag)) == (ac_flag | species_flag)))

854

record_flags |= sequence_flag;

855

}

856

source_database_pos_buffer = source_database.tellg();

857

}

858

// source file read

859

source_database.close();

860

source_database.clear();

861

862

// if the last record has no sequence end label, the sequence has to be appended nevertheless (e.g. FASTA)

863

if (record_flags == (ac_flag | species_flag | sequence_flag) && !sequence.empty())

864

{

865

// all but the first record in the database are preceded by an asterisk (if in append mode an asterisk has to be put at any time)

866

if (append)

867

database.put('*');

868

database_pos = database.tellp();

869

870

// write the record

871

// whoever wrote this code - please don't ever do this again.

872

// x86 does *not* have a monopoly, nor does little endian.

873

memcpy(record, &source_database_pos, db_pos_length_); // source database position

874

if (OPENMS_IS_BIG_ENDIAN)

875

{

876

char tmp;

877

for (Size i = 0; i < db_pos_length_ / 2; i++)

878

{

879

tmp = record[i];

880

record[i] = record[db_pos_length_ - 1 - i];

881

record[db_pos_length_ - 1 - i] = tmp;

882

}

883

}

884

885

memcpy(record + db_pos_length_, &database_pos, trie_db_pos_length_); // database position

886

887

// fix the above "suboptimal" code

888

if (OPENMS_IS_BIG_ENDIAN)

889

{

890

char tmp;

891

for (Size i = 0; i < trie_db_pos_length_ / 2; i++)

892

{

893

tmp = record[db_pos_length_ + i];

894

record[db_pos_length_ + i] = record[db_pos_length_ + trie_db_pos_length_ - 1 - i];

895

record[db_pos_length_ + trie_db_pos_length_ - 1 - i] = tmp;

896

}

897

}

898

899

index.write(record, record_length_);

900

// protein name / accession has already been written

901

database << sequence;

902

append = true;

903

}

904

905

delete[] record;

906

907

// close the filestreams

908

database.close();

909

database.clear();

910

index.close();

911

index.clear();

912

}

913

914

void InspectOutfile::getLabels(

915

const String & source_database_filename,

916

String & ac_label,

917

String & sequence_start_label,

918

String & sequence_end_label,

919

String & comment_label,

920

String & species_label)

921

{

922

ac_label = sequence_start_label = sequence_end_label = comment_label = species_label = "";

923

ifstream source_database(source_database_filename.c_str());

924

if (!source_database)

925

{

926

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, source_database_filename);

927

}

928

929

String line;

930

while (getline(source_database, line) && (sequence_start_label.empty()))

931

{

932

if (!line.empty() && (line[line.length() - 1] < 33))

933

line.resize(line.length() - 1);

934

if (line.trim().empty())

935

continue;

936

937

else if (line.hasPrefix(">"))

938

{

939

ac_label = ">";

940

sequence_start_label = ">";

941

sequence_end_label = ">";

942

comment_label = ";";

943

species_label = ">";

944

}

945

else if (line.hasPrefix("SQ"))

946

{

947

ac_label = "AC";

948

sequence_start_label = "SQ";

949

sequence_end_label = "//";

950

comment_label = "CC";

951

species_label = "OS";

952

}

953

}

954

source_database.close();

955

source_database.clear();

956

957

// if no known start separator is found

958

if (sequence_start_label.empty())

959

{

960

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "database has unknown file format (neither trie nor FASTA nor swissprot)", source_database_filename);

961

}

962

}

963

964

vector<Size> InspectOutfile::getWantedRecords(const String & result_filename, DoubleReal p_value_threshold)

965

{

966

// check whether the p_value is correct

967

if ((p_value_threshold < 0) || (p_value_threshold > 1))

968

{

969

throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the parameters 'p_value_threshold' must be >= 0 and <=1 !");

970

}

971

972

ifstream result_file(result_filename.c_str());

973

if (!result_file)

974

{

975

throw Exception::FileNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);

976

}

977

978

String line;

979

vector<String> substrings;

980

981

set<Size> wanted_records_set;

982

983

vector<Size>

984

wanted_records,

985

corrupted_lines;

986

987

Size line_number(0);

988

989

// get the header

990

Int

991

spectrum_file_column(-1),

992

scan_column(-1),

993

peptide_column(-1),

994

protein_column(-1),

995

charge_column(-1),

996

MQ_score_column(-1),

997

p_value_column(-1),

998

record_number_column(-1),

999

DB_file_pos_column(-1),

1000

spec_file_pos_column(-1);

1001

1002

Size number_of_columns(0);

1003

1004

if (!getline(result_file, line))

1005

{

1006

result_file.close();

1007

result_file.clear();

1008

throw Exception::FileEmpty(__FILE__, __LINE__, __PRETTY_FUNCTION__, result_filename);

1009

}

1010

++line_number;

1011

1012

1013

while (getline(result_file, line))

1014

{

1015

++line_number;

1016

if (!line.empty() && (line[line.length() - 1] < 33))

1017

line.resize(line.length() - 1);

1018

line.trim();

1019

if (line.empty())

1020

continue;

1021

line.split('\t', substrings);

1022

1023

// check whether the line has enough columns

1024

if (substrings.size() != number_of_columns)

1025

{

1026

corrupted_lines.push_back(line_number);

1027

continue;

1028

}

1029

1030

// check whether the line has enough columns

1031

if (substrings.size() != number_of_columns)

1032

continue;

1033

1034

// take only those peptides whose p-value is less or equal the given threshold

1035

if (substrings[p_value_column].toFloat() > p_value_threshold)

1036

continue;

1037

1038

wanted_records_set.insert(substrings[record_number_column].toInt());

1039

}

1040

1041

result_file.close();

1042

result_file.clear();

1043

1044

for (set<Size>::const_iterator rn_i = wanted_records_set.begin(); rn_i != wanted_records_set.end(); ++rn_i)

1045

{

1046

wanted_records.push_back(*rn_i);

1047

}

1048

1049

return wanted_records;

1050

}

1051

1052

bool

1053

InspectOutfile::getSearchEngineAndVersion(

1054

const String & cmd_output,

1055

ProteinIdentification & protein_identification)

1056

{

1057

protein_identification.setSearchEngine("InsPecT");

1058

protein_identification.setSearchEngineVersion("unknown");

1059

// searching for something like this: InsPecT version 20060907, InsPecT version 20100331

1060

QString response(cmd_output.toQString());

986

1061

QRegExp rx("InsPecT (version|vesrion) (\\d+)"); // older versions of InsPecT have typo...

987

if (rx.indexIn(response) == -1) return false;

988

protein_identification.setSearchEngineVersion(String(rx.cap(2)));

1062

if (rx.indexIn(response) == -1)

1063

return false;

1064

1065

protein_identification.setSearchEngineVersion(String(rx.cap(2)));

989

1066

return true;

990

}

991

992

void

993

InspectOutfile::readOutHeader(

994

const String& filename,

995

const String& header_line,

996

Int& spectrum_file_column,

997

Int& scan_column,

998

Int& peptide_column,

999

Int& protein_column,

1000

Int& charge_column,

1001

Int& MQ_score_column,

1002

Int& p_value_column,

1003

Int& record_number_column,

1004

Int& DB_file_pos_column,

1005

Int& spec_file_pos_column,

1006

Size& number_of_columns)

1007

{

1008

spectrum_file_column = scan_column = peptide_column = protein_column = charge_column = MQ_score_column = p_value_column = record_number_column = DB_file_pos_column = spec_file_pos_column = -1;

1009

1010

vector< String > substrings;

1011

header_line.split('\t', substrings);

1012

1013

// #SpectrumFile Scan# Annotation Protein Charge MQScore Length TotalPRMScore MedianPRMScore FractionY FractionB Intensity NTT p-value F-Score DeltaScore DeltaScoreOther RecordNumber DBFilePos SpecFilePos

1014

for ( vector< String >::const_iterator s_i = substrings.begin(); s_i != substrings.end(); ++s_i )

1015

{

1016

if ( (*s_i) == "#SpectrumFile" ) spectrum_file_column = s_i - substrings.begin();

1017

else if ( (*s_i) == "Scan#" ) scan_column = s_i - substrings.begin();

1018

else if ( (*s_i) == "Annotation" ) peptide_column = s_i - substrings.begin();

1019

else if ( (*s_i) == "Protein" ) protein_column = s_i - substrings.begin();

1020

else if ( (*s_i) == "Charge" ) charge_column = s_i - substrings.begin();

1021

else if ( (*s_i) == "MQScore" ) MQ_score_column = s_i - substrings.begin();

1022

else if ( (*s_i) == "p-value" ) p_value_column = s_i - substrings.begin();

1023

else if ( (*s_i) == "RecordNumber" ) record_number_column = s_i - substrings.begin();

1024

else if ( (*s_i) == "DBFilePos" ) DB_file_pos_column = s_i - substrings.begin();

1025

else if ( (*s_i) == "SpecFilePos" ) spec_file_pos_column = s_i - substrings.begin();

1026

}

1027

1028

if ( (spectrum_file_column == -1) || (scan_column == -1) || (peptide_column == -1) || (protein_column == -1) || (charge_column == -1) || (MQ_score_column == -1) || (p_value_column == -1) || (record_number_column == -1) || (DB_file_pos_column == -1) || (spec_file_pos_column == -1) )

1029

{

1030

throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "at least one of the columns '#SpectrumFile', 'Scan#', 'Annotation', 'Protein', 'Charge', 'MQScore', 'p-value', 'RecordNumber', 'DBFilePos' or 'SpecFilePos' is missing!", filename);

1031

}

1032

number_of_columns = substrings.size();

1033

}

1034

1035

const Size InspectOutfile::db_pos_length_ = 8;

1036

const Size InspectOutfile::trie_db_pos_length_ = 4;

1037

const Size InspectOutfile::protein_name_length_ = 80;

1038

const Size InspectOutfile::record_length_ = db_pos_length_ + trie_db_pos_length_ + protein_name_length_;

1039

const char InspectOutfile::trie_delimiter_ = '*';

1040

const String InspectOutfile::score_type_ = "Inspect";

1041

1067

}

1068

1069

void

1070

InspectOutfile::readOutHeader(

1071

const String & filename,

1072

const String & header_line,

1073

Int & spectrum_file_column,

1074

Int & scan_column,

1075

Int & peptide_column,

1076

Int & protein_column,

1077

Int & charge_column,

1078

Int & MQ_score_column,

1079

Int & p_value_column,

1080

Int & record_number_column,

1081

Int & DB_file_pos_column,

1082

Int & spec_file_pos_column,

1083

Size & number_of_columns)

1084

{

1085

spectrum_file_column = scan_column = peptide_column = protein_column = charge_column = MQ_score_column = p_value_column = record_number_column = DB_file_pos_column = spec_file_pos_column = -1;

1086

1087

vector<String> substrings;

1088

header_line.split('\t', substrings);

1089

1090

1091

for (vector<String>::const_iterator s_i = substrings.begin(); s_i != substrings.end(); ++s_i)

1092

{

1093

if ((*s_i) == "#SpectrumFile")

1094

spectrum_file_column = s_i - substrings.begin();

1095

else if ((*s_i) == "Scan#")

1096

scan_column = s_i - substrings.begin();

1097

else if ((*s_i) == "Annotation")

1098

peptide_column = s_i - substrings.begin();

1099

else if ((*s_i) == "Protein")

1100

protein_column = s_i - substrings.begin();

1101

else if ((*s_i) == "Charge")

1102

charge_column = s_i - substrings.begin();

1103

else if ((*s_i) == "MQScore")

1104

MQ_score_column = s_i - substrings.begin();

1105

else if ((*s_i) == "p-value")

1106

p_value_column = s_i - substrings.begin();

1107

else if ((*s_i) == "RecordNumber")

1108

record_number_column = s_i - substrings.begin();

1109

else if ((*s_i) == "DBFilePos")

1110

DB_file_pos_column = s_i - substrings.begin();

1111

else if ((*s_i) == "SpecFilePos")

1112

spec_file_pos_column = s_i - substrings.begin();

1113

}

1114

1115

if ((spectrum_file_column == -1) || (scan_column == -1) || (peptide_column == -1) || (protein_column == -1) || (charge_column == -1) || (MQ_score_column == -1) || (p_value_column == -1) || (record_number_column == -1) || (DB_file_pos_column == -1) || (spec_file_pos_column == -1))

1116

{

1117

1118

}

1119

number_of_columns = substrings.size();

1120

}

1121

1122

const Size InspectOutfile::db_pos_length_ = 8;

1123

const Size InspectOutfile::trie_db_pos_length_ = 4;

1124

const Size InspectOutfile::protein_name_length_ = 80;

1125

const Size InspectOutfile::record_length_ = db_pos_length_ + trie_db_pos_length_ + protein_name_length_;

1126

const char InspectOutfile::trie_delimiter_ = '*';

1127

const String InspectOutfile::score_type_ = "Inspect";

1128

1042

1129

} //namespace OpenMS

Older »