~ubuntu-branches/ubuntu/utopic/amap-align/utopic : revision 6

1

/////////////////////////////////////////////////////////////////

2

// Amap.cc

3

//

4

// Main routines for AMAP program.

5

/////////////////////////////////////////////////////////////////

6

7

#include "SafeVector.h"

8

#include "MultiSequence.h"

9

#include "MultiSequenceDag.h"

10

#include "Defaults.h"

11

#include "ScoreType.h"

12

#include "ProbabilisticModel.h"

13

#include "EvolutionaryTree.h"

14

#include "SparseMatrix.h"

15

#include <string>

16

#include <sstream>

17

#include <iomanip>

18

#include <iostream>

19

#include <list>

20

#include <set>

21

#include <algorithm>

22

#include <cstdio>

23

#include <cstdlib>

24

#include <cerrno>

25

#include <iomanip>

26

27

string parametersInputFilename = "";

28

string parametersOutputFilename = "no training";

29

string annotationFilename = "";

30

31

bool enableTraining = false;

32

bool enableVerbose = false;

33

bool enableAllPairs = false;

34

bool enableAnnotation = false;

35

bool enableViterbi = false;

36

bool enableClustalWOutput = false;

37

bool enableTrainEmissions = false;

38

bool enableAlignOrder = false;

39

bool enableDagAlignment = true;

40

bool enableEdgeReordering = true;

41

bool useTgf = true;

42

bool onlyPrintPosteriors = false;

43

bool outputForGUI = false;

44

45

int numConsistencyReps = 0;

46

int numPreTrainingReps = 0;

47

int numIterativeRefinementReps = 0;

48

int guiStepSize = 1;

49

50

float guiStartWeight = std::numeric_limits<float>::max();

51

float cutoff = 0;

52

float gapOpenPenalty = 0;

53

float gapContinuePenalty = 0;

54

VF initDistrib (NumMatrixTypes);

55

VF gapOpen (2*NumInsertStates);

56

VF gapExtend (2*NumInsertStates);

57

VVF emitPairs (256, VF (256, 1e-10));

58

VF emitSingle (256, 1e-5);

59

string alphabet = alphabetDefault;

60

float gapFactor = gapFactorDefault;

61

float edgeWeightThreshold = 0;

62

63

const int MIN_PRETRAINING_REPS = 0;

64

const int MAX_PRETRAINING_REPS = 20;

65

const int MIN_CONSISTENCY_REPS = 0;

66

const int MAX_CONSISTENCY_REPS = 5;

67

const int MIN_ITERATIVE_REFINEMENT_REPS = 0;

68

const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;

69

70

/////////////////////////////////////////////////////////////////

71

// Function prototypes

72

/////////////////////////////////////////////////////////////////

73

74

void PrintHeading();

75

void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,

76

const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);

77

MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,

78

VVF &emitPairs, VF &emitSingle);

79

SafeVector<string> ParseParams (int argc, char **argv);

80

void ReadParameters ();

81

MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,

82

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

83

const ProbabilisticModel &model);

84

MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,

85

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

86

const ProbabilisticModel &model);

87

SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,

88

SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);

89

void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);

90

91

set<int> GetSubtree (const TreeNode *tree);

92

void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

93

const ProbabilisticModel &model, MultiSequence* &alignment,

94

const TreeNode *tree);

95

void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

96

const ProbabilisticModel &model, MultiSequence* &alignment);

97

void WriteAnnotation (MultiSequence *alignment,

98

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);

99

int ComputeScore (const SafeVector<pair<int, int> > &active,

100

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);

101

102

103

/////////////////////////////////////////////////////////////////

104

// main()

105

//

106

// Calls all initialization routines and runs the AMAP

107

// aligner.

108

/////////////////////////////////////////////////////////////////

109

110

int main (int argc, char **argv){

111

112

// print AMAP heading

113

PrintHeading();

114

115

// parse program parameters

116

SafeVector<string> sequenceNames = ParseParams (argc, argv);

117

ReadParameters();

118

PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);

119

120

// now, we'll process all the files given as input. If we are given

121

// several filenames as input, then we'll load all of those sequences

122

// simultaneously, as long as we're not training. On the other hand,

123

// if we are training, then we'll treat each file as a separate

124

// training instance

125

126

// if we are training

127

if (enableTraining){

128

129

// build new model for aligning

130

ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);

131

132

// prepare to average parameters

133

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;

134

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;

135

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;

136

if (enableTrainEmissions){

137

for (int i = 0; i < (int) emitPairs.size(); i++)

138

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;

139

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;

140

}

141

142

// align each file individually

143

for (int i = 0; i < (int) sequenceNames.size(); i++){

144

145

VF thisInitDistrib (NumMatrixTypes);

146

VF thisGapOpen (2*NumInsertStates);

147

VF thisGapExtend (2*NumInsertStates);

148

VVF thisEmitPairs (256, VF (256, 1e-10));

149

VF thisEmitSingle (256, 1e-5);

150

151

// load sequence file

152

MultiSequence *sequences = new MultiSequence(); assert (sequences);

153

cerr << "Loading sequence file: " << sequenceNames[i] << endl;

154

sequences->LoadMFA (sequenceNames[i], true);

155

156

// align sequences

157

MultiSequence *tmpMsa = DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);

158

delete tmpMsa;

159

160

// add in contribution of the derived parameters

161

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];

162

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];

163

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];

164

if (enableTrainEmissions){

165

for (int i = 0; i < (int) emitPairs.size(); i++)

166

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];

167

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];

168

}

169

170

delete sequences;

171

}

172

173

// compute new parameters and print them out

174

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();

175

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();

176

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();

177

if (enableTrainEmissions){

178

for (int i = 0; i < (int) emitPairs.size(); i++)

179

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();

180

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();

181

}

182

183

PrintParameters ("Trained parameter set:",

184

initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,

185

parametersOutputFilename.c_str());

186

}

187

188

// if we are not training, we must simply want to align some sequences

189

else {

190

191

bool allStartWithM = true;

192

// load all files together

193

MultiSequence *sequences = new MultiSequence(); assert (sequences);

194

for (int i = 0; i < (int) sequenceNames.size(); i++){

195

cerr << "Loading sequence file: " << sequenceNames[i] << endl;

196

sequences->LoadMFA (sequenceNames[i], true);

197

}

198

199

for (int i = 0; i < sequences->GetNumSequences(); i++){

200

char firstChar = *(++(sequences->GetSequence(i)->GetDataPtr()));

201

allStartWithM = firstChar == 'M' ? allStartWithM : false;

202

}

203

204

if (allStartWithM) {

205

for (int i = 0; i < sequences->GetNumSequences(); i++){

206

int seqLength = sequences->GetSequence(0)->GetLength();

207

Sequence* newSeq = sequences->GetSequence(0)->GetRange(2,seqLength);

208

sequences->RemoveSequence(0);

209

sequences->AddSequence(newSeq);

210

}

211

}

212

213

// do all "pre-training" repetitions first

214

for (int ct = 0; ct < numPreTrainingReps; ct++){

215

enableTraining = true;

216

217

// build new model for aligning

218

ProbabilisticModel model (initDistrib, gapOpen, gapExtend,

219

emitPairs, emitSingle);

220

221

// do initial alignments

222

DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);

223

224

// print new parameters

225

PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);

226

227

enableTraining = false;

228

}

229

230

// now, we can perform the alignments and write them out

231

MultiSequence *alignment = DoAlign (sequences,

232

ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),

233

initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);

234

235

if (allStartWithM) {

236

for (int i = 0; i < alignment->GetNumSequences(); i++){

237

SafeVector<char>* newData = new SafeVector<char>;

238

Sequence* currSeq = alignment->GetSequence(0);

239

SafeVector<char>::iterator dataItr = currSeq->GetDataPtr();

240

newData->push_back(*dataItr++);

241

newData->push_back('M');

242

for (int j = 0; j < currSeq->GetLength(); j++) {

243

newData->push_back(*dataItr++);

244

}

245

Sequence *newSeq = new Sequence(newData, currSeq->GetHeader(), currSeq->GetLength() + 1, currSeq->GetSortLabel(), currSeq->GetLabel());

246

alignment->AddSequence(newSeq);

247

alignment->RemoveSequence(0);

248

}

249

}

250

251

if (onlyPrintPosteriors)

252

return 0;

253

if (!enableAllPairs && !outputForGUI){

254

if (enableClustalWOutput)

255

alignment->WriteALN (cout);

256

else

257

alignment->WriteMFA (cout);

258

}

259

delete alignment;

260

delete sequences;

261

}

262

}

263

264

/////////////////////////////////////////////////////////////////

265

// PrintHeading()

266

//

267

// Prints heading for AMAP program.

268

/////////////////////////////////////////////////////////////////

269

270

void PrintHeading (){

271

cerr << endl

272

<< "AMAP version " << VERSION << " - align multiple protein sequences and print to standard output" << endl

273

<< "PROBCONS Written by Chuong Do" << endl

274

<< "AMAP algorithm implemented by Ariel Schwartz" << endl

275

<< endl;

276

}

277

278

/////////////////////////////////////////////////////////////////

279

// PrintParameters()

280

//

281

// Prints AMAP parameters to STDERR. If a filename is

282

// specified, then the parameters are also written to the file.

283

/////////////////////////////////////////////////////////////////

284

285

void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,

286

const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){

287

288

// print parameters to the screen

289

cerr << message << endl

290

<< " initDistrib[] = { ";

291

for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";

292

cerr << "}" << endl

293

<< " gapOpen[] = { ";

294

for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";

295

cerr << "}" << endl

296

<< " gapExtend[] = { ";

297

for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";

298

cerr << "}" << endl

299

<< endl;

300

301

/*

302

for (int i = 0; i < 5; i++){

303

for (int j = 0; j <= i; j++){

304

cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";

305

}

306

cerr << endl;

307

}*/

308

309

// if a file name is specified

310

if (filename){

311

312

// attempt to open the file for writing

313

FILE *file = fopen (filename, "w");

314

if (!file){

315

cerr << "ERROR: Unable to write parameter file: " << filename << endl;

316

exit (1);

317

}

318

319

// if successful, then write the parameters to the file

320

for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");

321

for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");

322

for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");

323

fprintf (file, "%s\n", alphabet.c_str());

324

for (int i = 0; i < (int) alphabet.size(); i++){

325

for (int j = 0; j <= i; j++)

326

fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);

327

fprintf (file, "\n");

328

}

329

for (int i = 0; i < (int) alphabet.size(); i++)

330

fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);

331

fprintf (file, "\n");

332

fclose (file);

333

}

334

}

335

336

/////////////////////////////////////////////////////////////////

337

// DoAlign()

338

//

339

// First computes all pairwise posterior probability matrices.

340

// Then, computes new parameters if training, or a final

341

// alignment, otherwise.

342

/////////////////////////////////////////////////////////////////

343

344

MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,

345

VF &gapExtend, VVF &emitPairs, VF &emitSingle){

346

347

assert (sequences);

348

349

const int numSeqs = sequences->GetNumSequences();

350

VVF distances (numSeqs, VF (numSeqs, 0));

351

SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));

352

SafeVector<SafeVector<SparseMatrix *> > originalSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));

353

354

if (enableTraining){

355

// prepare to average parameters

356

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;

357

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;

358

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;

359

if (enableTrainEmissions){

360

for (int i = 0; i < (int) emitPairs.size(); i++)

361

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;

362

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;

363

}

364

}

365

366

367

// skip posterior calculations if we just want to do Viterbi alignments

368

if (!enableViterbi){

369

cerr << "Computing posterior matrices" << endl;

370

// do all pairwise alignments for posterior probability matrices

371

for (int a = 0; a < numSeqs-1; a++){

372

for (int b = a+1; b < numSeqs; b++){

373

Sequence *seq1 = sequences->GetSequence (a);

374

Sequence *seq2 = sequences->GetSequence (b);

375

376

// verbose output

377

if (enableVerbose)

378

cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "

379

<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";

380

381

// compute forward and backward probabilities

382

VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);

383

VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);

384

385

// if we are training, then we'll simply want to compute the

386

// expected counts for each region within the matrix separately;

387

// otherwise, we'll need to put all of the regions together and

388

// assemble a posterior probability match matrix

389

390

// so, if we're training

391

if (enableTraining){

392

393

// compute new parameters

394

VF thisInitDistrib (NumMatrixTypes);

395

VF thisGapOpen (2*NumInsertStates);

396

VF thisGapExtend (2*NumInsertStates);

397

VVF thisEmitPairs (256, VF (256, 1e-10));

398

VF thisEmitSingle (256, 1e-5);

399

400

model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);

401

402

// add in contribution of the derived parameters

403

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];

404

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];

405

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];

406

if (enableTrainEmissions){

407

for (int i = 0; i < (int) emitPairs.size(); i++)

408

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];

409

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];

410

}

411

412

// let us know that we're done.

413

if (enableVerbose) cerr << "done." << endl;

414

}

415

else {

416

// compute posterior probability matrix

417

VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);

418

419

// compute sparse representations

420

originalSparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);

421

originalSparseMatrices[b][a] = originalSparseMatrices[a][b]->ComputeTranspose();

422

423

if (!enableAllPairs){

424

if (!enableDagAlignment) {

425

// perform the pairwise sequence alignment

426

pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),

427

seq2->GetLength(),

428

*posterior);

429

430

// compute "expected accuracy" distance for evolutionary tree computation

431

float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());

432

//float distance = (gapFactor == 0) ? alignment.second / min (seq1->GetLength(), seq2->GetLength()):

433

// alignment.second / (seq1->GetLength() + seq2->GetLength()) * gapFactor;

434

distances[a][b] = distances[b][a] = distance;

435

436

if (enableVerbose) {

437

cerr << setprecision (10) << distance << endl;

438

// if (distance == 1)

439

// cerr << setprecision(10) << (*posterior)[4 * (seq1->GetLength() + 1) + 4] << endl;

440

// originalSparseMatrices[a][b]->Print(cerr);

441

}

442

delete alignment.first;

443

}

444

}

445

else {

446

// let us know that we're done.

447

if (enableVerbose) cerr << "done." << endl;

448

}

449

450

delete posterior;

451

}

452

453

delete forward;

454

delete backward;

455

}

456

}

457

}

458

459

// now average out parameters derived

460

if (enableTraining){

461

462

// compute new parameters

463

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;

464

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;

465

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;

466

467

if (enableTrainEmissions){

468

for (int i = 0; i < (int) emitPairs.size(); i++)

469

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;

470

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;

471

}

472

}

473

474

// see if we still want to do some alignments

475

else {

476

477

if (!enableViterbi){

478

479

sparseMatrices = originalSparseMatrices;

480

481

// perform the consistency transformation the desired number of times

482

for (int r = 0; r < numConsistencyReps; r++){

483

SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);

484

485

// now replace the old posterior matrices

486

for (int i = 0; i < numSeqs; i++){

487

for (int j = 0; j < numSeqs; j++){

488

489

// don't delete the original sparse matrices

490

if (r > 0) delete sparseMatrices[i][j];

491

sparseMatrices[i][j] = newSparseMatrices[i][j];

492

}

493

}

494

}

495

}

496

497

MultiSequence *finalAlignment = NULL;

498

499

if (onlyPrintPosteriors) {

500

for (int i = 0; i < numSeqs; i++){

501

string seq1name = sequences->GetSequence(i)->GetHeader();

502

for (int j = i + 1; j < numSeqs; j++){

503

cout << "Sparse Matrix: " << i << "," << j << endl;

504

cout << "Sequence names: " << seq1name << ", " << sequences->GetSequence(j)->GetHeader() << endl;

505

sparseMatrices[i][j]->Print(cout);

506

}

507

}

508

}

509

else if (enableAllPairs){

510

for (int a = 0; a < numSeqs-1; a++){

511

for (int b = a+1; b < numSeqs; b++){

512

Sequence *seq1 = sequences->GetSequence (a);

513

Sequence *seq2 = sequences->GetSequence (b);

514

515

if (enableVerbose)

516

cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "

517

<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";

518

519

520

// perform the pairwise sequence alignment

521

pair<SafeVector<char> *, float> alignment;

522

if (enableViterbi)

523

alignment = model.ComputeViterbiAlignment (seq1, seq2);

524

else {

525

526

// build posterior matrix

527

VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);

528

int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);

529

for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;

530

531

alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior, gapFactor);

532

delete posterior;

533

}

534

535

// write pairwise alignments

536

string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");

537

ofstream outfile (name.c_str());

538

539

MultiSequence *result = new MultiSequence();

540

result->AddSequence (seq1->AddGaps(alignment.first, 'X'));

541

result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));

542

if (enableClustalWOutput)

543

result->WriteALN (outfile);

544

else

545

result->WriteMFA (outfile);

546

547

outfile.close();

548

549

delete alignment.first;

550

}

551

}

552

}

553

554

// now if we still need to do a final multiple alignment

555

else {

556

557

if (enableVerbose)

558

cerr << endl;

559

560

if (!enableDagAlignment) {

561

// compute the evolutionary tree

562

TreeNode *tree = TreeNode::ComputeTree (distances);

563

564

tree->Print (cerr, sequences);

565

cerr << endl;

566

567

// make the final alignment

568

finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);

569

570

delete tree;

571

} else {

572

cerr << "Building DAG" << endl;

573

MultiSequenceDag mds(sequences,false,outputForGUI,guiStartWeight,guiStepSize);

574

cerr << "Starting the sequence annealing process" << endl;

575

finalAlignment = mds.AlignDag(sparseMatrices, gapFactor, enableVerbose, enableEdgeReordering, useTgf, edgeWeightThreshold);

576

}

577

// build annotation

578

if (enableAnnotation){

579

WriteAnnotation (finalAlignment, originalSparseMatrices);

580

}

581

582

}

583

584

if (!enableViterbi){

585

// delete sparse matrices

586

for (int a = 0; a < numSeqs-1; a++){

587

for (int b = a+1; b < numSeqs; b++){

588

delete originalSparseMatrices[a][b];

589

delete originalSparseMatrices[b][a];

590

591

if (numConsistencyReps > 0){

592

delete sparseMatrices[a][b];

593

delete sparseMatrices[b][a];

594

}

595

}

596

}

597

}

598

599

return finalAlignment;

600

}

601

602

return NULL;

603

}

604

605

/////////////////////////////////////////////////////////////////

606

// GetInteger()

607

//

608

// Attempts to parse an integer from the character string given.

609

// Returns true only if no parsing error occurs.

610

/////////////////////////////////////////////////////////////////

611

612

bool GetInteger (char *data, int *val){

613

char *endPtr;

614

long int retVal;

615

616

assert (val);

617

618

errno = 0;

619

retVal = strtol (data, &endPtr, 0);

620

if (retVal == 0 && (errno != 0 || data == endPtr)) return false;

621

if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;

622

if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;

623

*val = (int) retVal;

624

return true;

625

}

626

627

/////////////////////////////////////////////////////////////////

628

// GetFloat()

629

//

630

// Attempts to parse a float from the character string given.

631

// Returns true only if no parsing error occurs.

632

/////////////////////////////////////////////////////////////////

633

634

bool GetFloat (char *data, float *val){

635

char *endPtr;

636

double retVal;

637

638

assert (val);

639

640

errno = 0;

641

retVal = strtod (data, &endPtr);

642

if (retVal == 0 && (errno != 0 || data == endPtr)) return false;

643

if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;

644

*val = (float) retVal;

645

return true;

646

}

647

648

/////////////////////////////////////////////////////////////////

649

// ParseParams()

650

//

651

// Parse all command-line options.

652

/////////////////////////////////////////////////////////////////

653

654

SafeVector<string> ParseParams (int argc, char **argv){

655

656

if (argc < 2){

657

658

cerr << "AMAP comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl

659

<< "you are welcome to redistribute it under certain conditions. See the" << endl

660

<< "files README and README.PROBCONS for details." << endl

661

<< endl

662

<< "Usage:" << endl

663

<< " amap [OPTION]... [MFAFILE]..." << endl

664

<< endl

665

<< "Description:" << endl

666

<< " Align sequences in MFAFILE(s) and print result to standard output" << endl

667

<< endl

668

<< " -clustalw" << endl

669

<< " use CLUSTALW output format instead of MFA" << endl

670

<< endl

671

<< " -c, --consistency REPS" << endl

672

<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS

673

<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl

674

<< endl

675

<< " -ir, --iterative-refinement REPS" << endl

676

<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS

677

<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl

678

<< endl

679

<< " -pre, --pre-training REPS" << endl

680

<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS

681

<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl

682

<< endl

683

<< " -pairs" << endl

684

<< " generate all-pairs pairwise alignments" << endl

685

<< endl

686

<< " -viterbi" << endl

687

<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl

688

<< endl

689

<< " -v, --verbose" << endl

690

<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl

691

<< endl

692

<< " -annot FILENAME" << endl

693

<< " write annotation for multiple alignment to FILENAME" << endl

694

<< endl

695

<< " -t, --train FILENAME" << endl

696

<< " compute EM transition probabilities, store in FILENAME (default: "

697

<< parametersOutputFilename << ")" << endl

698

<< endl

699

<< " -e, --emissions" << endl

700

<< " also reestimate emission probabilities (default: "

701

<< (enableTrainEmissions ? "on" : "off") << ")" << endl

702

<< endl

703

<< " -p, --paramfile FILENAME" << endl

704

<< " read parameters from FILENAME (default: "

705

<< parametersInputFilename << ")" << endl

706

<< endl

707

<< " -a, --alignment-order" << endl

708

<< " print sequences in alignment order rather than input order (default: "

709

<< (enableAlignOrder ? "on" : "off") << ")" << endl

710

<< endl

711

<< " -g, --gap-factor GF" << endl

712

<< " use GF as the gap-factor parameter, set to 0 for best sensitivity, higher values for better specificity (default: "

713

<< gapFactor << ")" << endl

714

<< endl

715

<< " -w, --edge-weight-threshold W" << endl

716

<< " stop the sequence annealing process when best edge has lower weight than W," << endl

717

<< " set to 0 for best sensitivity, higher values for better specificity (default: "

718

<< edgeWeightThreshold << ")" << endl

719

<< endl

720

<< " -prog, --progressive" << endl

721

<< " use progresive alignment instead of sequence annealing alignment (default: "

722

<< (!enableDagAlignment ? "on" : "off") << ")" << endl

723

<< endl

724

<< " -noreorder, --no-edge-reordering" << endl

725

<< " disable reordring of edges during sequence annealing alignment (default: "

726

<< (!enableEdgeReordering ? "on" : "off") << ")" << endl

727

<< endl

728

<< " -maxstep, --use-max-stepsize" << endl

729

<< " use maximum improvement step size instead of tGf edge ranking (default: "

730

<< (!useTgf ? "on" : "off") << ")" << endl

731

<< endl

732

<< " -print, --print-posteriors" << endl

733

<< " only print the posterior probability matrices (default: "

734

<< (onlyPrintPosteriors ? "on" : "off") << ")" << endl

735

<< endl

736

<< " -gui [START] [STEP]" << endl

737

<< " print output for the AMAP Display Java based GUI (default: "

738

<< (outputForGUI ? "on" : "off") << ") " << endl

739

<< " starting at weight START (default: infinity) with step size STEP (default: " << guiStepSize << ")" << endl

740

<< endl;

741

exit (1);

742

}

743

744

SafeVector<string> sequenceNames;

745

int tempInt;

746

float tempFloat;

747

748

for (int i = 1; i < argc; i++){

749

if (argv[i][0] == '-'){

750

751

// training

752

if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){

753

enableTraining = true;

754

if (i < argc - 1)

755

parametersOutputFilename = string (argv[++i]);

756

else {

757

cerr << "ERROR: Filename expected for option " << argv[i] << endl;

758

exit (1);

759

}

760

}

761

762

// emission training

763

else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){

764

enableTrainEmissions = true;

765

}

766

767

// parameter file

768

else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){

769

if (i < argc - 1)

770

parametersInputFilename = string (argv[++i]);

771

else {

772

cerr << "ERROR: Filename expected for option " << argv[i] << endl;

773

exit (1);

774

}

775

}

776

777

// number of consistency transformations

778

else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){

779

if (i < argc - 1){

780

if (!GetInteger (argv[++i], &tempInt)){

781

cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;

782

exit (1);

783

}

784

else {

785

if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){

786

cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "

787

<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;

788

exit (1);

789

}

790

else

791

numConsistencyReps = tempInt;

792

}

793

}

794

else {

795

cerr << "ERROR: Integer expected for option " << argv[i] << endl;

796

exit (1);

797

}

798

}

799

800

// number of randomized partitioning iterative refinement passes

801

else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){

802

if (i < argc - 1){

803

if (!GetInteger (argv[++i], &tempInt)){

804

cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;

805

exit (1);

806

}

807

else {

808

if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){

809

cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "

810

<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;

811

exit (1);

812

}

813

else

814

numIterativeRefinementReps = tempInt;

815

}

816

}

817

else {

818

cerr << "ERROR: Integer expected for option " << argv[i] << endl;

819

exit (1);

820

}

821

}

822

823

// number of EM pre-training rounds

824

else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){

825

if (i < argc - 1){

826

if (!GetInteger (argv[++i], &tempInt)){

827

cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;

828

exit (1);

829

}

830

else {

831

if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){

832

cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "

833

<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;

834

exit (1);

835

}

836

else

837

numPreTrainingReps = tempInt;

838

}

839

}

840

else {

841

cerr << "ERROR: Integer expected for option " << argv[i] << endl;

842

exit (1);

843

}

844

}

845

846

// gap open penalty

847

else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){

848

if (i < argc - 1){

849

if (!GetFloat (argv[++i], &tempFloat)){

850

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

851

exit (1);

852

}

853

else {

854

if (tempFloat > 0){

855

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;

856

exit (1);

857

}

858

else

859

gapOpenPenalty = tempFloat;

860

}

861

}

862

else {

863

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

864

exit (1);

865

}

866

}

867

868

// gap extension penalty

869

else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){

870

if (i < argc - 1){

871

if (!GetFloat (argv[++i], &tempFloat)){

872

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

873

exit (1);

874

}

875

else {

876

if (tempFloat > 0){

877

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;

878

exit (1);

879

}

880

else

881

gapContinuePenalty = tempFloat;

882

}

883

}

884

else {

885

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

886

exit (1);

887

}

888

}

889

890

// all-pairs pairwise alignments

891

else if (!strcmp (argv[i], "-pairs")){

892

enableAllPairs = true;

893

}

894

895

// all-pairs pairwise Viterbi alignments

896

else if (!strcmp (argv[i], "-viterbi")){

897

enableAllPairs = true;

898

enableViterbi = true;

899

}

900

901

// annotation files

902

else if (!strcmp (argv[i], "-annot")){

903

enableAnnotation = true;

904

if (i < argc - 1)

905

annotationFilename = argv[++i];

906

else {

907

cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;

908

exit (1);

909

}

910

}

911

912

// clustalw output format

913

else if (!strcmp (argv[i], "-clustalw")){

914

enableClustalWOutput = true;

915

}

916

917

// cutoff

918

else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){

919

if (i < argc - 1){

920

if (!GetFloat (argv[++i], &tempFloat)){

921

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

922

exit (1);

923

}

924

else {

925

if (tempFloat < 0 || tempFloat > 1){

926

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;

927

exit (1);

928

}

929

else

930

cutoff = tempFloat;

931

}

932

}

933

else {

934

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

935

exit (1);

936

}

937

}

938

939

// verbose reporting

940

else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){

941

enableVerbose = true;

942

}

943

944

// output for AMAP Display

945

else if (!strcmp (argv[i], "-gui")) {

946

outputForGUI = true;

947

if (i < argc - 1){

948

if (GetFloat (argv[i + 1], &tempFloat)){

949

if (tempFloat < 0){

950

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be non-negative." << endl;

951

exit (1);

952

}

953

else {

954

guiStartWeight = tempFloat;

955

i++;

956

if (i < argc - 1){

957

if (GetInteger (argv[i + 1], &tempInt)){

958

if (tempInt < 1){

959

cerr << "ERROR: For option " << argv[i-1] << ", integer must be positive." << endl;

960

exit (1);

961

}

962

else {

963

guiStepSize = tempInt;

964

i++;

965

}

966

}

967

}

968

}

969

}

970

}

971

}

972

973

// alignment order

974

else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){

975

enableAlignOrder = true;

976

}

977

978

// progressive

979

else if (!strcmp (argv[i], "-prog") || !strcmp (argv[i], "--progressive")){

980

enableDagAlignment = false;

981

}

982

983

// edge reordering

984

else if (!strcmp (argv[i], "-noreorder") || !strcmp (argv[i], "--no-edge-reordering")){

985

enableEdgeReordering = false;

986

}

987

988

// edge ranking method

989

else if (!strcmp (argv[i], "-maxstep") || !strcmp (argv[i], "--use-max-stepsize")){

990

useTgf = false;

991

}

992

993

// print posteriors

994

else if (!strcmp (argv[i], "-print") || !strcmp (argv[i], "--print-posteriors")){

995

onlyPrintPosteriors = true;

996

}

997

998

// gap factor

999

else if (!strcmp (argv[i], "-g") || !strcmp (argv[i], "--gap-factor")){

1000

if (i < argc - 1){

1001

if (!GetFloat (argv[++i], &tempFloat)){

1002

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

1003

exit (1);

1004

}

1005

else {

1006

if (tempFloat < 0){

1007

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;

1008

exit (1);

1009

}

1010

else

1011

gapFactor = tempFloat;

1012

}

1013

}

1014

else {

1015

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

1016

exit (1);

1017

}

1018

}

1019

1020

// edge weight threshold

1021

else if (!strcmp (argv[i], "-w") || !strcmp (argv[i], "--edge-weight-threshold")){

1022

if (i < argc - 1){

1023

if (!GetFloat (argv[++i], &tempFloat)){

1024

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

1025

exit (1);

1026

}

1027

else {

1028

if (tempFloat < 0){

1029

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;

1030

exit (1);

1031

}

1032

else

1033

edgeWeightThreshold = tempFloat;

1034

}

1035

}

1036

else {

1037

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

1038

exit (1);

1039

}

1040

}

1041

1042

// bad arguments

1043

else {

1044

cerr << "ERROR: Unrecognized option: " << argv[i] << endl;

1045

exit (1);

1046

}

1047

}

1048

else {

1049

sequenceNames.push_back (string (argv[i]));

1050

}

1051

}

1052

1053

if (enableTrainEmissions && !enableTraining){

1054

cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;

1055

exit (1);

1056

}

1057

1058

return sequenceNames;

1059

}

1060

1061

/////////////////////////////////////////////////////////////////

1062

// ReadParameters()

1063

//

1064

// Read initial distribution, transition, and emission

1065

// parameters from a file.

1066

/////////////////////////////////////////////////////////////////

1067

1068

void ReadParameters (){

1069

1070

ifstream data;

1071

1072

emitPairs = VVF (256, VF (256, 1e-10));

1073

emitSingle = VF (256, 1e-5);

1074

1075

// read initial state distribution and transition parameters

1076

1077

if (NumInsertStates == 1){

1078

for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];

1079

for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];

1080

for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];

1081

}

1082

else if (NumInsertStates == 2){

1083

for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];

1084

for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];

1085

for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];

1086

}

1087

else {

1088

cerr << "ERROR: No default initial distribution/parameter settings exist" << endl

1089

<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;

1090

exit (1);

1091

}

1092

1093

alphabet = alphabetDefault;

1094

1095

for (int i = 0; i < (int) alphabet.length(); i++){

1096

emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];

1097

emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];

1098

for (int j = 0; j <= i; j++){

1099

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];

1100

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];

1101

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];

1102

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];

1103

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];

1104

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];

1105

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];

1106

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];

1107

}

1108

}

1109

1110

if (parametersInputFilename != string ("")){

1111

data.open (parametersInputFilename.c_str());

1112

if (data.fail()){

1113

cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;

1114

exit (1);

1115

}

1116

1117

string line[3];

1118

for (int i = 0; i < 3; i++){

1119

if (!getline (data, line[i])){

1120

cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;

1121

exit (1);

1122

}

1123

}

1124

istringstream data2;

1125

data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];

1126

data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];

1127

data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];

1128

1129

if (!getline (data, line[0])){

1130

return;

1131

cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;

1132

exit (1);

1133

}

1134

1135

// read alphabet as concatenation of all characters on alphabet line

1136

alphabet = "";

1137

string token;

1138

data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;

1139

1140

for (int i = 0; i < (int) alphabet.size(); i++){

1141

for (int j = 0; j <= i; j++){

1142

float val;

1143

data >> val;

1144

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;

1145

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;

1146

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;

1147

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;

1148

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;

1149

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;

1150

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;

1151

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;

1152

}

1153

}

1154

1155

for (int i = 0; i < (int) alphabet.size(); i++){

1156

float val;

1157

data >> val;

1158

emitSingle[(unsigned char) tolower(alphabet[i])] = val;

1159

emitSingle[(unsigned char) toupper(alphabet[i])] = val;

1160

}

1161

data.close();

1162

}

1163

}

1164

1165

/////////////////////////////////////////////////////////////////

1166

// ProcessTree()

1167

//

1168

// Process the tree recursively. Returns the aligned sequences

1169

// corresponding to a node or leaf of the tree.

1170

/////////////////////////////////////////////////////////////////

1171

1172

MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,

1173

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1174

const ProbabilisticModel &model){

1175

MultiSequence *result;

1176

1177

// check if this is a node of the alignment tree

1178

if (tree->GetSequenceLabel() == -1){

1179

MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);

1180

MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);

1181

1182

assert (alignLeft);

1183

assert (alignRight);

1184

1185

result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);

1186

assert (result);

1187

1188

delete alignLeft;

1189

delete alignRight;

1190

}

1191

1192

// otherwise, this is a leaf of the alignment tree

1193

else {

1194

result = new MultiSequence(); assert (result);

1195

result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());

1196

}

1197

1198

return result;

1199

}

1200

1201

/////////////////////////////////////////////////////////////////

1202

// ComputeFinalAlignment()

1203

//

1204

// Compute the final alignment by calling ProcessTree(), then

1205

// performing iterative refinement as needed.

1206

/////////////////////////////////////////////////////////////////

1207

1208

MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,

1209

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1210

const ProbabilisticModel &model){

1211

1212

MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);

1213

1214

if (enableAlignOrder){

1215

alignment->SaveOrdering();

1216

enableAlignOrder = false;

1217

}

1218

1219

// tree-based refinement

1220

// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);

1221

1222

// iterative refinement

1223

for (int i = 0; i < numIterativeRefinementReps; i++)

1224

DoIterativeRefinement (sparseMatrices, model, alignment);

1225

1226

cerr << endl;

1227

1228

// return final alignment

1229

return alignment;

1230

}

1231

1232

/////////////////////////////////////////////////////////////////

1233

// AlignAlignments()

1234

//

1235

// Returns the alignment of two MultiSequence objects.

1236

/////////////////////////////////////////////////////////////////

1237

1238

MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,

1239

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1240

const ProbabilisticModel &model){

1241

1242

// print some info about the alignment

1243

if (enableVerbose){

1244

for (int i = 0; i < align1->GetNumSequences(); i++)

1245

cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();

1246

cerr << "] vs. ";

1247

for (int i = 0; i < align2->GetNumSequences(); i++)

1248

cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();

1249

cerr << "]: ";

1250

}

1251

1252

VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff, gapFactor);

1253

pair<SafeVector<char> *, float> alignment;

1254

1255

// choose the alignment routine depending on the "cosmetic" gap penalties used

1256

if (gapOpenPenalty == 0 && gapContinuePenalty == 0)

1257

alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, gapFactor);

1258

else

1259

alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,

1260

*posterior, align1->GetNumSequences(), align2->GetNumSequences(),

1261

gapOpenPenalty, gapContinuePenalty);

1262

// if (enableVerbose)

1263

// cerr << "finished computing alignment\n";

1264

1265

delete posterior;

1266

1267

if (enableVerbose){

1268

1269

// compute total length of sequences

1270

int totLength = 0;

1271

for (int i = 0; i < align1->GetNumSequences(); i++)

1272

for (int j = 0; j < align2->GetNumSequences(); j++)

1273

totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());

1274

1275

// give an "accuracy" measure for the alignment

1276

cerr << alignment.second / totLength << endl;

1277

}

1278

1279

// now build final alignment

1280

MultiSequence *result = new MultiSequence();

1281

for (int i = 0; i < align1->GetNumSequences(); i++)

1282

result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));

1283

for (int i = 0; i < align2->GetNumSequences(); i++)

1284

result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));

1285

if (!enableAlignOrder)

1286

result->SortByLabel();

1287

1288

// free temporary alignment

1289

delete alignment.first;

1290

1291

return result;

1292

}

1293

1294

/////////////////////////////////////////////////////////////////

1295

// DoRelaxation()

1296

//

1297

// Performs one round of the consistency transformation. The

1298

// formula used is:

1299

// 1

1300

// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])

1301

// |S| z in S k

1302

//

1303

// where S = {x, y, all other sequences...}

1304

//

1305

/////////////////////////////////////////////////////////////////

1306

1307

SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,

1308

SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){

1309

const int numSeqs = sequences->GetNumSequences();

1310

1311

SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));

1312

1313

// for every pair of sequences

1314

for (int i = 0; i < numSeqs; i++){

1315

for (int j = i+1; j < numSeqs; j++){

1316

Sequence *seq1 = sequences->GetSequence (i);

1317

Sequence *seq2 = sequences->GetSequence (j);

1318

1319

if (enableVerbose)

1320

cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "

1321

<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";

1322

1323

// get the original posterior matrix

1324

VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);

1325

VF &posterior = *posteriorPtr;

1326

1327

const int seq1Length = seq1->GetLength();

1328

const int seq2Length = seq2->GetLength();

1329

1330

VF *oldSumsPtr = new VF(seq1Length + seq2Length + 2,0);

1331

VF &oldSums = *oldSumsPtr;

1332

VF *newSumsPtr = new VF(seq1Length + seq2Length + 2,0);

1333

VF &newSums = *newSumsPtr;

1334

1335

for (int k = 0, kl = 0; k <= seq1Length; k++) {

1336

for (int l = 0; l <= seq2Length; l++) {

1337

oldSums[k] += posterior[kl];

1338

oldSums[seq1Length + 1 + l] += posterior[kl++];

1339

}

1340

}

1341

1342

// contribution from the summation where z = x and z = y

1343

for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];

1344

1345

if (enableVerbose)

1346

cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";

1347

1348

// contribution from all other sequences

1349

for (int k = 0; k < numSeqs; k++) if (k != i && k != j){

1350

Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);

1351

}

1352

1353

// now renormalization

1354

for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;

1355

1356

for (int k = 0, kl = 0; k <= seq1Length; k++) {

1357

for (int l = 0; l <= seq2Length; l++) {

1358

newSums[k] += posterior[kl];

1359

newSums[seq1Length + 1 + l] += posterior[kl++];

1360

}

1361

}

1362

1363

int gapPostBase = (seq1Length+1) * (seq2Length+1);

1364

for (int k = 0; k < seq1Length + seq2Length + 2; k++) {

1365

if (oldSums[k] < POSTERIOR_CUTOFF) {

1366

if (newSums[k] > 1)

1367

cerr << "negative new gap posterior!\n";

1368

else {

1369

if (enableVerbose)

1370

cerr << setprecision(5) << posterior[gapPostBase + k] << "->" << setprecision(5) << 1 - newSums[k] << ", ";

1371

posterior[gapPostBase + k] = 1 - newSums[k];

1372

}

1373

}

1374

else {

1375

posterior[gapPostBase + k] *= newSums[k] / oldSums[k];

1376

if (enableVerbose && newSums[k] > oldSums[k])

1377

cerr << setprecision(5) << newSums[k] / oldSums[k] << ", ";

1378

}

1379

}

1380

1381

if (enableVerbose)

1382

cerr << endl;

1383

1384

// save the new posterior matrix

1385

newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);

1386

newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();

1387

1388

if (enableVerbose)

1389

cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";

1390

1391

delete posteriorPtr;

1392

delete oldSumsPtr;

1393

delete newSumsPtr;

1394

1395

if (enableVerbose)

1396

cerr << "done." << endl;

1397

}

1398

}

1399

1400

return newSparseMatrices;

1401

}

1402

1403

/////////////////////////////////////////////////////////////////

1404

// Relax()

1405

//

1406

// Computes the consistency transformation for a single sequence

1407

// z, and adds the transformed matrix to "posterior".

1408

/////////////////////////////////////////////////////////////////

1409

1410

void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){

1411

1412

assert (matXZ);

1413

assert (matZY);

1414

1415

int lengthX = matXZ->GetSeq1Length();

1416

int lengthY = matZY->GetSeq2Length();

1417

assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());

1418

1419

// for every x[i]

1420

for (int i = 1; i <= lengthX; i++){

1421

SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);

1422

SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);

1423

1424

VF::iterator base = posterior.begin() + i * (lengthY + 1);

1425

1426

// iterate through all x[i]-z[k]

1427

while (XZptr != XZend){

1428

SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);

1429

SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);

1430

const float XZval = XZptr->second;

1431

1432

// iterate through all z[k]-y[j]

1433

while (ZYptr != ZYend){

1434

base[ZYptr->first] += XZval * ZYptr->second;

1435

ZYptr++;

1436

}

1437

XZptr++;

1438

}

1439

}

1440

}

1441

1442

/////////////////////////////////////////////////////////////////

1443

// GetSubtree

1444

//

1445

// Returns set containing all leaf labels of the current subtree.

1446

/////////////////////////////////////////////////////////////////

1447

1448

set<int> GetSubtree (const TreeNode *tree){

1449

set<int> s;

1450

1451

if (tree->GetSequenceLabel() == -1){

1452

s = GetSubtree (tree->GetLeftChild());

1453

set<int> t = GetSubtree (tree->GetRightChild());

1454

1455

for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)

1456

s.insert (*iter);

1457

}

1458

else {

1459

s.insert (tree->GetSequenceLabel());

1460

}

1461

1462

return s;

1463

}

1464

1465

/////////////////////////////////////////////////////////////////

1466

// TreeBasedBiPartitioning

1467

//

1468

// Uses the iterative refinement scheme from MUSCLE.

1469

/////////////////////////////////////////////////////////////////

1470

1471

void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1472

const ProbabilisticModel &model, MultiSequence* &alignment,

1473

const TreeNode *tree){

1474

// check if this is a node of the alignment tree

1475

if (tree->GetSequenceLabel() == -1){

1476

TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());

1477

TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());

1478

1479

set<int> leftSubtree = GetSubtree (tree->GetLeftChild());

1480

set<int> rightSubtree = GetSubtree (tree->GetRightChild());

1481

set<int> leftSubtreeComplement, rightSubtreeComplement;

1482

1483

// calculate complement of each subtree

1484

for (int i = 0; i < alignment->GetNumSequences(); i++){

1485

if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);

1486

if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);

1487

}

1488

1489

// perform realignments for edge to left child

1490

if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){

1491

MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);

1492

MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);

1493

delete alignment;

1494

alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);

1495

}

1496

1497

// perform realignments for edge to right child

1498

if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){

1499

MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);

1500

MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);

1501

delete alignment;

1502

alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);

1503

}

1504

}

1505

}

1506

1507

/////////////////////////////////////////////////////////////////

1508

// DoIterativeRefinement()

1509

//

1510

// Performs a single round of randomized partionining iterative

1511

// refinement.

1512

/////////////////////////////////////////////////////////////////

1513

1514

void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1515

const ProbabilisticModel &model, MultiSequence* &alignment){

1516

set<int> groupOne, groupTwo;

1517

1518

// create two separate groups

1519

for (int i = 0; i < alignment->GetNumSequences(); i++){

1520

if (rand() % 2)

1521

groupOne.insert (i);

1522

else

1523

groupTwo.insert (i);

1524

}

1525

1526

if (groupOne.empty() || groupTwo.empty()) return;

1527

1528

// project into the two groups

1529

MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);

1530

MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);

1531

delete alignment;

1532

1533

// realign

1534

alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);

1535

}

1536

1537

/////////////////////////////////////////////////////////////////

1538

// WriteAnnotation()

1539

//

1540

// Computes annotation for multiple alignment and write values

1541

// to a file.

1542

/////////////////////////////////////////////////////////////////

1543

1544

void WriteAnnotation (MultiSequence *alignment,

1545

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){

1546

ofstream outfile (annotationFilename.c_str());

1547

1548

if (outfile.fail()){

1549

cerr << "ERROR: Unable to write annotation file." << endl;

1550

exit (1);

1551

}

1552

1553

const int alignLength = alignment->GetSequence(0)->GetLength();

1554

const int numSeqs = alignment->GetNumSequences();

1555

1556

SafeVector<int> position (numSeqs, 0);

1557

SafeVector<SafeVector<char>::iterator> seqs (numSeqs);

1558

for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();

1559

SafeVector<pair<int,int> > active;

1560

active.reserve (numSeqs);

1561

1562

// for every column

1563

for (int i = 1; i <= alignLength; i++){

1564

1565

// find all aligned residues in this particular column

1566

active.clear();

1567

for (int j = 0; j < numSeqs; j++){

1568

if (seqs[j][i] != '-'){

1569

active.push_back (make_pair(j, ++position[j]));

1570

}

1571

}

1572

1573

outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;

1574

}

1575

1576

outfile.close();

1577

}

1578

1579

/////////////////////////////////////////////////////////////////

1580

// ComputeScore()

1581

//

1582

// Computes the annotation score for a particular column.

1583

/////////////////////////////////////////////////////////////////

1584

1585

int ComputeScore (const SafeVector<pair<int, int> > &active,

1586

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){

1587

1588

if (active.size() <= 1) return 0;

1589

1590

// ALTERNATIVE #1: Compute the average alignment score.

1591

1592

float val = 0;

1593

for (int i = 0; i < (int) active.size(); i++){

1594

for (int j = i+1; j < (int) active.size(); j++){

1595

val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);

1596

}

1597

}

1598

1599

return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));

1600

1601

}