~ubuntu-branches/ubuntu/trusty/amap-align/trusty-proposed : revision 1

1

/////////////////////////////////////////////////////////////////

2

// Amap.cc

3

//

4

// Main routines for AMAP program.

5

/////////////////////////////////////////////////////////////////

6

7

#include "SafeVector.h"

8

#include "MultiSequence.h"

9

#include "MultiSequenceDag.h"

10

#include "Defaults.h"

11

#include "ScoreType.h"

12

#include "ProbabilisticModel.h"

13

#include "EvolutionaryTree.h"

14

#include "SparseMatrix.h"

15

#include <string>

16

#include <sstream>

17

#include <iomanip>

18

#include <iostream>

19

#include <list>

20

#include <set>

21

#include <algorithm>

22

#include <cstdio>

23

#include <cstdlib>

24

#include <cerrno>

25

#include <iomanip>

26

27

string parametersInputFilename = "";

28

string parametersOutputFilename = "no training";

29

string annotationFilename = "";

30

31

bool enableTraining = false;

32

bool enableVerbose = false;

33

bool enableAllPairs = false;

34

bool enableAnnotation = false;

35

bool enableViterbi = false;

36

bool enableClustalWOutput = false;

37

bool enableTrainEmissions = false;

38

bool enableAlignOrder = false;

39

bool enableDagAlignment = true;

40

bool enableEdgeReordering = true;

41

bool useTgf = true;

42

bool onlyPrintPosteriors = false;

43

int numConsistencyReps = 0;

44

int numPreTrainingReps = 0;

45

int numIterativeRefinementReps = 0;

46

47

float cutoff = 0;

48

float gapOpenPenalty = 0;

49

float gapContinuePenalty = 0;

50

VF initDistrib (NumMatrixTypes);

51

VF gapOpen (2*NumInsertStates);

52

VF gapExtend (2*NumInsertStates);

53

VVF emitPairs (256, VF (256, 1e-10));

54

VF emitSingle (256, 1e-5);

55

string alphabet = alphabetDefault;

56

float gapFactor = gapFactorDefault;

57

float edgeWeightThreshold = 0;

58

59

const int MIN_PRETRAINING_REPS = 0;

60

const int MAX_PRETRAINING_REPS = 20;

61

const int MIN_CONSISTENCY_REPS = 0;

62

const int MAX_CONSISTENCY_REPS = 5;

63

const int MIN_ITERATIVE_REFINEMENT_REPS = 0;

64

const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;

65

66

/////////////////////////////////////////////////////////////////

67

// Function prototypes

68

/////////////////////////////////////////////////////////////////

69

70

void PrintHeading();

71

void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,

72

const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);

73

MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,

74

VVF &emitPairs, VF &emitSingle);

75

SafeVector<string> ParseParams (int argc, char **argv);

76

void ReadParameters ();

77

MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,

78

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

79

const ProbabilisticModel &model);

80

MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,

81

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

82

const ProbabilisticModel &model);

83

SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,

84

SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);

85

void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);

86

87

set<int> GetSubtree (const TreeNode *tree);

88

void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

89

const ProbabilisticModel &model, MultiSequence* &alignment,

90

const TreeNode *tree);

91

void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

92

const ProbabilisticModel &model, MultiSequence* &alignment);

93

void WriteAnnotation (MultiSequence *alignment,

94

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);

95

int ComputeScore (const SafeVector<pair<int, int> > &active,

96

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);

97

98

99

/////////////////////////////////////////////////////////////////

100

// main()

101

//

102

// Calls all initialization routines and runs the AMAP

103

// aligner.

104

/////////////////////////////////////////////////////////////////

105

106

int main (int argc, char **argv){

107

108

// print AMAP heading

109

PrintHeading();

110

111

// parse program parameters

112

SafeVector<string> sequenceNames = ParseParams (argc, argv);

113

ReadParameters();

114

PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);

115

116

// now, we'll process all the files given as input. If we are given

117

// several filenames as input, then we'll load all of those sequences

118

// simultaneously, as long as we're not training. On the other hand,

119

// if we are training, then we'll treat each file as a separate

120

// training instance

121

122

// if we are training

123

if (enableTraining){

124

125

// build new model for aligning

126

ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);

127

128

// prepare to average parameters

129

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;

130

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;

131

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;

132

if (enableTrainEmissions){

133

for (int i = 0; i < (int) emitPairs.size(); i++)

134

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;

135

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;

136

}

137

138

// align each file individually

139

for (int i = 0; i < (int) sequenceNames.size(); i++){

140

141

VF thisInitDistrib (NumMatrixTypes);

142

VF thisGapOpen (2*NumInsertStates);

143

VF thisGapExtend (2*NumInsertStates);

144

VVF thisEmitPairs (256, VF (256, 1e-10));

145

VF thisEmitSingle (256, 1e-5);

146

147

// load sequence file

148

MultiSequence *sequences = new MultiSequence(); assert (sequences);

149

cerr << "Loading sequence file: " << sequenceNames[i] << endl;

150

sequences->LoadMFA (sequenceNames[i], true);

151

152

// align sequences

153

MultiSequence *tmpMsa = DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);

154

delete tmpMsa;

155

156

// add in contribution of the derived parameters

157

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];

158

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];

159

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];

160

if (enableTrainEmissions){

161

for (int i = 0; i < (int) emitPairs.size(); i++)

162

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];

163

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];

164

}

165

166

delete sequences;

167

}

168

169

// compute new parameters and print them out

170

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();

171

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();

172

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();

173

if (enableTrainEmissions){

174

for (int i = 0; i < (int) emitPairs.size(); i++)

175

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();

176

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();

177

}

178

179

PrintParameters ("Trained parameter set:",

180

initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,

181

parametersOutputFilename.c_str());

182

}

183

184

// if we are not training, we must simply want to align some sequences

185

else {

186

187

// load all files together

188

MultiSequence *sequences = new MultiSequence(); assert (sequences);

189

for (int i = 0; i < (int) sequenceNames.size(); i++){

190

cerr << "Loading sequence file: " << sequenceNames[i] << endl;

191

sequences->LoadMFA (sequenceNames[i], true);

192

}

193

194

// do all "pre-training" repetitions first

195

for (int ct = 0; ct < numPreTrainingReps; ct++){

196

enableTraining = true;

197

198

// build new model for aligning

199

ProbabilisticModel model (initDistrib, gapOpen, gapExtend,

200

emitPairs, emitSingle);

201

202

// do initial alignments

203

DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);

204

205

// print new parameters

206

PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);

207

208

enableTraining = false;

209

}

210

211

// now, we can perform the alignments and write them out

212

MultiSequence *alignment = DoAlign (sequences,

213

ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),

214

initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);

215

216

if (onlyPrintPosteriors)

217

return 0;

218

if (!enableAllPairs){

219

if (enableClustalWOutput)

220

alignment->WriteALN (cout);

221

else

222

alignment->WriteMFA (cout);

223

}

224

delete alignment;

225

delete sequences;

226

}

227

}

228

229

/////////////////////////////////////////////////////////////////

230

// PrintHeading()

231

//

232

// Prints heading for AMAP program.

233

/////////////////////////////////////////////////////////////////

234

235

void PrintHeading (){

236

cerr << endl

237

<< "AMAP version " << VERSION << " - align multiple protein sequences and print to standard output" << endl

238

<< "PROBCONS Written by Chuong Do" << endl

239

<< "AMAP algorithm implemented by Ariel Schwartz" << endl

240

<< endl;

241

}

242

243

/////////////////////////////////////////////////////////////////

244

// PrintParameters()

245

//

246

// Prints AMAP parameters to STDERR. If a filename is

247

// specified, then the parameters are also written to the file.

248

/////////////////////////////////////////////////////////////////

249

250

void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,

251

const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){

252

253

// print parameters to the screen

254

cerr << message << endl

255

<< " initDistrib[] = { ";

256

for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";

257

cerr << "}" << endl

258

<< " gapOpen[] = { ";

259

for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";

260

cerr << "}" << endl

261

<< " gapExtend[] = { ";

262

for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";

263

cerr << "}" << endl

264

<< endl;

265

266

/*

267

for (int i = 0; i < 5; i++){

268

for (int j = 0; j <= i; j++){

269

cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";

270

}

271

cerr << endl;

272

}*/

273

274

// if a file name is specified

275

if (filename){

276

277

// attempt to open the file for writing

278

FILE *file = fopen (filename, "w");

279

if (!file){

280

cerr << "ERROR: Unable to write parameter file: " << filename << endl;

281

exit (1);

282

}

283

284

// if successful, then write the parameters to the file

285

for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");

286

for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");

287

for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");

288

fprintf (file, "%s\n", alphabet.c_str());

289

for (int i = 0; i < (int) alphabet.size(); i++){

290

for (int j = 0; j <= i; j++)

291

fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);

292

fprintf (file, "\n");

293

}

294

for (int i = 0; i < (int) alphabet.size(); i++)

295

fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);

296

fprintf (file, "\n");

297

fclose (file);

298

}

299

}

300

301

/////////////////////////////////////////////////////////////////

302

// DoAlign()

303

//

304

// First computes all pairwise posterior probability matrices.

305

// Then, computes new parameters if training, or a final

306

// alignment, otherwise.

307

/////////////////////////////////////////////////////////////////

308

309

MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,

310

VF &gapExtend, VVF &emitPairs, VF &emitSingle){

311

312

assert (sequences);

313

314

const int numSeqs = sequences->GetNumSequences();

315

VVF distances (numSeqs, VF (numSeqs, 0));

316

SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));

317

SafeVector<SafeVector<SparseMatrix *> > originalSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));

318

319

if (enableTraining){

320

// prepare to average parameters

321

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;

322

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;

323

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;

324

if (enableTrainEmissions){

325

for (int i = 0; i < (int) emitPairs.size(); i++)

326

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;

327

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;

328

}

329

}

330

331

332

// skip posterior calculations if we just want to do Viterbi alignments

333

if (!enableViterbi){

334

cerr << "Computing posterior matrices" << endl;

335

// do all pairwise alignments for posterior probability matrices

336

for (int a = 0; a < numSeqs-1; a++){

337

for (int b = a+1; b < numSeqs; b++){

338

Sequence *seq1 = sequences->GetSequence (a);

339

Sequence *seq2 = sequences->GetSequence (b);

340

341

// verbose output

342

if (enableVerbose)

343

cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "

344

<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";

345

346

// compute forward and backward probabilities

347

VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);

348

VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);

349

350

// if we are training, then we'll simply want to compute the

351

// expected counts for each region within the matrix separately;

352

// otherwise, we'll need to put all of the regions together and

353

// assemble a posterior probability match matrix

354

355

// so, if we're training

356

if (enableTraining){

357

358

// compute new parameters

359

VF thisInitDistrib (NumMatrixTypes);

360

VF thisGapOpen (2*NumInsertStates);

361

VF thisGapExtend (2*NumInsertStates);

362

VVF thisEmitPairs (256, VF (256, 1e-10));

363

VF thisEmitSingle (256, 1e-5);

364

365

model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);

366

367

// add in contribution of the derived parameters

368

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];

369

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];

370

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];

371

if (enableTrainEmissions){

372

for (int i = 0; i < (int) emitPairs.size(); i++)

373

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];

374

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];

375

}

376

377

// let us know that we're done.

378

if (enableVerbose) cerr << "done." << endl;

379

}

380

else {

381

// compute posterior probability matrix

382

VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);

383

384

// compute sparse representations

385

originalSparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);

386

originalSparseMatrices[b][a] = originalSparseMatrices[a][b]->ComputeTranspose();

387

388

if (!enableAllPairs){

389

if (!enableDagAlignment) {

390

// perform the pairwise sequence alignment

391

pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),

392

seq2->GetLength(),

393

*posterior);

394

395

// compute "expected accuracy" distance for evolutionary tree computation

396

float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());

397

//float distance = (gapFactor == 0) ? alignment.second / min (seq1->GetLength(), seq2->GetLength()):

398

// alignment.second / (seq1->GetLength() + seq2->GetLength()) * gapFactor;

399

distances[a][b] = distances[b][a] = distance;

400

401

if (enableVerbose) {

402

cerr << setprecision (10) << distance << endl;

403

// if (distance == 1)

404

// cerr << setprecision(10) << (*posterior)[4 * (seq1->GetLength() + 1) + 4] << endl;

405

// originalSparseMatrices[a][b]->Print(cerr);

406

}

407

delete alignment.first;

408

}

409

}

410

else {

411

// let us know that we're done.

412

if (enableVerbose) cerr << "done." << endl;

413

}

414

415

delete posterior;

416

}

417

418

delete forward;

419

delete backward;

420

}

421

}

422

}

423

424

// now average out parameters derived

425

if (enableTraining){

426

427

// compute new parameters

428

for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;

429

for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;

430

for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;

431

432

if (enableTrainEmissions){

433

for (int i = 0; i < (int) emitPairs.size(); i++)

434

for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;

435

for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;

436

}

437

}

438

439

// see if we still want to do some alignments

440

else {

441

442

if (!enableViterbi){

443

444

sparseMatrices = originalSparseMatrices;

445

446

// perform the consistency transformation the desired number of times

447

for (int r = 0; r < numConsistencyReps; r++){

448

SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);

449

450

// now replace the old posterior matrices

451

for (int i = 0; i < numSeqs; i++){

452

for (int j = 0; j < numSeqs; j++){

453

454

// don't delete the original sparse matrices

455

if (r > 0) delete sparseMatrices[i][j];

456

sparseMatrices[i][j] = newSparseMatrices[i][j];

457

}

458

}

459

}

460

}

461

462

MultiSequence *finalAlignment = NULL;

463

464

if (onlyPrintPosteriors) {

465

for (int i = 0; i < numSeqs; i++){

466

string seq1name = sequences->GetSequence(i)->GetHeader();

467

for (int j = i + 1; j < numSeqs; j++){

468

cout << "Sparse Matrix: " << i << "," << j << endl;

469

cout << "Sequence names: " << seq1name << ", " << sequences->GetSequence(j)->GetHeader() << endl;

470

sparseMatrices[i][j]->Print(cout);

471

}

472

}

473

}

474

else if (enableAllPairs){

475

for (int a = 0; a < numSeqs-1; a++){

476

for (int b = a+1; b < numSeqs; b++){

477

Sequence *seq1 = sequences->GetSequence (a);

478

Sequence *seq2 = sequences->GetSequence (b);

479

480

if (enableVerbose)

481

cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "

482

<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";

483

484

485

// perform the pairwise sequence alignment

486

pair<SafeVector<char> *, float> alignment;

487

if (enableViterbi)

488

alignment = model.ComputeViterbiAlignment (seq1, seq2);

489

else {

490

491

// build posterior matrix

492

VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);

493

int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);

494

for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;

495

496

alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior, gapFactor);

497

delete posterior;

498

}

499

500

// write pairwise alignments

501

string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");

502

ofstream outfile (name.c_str());

503

504

MultiSequence *result = new MultiSequence();

505

result->AddSequence (seq1->AddGaps(alignment.first, 'X'));

506

result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));

507

if (enableClustalWOutput)

508

result->WriteALN (outfile);

509

else

510

result->WriteMFA (outfile);

511

512

outfile.close();

513

514

delete alignment.first;

515

}

516

}

517

}

518

519

// now if we still need to do a final multiple alignment

520

else {

521

522

if (enableVerbose)

523

cerr << endl;

524

525

if (!enableDagAlignment) {

526

// compute the evolutionary tree

527

TreeNode *tree = TreeNode::ComputeTree (distances);

528

529

tree->Print (cerr, sequences);

530

cerr << endl;

531

532

// make the final alignment

533

finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);

534

535

delete tree;

536

} else {

537

cerr << "Building DAG" << endl;

538

MultiSequenceDag mds(sequences,false);

539

cerr << "Aligning sequences with DAG alignment" << endl;

540

finalAlignment = mds.AlignDag(sparseMatrices, gapFactor, enableVerbose, enableEdgeReordering, useTgf, edgeWeightThreshold);

541

}

542

// build annotation

543

if (enableAnnotation){

544

WriteAnnotation (finalAlignment, originalSparseMatrices);

545

}

546

547

}

548

549

if (!enableViterbi){

550

// delete sparse matrices

551

for (int a = 0; a < numSeqs-1; a++){

552

for (int b = a+1; b < numSeqs; b++){

553

delete originalSparseMatrices[a][b];

554

delete originalSparseMatrices[b][a];

555

556

if (numConsistencyReps > 0){

557

delete sparseMatrices[a][b];

558

delete sparseMatrices[b][a];

559

}

560

}

561

}

562

}

563

564

return finalAlignment;

565

}

566

567

return NULL;

568

}

569

570

/////////////////////////////////////////////////////////////////

571

// GetInteger()

572

//

573

// Attempts to parse an integer from the character string given.

574

// Returns true only if no parsing error occurs.

575

/////////////////////////////////////////////////////////////////

576

577

bool GetInteger (char *data, int *val){

578

char *endPtr;

579

long int retVal;

580

581

assert (val);

582

583

errno = 0;

584

retVal = strtol (data, &endPtr, 0);

585

if (retVal == 0 && (errno != 0 || data == endPtr)) return false;

586

if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;

587

if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;

588

*val = (int) retVal;

589

return true;

590

}

591

592

/////////////////////////////////////////////////////////////////

593

// GetFloat()

594

//

595

// Attempts to parse a float from the character string given.

596

// Returns true only if no parsing error occurs.

597

/////////////////////////////////////////////////////////////////

598

599

bool GetFloat (char *data, float *val){

600

char *endPtr;

601

double retVal;

602

603

assert (val);

604

605

errno = 0;

606

retVal = strtod (data, &endPtr);

607

if (retVal == 0 && (errno != 0 || data == endPtr)) return false;

608

if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;

609

*val = (float) retVal;

610

return true;

611

}

612

613

/////////////////////////////////////////////////////////////////

614

// ParseParams()

615

//

616

// Parse all command-line options.

617

/////////////////////////////////////////////////////////////////

618

619

SafeVector<string> ParseParams (int argc, char **argv){

620

621

if (argc < 2){

622

623

cerr << "AMAP comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl

624

<< "you are welcome to redistribute it under certain conditions. See the" << endl

625

<< "files README and README.PROBCONS for details." << endl

626

<< endl

627

<< "Usage:" << endl

628

<< " amap [OPTION]... [MFAFILE]..." << endl

629

<< endl

630

<< "Description:" << endl

631

<< " Align sequences in MFAFILE(s) and print result to standard output" << endl

632

<< endl

633

<< " -clustalw" << endl

634

<< " use CLUSTALW output format instead of MFA" << endl

635

<< endl

636

<< " -c, --consistency REPS" << endl

637

<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS

638

<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl

639

<< endl

640

<< " -ir, --iterative-refinement REPS" << endl

641

<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS

642

<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl

643

<< endl

644

<< " -pre, --pre-training REPS" << endl

645

<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS

646

<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl

647

<< endl

648

<< " -pairs" << endl

649

<< " generate all-pairs pairwise alignments" << endl

650

<< endl

651

<< " -viterbi" << endl

652

<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl

653

<< endl

654

<< " -v, --verbose" << endl

655

<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl

656

<< endl

657

<< " -annot FILENAME" << endl

658

<< " write annotation for multiple alignment to FILENAME" << endl

659

<< endl

660

<< " -t, --train FILENAME" << endl

661

<< " compute EM transition probabilities, store in FILENAME (default: "

662

<< parametersOutputFilename << ")" << endl

663

<< endl

664

<< " -e, --emissions" << endl

665

<< " also reestimate emission probabilities (default: "

666

<< (enableTrainEmissions ? "on" : "off") << ")" << endl

667

<< endl

668

<< " -p, --paramfile FILENAME" << endl

669

<< " read parameters from FILENAME (default: "

670

<< parametersInputFilename << ")" << endl

671

<< endl

672

<< " -a, --alignment-order" << endl

673

<< " print sequences in alignment order rather than input order (default: "

674

<< (enableAlignOrder ? "on" : "off") << ")" << endl

675

<< endl

676

<< " -g, --gap-factor GF" << endl

677

<< " use GF as the gap-factor parameter, set to 0 for best sensitivity, higher values for better specificity (default: "

678

<< gapFactor << ")" << endl

679

<< endl

680

<< " -w, --edge-weight-threshold W" << endl

681

<< " stop the sequence annealing process when best edge has lower weight than W," << endl

682

<< " set to 0 for best sensitivity, higher values for better specificity (default: "

683

<< edgeWeightThreshold << ")" << endl

684

<< endl

685

<< " -prog, --progressive" << endl

686

<< " use progresive alignment instead of sequence annealing alignment (default: "

687

<< (!enableDagAlignment ? "on" : "off") << ")" << endl

688

<< endl

689

<< " -noreorder, --no-edge-reordering" << endl

690

<< " disable reordring of edges during sequence annealing alignment (default: "

691

<< (!enableEdgeReordering ? "on" : "off") << ")" << endl

692

<< endl

693

<< " -maxstep, --use-max-stepsize" << endl

694

<< " use maximum improvement step size instead of tGf edge ranking (default: "

695

<< (!useTgf ? "on" : "off") << ")" << endl

696

<< endl

697

<< " -print, --print-posteriors" << endl

698

<< " only print the posterior probability matrices (default: "

699

<< (onlyPrintPosteriors ? "on" : "off") << ")" << endl

700

<< endl;

701

exit (1);

702

}

703

704

SafeVector<string> sequenceNames;

705

int tempInt;

706

float tempFloat;

707

708

for (int i = 1; i < argc; i++){

709

if (argv[i][0] == '-'){

710

711

// training

712

if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){

713

enableTraining = true;

714

if (i < argc - 1)

715

parametersOutputFilename = string (argv[++i]);

716

else {

717

cerr << "ERROR: Filename expected for option " << argv[i] << endl;

718

exit (1);

719

}

720

}

721

722

// emission training

723

else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){

724

enableTrainEmissions = true;

725

}

726

727

// parameter file

728

else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){

729

if (i < argc - 1)

730

parametersInputFilename = string (argv[++i]);

731

else {

732

cerr << "ERROR: Filename expected for option " << argv[i] << endl;

733

exit (1);

734

}

735

}

736

737

// number of consistency transformations

738

else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){

739

if (i < argc - 1){

740

if (!GetInteger (argv[++i], &tempInt)){

741

cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;

742

exit (1);

743

}

744

else {

745

if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){

746

cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "

747

<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;

748

exit (1);

749

}

750

else

751

numConsistencyReps = tempInt;

752

}

753

}

754

else {

755

cerr << "ERROR: Integer expected for option " << argv[i] << endl;

756

exit (1);

757

}

758

}

759

760

// number of randomized partitioning iterative refinement passes

761

else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){

762

if (i < argc - 1){

763

if (!GetInteger (argv[++i], &tempInt)){

764

cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;

765

exit (1);

766

}

767

else {

768

if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){

769

cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "

770

<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;

771

exit (1);

772

}

773

else

774

numIterativeRefinementReps = tempInt;

775

}

776

}

777

else {

778

cerr << "ERROR: Integer expected for option " << argv[i] << endl;

779

exit (1);

780

}

781

}

782

783

// number of EM pre-training rounds

784

else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){

785

if (i < argc - 1){

786

if (!GetInteger (argv[++i], &tempInt)){

787

cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;

788

exit (1);

789

}

790

else {

791

if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){

792

cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "

793

<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;

794

exit (1);

795

}

796

else

797

numPreTrainingReps = tempInt;

798

}

799

}

800

else {

801

cerr << "ERROR: Integer expected for option " << argv[i] << endl;

802

exit (1);

803

}

804

}

805

806

// gap open penalty

807

else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){

808

if (i < argc - 1){

809

if (!GetFloat (argv[++i], &tempFloat)){

810

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

811

exit (1);

812

}

813

else {

814

if (tempFloat > 0){

815

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;

816

exit (1);

817

}

818

else

819

gapOpenPenalty = tempFloat;

820

}

821

}

822

else {

823

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

824

exit (1);

825

}

826

}

827

828

// gap extension penalty

829

else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){

830

if (i < argc - 1){

831

if (!GetFloat (argv[++i], &tempFloat)){

832

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

833

exit (1);

834

}

835

else {

836

if (tempFloat > 0){

837

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;

838

exit (1);

839

}

840

else

841

gapContinuePenalty = tempFloat;

842

}

843

}

844

else {

845

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

846

exit (1);

847

}

848

}

849

850

// all-pairs pairwise alignments

851

else if (!strcmp (argv[i], "-pairs")){

852

enableAllPairs = true;

853

}

854

855

// all-pairs pairwise Viterbi alignments

856

else if (!strcmp (argv[i], "-viterbi")){

857

enableAllPairs = true;

858

enableViterbi = true;

859

}

860

861

// annotation files

862

else if (!strcmp (argv[i], "-annot")){

863

enableAnnotation = true;

864

if (i < argc - 1)

865

annotationFilename = argv[++i];

866

else {

867

cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;

868

exit (1);

869

}

870

}

871

872

// clustalw output format

873

else if (!strcmp (argv[i], "-clustalw")){

874

enableClustalWOutput = true;

875

}

876

877

// cutoff

878

else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){

879

if (i < argc - 1){

880

if (!GetFloat (argv[++i], &tempFloat)){

881

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

882

exit (1);

883

}

884

else {

885

if (tempFloat < 0 || tempFloat > 1){

886

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;

887

exit (1);

888

}

889

else

890

cutoff = tempFloat;

891

}

892

}

893

else {

894

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

895

exit (1);

896

}

897

}

898

899

// verbose reporting

900

else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){

901

enableVerbose = true;

902

}

903

904

// alignment order

905

else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){

906

enableAlignOrder = true;

907

}

908

909

// progressive

910

else if (!strcmp (argv[i], "-prog") || !strcmp (argv[i], "--progressive")){

911

enableDagAlignment = false;

912

}

913

914

// edge reordering

915

else if (!strcmp (argv[i], "-noreorder") || !strcmp (argv[i], "--no-edge-reordering")){

916

enableEdgeReordering = false;

917

}

918

919

// edge ranking method

920

else if (!strcmp (argv[i], "-maxstep") || !strcmp (argv[i], "--use-max-stepsize")){

921

useTgf = false;

922

}

923

924

// print posteriors

925

else if (!strcmp (argv[i], "-print") || !strcmp (argv[i], "--print-posteriors")){

926

onlyPrintPosteriors = true;

927

}

928

929

// gap factor

930

else if (!strcmp (argv[i], "-g") || !strcmp (argv[i], "--gap-factor")){

931

if (i < argc - 1){

932

if (!GetFloat (argv[++i], &tempFloat)){

933

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

934

exit (1);

935

}

936

else {

937

if (tempFloat < 0){

938

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;

939

exit (1);

940

}

941

else

942

gapFactor = tempFloat;

943

}

944

}

945

else {

946

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

947

exit (1);

948

}

949

}

950

951

// edge weight threshold

952

else if (!strcmp (argv[i], "-w") || !strcmp (argv[i], "--edge-weight-threshold")){

953

if (i < argc - 1){

954

if (!GetFloat (argv[++i], &tempFloat)){

955

cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;

956

exit (1);

957

}

958

else {

959

if (tempFloat < 0){

960

cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;

961

exit (1);

962

}

963

else

964

edgeWeightThreshold = tempFloat;

965

}

966

}

967

else {

968

cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;

969

exit (1);

970

}

971

}

972

973

// bad arguments

974

else {

975

cerr << "ERROR: Unrecognized option: " << argv[i] << endl;

976

exit (1);

977

}

978

}

979

else {

980

sequenceNames.push_back (string (argv[i]));

981

}

982

}

983

984

if (enableTrainEmissions && !enableTraining){

985

cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;

986

exit (1);

987

}

988

989

return sequenceNames;

990

}

991

992

/////////////////////////////////////////////////////////////////

993

// ReadParameters()

994

//

995

// Read initial distribution, transition, and emission

996

// parameters from a file.

997

/////////////////////////////////////////////////////////////////

998

999

void ReadParameters (){

1000

1001

ifstream data;

1002

1003

emitPairs = VVF (256, VF (256, 1e-10));

1004

emitSingle = VF (256, 1e-5);

1005

1006

// read initial state distribution and transition parameters

1007

1008

if (NumInsertStates == 1){

1009

for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];

1010

for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];

1011

for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];

1012

}

1013

else if (NumInsertStates == 2){

1014

for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];

1015

for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];

1016

for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];

1017

}

1018

else {

1019

cerr << "ERROR: No default initial distribution/parameter settings exist" << endl

1020

<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;

1021

exit (1);

1022

}

1023

1024

alphabet = alphabetDefault;

1025

1026

for (int i = 0; i < (int) alphabet.length(); i++){

1027

emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];

1028

emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];

1029

for (int j = 0; j <= i; j++){

1030

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];

1031

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];

1032

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];

1033

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];

1034

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];

1035

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];

1036

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];

1037

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];

1038

}

1039

}

1040

1041

if (parametersInputFilename != string ("")){

1042

data.open (parametersInputFilename.c_str());

1043

if (data.fail()){

1044

cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;

1045

exit (1);

1046

}

1047

1048

string line[3];

1049

for (int i = 0; i < 3; i++){

1050

if (!getline (data, line[i])){

1051

cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;

1052

exit (1);

1053

}

1054

}

1055

istringstream data2;

1056

data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];

1057

data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];

1058

data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];

1059

1060

if (!getline (data, line[0])){

1061

return;

1062

cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;

1063

exit (1);

1064

}

1065

1066

// read alphabet as concatenation of all characters on alphabet line

1067

alphabet = "";

1068

string token;

1069

data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;

1070

1071

for (int i = 0; i < (int) alphabet.size(); i++){

1072

for (int j = 0; j <= i; j++){

1073

float val;

1074

data >> val;

1075

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;

1076

emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;

1077

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;

1078

emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;

1079

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;

1080

emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;

1081

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;

1082

emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;

1083

}

1084

}

1085

1086

for (int i = 0; i < (int) alphabet.size(); i++){

1087

float val;

1088

data >> val;

1089

emitSingle[(unsigned char) tolower(alphabet[i])] = val;

1090

emitSingle[(unsigned char) toupper(alphabet[i])] = val;

1091

}

1092

data.close();

1093

}

1094

}

1095

1096

/////////////////////////////////////////////////////////////////

1097

// ProcessTree()

1098

//

1099

// Process the tree recursively. Returns the aligned sequences

1100

// corresponding to a node or leaf of the tree.

1101

/////////////////////////////////////////////////////////////////

1102

1103

MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,

1104

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1105

const ProbabilisticModel &model){

1106

MultiSequence *result;

1107

1108

// check if this is a node of the alignment tree

1109

if (tree->GetSequenceLabel() == -1){

1110

MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);

1111

MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);

1112

1113

assert (alignLeft);

1114

assert (alignRight);

1115

1116

result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);

1117

assert (result);

1118

1119

delete alignLeft;

1120

delete alignRight;

1121

}

1122

1123

// otherwise, this is a leaf of the alignment tree

1124

else {

1125

result = new MultiSequence(); assert (result);

1126

result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());

1127

}

1128

1129

return result;

1130

}

1131

1132

/////////////////////////////////////////////////////////////////

1133

// ComputeFinalAlignment()

1134

//

1135

// Compute the final alignment by calling ProcessTree(), then

1136

// performing iterative refinement as needed.

1137

/////////////////////////////////////////////////////////////////

1138

1139

MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,

1140

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1141

const ProbabilisticModel &model){

1142

1143

MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);

1144

1145

if (enableAlignOrder){

1146

alignment->SaveOrdering();

1147

enableAlignOrder = false;

1148

}

1149

1150

// tree-based refinement

1151

// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);

1152

1153

// iterative refinement

1154

for (int i = 0; i < numIterativeRefinementReps; i++)

1155

DoIterativeRefinement (sparseMatrices, model, alignment);

1156

1157

cerr << endl;

1158

1159

// return final alignment

1160

return alignment;

1161

}

1162

1163

/////////////////////////////////////////////////////////////////

1164

// AlignAlignments()

1165

//

1166

// Returns the alignment of two MultiSequence objects.

1167

/////////////////////////////////////////////////////////////////

1168

1169

MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,

1170

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1171

const ProbabilisticModel &model){

1172

1173

// print some info about the alignment

1174

if (enableVerbose){

1175

for (int i = 0; i < align1->GetNumSequences(); i++)

1176

cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();

1177

cerr << "] vs. ";

1178

for (int i = 0; i < align2->GetNumSequences(); i++)

1179

cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();

1180

cerr << "]: ";

1181

}

1182

1183

VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff, gapFactor);

1184

pair<SafeVector<char> *, float> alignment;

1185

1186

// choose the alignment routine depending on the "cosmetic" gap penalties used

1187

if (gapOpenPenalty == 0 && gapContinuePenalty == 0)

1188

alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, gapFactor);

1189

else

1190

alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,

1191

*posterior, align1->GetNumSequences(), align2->GetNumSequences(),

1192

gapOpenPenalty, gapContinuePenalty);

1193

// if (enableVerbose)

1194

// cerr << "finished computing alignment\n";

1195

1196

delete posterior;

1197

1198

if (enableVerbose){

1199

1200

// compute total length of sequences

1201

int totLength = 0;

1202

for (int i = 0; i < align1->GetNumSequences(); i++)

1203

for (int j = 0; j < align2->GetNumSequences(); j++)

1204

totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());

1205

1206

// give an "accuracy" measure for the alignment

1207

cerr << alignment.second / totLength << endl;

1208

}

1209

1210

// now build final alignment

1211

MultiSequence *result = new MultiSequence();

1212

for (int i = 0; i < align1->GetNumSequences(); i++)

1213

result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));

1214

for (int i = 0; i < align2->GetNumSequences(); i++)

1215

result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));

1216

if (!enableAlignOrder)

1217

result->SortByLabel();

1218

1219

// free temporary alignment

1220

delete alignment.first;

1221

1222

return result;

1223

}

1224

1225

/////////////////////////////////////////////////////////////////

1226

// DoRelaxation()

1227

//

1228

// Performs one round of the consistency transformation. The

1229

// formula used is:

1230

// 1

1231

// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])

1232

// |S| z in S k

1233

//

1234

// where S = {x, y, all other sequences...}

1235

//

1236

/////////////////////////////////////////////////////////////////

1237

1238

SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,

1239

SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){

1240

const int numSeqs = sequences->GetNumSequences();

1241

1242

SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));

1243

1244

// for every pair of sequences

1245

for (int i = 0; i < numSeqs; i++){

1246

for (int j = i+1; j < numSeqs; j++){

1247

Sequence *seq1 = sequences->GetSequence (i);

1248

Sequence *seq2 = sequences->GetSequence (j);

1249

1250

if (enableVerbose)

1251

cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "

1252

<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";

1253

1254

// get the original posterior matrix

1255

VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);

1256

VF &posterior = *posteriorPtr;

1257

1258

const int seq1Length = seq1->GetLength();

1259

const int seq2Length = seq2->GetLength();

1260

1261

VF *oldSumsPtr = new VF(seq1Length + seq2Length + 2,0);

1262

VF &oldSums = *oldSumsPtr;

1263

VF *newSumsPtr = new VF(seq1Length + seq2Length + 2,0);

1264

VF &newSums = *newSumsPtr;

1265

1266

for (int k = 0, kl = 0; k <= seq1Length; k++) {

1267

for (int l = 0; l <= seq2Length; l++) {

1268

oldSums[k] += posterior[kl];

1269

oldSums[seq1Length + 1 + l] += posterior[kl++];

1270

}

1271

}

1272

1273

// contribution from the summation where z = x and z = y

1274

for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];

1275

1276

if (enableVerbose)

1277

cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";

1278

1279

// contribution from all other sequences

1280

for (int k = 0; k < numSeqs; k++) if (k != i && k != j){

1281

Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);

1282

}

1283

1284

// now renormalization

1285

for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;

1286

1287

for (int k = 0, kl = 0; k <= seq1Length; k++) {

1288

for (int l = 0; l <= seq2Length; l++) {

1289

newSums[k] += posterior[kl];

1290

newSums[seq1Length + 1 + l] += posterior[kl++];

1291

}

1292

}

1293

1294

int gapPostBase = (seq1Length+1) * (seq2Length+1);

1295

for (int k = 0; k < seq1Length + seq2Length + 2; k++) {

1296

if (oldSums[k] < POSTERIOR_CUTOFF) {

1297

if (newSums[k] > 1)

1298

cerr << "negative new gap posterior!\n";

1299

else {

1300

if (enableVerbose)

1301

cerr << setprecision(5) << posterior[gapPostBase + k] << "->" << setprecision(5) << 1 - newSums[k] << ", ";

1302

posterior[gapPostBase + k] = 1 - newSums[k];

1303

}

1304

}

1305

else {

1306

posterior[gapPostBase + k] *= newSums[k] / oldSums[k];

1307

if (enableVerbose && newSums[k] > oldSums[k])

1308

cerr << setprecision(5) << newSums[k] / oldSums[k] << ", ";

1309

}

1310

}

1311

1312

if (enableVerbose)

1313

cerr << endl;

1314

1315

// save the new posterior matrix

1316

newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);

1317

newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();

1318

1319

if (enableVerbose)

1320

cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";

1321

1322

delete posteriorPtr;

1323

delete oldSumsPtr;

1324

delete newSumsPtr;

1325

1326

if (enableVerbose)

1327

cerr << "done." << endl;

1328

}

1329

}

1330

1331

return newSparseMatrices;

1332

}

1333

1334

/////////////////////////////////////////////////////////////////

1335

// Relax()

1336

//

1337

// Computes the consistency transformation for a single sequence

1338

// z, and adds the transformed matrix to "posterior".

1339

/////////////////////////////////////////////////////////////////

1340

1341

void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){

1342

1343

assert (matXZ);

1344

assert (matZY);

1345

1346

int lengthX = matXZ->GetSeq1Length();

1347

int lengthY = matZY->GetSeq2Length();

1348

assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());

1349

1350

// for every x[i]

1351

for (int i = 1; i <= lengthX; i++){

1352

SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);

1353

SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);

1354

1355

VF::iterator base = posterior.begin() + i * (lengthY + 1);

1356

1357

// iterate through all x[i]-z[k]

1358

while (XZptr != XZend){

1359

SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);

1360

SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);

1361

const float XZval = XZptr->second;

1362

1363

// iterate through all z[k]-y[j]

1364

while (ZYptr != ZYend){

1365

base[ZYptr->first] += XZval * ZYptr->second;

1366

ZYptr++;

1367

}

1368

XZptr++;

1369

}

1370

}

1371

}

1372

1373

/////////////////////////////////////////////////////////////////

1374

// GetSubtree

1375

//

1376

// Returns set containing all leaf labels of the current subtree.

1377

/////////////////////////////////////////////////////////////////

1378

1379

set<int> GetSubtree (const TreeNode *tree){

1380

set<int> s;

1381

1382

if (tree->GetSequenceLabel() == -1){

1383

s = GetSubtree (tree->GetLeftChild());

1384

set<int> t = GetSubtree (tree->GetRightChild());

1385

1386

for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)

1387

s.insert (*iter);

1388

}

1389

else {

1390

s.insert (tree->GetSequenceLabel());

1391

}

1392

1393

return s;

1394

}

1395

1396

/////////////////////////////////////////////////////////////////

1397

// TreeBasedBiPartitioning

1398

//

1399

// Uses the iterative refinement scheme from MUSCLE.

1400

/////////////////////////////////////////////////////////////////

1401

1402

void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1403

const ProbabilisticModel &model, MultiSequence* &alignment,

1404

const TreeNode *tree){

1405

// check if this is a node of the alignment tree

1406

if (tree->GetSequenceLabel() == -1){

1407

TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());

1408

TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());

1409

1410

set<int> leftSubtree = GetSubtree (tree->GetLeftChild());

1411

set<int> rightSubtree = GetSubtree (tree->GetRightChild());

1412

set<int> leftSubtreeComplement, rightSubtreeComplement;

1413

1414

// calculate complement of each subtree

1415

for (int i = 0; i < alignment->GetNumSequences(); i++){

1416

if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);

1417

if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);

1418

}

1419

1420

// perform realignments for edge to left child

1421

if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){

1422

MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);

1423

MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);

1424

delete alignment;

1425

alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);

1426

}

1427

1428

// perform realignments for edge to right child

1429

if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){

1430

MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);

1431

MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);

1432

delete alignment;

1433

alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);

1434

}

1435

}

1436

}

1437

1438

/////////////////////////////////////////////////////////////////

1439

// DoIterativeRefinement()

1440

//

1441

// Performs a single round of randomized partionining iterative

1442

// refinement.

1443

/////////////////////////////////////////////////////////////////

1444

1445

void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,

1446

const ProbabilisticModel &model, MultiSequence* &alignment){

1447

set<int> groupOne, groupTwo;

1448

1449

// create two separate groups

1450

for (int i = 0; i < alignment->GetNumSequences(); i++){

1451

if (rand() % 2)

1452

groupOne.insert (i);

1453

else

1454

groupTwo.insert (i);

1455

}

1456

1457

if (groupOne.empty() || groupTwo.empty()) return;

1458

1459

// project into the two groups

1460

MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);

1461

MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);

1462

delete alignment;

1463

1464

// realign

1465

alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);

1466

}

1467

1468

/////////////////////////////////////////////////////////////////

1469

// WriteAnnotation()

1470

//

1471

// Computes annotation for multiple alignment and write values

1472

// to a file.

1473

/////////////////////////////////////////////////////////////////

1474

1475

void WriteAnnotation (MultiSequence *alignment,

1476

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){

1477

ofstream outfile (annotationFilename.c_str());

1478

1479

if (outfile.fail()){

1480

cerr << "ERROR: Unable to write annotation file." << endl;

1481

exit (1);

1482

}

1483

1484

const int alignLength = alignment->GetSequence(0)->GetLength();

1485

const int numSeqs = alignment->GetNumSequences();

1486

1487

SafeVector<int> position (numSeqs, 0);

1488

SafeVector<SafeVector<char>::iterator> seqs (numSeqs);

1489

for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();

1490

SafeVector<pair<int,int> > active;

1491

active.reserve (numSeqs);

1492

1493

// for every column

1494

for (int i = 1; i <= alignLength; i++){

1495

1496

// find all aligned residues in this particular column

1497

active.clear();

1498

for (int j = 0; j < numSeqs; j++){

1499

if (seqs[j][i] != '-'){

1500

active.push_back (make_pair(j, ++position[j]));

1501

}

1502

}

1503

1504

outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;

1505

}

1506

1507

outfile.close();

1508

}

1509

1510

/////////////////////////////////////////////////////////////////

1511

// ComputeScore()

1512

//

1513

// Computes the annotation score for a particular column.

1514

/////////////////////////////////////////////////////////////////

1515

1516

int ComputeScore (const SafeVector<pair<int, int> > &active,

1517

const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){

1518

1519

if (active.size() <= 1) return 0;

1520

1521

// ALTERNATIVE #1: Compute the average alignment score.

1522

1523

float val = 0;

1524

for (int i = 0; i < (int) active.size(); i++){

1525

for (int j = i+1; j < (int) active.size(); j++){

1526

val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);

1527

}

1528

}

1529

1530

return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));

1531

1532

}