1
/////////////////////////////////////////////////////////////////
4
// Main routines for AMAP program.
5
/////////////////////////////////////////////////////////////////
7
#include "SafeVector.h"
8
#include "MultiSequence.h"
9
#include "MultiSequenceDag.h"
11
#include "ScoreType.h"
12
#include "ProbabilisticModel.h"
13
#include "EvolutionaryTree.h"
14
#include "SparseMatrix.h"
27
string parametersInputFilename = "";
28
string parametersOutputFilename = "no training";
29
string annotationFilename = "";
31
bool enableTraining = false;
32
bool enableVerbose = false;
33
bool enableAllPairs = false;
34
bool enableAnnotation = false;
35
bool enableViterbi = false;
36
bool enableClustalWOutput = false;
37
bool enableTrainEmissions = false;
38
bool enableAlignOrder = false;
39
bool enableDagAlignment = true;
40
bool enableEdgeReordering = true;
42
bool onlyPrintPosteriors = false;
43
int numConsistencyReps = 0;
44
int numPreTrainingReps = 0;
45
int numIterativeRefinementReps = 0;
48
float gapOpenPenalty = 0;
49
float gapContinuePenalty = 0;
50
VF initDistrib (NumMatrixTypes);
51
VF gapOpen (2*NumInsertStates);
52
VF gapExtend (2*NumInsertStates);
53
VVF emitPairs (256, VF (256, 1e-10));
54
VF emitSingle (256, 1e-5);
55
string alphabet = alphabetDefault;
56
float gapFactor = gapFactorDefault;
57
float edgeWeightThreshold = 0;
59
const int MIN_PRETRAINING_REPS = 0;
60
const int MAX_PRETRAINING_REPS = 20;
61
const int MIN_CONSISTENCY_REPS = 0;
62
const int MAX_CONSISTENCY_REPS = 5;
63
const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
64
const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
66
/////////////////////////////////////////////////////////////////
67
// Function prototypes
68
/////////////////////////////////////////////////////////////////
71
void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
72
const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
73
MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
74
VVF &emitPairs, VF &emitSingle);
75
SafeVector<string> ParseParams (int argc, char **argv);
76
void ReadParameters ();
77
MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
78
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
79
const ProbabilisticModel &model);
80
MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
81
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
82
const ProbabilisticModel &model);
83
SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
84
SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
85
void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
87
set<int> GetSubtree (const TreeNode *tree);
88
void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
89
const ProbabilisticModel &model, MultiSequence* &alignment,
90
const TreeNode *tree);
91
void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
92
const ProbabilisticModel &model, MultiSequence* &alignment);
93
void WriteAnnotation (MultiSequence *alignment,
94
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
95
int ComputeScore (const SafeVector<pair<int, int> > &active,
96
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
99
/////////////////////////////////////////////////////////////////
102
// Calls all initialization routines and runs the AMAP
104
/////////////////////////////////////////////////////////////////
106
int main (int argc, char **argv){
108
// print AMAP heading
111
// parse program parameters
112
SafeVector<string> sequenceNames = ParseParams (argc, argv);
114
PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
116
// now, we'll process all the files given as input. If we are given
117
// several filenames as input, then we'll load all of those sequences
118
// simultaneously, as long as we're not training. On the other hand,
119
// if we are training, then we'll treat each file as a separate
122
// if we are training
125
// build new model for aligning
126
ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
128
// prepare to average parameters
129
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
130
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
131
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
132
if (enableTrainEmissions){
133
for (int i = 0; i < (int) emitPairs.size(); i++)
134
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
135
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
138
// align each file individually
139
for (int i = 0; i < (int) sequenceNames.size(); i++){
141
VF thisInitDistrib (NumMatrixTypes);
142
VF thisGapOpen (2*NumInsertStates);
143
VF thisGapExtend (2*NumInsertStates);
144
VVF thisEmitPairs (256, VF (256, 1e-10));
145
VF thisEmitSingle (256, 1e-5);
147
// load sequence file
148
MultiSequence *sequences = new MultiSequence(); assert (sequences);
149
cerr << "Loading sequence file: " << sequenceNames[i] << endl;
150
sequences->LoadMFA (sequenceNames[i], true);
153
MultiSequence *tmpMsa = DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
156
// add in contribution of the derived parameters
157
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
158
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
159
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
160
if (enableTrainEmissions){
161
for (int i = 0; i < (int) emitPairs.size(); i++)
162
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
163
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
169
// compute new parameters and print them out
170
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
171
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
172
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
173
if (enableTrainEmissions){
174
for (int i = 0; i < (int) emitPairs.size(); i++)
175
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
176
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
179
PrintParameters ("Trained parameter set:",
180
initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
181
parametersOutputFilename.c_str());
184
// if we are not training, we must simply want to align some sequences
187
// load all files together
188
MultiSequence *sequences = new MultiSequence(); assert (sequences);
189
for (int i = 0; i < (int) sequenceNames.size(); i++){
190
cerr << "Loading sequence file: " << sequenceNames[i] << endl;
191
sequences->LoadMFA (sequenceNames[i], true);
194
// do all "pre-training" repetitions first
195
for (int ct = 0; ct < numPreTrainingReps; ct++){
196
enableTraining = true;
198
// build new model for aligning
199
ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
200
emitPairs, emitSingle);
202
// do initial alignments
203
DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
205
// print new parameters
206
PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
208
enableTraining = false;
211
// now, we can perform the alignments and write them out
212
MultiSequence *alignment = DoAlign (sequences,
213
ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
214
initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
216
if (onlyPrintPosteriors)
218
if (!enableAllPairs){
219
if (enableClustalWOutput)
220
alignment->WriteALN (cout);
222
alignment->WriteMFA (cout);
229
/////////////////////////////////////////////////////////////////
232
// Prints heading for AMAP program.
233
/////////////////////////////////////////////////////////////////
235
void PrintHeading (){
237
<< "AMAP version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
238
<< "PROBCONS Written by Chuong Do" << endl
239
<< "AMAP algorithm implemented by Ariel Schwartz" << endl
243
/////////////////////////////////////////////////////////////////
246
// Prints AMAP parameters to STDERR. If a filename is
247
// specified, then the parameters are also written to the file.
248
/////////////////////////////////////////////////////////////////
250
void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
251
const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
253
// print parameters to the screen
254
cerr << message << endl
255
<< " initDistrib[] = { ";
256
for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
258
<< " gapOpen[] = { ";
259
for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
261
<< " gapExtend[] = { ";
262
for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
267
for (int i = 0; i < 5; i++){
268
for (int j = 0; j <= i; j++){
269
cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
274
// if a file name is specified
277
// attempt to open the file for writing
278
FILE *file = fopen (filename, "w");
280
cerr << "ERROR: Unable to write parameter file: " << filename << endl;
284
// if successful, then write the parameters to the file
285
for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
286
for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
287
for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
288
fprintf (file, "%s\n", alphabet.c_str());
289
for (int i = 0; i < (int) alphabet.size(); i++){
290
for (int j = 0; j <= i; j++)
291
fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
292
fprintf (file, "\n");
294
for (int i = 0; i < (int) alphabet.size(); i++)
295
fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
296
fprintf (file, "\n");
301
/////////////////////////////////////////////////////////////////
304
// First computes all pairwise posterior probability matrices.
305
// Then, computes new parameters if training, or a final
306
// alignment, otherwise.
307
/////////////////////////////////////////////////////////////////
309
MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
310
VF &gapExtend, VVF &emitPairs, VF &emitSingle){
314
const int numSeqs = sequences->GetNumSequences();
315
VVF distances (numSeqs, VF (numSeqs, 0));
316
SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
317
SafeVector<SafeVector<SparseMatrix *> > originalSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
320
// prepare to average parameters
321
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
322
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
323
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
324
if (enableTrainEmissions){
325
for (int i = 0; i < (int) emitPairs.size(); i++)
326
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
327
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
332
// skip posterior calculations if we just want to do Viterbi alignments
334
cerr << "Computing posterior matrices" << endl;
335
// do all pairwise alignments for posterior probability matrices
336
for (int a = 0; a < numSeqs-1; a++){
337
for (int b = a+1; b < numSeqs; b++){
338
Sequence *seq1 = sequences->GetSequence (a);
339
Sequence *seq2 = sequences->GetSequence (b);
343
cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
344
<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
346
// compute forward and backward probabilities
347
VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
348
VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
350
// if we are training, then we'll simply want to compute the
351
// expected counts for each region within the matrix separately;
352
// otherwise, we'll need to put all of the regions together and
353
// assemble a posterior probability match matrix
355
// so, if we're training
358
// compute new parameters
359
VF thisInitDistrib (NumMatrixTypes);
360
VF thisGapOpen (2*NumInsertStates);
361
VF thisGapExtend (2*NumInsertStates);
362
VVF thisEmitPairs (256, VF (256, 1e-10));
363
VF thisEmitSingle (256, 1e-5);
365
model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
367
// add in contribution of the derived parameters
368
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
369
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
370
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
371
if (enableTrainEmissions){
372
for (int i = 0; i < (int) emitPairs.size(); i++)
373
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
374
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
377
// let us know that we're done.
378
if (enableVerbose) cerr << "done." << endl;
381
// compute posterior probability matrix
382
VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
384
// compute sparse representations
385
originalSparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
386
originalSparseMatrices[b][a] = originalSparseMatrices[a][b]->ComputeTranspose();
388
if (!enableAllPairs){
389
if (!enableDagAlignment) {
390
// perform the pairwise sequence alignment
391
pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
395
// compute "expected accuracy" distance for evolutionary tree computation
396
float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
397
//float distance = (gapFactor == 0) ? alignment.second / min (seq1->GetLength(), seq2->GetLength()):
398
// alignment.second / (seq1->GetLength() + seq2->GetLength()) * gapFactor;
399
distances[a][b] = distances[b][a] = distance;
402
cerr << setprecision (10) << distance << endl;
403
// if (distance == 1)
404
// cerr << setprecision(10) << (*posterior)[4 * (seq1->GetLength() + 1) + 4] << endl;
405
// originalSparseMatrices[a][b]->Print(cerr);
407
delete alignment.first;
411
// let us know that we're done.
412
if (enableVerbose) cerr << "done." << endl;
424
// now average out parameters derived
427
// compute new parameters
428
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
429
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
430
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
432
if (enableTrainEmissions){
433
for (int i = 0; i < (int) emitPairs.size(); i++)
434
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
435
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
439
// see if we still want to do some alignments
444
sparseMatrices = originalSparseMatrices;
446
// perform the consistency transformation the desired number of times
447
for (int r = 0; r < numConsistencyReps; r++){
448
SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
450
// now replace the old posterior matrices
451
for (int i = 0; i < numSeqs; i++){
452
for (int j = 0; j < numSeqs; j++){
454
// don't delete the original sparse matrices
455
if (r > 0) delete sparseMatrices[i][j];
456
sparseMatrices[i][j] = newSparseMatrices[i][j];
462
MultiSequence *finalAlignment = NULL;
464
if (onlyPrintPosteriors) {
465
for (int i = 0; i < numSeqs; i++){
466
string seq1name = sequences->GetSequence(i)->GetHeader();
467
for (int j = i + 1; j < numSeqs; j++){
468
cout << "Sparse Matrix: " << i << "," << j << endl;
469
cout << "Sequence names: " << seq1name << ", " << sequences->GetSequence(j)->GetHeader() << endl;
470
sparseMatrices[i][j]->Print(cout);
474
else if (enableAllPairs){
475
for (int a = 0; a < numSeqs-1; a++){
476
for (int b = a+1; b < numSeqs; b++){
477
Sequence *seq1 = sequences->GetSequence (a);
478
Sequence *seq2 = sequences->GetSequence (b);
481
cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
482
<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
485
// perform the pairwise sequence alignment
486
pair<SafeVector<char> *, float> alignment;
488
alignment = model.ComputeViterbiAlignment (seq1, seq2);
491
// build posterior matrix
492
VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
493
int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
494
for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
496
alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior, gapFactor);
500
// write pairwise alignments
501
string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
502
ofstream outfile (name.c_str());
504
MultiSequence *result = new MultiSequence();
505
result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
506
result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
507
if (enableClustalWOutput)
508
result->WriteALN (outfile);
510
result->WriteMFA (outfile);
514
delete alignment.first;
519
// now if we still need to do a final multiple alignment
525
if (!enableDagAlignment) {
526
// compute the evolutionary tree
527
TreeNode *tree = TreeNode::ComputeTree (distances);
529
tree->Print (cerr, sequences);
532
// make the final alignment
533
finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
537
cerr << "Building DAG" << endl;
538
MultiSequenceDag mds(sequences,false);
539
cerr << "Aligning sequences with DAG alignment" << endl;
540
finalAlignment = mds.AlignDag(sparseMatrices, gapFactor, enableVerbose, enableEdgeReordering, useTgf, edgeWeightThreshold);
543
if (enableAnnotation){
544
WriteAnnotation (finalAlignment, originalSparseMatrices);
550
// delete sparse matrices
551
for (int a = 0; a < numSeqs-1; a++){
552
for (int b = a+1; b < numSeqs; b++){
553
delete originalSparseMatrices[a][b];
554
delete originalSparseMatrices[b][a];
556
if (numConsistencyReps > 0){
557
delete sparseMatrices[a][b];
558
delete sparseMatrices[b][a];
564
return finalAlignment;
570
/////////////////////////////////////////////////////////////////
573
// Attempts to parse an integer from the character string given.
574
// Returns true only if no parsing error occurs.
575
/////////////////////////////////////////////////////////////////
577
bool GetInteger (char *data, int *val){
584
retVal = strtol (data, &endPtr, 0);
585
if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
586
if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
587
if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
592
/////////////////////////////////////////////////////////////////
595
// Attempts to parse a float from the character string given.
596
// Returns true only if no parsing error occurs.
597
/////////////////////////////////////////////////////////////////
599
bool GetFloat (char *data, float *val){
606
retVal = strtod (data, &endPtr);
607
if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
608
if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
609
*val = (float) retVal;
613
/////////////////////////////////////////////////////////////////
616
// Parse all command-line options.
617
/////////////////////////////////////////////////////////////////
619
SafeVector<string> ParseParams (int argc, char **argv){
623
cerr << "AMAP comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
624
<< "you are welcome to redistribute it under certain conditions. See the" << endl
625
<< "files README and README.PROBCONS for details." << endl
628
<< " amap [OPTION]... [MFAFILE]..." << endl
630
<< "Description:" << endl
631
<< " Align sequences in MFAFILE(s) and print result to standard output" << endl
633
<< " -clustalw" << endl
634
<< " use CLUSTALW output format instead of MFA" << endl
636
<< " -c, --consistency REPS" << endl
637
<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
638
<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
640
<< " -ir, --iterative-refinement REPS" << endl
641
<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
642
<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
644
<< " -pre, --pre-training REPS" << endl
645
<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
646
<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
649
<< " generate all-pairs pairwise alignments" << endl
651
<< " -viterbi" << endl
652
<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
654
<< " -v, --verbose" << endl
655
<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
657
<< " -annot FILENAME" << endl
658
<< " write annotation for multiple alignment to FILENAME" << endl
660
<< " -t, --train FILENAME" << endl
661
<< " compute EM transition probabilities, store in FILENAME (default: "
662
<< parametersOutputFilename << ")" << endl
664
<< " -e, --emissions" << endl
665
<< " also reestimate emission probabilities (default: "
666
<< (enableTrainEmissions ? "on" : "off") << ")" << endl
668
<< " -p, --paramfile FILENAME" << endl
669
<< " read parameters from FILENAME (default: "
670
<< parametersInputFilename << ")" << endl
672
<< " -a, --alignment-order" << endl
673
<< " print sequences in alignment order rather than input order (default: "
674
<< (enableAlignOrder ? "on" : "off") << ")" << endl
676
<< " -g, --gap-factor GF" << endl
677
<< " use GF as the gap-factor parameter, set to 0 for best sensitivity, higher values for better specificity (default: "
678
<< gapFactor << ")" << endl
680
<< " -w, --edge-weight-threshold W" << endl
681
<< " stop the sequence annealing process when best edge has lower weight than W," << endl
682
<< " set to 0 for best sensitivity, higher values for better specificity (default: "
683
<< edgeWeightThreshold << ")" << endl
685
<< " -prog, --progressive" << endl
686
<< " use progresive alignment instead of sequence annealing alignment (default: "
687
<< (!enableDagAlignment ? "on" : "off") << ")" << endl
689
<< " -noreorder, --no-edge-reordering" << endl
690
<< " disable reordring of edges during sequence annealing alignment (default: "
691
<< (!enableEdgeReordering ? "on" : "off") << ")" << endl
693
<< " -maxstep, --use-max-stepsize" << endl
694
<< " use maximum improvement step size instead of tGf edge ranking (default: "
695
<< (!useTgf ? "on" : "off") << ")" << endl
697
<< " -print, --print-posteriors" << endl
698
<< " only print the posterior probability matrices (default: "
699
<< (onlyPrintPosteriors ? "on" : "off") << ")" << endl
704
SafeVector<string> sequenceNames;
708
for (int i = 1; i < argc; i++){
709
if (argv[i][0] == '-'){
712
if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
713
enableTraining = true;
715
parametersOutputFilename = string (argv[++i]);
717
cerr << "ERROR: Filename expected for option " << argv[i] << endl;
723
else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
724
enableTrainEmissions = true;
728
else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
730
parametersInputFilename = string (argv[++i]);
732
cerr << "ERROR: Filename expected for option " << argv[i] << endl;
737
// number of consistency transformations
738
else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
740
if (!GetInteger (argv[++i], &tempInt)){
741
cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
745
if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
746
cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
747
<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
751
numConsistencyReps = tempInt;
755
cerr << "ERROR: Integer expected for option " << argv[i] << endl;
760
// number of randomized partitioning iterative refinement passes
761
else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
763
if (!GetInteger (argv[++i], &tempInt)){
764
cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
768
if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
769
cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
770
<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
774
numIterativeRefinementReps = tempInt;
778
cerr << "ERROR: Integer expected for option " << argv[i] << endl;
783
// number of EM pre-training rounds
784
else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
786
if (!GetInteger (argv[++i], &tempInt)){
787
cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
791
if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
792
cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
793
<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
797
numPreTrainingReps = tempInt;
801
cerr << "ERROR: Integer expected for option " << argv[i] << endl;
807
else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
809
if (!GetFloat (argv[++i], &tempFloat)){
810
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
815
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
819
gapOpenPenalty = tempFloat;
823
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
828
// gap extension penalty
829
else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
831
if (!GetFloat (argv[++i], &tempFloat)){
832
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
837
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
841
gapContinuePenalty = tempFloat;
845
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
850
// all-pairs pairwise alignments
851
else if (!strcmp (argv[i], "-pairs")){
852
enableAllPairs = true;
855
// all-pairs pairwise Viterbi alignments
856
else if (!strcmp (argv[i], "-viterbi")){
857
enableAllPairs = true;
858
enableViterbi = true;
862
else if (!strcmp (argv[i], "-annot")){
863
enableAnnotation = true;
865
annotationFilename = argv[++i];
867
cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
872
// clustalw output format
873
else if (!strcmp (argv[i], "-clustalw")){
874
enableClustalWOutput = true;
878
else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
880
if (!GetFloat (argv[++i], &tempFloat)){
881
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
885
if (tempFloat < 0 || tempFloat > 1){
886
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
894
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
900
else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
901
enableVerbose = true;
905
else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
906
enableAlignOrder = true;
910
else if (!strcmp (argv[i], "-prog") || !strcmp (argv[i], "--progressive")){
911
enableDagAlignment = false;
915
else if (!strcmp (argv[i], "-noreorder") || !strcmp (argv[i], "--no-edge-reordering")){
916
enableEdgeReordering = false;
919
// edge ranking method
920
else if (!strcmp (argv[i], "-maxstep") || !strcmp (argv[i], "--use-max-stepsize")){
925
else if (!strcmp (argv[i], "-print") || !strcmp (argv[i], "--print-posteriors")){
926
onlyPrintPosteriors = true;
930
else if (!strcmp (argv[i], "-g") || !strcmp (argv[i], "--gap-factor")){
932
if (!GetFloat (argv[++i], &tempFloat)){
933
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
938
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;
942
gapFactor = tempFloat;
946
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
951
// edge weight threshold
952
else if (!strcmp (argv[i], "-w") || !strcmp (argv[i], "--edge-weight-threshold")){
954
if (!GetFloat (argv[++i], &tempFloat)){
955
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
960
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;
964
edgeWeightThreshold = tempFloat;
968
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
975
cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
980
sequenceNames.push_back (string (argv[i]));
984
if (enableTrainEmissions && !enableTraining){
985
cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
989
return sequenceNames;
992
/////////////////////////////////////////////////////////////////
995
// Read initial distribution, transition, and emission
996
// parameters from a file.
997
/////////////////////////////////////////////////////////////////
999
void ReadParameters (){
1003
emitPairs = VVF (256, VF (256, 1e-10));
1004
emitSingle = VF (256, 1e-5);
1006
// read initial state distribution and transition parameters
1008
if (NumInsertStates == 1){
1009
for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1010
for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1011
for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1013
else if (NumInsertStates == 2){
1014
for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1015
for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1016
for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1019
cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1020
<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
1024
alphabet = alphabetDefault;
1026
for (int i = 0; i < (int) alphabet.length(); i++){
1027
emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1028
emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1029
for (int j = 0; j <= i; j++){
1030
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1031
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1032
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1033
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1034
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1035
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1036
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1037
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1041
if (parametersInputFilename != string ("")){
1042
data.open (parametersInputFilename.c_str());
1044
cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1049
for (int i = 0; i < 3; i++){
1050
if (!getline (data, line[i])){
1051
cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1055
istringstream data2;
1056
data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1057
data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1058
data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1060
if (!getline (data, line[0])){
1062
cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1066
// read alphabet as concatenation of all characters on alphabet line
1069
data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1071
for (int i = 0; i < (int) alphabet.size(); i++){
1072
for (int j = 0; j <= i; j++){
1075
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1076
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1077
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1078
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1079
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1080
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1081
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1082
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1086
for (int i = 0; i < (int) alphabet.size(); i++){
1089
emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1090
emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1096
/////////////////////////////////////////////////////////////////
1099
// Process the tree recursively. Returns the aligned sequences
1100
// corresponding to a node or leaf of the tree.
1101
/////////////////////////////////////////////////////////////////
1103
MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
1104
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1105
const ProbabilisticModel &model){
1106
MultiSequence *result;
1108
// check if this is a node of the alignment tree
1109
if (tree->GetSequenceLabel() == -1){
1110
MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
1111
MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
1114
assert (alignRight);
1116
result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
1123
// otherwise, this is a leaf of the alignment tree
1125
result = new MultiSequence(); assert (result);
1126
result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1132
/////////////////////////////////////////////////////////////////
1133
// ComputeFinalAlignment()
1135
// Compute the final alignment by calling ProcessTree(), then
1136
// performing iterative refinement as needed.
1137
/////////////////////////////////////////////////////////////////
1139
MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1140
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1141
const ProbabilisticModel &model){
1143
MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1145
if (enableAlignOrder){
1146
alignment->SaveOrdering();
1147
enableAlignOrder = false;
1150
// tree-based refinement
1151
// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1153
// iterative refinement
1154
for (int i = 0; i < numIterativeRefinementReps; i++)
1155
DoIterativeRefinement (sparseMatrices, model, alignment);
1159
// return final alignment
1163
/////////////////////////////////////////////////////////////////
1164
// AlignAlignments()
1166
// Returns the alignment of two MultiSequence objects.
1167
/////////////////////////////////////////////////////////////////
1169
MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1170
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1171
const ProbabilisticModel &model){
1173
// print some info about the alignment
1175
for (int i = 0; i < align1->GetNumSequences(); i++)
1176
cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1178
for (int i = 0; i < align2->GetNumSequences(); i++)
1179
cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1183
VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff, gapFactor);
1184
pair<SafeVector<char> *, float> alignment;
1186
// choose the alignment routine depending on the "cosmetic" gap penalties used
1187
if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1188
alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, gapFactor);
1190
alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1191
*posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1192
gapOpenPenalty, gapContinuePenalty);
1193
// if (enableVerbose)
1194
// cerr << "finished computing alignment\n";
1200
// compute total length of sequences
1202
for (int i = 0; i < align1->GetNumSequences(); i++)
1203
for (int j = 0; j < align2->GetNumSequences(); j++)
1204
totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1206
// give an "accuracy" measure for the alignment
1207
cerr << alignment.second / totLength << endl;
1210
// now build final alignment
1211
MultiSequence *result = new MultiSequence();
1212
for (int i = 0; i < align1->GetNumSequences(); i++)
1213
result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1214
for (int i = 0; i < align2->GetNumSequences(); i++)
1215
result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1216
if (!enableAlignOrder)
1217
result->SortByLabel();
1219
// free temporary alignment
1220
delete alignment.first;
1225
/////////////////////////////////////////////////////////////////
1228
// Performs one round of the consistency transformation. The
1231
// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1234
// where S = {x, y, all other sequences...}
1236
/////////////////////////////////////////////////////////////////
1238
SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1239
SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1240
const int numSeqs = sequences->GetNumSequences();
1242
SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1244
// for every pair of sequences
1245
for (int i = 0; i < numSeqs; i++){
1246
for (int j = i+1; j < numSeqs; j++){
1247
Sequence *seq1 = sequences->GetSequence (i);
1248
Sequence *seq2 = sequences->GetSequence (j);
1251
cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1252
<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1254
// get the original posterior matrix
1255
VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1256
VF &posterior = *posteriorPtr;
1258
const int seq1Length = seq1->GetLength();
1259
const int seq2Length = seq2->GetLength();
1261
VF *oldSumsPtr = new VF(seq1Length + seq2Length + 2,0);
1262
VF &oldSums = *oldSumsPtr;
1263
VF *newSumsPtr = new VF(seq1Length + seq2Length + 2,0);
1264
VF &newSums = *newSumsPtr;
1266
for (int k = 0, kl = 0; k <= seq1Length; k++) {
1267
for (int l = 0; l <= seq2Length; l++) {
1268
oldSums[k] += posterior[kl];
1269
oldSums[seq1Length + 1 + l] += posterior[kl++];
1273
// contribution from the summation where z = x and z = y
1274
for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1277
cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1279
// contribution from all other sequences
1280
for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1281
Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1284
// now renormalization
1285
for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1287
for (int k = 0, kl = 0; k <= seq1Length; k++) {
1288
for (int l = 0; l <= seq2Length; l++) {
1289
newSums[k] += posterior[kl];
1290
newSums[seq1Length + 1 + l] += posterior[kl++];
1294
int gapPostBase = (seq1Length+1) * (seq2Length+1);
1295
for (int k = 0; k < seq1Length + seq2Length + 2; k++) {
1296
if (oldSums[k] < POSTERIOR_CUTOFF) {
1298
cerr << "negative new gap posterior!\n";
1301
cerr << setprecision(5) << posterior[gapPostBase + k] << "->" << setprecision(5) << 1 - newSums[k] << ", ";
1302
posterior[gapPostBase + k] = 1 - newSums[k];
1306
posterior[gapPostBase + k] *= newSums[k] / oldSums[k];
1307
if (enableVerbose && newSums[k] > oldSums[k])
1308
cerr << setprecision(5) << newSums[k] / oldSums[k] << ", ";
1315
// save the new posterior matrix
1316
newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1317
newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();
1320
cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1322
delete posteriorPtr;
1327
cerr << "done." << endl;
1331
return newSparseMatrices;
1334
/////////////////////////////////////////////////////////////////
1337
// Computes the consistency transformation for a single sequence
1338
// z, and adds the transformed matrix to "posterior".
1339
/////////////////////////////////////////////////////////////////
1341
void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1346
int lengthX = matXZ->GetSeq1Length();
1347
int lengthY = matZY->GetSeq2Length();
1348
assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1351
for (int i = 1; i <= lengthX; i++){
1352
SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1353
SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1355
VF::iterator base = posterior.begin() + i * (lengthY + 1);
1357
// iterate through all x[i]-z[k]
1358
while (XZptr != XZend){
1359
SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1360
SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1361
const float XZval = XZptr->second;
1363
// iterate through all z[k]-y[j]
1364
while (ZYptr != ZYend){
1365
base[ZYptr->first] += XZval * ZYptr->second;
1373
/////////////////////////////////////////////////////////////////
1376
// Returns set containing all leaf labels of the current subtree.
1377
/////////////////////////////////////////////////////////////////
1379
set<int> GetSubtree (const TreeNode *tree){
1382
if (tree->GetSequenceLabel() == -1){
1383
s = GetSubtree (tree->GetLeftChild());
1384
set<int> t = GetSubtree (tree->GetRightChild());
1386
for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1390
s.insert (tree->GetSequenceLabel());
1396
/////////////////////////////////////////////////////////////////
1397
// TreeBasedBiPartitioning
1399
// Uses the iterative refinement scheme from MUSCLE.
1400
/////////////////////////////////////////////////////////////////
1402
void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1403
const ProbabilisticModel &model, MultiSequence* &alignment,
1404
const TreeNode *tree){
1405
// check if this is a node of the alignment tree
1406
if (tree->GetSequenceLabel() == -1){
1407
TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1408
TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1410
set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1411
set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1412
set<int> leftSubtreeComplement, rightSubtreeComplement;
1414
// calculate complement of each subtree
1415
for (int i = 0; i < alignment->GetNumSequences(); i++){
1416
if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1417
if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1420
// perform realignments for edge to left child
1421
if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1422
MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1423
MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1425
alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1428
// perform realignments for edge to right child
1429
if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1430
MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1431
MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1433
alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1438
/////////////////////////////////////////////////////////////////
1439
// DoIterativeRefinement()
1441
// Performs a single round of randomized partionining iterative
1443
/////////////////////////////////////////////////////////////////
1445
void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1446
const ProbabilisticModel &model, MultiSequence* &alignment){
1447
set<int> groupOne, groupTwo;
1449
// create two separate groups
1450
for (int i = 0; i < alignment->GetNumSequences(); i++){
1452
groupOne.insert (i);
1454
groupTwo.insert (i);
1457
if (groupOne.empty() || groupTwo.empty()) return;
1459
// project into the two groups
1460
MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1461
MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1465
alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1468
/////////////////////////////////////////////////////////////////
1469
// WriteAnnotation()
1471
// Computes annotation for multiple alignment and write values
1473
/////////////////////////////////////////////////////////////////
1475
void WriteAnnotation (MultiSequence *alignment,
1476
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1477
ofstream outfile (annotationFilename.c_str());
1479
if (outfile.fail()){
1480
cerr << "ERROR: Unable to write annotation file." << endl;
1484
const int alignLength = alignment->GetSequence(0)->GetLength();
1485
const int numSeqs = alignment->GetNumSequences();
1487
SafeVector<int> position (numSeqs, 0);
1488
SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1489
for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1490
SafeVector<pair<int,int> > active;
1491
active.reserve (numSeqs);
1494
for (int i = 1; i <= alignLength; i++){
1496
// find all aligned residues in this particular column
1498
for (int j = 0; j < numSeqs; j++){
1499
if (seqs[j][i] != '-'){
1500
active.push_back (make_pair(j, ++position[j]));
1504
outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1510
/////////////////////////////////////////////////////////////////
1513
// Computes the annotation score for a particular column.
1514
/////////////////////////////////////////////////////////////////
1516
int ComputeScore (const SafeVector<pair<int, int> > &active,
1517
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1519
if (active.size() <= 1) return 0;
1521
// ALTERNATIVE #1: Compute the average alignment score.
1524
for (int i = 0; i < (int) active.size(); i++){
1525
for (int j = i+1; j < (int) active.size(); j++){
1526
val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1530
return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));