1
/////////////////////////////////////////////////////////////////
4
// Main routines for AMAP program.
5
/////////////////////////////////////////////////////////////////
7
#include "SafeVector.h"
8
#include "MultiSequence.h"
9
#include "MultiSequenceDag.h"
11
#include "ScoreType.h"
12
#include "ProbabilisticModel.h"
13
#include "EvolutionaryTree.h"
14
#include "SparseMatrix.h"
27
string parametersInputFilename = "";
28
string parametersOutputFilename = "no training";
29
string annotationFilename = "";
31
bool enableTraining = false;
32
bool enableVerbose = false;
33
bool enableAllPairs = false;
34
bool enableAnnotation = false;
35
bool enableViterbi = false;
36
bool enableClustalWOutput = false;
37
bool enableTrainEmissions = false;
38
bool enableAlignOrder = false;
39
bool enableDagAlignment = true;
40
bool enableEdgeReordering = true;
42
bool onlyPrintPosteriors = false;
43
bool outputForGUI = false;
45
int numConsistencyReps = 0;
46
int numPreTrainingReps = 0;
47
int numIterativeRefinementReps = 0;
50
float guiStartWeight = std::numeric_limits<float>::max();
52
float gapOpenPenalty = 0;
53
float gapContinuePenalty = 0;
54
VF initDistrib (NumMatrixTypes);
55
VF gapOpen (2*NumInsertStates);
56
VF gapExtend (2*NumInsertStates);
57
VVF emitPairs (256, VF (256, 1e-10));
58
VF emitSingle (256, 1e-5);
59
string alphabet = alphabetDefault;
60
float gapFactor = gapFactorDefault;
61
float edgeWeightThreshold = 0;
63
const int MIN_PRETRAINING_REPS = 0;
64
const int MAX_PRETRAINING_REPS = 20;
65
const int MIN_CONSISTENCY_REPS = 0;
66
const int MAX_CONSISTENCY_REPS = 5;
67
const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
68
const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
70
/////////////////////////////////////////////////////////////////
71
// Function prototypes
72
/////////////////////////////////////////////////////////////////
75
void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
76
const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
77
MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
78
VVF &emitPairs, VF &emitSingle);
79
SafeVector<string> ParseParams (int argc, char **argv);
80
void ReadParameters ();
81
MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
82
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
83
const ProbabilisticModel &model);
84
MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
85
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
86
const ProbabilisticModel &model);
87
SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
88
SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
89
void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
91
set<int> GetSubtree (const TreeNode *tree);
92
void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
93
const ProbabilisticModel &model, MultiSequence* &alignment,
94
const TreeNode *tree);
95
void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
96
const ProbabilisticModel &model, MultiSequence* &alignment);
97
void WriteAnnotation (MultiSequence *alignment,
98
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
99
int ComputeScore (const SafeVector<pair<int, int> > &active,
100
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
103
/////////////////////////////////////////////////////////////////
106
// Calls all initialization routines and runs the AMAP
108
/////////////////////////////////////////////////////////////////
110
int main (int argc, char **argv){
112
// print AMAP heading
115
// parse program parameters
116
SafeVector<string> sequenceNames = ParseParams (argc, argv);
118
PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
120
// now, we'll process all the files given as input. If we are given
121
// several filenames as input, then we'll load all of those sequences
122
// simultaneously, as long as we're not training. On the other hand,
123
// if we are training, then we'll treat each file as a separate
126
// if we are training
129
// build new model for aligning
130
ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
132
// prepare to average parameters
133
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
134
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
135
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
136
if (enableTrainEmissions){
137
for (int i = 0; i < (int) emitPairs.size(); i++)
138
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
139
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
142
// align each file individually
143
for (int i = 0; i < (int) sequenceNames.size(); i++){
145
VF thisInitDistrib (NumMatrixTypes);
146
VF thisGapOpen (2*NumInsertStates);
147
VF thisGapExtend (2*NumInsertStates);
148
VVF thisEmitPairs (256, VF (256, 1e-10));
149
VF thisEmitSingle (256, 1e-5);
151
// load sequence file
152
MultiSequence *sequences = new MultiSequence(); assert (sequences);
153
cerr << "Loading sequence file: " << sequenceNames[i] << endl;
154
sequences->LoadMFA (sequenceNames[i], true);
157
MultiSequence *tmpMsa = DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
160
// add in contribution of the derived parameters
161
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
162
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
163
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
164
if (enableTrainEmissions){
165
for (int i = 0; i < (int) emitPairs.size(); i++)
166
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
167
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
173
// compute new parameters and print them out
174
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
175
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
176
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
177
if (enableTrainEmissions){
178
for (int i = 0; i < (int) emitPairs.size(); i++)
179
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
180
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
183
PrintParameters ("Trained parameter set:",
184
initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
185
parametersOutputFilename.c_str());
188
// if we are not training, we must simply want to align some sequences
191
bool allStartWithM = true;
192
// load all files together
193
MultiSequence *sequences = new MultiSequence(); assert (sequences);
194
for (int i = 0; i < (int) sequenceNames.size(); i++){
195
cerr << "Loading sequence file: " << sequenceNames[i] << endl;
196
sequences->LoadMFA (sequenceNames[i], true);
199
for (int i = 0; i < sequences->GetNumSequences(); i++){
200
char firstChar = *(++(sequences->GetSequence(i)->GetDataPtr()));
201
allStartWithM = firstChar == 'M' ? allStartWithM : false;
205
for (int i = 0; i < sequences->GetNumSequences(); i++){
206
int seqLength = sequences->GetSequence(0)->GetLength();
207
Sequence* newSeq = sequences->GetSequence(0)->GetRange(2,seqLength);
208
sequences->RemoveSequence(0);
209
sequences->AddSequence(newSeq);
213
// do all "pre-training" repetitions first
214
for (int ct = 0; ct < numPreTrainingReps; ct++){
215
enableTraining = true;
217
// build new model for aligning
218
ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
219
emitPairs, emitSingle);
221
// do initial alignments
222
DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
224
// print new parameters
225
PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
227
enableTraining = false;
230
// now, we can perform the alignments and write them out
231
MultiSequence *alignment = DoAlign (sequences,
232
ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
233
initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
236
for (int i = 0; i < alignment->GetNumSequences(); i++){
237
SafeVector<char>* newData = new SafeVector<char>;
238
Sequence* currSeq = alignment->GetSequence(0);
239
SafeVector<char>::iterator dataItr = currSeq->GetDataPtr();
240
newData->push_back(*dataItr++);
241
newData->push_back('M');
242
for (int j = 0; j < currSeq->GetLength(); j++) {
243
newData->push_back(*dataItr++);
245
Sequence *newSeq = new Sequence(newData, currSeq->GetHeader(), currSeq->GetLength() + 1, currSeq->GetSortLabel(), currSeq->GetLabel());
246
alignment->AddSequence(newSeq);
247
alignment->RemoveSequence(0);
251
if (onlyPrintPosteriors)
253
if (!enableAllPairs && !outputForGUI){
254
if (enableClustalWOutput)
255
alignment->WriteALN (cout);
257
alignment->WriteMFA (cout);
264
/////////////////////////////////////////////////////////////////
267
// Prints heading for AMAP program.
268
/////////////////////////////////////////////////////////////////
270
void PrintHeading (){
272
<< "AMAP version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
273
<< "PROBCONS Written by Chuong Do" << endl
274
<< "AMAP algorithm implemented by Ariel Schwartz" << endl
278
/////////////////////////////////////////////////////////////////
281
// Prints AMAP parameters to STDERR. If a filename is
282
// specified, then the parameters are also written to the file.
283
/////////////////////////////////////////////////////////////////
285
void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
286
const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
288
// print parameters to the screen
289
cerr << message << endl
290
<< " initDistrib[] = { ";
291
for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
293
<< " gapOpen[] = { ";
294
for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
296
<< " gapExtend[] = { ";
297
for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
302
for (int i = 0; i < 5; i++){
303
for (int j = 0; j <= i; j++){
304
cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
309
// if a file name is specified
312
// attempt to open the file for writing
313
FILE *file = fopen (filename, "w");
315
cerr << "ERROR: Unable to write parameter file: " << filename << endl;
319
// if successful, then write the parameters to the file
320
for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
321
for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
322
for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
323
fprintf (file, "%s\n", alphabet.c_str());
324
for (int i = 0; i < (int) alphabet.size(); i++){
325
for (int j = 0; j <= i; j++)
326
fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
327
fprintf (file, "\n");
329
for (int i = 0; i < (int) alphabet.size(); i++)
330
fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
331
fprintf (file, "\n");
336
/////////////////////////////////////////////////////////////////
339
// First computes all pairwise posterior probability matrices.
340
// Then, computes new parameters if training, or a final
341
// alignment, otherwise.
342
/////////////////////////////////////////////////////////////////
344
MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
345
VF &gapExtend, VVF &emitPairs, VF &emitSingle){
349
const int numSeqs = sequences->GetNumSequences();
350
VVF distances (numSeqs, VF (numSeqs, 0));
351
SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
352
SafeVector<SafeVector<SparseMatrix *> > originalSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
355
// prepare to average parameters
356
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
357
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
358
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
359
if (enableTrainEmissions){
360
for (int i = 0; i < (int) emitPairs.size(); i++)
361
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
362
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
367
// skip posterior calculations if we just want to do Viterbi alignments
369
cerr << "Computing posterior matrices" << endl;
370
// do all pairwise alignments for posterior probability matrices
371
for (int a = 0; a < numSeqs-1; a++){
372
for (int b = a+1; b < numSeqs; b++){
373
Sequence *seq1 = sequences->GetSequence (a);
374
Sequence *seq2 = sequences->GetSequence (b);
378
cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
379
<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
381
// compute forward and backward probabilities
382
VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
383
VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
385
// if we are training, then we'll simply want to compute the
386
// expected counts for each region within the matrix separately;
387
// otherwise, we'll need to put all of the regions together and
388
// assemble a posterior probability match matrix
390
// so, if we're training
393
// compute new parameters
394
VF thisInitDistrib (NumMatrixTypes);
395
VF thisGapOpen (2*NumInsertStates);
396
VF thisGapExtend (2*NumInsertStates);
397
VVF thisEmitPairs (256, VF (256, 1e-10));
398
VF thisEmitSingle (256, 1e-5);
400
model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
402
// add in contribution of the derived parameters
403
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
404
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
405
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
406
if (enableTrainEmissions){
407
for (int i = 0; i < (int) emitPairs.size(); i++)
408
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
409
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
412
// let us know that we're done.
413
if (enableVerbose) cerr << "done." << endl;
416
// compute posterior probability matrix
417
VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
419
// compute sparse representations
420
originalSparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
421
originalSparseMatrices[b][a] = originalSparseMatrices[a][b]->ComputeTranspose();
423
if (!enableAllPairs){
424
if (!enableDagAlignment) {
425
// perform the pairwise sequence alignment
426
pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
430
// compute "expected accuracy" distance for evolutionary tree computation
431
float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
432
//float distance = (gapFactor == 0) ? alignment.second / min (seq1->GetLength(), seq2->GetLength()):
433
// alignment.second / (seq1->GetLength() + seq2->GetLength()) * gapFactor;
434
distances[a][b] = distances[b][a] = distance;
437
cerr << setprecision (10) << distance << endl;
438
// if (distance == 1)
439
// cerr << setprecision(10) << (*posterior)[4 * (seq1->GetLength() + 1) + 4] << endl;
440
// originalSparseMatrices[a][b]->Print(cerr);
442
delete alignment.first;
446
// let us know that we're done.
447
if (enableVerbose) cerr << "done." << endl;
459
// now average out parameters derived
462
// compute new parameters
463
for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
464
for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
465
for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
467
if (enableTrainEmissions){
468
for (int i = 0; i < (int) emitPairs.size(); i++)
469
for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
470
for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
474
// see if we still want to do some alignments
479
sparseMatrices = originalSparseMatrices;
481
// perform the consistency transformation the desired number of times
482
for (int r = 0; r < numConsistencyReps; r++){
483
SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
485
// now replace the old posterior matrices
486
for (int i = 0; i < numSeqs; i++){
487
for (int j = 0; j < numSeqs; j++){
489
// don't delete the original sparse matrices
490
if (r > 0) delete sparseMatrices[i][j];
491
sparseMatrices[i][j] = newSparseMatrices[i][j];
497
MultiSequence *finalAlignment = NULL;
499
if (onlyPrintPosteriors) {
500
for (int i = 0; i < numSeqs; i++){
501
string seq1name = sequences->GetSequence(i)->GetHeader();
502
for (int j = i + 1; j < numSeqs; j++){
503
cout << "Sparse Matrix: " << i << "," << j << endl;
504
cout << "Sequence names: " << seq1name << ", " << sequences->GetSequence(j)->GetHeader() << endl;
505
sparseMatrices[i][j]->Print(cout);
509
else if (enableAllPairs){
510
for (int a = 0; a < numSeqs-1; a++){
511
for (int b = a+1; b < numSeqs; b++){
512
Sequence *seq1 = sequences->GetSequence (a);
513
Sequence *seq2 = sequences->GetSequence (b);
516
cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
517
<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
520
// perform the pairwise sequence alignment
521
pair<SafeVector<char> *, float> alignment;
523
alignment = model.ComputeViterbiAlignment (seq1, seq2);
526
// build posterior matrix
527
VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
528
int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
529
for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
531
alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior, gapFactor);
535
// write pairwise alignments
536
string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
537
ofstream outfile (name.c_str());
539
MultiSequence *result = new MultiSequence();
540
result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
541
result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
542
if (enableClustalWOutput)
543
result->WriteALN (outfile);
545
result->WriteMFA (outfile);
549
delete alignment.first;
554
// now if we still need to do a final multiple alignment
560
if (!enableDagAlignment) {
561
// compute the evolutionary tree
562
TreeNode *tree = TreeNode::ComputeTree (distances);
564
tree->Print (cerr, sequences);
567
// make the final alignment
568
finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
572
cerr << "Building DAG" << endl;
573
MultiSequenceDag mds(sequences,false,outputForGUI,guiStartWeight,guiStepSize);
574
cerr << "Starting the sequence annealing process" << endl;
575
finalAlignment = mds.AlignDag(sparseMatrices, gapFactor, enableVerbose, enableEdgeReordering, useTgf, edgeWeightThreshold);
578
if (enableAnnotation){
579
WriteAnnotation (finalAlignment, originalSparseMatrices);
585
// delete sparse matrices
586
for (int a = 0; a < numSeqs-1; a++){
587
for (int b = a+1; b < numSeqs; b++){
588
delete originalSparseMatrices[a][b];
589
delete originalSparseMatrices[b][a];
591
if (numConsistencyReps > 0){
592
delete sparseMatrices[a][b];
593
delete sparseMatrices[b][a];
599
return finalAlignment;
605
/////////////////////////////////////////////////////////////////
608
// Attempts to parse an integer from the character string given.
609
// Returns true only if no parsing error occurs.
610
/////////////////////////////////////////////////////////////////
612
bool GetInteger (char *data, int *val){
619
retVal = strtol (data, &endPtr, 0);
620
if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
621
if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
622
if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
627
/////////////////////////////////////////////////////////////////
630
// Attempts to parse a float from the character string given.
631
// Returns true only if no parsing error occurs.
632
/////////////////////////////////////////////////////////////////
634
bool GetFloat (char *data, float *val){
641
retVal = strtod (data, &endPtr);
642
if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
643
if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
644
*val = (float) retVal;
648
/////////////////////////////////////////////////////////////////
651
// Parse all command-line options.
652
/////////////////////////////////////////////////////////////////
654
SafeVector<string> ParseParams (int argc, char **argv){
658
cerr << "AMAP comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
659
<< "you are welcome to redistribute it under certain conditions. See the" << endl
660
<< "files README and README.PROBCONS for details." << endl
663
<< " amap [OPTION]... [MFAFILE]..." << endl
665
<< "Description:" << endl
666
<< " Align sequences in MFAFILE(s) and print result to standard output" << endl
668
<< " -clustalw" << endl
669
<< " use CLUSTALW output format instead of MFA" << endl
671
<< " -c, --consistency REPS" << endl
672
<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
673
<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
675
<< " -ir, --iterative-refinement REPS" << endl
676
<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
677
<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
679
<< " -pre, --pre-training REPS" << endl
680
<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
681
<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
684
<< " generate all-pairs pairwise alignments" << endl
686
<< " -viterbi" << endl
687
<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
689
<< " -v, --verbose" << endl
690
<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
692
<< " -annot FILENAME" << endl
693
<< " write annotation for multiple alignment to FILENAME" << endl
695
<< " -t, --train FILENAME" << endl
696
<< " compute EM transition probabilities, store in FILENAME (default: "
697
<< parametersOutputFilename << ")" << endl
699
<< " -e, --emissions" << endl
700
<< " also reestimate emission probabilities (default: "
701
<< (enableTrainEmissions ? "on" : "off") << ")" << endl
703
<< " -p, --paramfile FILENAME" << endl
704
<< " read parameters from FILENAME (default: "
705
<< parametersInputFilename << ")" << endl
707
<< " -a, --alignment-order" << endl
708
<< " print sequences in alignment order rather than input order (default: "
709
<< (enableAlignOrder ? "on" : "off") << ")" << endl
711
<< " -g, --gap-factor GF" << endl
712
<< " use GF as the gap-factor parameter, set to 0 for best sensitivity, higher values for better specificity (default: "
713
<< gapFactor << ")" << endl
715
<< " -w, --edge-weight-threshold W" << endl
716
<< " stop the sequence annealing process when best edge has lower weight than W," << endl
717
<< " set to 0 for best sensitivity, higher values for better specificity (default: "
718
<< edgeWeightThreshold << ")" << endl
720
<< " -prog, --progressive" << endl
721
<< " use progresive alignment instead of sequence annealing alignment (default: "
722
<< (!enableDagAlignment ? "on" : "off") << ")" << endl
724
<< " -noreorder, --no-edge-reordering" << endl
725
<< " disable reordring of edges during sequence annealing alignment (default: "
726
<< (!enableEdgeReordering ? "on" : "off") << ")" << endl
728
<< " -maxstep, --use-max-stepsize" << endl
729
<< " use maximum improvement step size instead of tGf edge ranking (default: "
730
<< (!useTgf ? "on" : "off") << ")" << endl
732
<< " -print, --print-posteriors" << endl
733
<< " only print the posterior probability matrices (default: "
734
<< (onlyPrintPosteriors ? "on" : "off") << ")" << endl
736
<< " -gui [START] [STEP]" << endl
737
<< " print output for the AMAP Display Java based GUI (default: "
738
<< (outputForGUI ? "on" : "off") << ") " << endl
739
<< " starting at weight START (default: infinity) with step size STEP (default: " << guiStepSize << ")" << endl
744
SafeVector<string> sequenceNames;
748
for (int i = 1; i < argc; i++){
749
if (argv[i][0] == '-'){
752
if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
753
enableTraining = true;
755
parametersOutputFilename = string (argv[++i]);
757
cerr << "ERROR: Filename expected for option " << argv[i] << endl;
763
else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
764
enableTrainEmissions = true;
768
else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
770
parametersInputFilename = string (argv[++i]);
772
cerr << "ERROR: Filename expected for option " << argv[i] << endl;
777
// number of consistency transformations
778
else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
780
if (!GetInteger (argv[++i], &tempInt)){
781
cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
785
if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
786
cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
787
<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
791
numConsistencyReps = tempInt;
795
cerr << "ERROR: Integer expected for option " << argv[i] << endl;
800
// number of randomized partitioning iterative refinement passes
801
else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
803
if (!GetInteger (argv[++i], &tempInt)){
804
cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
808
if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
809
cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
810
<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
814
numIterativeRefinementReps = tempInt;
818
cerr << "ERROR: Integer expected for option " << argv[i] << endl;
823
// number of EM pre-training rounds
824
else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
826
if (!GetInteger (argv[++i], &tempInt)){
827
cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
831
if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
832
cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
833
<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
837
numPreTrainingReps = tempInt;
841
cerr << "ERROR: Integer expected for option " << argv[i] << endl;
847
else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
849
if (!GetFloat (argv[++i], &tempFloat)){
850
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
855
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
859
gapOpenPenalty = tempFloat;
863
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
868
// gap extension penalty
869
else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
871
if (!GetFloat (argv[++i], &tempFloat)){
872
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
877
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
881
gapContinuePenalty = tempFloat;
885
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
890
// all-pairs pairwise alignments
891
else if (!strcmp (argv[i], "-pairs")){
892
enableAllPairs = true;
895
// all-pairs pairwise Viterbi alignments
896
else if (!strcmp (argv[i], "-viterbi")){
897
enableAllPairs = true;
898
enableViterbi = true;
902
else if (!strcmp (argv[i], "-annot")){
903
enableAnnotation = true;
905
annotationFilename = argv[++i];
907
cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
912
// clustalw output format
913
else if (!strcmp (argv[i], "-clustalw")){
914
enableClustalWOutput = true;
918
else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
920
if (!GetFloat (argv[++i], &tempFloat)){
921
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
925
if (tempFloat < 0 || tempFloat > 1){
926
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
934
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
940
else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
941
enableVerbose = true;
944
// output for AMAP Display
945
else if (!strcmp (argv[i], "-gui")) {
948
if (GetFloat (argv[i + 1], &tempFloat)){
950
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be non-negative." << endl;
954
guiStartWeight = tempFloat;
957
if (GetInteger (argv[i + 1], &tempInt)){
959
cerr << "ERROR: For option " << argv[i-1] << ", integer must be positive." << endl;
963
guiStepSize = tempInt;
974
else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
975
enableAlignOrder = true;
979
else if (!strcmp (argv[i], "-prog") || !strcmp (argv[i], "--progressive")){
980
enableDagAlignment = false;
984
else if (!strcmp (argv[i], "-noreorder") || !strcmp (argv[i], "--no-edge-reordering")){
985
enableEdgeReordering = false;
988
// edge ranking method
989
else if (!strcmp (argv[i], "-maxstep") || !strcmp (argv[i], "--use-max-stepsize")){
994
else if (!strcmp (argv[i], "-print") || !strcmp (argv[i], "--print-posteriors")){
995
onlyPrintPosteriors = true;
999
else if (!strcmp (argv[i], "-g") || !strcmp (argv[i], "--gap-factor")){
1001
if (!GetFloat (argv[++i], &tempFloat)){
1002
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1007
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;
1011
gapFactor = tempFloat;
1015
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1020
// edge weight threshold
1021
else if (!strcmp (argv[i], "-w") || !strcmp (argv[i], "--edge-weight-threshold")){
1023
if (!GetFloat (argv[++i], &tempFloat)){
1024
cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1029
cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be negative." << endl;
1033
edgeWeightThreshold = tempFloat;
1037
cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1044
cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
1049
sequenceNames.push_back (string (argv[i]));
1053
if (enableTrainEmissions && !enableTraining){
1054
cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
1058
return sequenceNames;
1061
/////////////////////////////////////////////////////////////////
1064
// Read initial distribution, transition, and emission
1065
// parameters from a file.
1066
/////////////////////////////////////////////////////////////////
1068
void ReadParameters (){
1072
emitPairs = VVF (256, VF (256, 1e-10));
1073
emitSingle = VF (256, 1e-5);
1075
// read initial state distribution and transition parameters
1077
if (NumInsertStates == 1){
1078
for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1079
for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1080
for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1082
else if (NumInsertStates == 2){
1083
for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1084
for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1085
for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1088
cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1089
<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
1093
alphabet = alphabetDefault;
1095
for (int i = 0; i < (int) alphabet.length(); i++){
1096
emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1097
emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1098
for (int j = 0; j <= i; j++){
1099
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1100
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1101
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1102
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1103
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1104
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1105
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1106
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1110
if (parametersInputFilename != string ("")){
1111
data.open (parametersInputFilename.c_str());
1113
cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1118
for (int i = 0; i < 3; i++){
1119
if (!getline (data, line[i])){
1120
cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1124
istringstream data2;
1125
data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1126
data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1127
data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1129
if (!getline (data, line[0])){
1131
cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1135
// read alphabet as concatenation of all characters on alphabet line
1138
data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1140
for (int i = 0; i < (int) alphabet.size(); i++){
1141
for (int j = 0; j <= i; j++){
1144
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1145
emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1146
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1147
emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1148
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1149
emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1150
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1151
emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1155
for (int i = 0; i < (int) alphabet.size(); i++){
1158
emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1159
emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1165
/////////////////////////////////////////////////////////////////
1168
// Process the tree recursively. Returns the aligned sequences
1169
// corresponding to a node or leaf of the tree.
1170
/////////////////////////////////////////////////////////////////
1172
MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
1173
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1174
const ProbabilisticModel &model){
1175
MultiSequence *result;
1177
// check if this is a node of the alignment tree
1178
if (tree->GetSequenceLabel() == -1){
1179
MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
1180
MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
1183
assert (alignRight);
1185
result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
1192
// otherwise, this is a leaf of the alignment tree
1194
result = new MultiSequence(); assert (result);
1195
result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1201
/////////////////////////////////////////////////////////////////
1202
// ComputeFinalAlignment()
1204
// Compute the final alignment by calling ProcessTree(), then
1205
// performing iterative refinement as needed.
1206
/////////////////////////////////////////////////////////////////
1208
MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1209
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1210
const ProbabilisticModel &model){
1212
MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1214
if (enableAlignOrder){
1215
alignment->SaveOrdering();
1216
enableAlignOrder = false;
1219
// tree-based refinement
1220
// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1222
// iterative refinement
1223
for (int i = 0; i < numIterativeRefinementReps; i++)
1224
DoIterativeRefinement (sparseMatrices, model, alignment);
1228
// return final alignment
1232
/////////////////////////////////////////////////////////////////
1233
// AlignAlignments()
1235
// Returns the alignment of two MultiSequence objects.
1236
/////////////////////////////////////////////////////////////////
1238
MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1239
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1240
const ProbabilisticModel &model){
1242
// print some info about the alignment
1244
for (int i = 0; i < align1->GetNumSequences(); i++)
1245
cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1247
for (int i = 0; i < align2->GetNumSequences(); i++)
1248
cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1252
VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff, gapFactor);
1253
pair<SafeVector<char> *, float> alignment;
1255
// choose the alignment routine depending on the "cosmetic" gap penalties used
1256
if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1257
alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, gapFactor);
1259
alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1260
*posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1261
gapOpenPenalty, gapContinuePenalty);
1262
// if (enableVerbose)
1263
// cerr << "finished computing alignment\n";
1269
// compute total length of sequences
1271
for (int i = 0; i < align1->GetNumSequences(); i++)
1272
for (int j = 0; j < align2->GetNumSequences(); j++)
1273
totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1275
// give an "accuracy" measure for the alignment
1276
cerr << alignment.second / totLength << endl;
1279
// now build final alignment
1280
MultiSequence *result = new MultiSequence();
1281
for (int i = 0; i < align1->GetNumSequences(); i++)
1282
result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1283
for (int i = 0; i < align2->GetNumSequences(); i++)
1284
result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1285
if (!enableAlignOrder)
1286
result->SortByLabel();
1288
// free temporary alignment
1289
delete alignment.first;
1294
/////////////////////////////////////////////////////////////////
1297
// Performs one round of the consistency transformation. The
1300
// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1303
// where S = {x, y, all other sequences...}
1305
/////////////////////////////////////////////////////////////////
1307
SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1308
SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1309
const int numSeqs = sequences->GetNumSequences();
1311
SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1313
// for every pair of sequences
1314
for (int i = 0; i < numSeqs; i++){
1315
for (int j = i+1; j < numSeqs; j++){
1316
Sequence *seq1 = sequences->GetSequence (i);
1317
Sequence *seq2 = sequences->GetSequence (j);
1320
cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1321
<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1323
// get the original posterior matrix
1324
VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1325
VF &posterior = *posteriorPtr;
1327
const int seq1Length = seq1->GetLength();
1328
const int seq2Length = seq2->GetLength();
1330
VF *oldSumsPtr = new VF(seq1Length + seq2Length + 2,0);
1331
VF &oldSums = *oldSumsPtr;
1332
VF *newSumsPtr = new VF(seq1Length + seq2Length + 2,0);
1333
VF &newSums = *newSumsPtr;
1335
for (int k = 0, kl = 0; k <= seq1Length; k++) {
1336
for (int l = 0; l <= seq2Length; l++) {
1337
oldSums[k] += posterior[kl];
1338
oldSums[seq1Length + 1 + l] += posterior[kl++];
1342
// contribution from the summation where z = x and z = y
1343
for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1346
cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1348
// contribution from all other sequences
1349
for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1350
Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1353
// now renormalization
1354
for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1356
for (int k = 0, kl = 0; k <= seq1Length; k++) {
1357
for (int l = 0; l <= seq2Length; l++) {
1358
newSums[k] += posterior[kl];
1359
newSums[seq1Length + 1 + l] += posterior[kl++];
1363
int gapPostBase = (seq1Length+1) * (seq2Length+1);
1364
for (int k = 0; k < seq1Length + seq2Length + 2; k++) {
1365
if (oldSums[k] < POSTERIOR_CUTOFF) {
1367
cerr << "negative new gap posterior!\n";
1370
cerr << setprecision(5) << posterior[gapPostBase + k] << "->" << setprecision(5) << 1 - newSums[k] << ", ";
1371
posterior[gapPostBase + k] = 1 - newSums[k];
1375
posterior[gapPostBase + k] *= newSums[k] / oldSums[k];
1376
if (enableVerbose && newSums[k] > oldSums[k])
1377
cerr << setprecision(5) << newSums[k] / oldSums[k] << ", ";
1384
// save the new posterior matrix
1385
newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1386
newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();
1389
cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1391
delete posteriorPtr;
1396
cerr << "done." << endl;
1400
return newSparseMatrices;
1403
/////////////////////////////////////////////////////////////////
1406
// Computes the consistency transformation for a single sequence
1407
// z, and adds the transformed matrix to "posterior".
1408
/////////////////////////////////////////////////////////////////
1410
void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1415
int lengthX = matXZ->GetSeq1Length();
1416
int lengthY = matZY->GetSeq2Length();
1417
assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1420
for (int i = 1; i <= lengthX; i++){
1421
SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1422
SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1424
VF::iterator base = posterior.begin() + i * (lengthY + 1);
1426
// iterate through all x[i]-z[k]
1427
while (XZptr != XZend){
1428
SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1429
SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1430
const float XZval = XZptr->second;
1432
// iterate through all z[k]-y[j]
1433
while (ZYptr != ZYend){
1434
base[ZYptr->first] += XZval * ZYptr->second;
1442
/////////////////////////////////////////////////////////////////
1445
// Returns set containing all leaf labels of the current subtree.
1446
/////////////////////////////////////////////////////////////////
1448
set<int> GetSubtree (const TreeNode *tree){
1451
if (tree->GetSequenceLabel() == -1){
1452
s = GetSubtree (tree->GetLeftChild());
1453
set<int> t = GetSubtree (tree->GetRightChild());
1455
for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1459
s.insert (tree->GetSequenceLabel());
1465
/////////////////////////////////////////////////////////////////
1466
// TreeBasedBiPartitioning
1468
// Uses the iterative refinement scheme from MUSCLE.
1469
/////////////////////////////////////////////////////////////////
1471
void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1472
const ProbabilisticModel &model, MultiSequence* &alignment,
1473
const TreeNode *tree){
1474
// check if this is a node of the alignment tree
1475
if (tree->GetSequenceLabel() == -1){
1476
TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1477
TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1479
set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1480
set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1481
set<int> leftSubtreeComplement, rightSubtreeComplement;
1483
// calculate complement of each subtree
1484
for (int i = 0; i < alignment->GetNumSequences(); i++){
1485
if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1486
if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1489
// perform realignments for edge to left child
1490
if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1491
MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1492
MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1494
alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1497
// perform realignments for edge to right child
1498
if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1499
MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1500
MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1502
alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1507
/////////////////////////////////////////////////////////////////
1508
// DoIterativeRefinement()
1510
// Performs a single round of randomized partionining iterative
1512
/////////////////////////////////////////////////////////////////
1514
void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1515
const ProbabilisticModel &model, MultiSequence* &alignment){
1516
set<int> groupOne, groupTwo;
1518
// create two separate groups
1519
for (int i = 0; i < alignment->GetNumSequences(); i++){
1521
groupOne.insert (i);
1523
groupTwo.insert (i);
1526
if (groupOne.empty() || groupTwo.empty()) return;
1528
// project into the two groups
1529
MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1530
MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1534
alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1537
/////////////////////////////////////////////////////////////////
1538
// WriteAnnotation()
1540
// Computes annotation for multiple alignment and write values
1542
/////////////////////////////////////////////////////////////////
1544
void WriteAnnotation (MultiSequence *alignment,
1545
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1546
ofstream outfile (annotationFilename.c_str());
1548
if (outfile.fail()){
1549
cerr << "ERROR: Unable to write annotation file." << endl;
1553
const int alignLength = alignment->GetSequence(0)->GetLength();
1554
const int numSeqs = alignment->GetNumSequences();
1556
SafeVector<int> position (numSeqs, 0);
1557
SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1558
for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1559
SafeVector<pair<int,int> > active;
1560
active.reserve (numSeqs);
1563
for (int i = 1; i <= alignLength; i++){
1565
// find all aligned residues in this particular column
1567
for (int j = 0; j < numSeqs; j++){
1568
if (seqs[j][i] != '-'){
1569
active.push_back (make_pair(j, ++position[j]));
1573
outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1579
/////////////////////////////////////////////////////////////////
1582
// Computes the annotation score for a particular column.
1583
/////////////////////////////////////////////////////////////////
1585
int ComputeScore (const SafeVector<pair<int, int> > &active,
1586
const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1588
if (active.size() <= 1) return 0;
1590
// ALTERNATIVE #1: Compute the average alignment score.
1593
for (int i = 0; i < (int) active.size(); i++){
1594
for (int j = i+1; j < (int) active.size(); j++){
1595
val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1599
return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));