4
* Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
22
void Tree::calcSeqWeights(int firstSeq, int lastSeq, vector<int>* sweight)
24
if((int)sweight->size() < lastSeq - 1)
26
sweight->resize(lastSeq - 1);
33
* If there are more than three sequences....
35
_nSeqs = lastSeq - firstSeq;
36
if ((_nSeqs >= 2) && (clustalw::userParameters->getDistanceTree() == true) &&
37
(clustalw::userParameters->getNoWeights() == false))
40
* Calculate sequence weights based on Phylip tree.
43
weight = new int[lastSeq + 1];
45
for (i = firstSeq; i < lastSeq; i++)
47
weight[i] = calcWeight(i);
51
* Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
55
for (i = firstSeq; i < lastSeq; i++)
62
for (i = firstSeq; i < lastSeq; i++)
69
for (i = firstSeq; i < lastSeq; i++)
71
(*sweight)[i] = (weight[i] * INT_SCALE_FACTOR) / sum;
72
if ((*sweight)[i] < 1)
85
* Otherwise, use identity weights.
87
temp = INT_SCALE_FACTOR / _nSeqs;
88
// AW 2009-05-28: goes wrong if we have more than
89
// INT_SCALE_FACTOR seqs. if so, set to 1, just as above
93
for (i = firstSeq; i < lastSeq; i++)
104
* @param treeFileName
109
int Tree::readTree(clustalw::Alignment* alignPtr, const string& treeFileName, int firstSeq, int lastSeq)
112
if(alignPtr == NULL || firstSeq < 0 || lastSeq < 1)
117
string name1 = "", name2 = "";
126
// Need to check what happens if I try to open a file that doesnt exist!
127
if(treeFileName.empty())
129
return 0; // Cannot open, empty string!
133
file.open(treeFileName.c_str(), ifstream::in);
136
clustalw::utilityObject->error("Cannot open output file [%s]\n", treeFileName.c_str());
141
charFromFile = file.get();
143
if (charFromFile != '(')
145
clustalw::utilityObject->error("Wrong format in tree file %s\n", treeFileName.c_str());
148
file.seekg(0, ios::beg); // Put get pointer back to the begining!
150
clustalw::userParameters->setDistanceTree(true);
153
// Allocate memory for tree
156
nptr = new TreeNode*[3 * (lastSeq - firstSeq + 1)];
157
ptrs = new TreeNode*[3 * (lastSeq - firstSeq + 1)];
158
lptr = new TreeNode*[lastSeq - firstSeq + 1];
159
olptr = new TreeNode*[lastSeq + 1];
162
setInfo(seqTree, NULL, 0, string(""), 0.0);
164
createTree(seqTree, NULL, &file);
167
if (numSeq != lastSeq - firstSeq)
170
ss << "tree not compatible with alignment\n(" << lastSeq - firstSeq
171
<< " sequences in alignment and "<< numSeq <<" in tree\n";
174
clustalw::utilityObject->error(errorMes.c_str());
178
// If the tree is unrooted, reroot the tree - ie. minimise the difference
179
// between the mean root->leaf distances for the left and right branches of
182
if (clustalw::userParameters->getDistanceTree() == false)
184
if (rootedTree == false)
186
clustalw::utilityObject->error("input tree is unrooted and has no distances.\nCannot align sequences");
191
if (rootedTree == false)
193
root = reRoot(seqTree, lastSeq - firstSeq + 1);
200
// calculate the 'order' of each node.
207
// If there are more than three sequences....
208
// assign the sequence nodes (in the same order as in the alignment file)
209
for (i = firstSeq; i < lastSeq; i++)
211
nameLength = alignPtr->getName(i + 1).length();
212
nameSeq = alignPtr->getName(i + 1);
215
if (nameLength > clustalw::MAXNAMES)
218
ss << "name " << nameSeq << " is too long for PHYLIP tree format (max "
219
<< clustalw::MAXNAMES << " chars)";
222
clustalw::utilityObject->warning(msg.c_str());// Mark change 17-5-07
225
for (k = 0; k < nameLength && k < clustalw::MAXNAMES; k++)
228
if ((c > 0x40) && (c < 0x5b))
240
for (j = 0; j < numSeq; j++)
243
for (k = 0; k < (int)lptr[j]->name.length() && k < clustalw::MAXNAMES; k++)
245
c = lptr[j]->name[k];
246
if ((c > 0x40) && (c < 0x5b))
254
if (name1.compare(name2) == 0)
263
utilityObject->error("tree not compatible with alignment:\n %s not found\n", name2.c_str());
278
auto_ptr<AlignmentSteps> Tree::createSets(int firstSeq, int lastSeq)
280
auto_ptr<AlignmentSteps> progAlignSteps;
281
progAlignSteps.reset(new AlignmentSteps);
286
_nSeqs = lastSeq - firstSeq;
289
// If there are more than three sequences....
290
groups = new int[_nSeqs + 1];
291
groupSeqs(root, groups, _nSeqs, progAlignSteps.get());
300
groups = new int[_nSeqs + 1];
301
for (i = 0; i < _nSeqs - 1; i++)
303
for (j = 0; j < _nSeqs; j++)
317
progAlignSteps->saveSet(_nSeqs, groups);
323
return progAlignSteps;
327
* calcSimilarities changes the distMat.
332
int Tree::calcSimilarities(clustalw::Alignment* alignPtr, clustalw::DistMatrix* distMat)
334
int depth = 0, i, j, k, n;
336
int nerrs, seq1[MAXERRS], seq2[MAXERRS];
337
TreeNode *p, **pathToRoot;
339
float *distToNode, badDist[MAXERRS];
343
int nSeqs = alignPtr->getNumSeqs();
345
pathToRoot = new TreeNode*[nSeqs];
346
distToNode = new float[nSeqs];
347
dmat = new double*[nSeqs];
349
for (i = 0; i < nSeqs; i++)
351
dmat[i] = new double[nSeqs];
357
* for each leaf, determine all nodes between the leaf and the root;
359
for (i = 0; i < nSeqs; i++)
361
depth = 0;dist = 0.0;
365
pathToRoot[depth] = p;
367
distToNode[depth] = dist;
375
for (j = 0; j < i; j++)
380
* find the common ancestor.
384
while ((found == false) && (p->parent != NULL))
386
for (k = 0; k < depth; k++)
387
if (p->parent == pathToRoot[k])
397
dmat[i][j] = dist + distToNode[n - 1];
402
for (i = 0; i < nSeqs; i++)
405
for (j = 0; j < i; j++)
407
if (dmat[i][j] < 0.01)
411
if (dmat[i][j] > 1.0)
413
if (dmat[i][j] > 1.1 && nerrs < MAXERRS)
417
badDist[nerrs] = dmat[i][j];
426
string errMess = "The following sequences are too divergent to be aligned:\n";
428
for (i = 0; i < nerrs && i < 5; i++)
430
err1 << " " << alignPtr->getName(seq1[i] + 1) << " and "
431
<< alignPtr->getName(seq2[i] + 1) << " (distance "
432
<< setprecision(3) << badDist[i] << ")\n";
434
errMess += err1.str();
435
errMess += "(All distances should be between 0.0 and 1.0)\n";
436
errMess += "This may not be fatal but you have been warned!\n";
437
errMess += "SUGGESTION: Remove one or more problem sequences and try again";
439
if (clustalw::userParameters->getInteractive())
441
reply = clustalw::utilityObject->promptForYesNo(errMess.c_str(), "Continue ");
447
if ((reply != 'y') && (reply != 'Y'))
455
for (i = 0; i < nSeqs; i++)
457
for (j = 0; j < i; j++)
459
dmat[i][j] = (*distMat)(i + 1, j + 1);
467
for (i = 0; i < nSeqs; i++)
469
distMat->SetAt(i + 1, i + 1, 0.0);
470
for (j = 0; j < i; j++)
472
value = 100.0 - (dmat[i][j]) * 100.0;
473
distMat->SetAt(i + 1, j + 1, value);
474
distMat->SetAt(j + 1, i + 1, value);
478
for (i = 0; i < nSeqs; i++)
489
/** *************************************************************************
490
* Private functions!!!!!!!!!!!!!!! *
491
****************************************************************************/
499
void Tree::createTree(clustalw::TreeNode* ptree, clustalw::TreeNode* parent, ifstream* file)
508
// is this a node or a leaf ?
510
charFromFile = file->get();
511
if (charFromFile == '(')
513
// this must be a node....
516
ptrs[ntotal] = nptr[nnodes] = ptree;
520
createNode(ptree, parent);
523
createTree(p, ptree, file);
525
if (charFromFile == ',')
528
createTree(p, ptree, file);
529
if (charFromFile == ',')
531
ptree = insertNode(ptree);
532
ptrs[ntotal] = nptr[nnodes] = ptree;
536
createTree(p, ptree, file);
542
charFromFile = file->get();
545
// ...otherwise, this is a leaf
550
ptrs[ntotal++] = lptr[numSeq++] = ptree;
551
// get the sequence name
553
name += charFromFile;
554
charFromFile = file->get();
557
while ((charFromFile != ':') && (charFromFile != ',') && (charFromFile != ')'))
561
name += charFromFile;
564
charFromFile = file->get();
567
if (charFromFile != ':')
569
clustalw::userParameters->setDistanceTree(false);
574
// get the distance information
577
if (charFromFile == ':')
582
charFromFile = file->get();
584
setInfo(ptree, parent, type, name, dist);
592
void Tree::createNode(TreeNode* pptr, TreeNode* parent)
595
pptr->parent = parent;
607
TreeNode* Tree::insertNode(TreeNode* pptr)
613
createNode(newnode, pptr->parent);
615
newnode->left = pptr;
616
pptr->parent = newnode;
618
setInfo(newnode, pptr->parent, NODE, "", 0.0);
627
void Tree::clearTree(clustalw::TreeNode* p)
644
void Tree::clearTreeNodes(clustalw::TreeNode* p)
652
clearTreeNodes(p->left);
654
if (p->right != NULL)
656
clearTreeNodes(p->right);
674
void Tree::debugPrintAllNodes(int nseqs)
676
clustalw::TreeNode *p;
680
cerr << "\nDEBUG: reportAllNodes\n";
681
for (i = 0; i < ntotal; i++) {
683
// ios::sync_with_stdio();
685
// same design as TreeNode
686
if (p->parent == NULL)
687
diff = calcRootMean(p, &maxDist);
689
diff = calcMean(p, &maxDist, nseqs);
691
"i=%d p=%p: parent=%p left=%p right=%p dist=%f diff=%f\n",
692
i, (void*)p, (void*)p->parent, (void*)p->left, (void*)p->right,
707
clustalw::TreeNode* Tree::reRoot(clustalw::TreeNode* ptree, int nseqs)
709
clustalw::TreeNode *p, *rootNode, *rootPtr;
710
float diff, minDiff = 0.0, minDepth = 1.0, maxDist;
714
// find the difference between the means of leaf->node
715
// distances on the left and on the right of each node
717
for (i = 0; i < ntotal; i++)
720
if (p->parent == NULL)
722
/* AW Bug 94: p->parent must be chosen as rootNode
723
(non-optimized executables (-O0) never do), otherwise
725
Is p->parent == NULL valid at all?
726
Simplest thing for now is to continue here. Tree code
727
needs serious dismantling anyway. See debugPrintAllNodes
731
//diff = calcRootMean(p, &maxDist);
735
diff = calcMean(p, &maxDist, nseqs);
738
if ((diff == 0) || ((diff > 0) && (diff < 2 *p->dist)))
740
if ((maxDist < minDepth) || (first == true))
752
// insert a new node as the ancestor of the node which produces the shallowest
754
/* AW Bug 94: could also be prevented here */
755
if (rootPtr == ptree)
757
minDiff = rootPtr->left->dist + rootPtr->right->dist;
758
rootPtr = rootPtr->right;
760
rootNode = insertRoot(rootPtr, minDiff);
762
diff = calcRootMean(rootNode, &maxDist);
773
clustalw::TreeNode* Tree::insertRoot(clustalw::TreeNode* p, float diff)
775
clustalw::TreeNode *newp, *prev, *q, *t;
776
float dist, prevDist, td;
780
if (p->parent==NULL) {
781
// AW bug 94: question remains if access here should be handled differently
782
cerr << "\n\n*** INTERNAL ERROR: Tree::insertRoot: TreeNode p->parent is NULL\n";
783
cerr << "To help us fix this bug, please send sequence file and used options to clustalw@ucd.ie\n";
806
t->dist = dist - p->dist;
820
t->right = t->parent;
842
q->right = q->parent;
853
* remove the old root node
861
q->parent = prev->parent;
863
if (prev->parent->left == prev)
865
prev->parent->left = q;
869
prev->parent->right = q;
879
q->parent = prev->parent;
881
if (prev->parent->left == prev)
883
prev->parent->left = q;
887
prev->parent->right = q;
902
float Tree::calcRootMean(clustalw::TreeNode* root, float *maxDist)
904
float dist, leftSum = 0.0, rightSum = 0.0, leftMean, rightMean, diff;
905
clustalw::TreeNode* p;
907
int numLeft, numRight;
910
// for each leaf, determine whether the leaf is left or right of the root.
912
dist = (*maxDist) = 0;
913
numLeft = numRight = 0;
914
for (i = 0; i < numSeq; i++)
918
while (p->parent != root)
935
if (direction == LEFT)
945
if (dist > (*maxDist))
951
leftMean = leftSum / numLeft;
952
rightMean = rightSum / numRight;
954
diff = leftMean - rightMean;
965
float Tree::calcMean(clustalw::TreeNode* nptr, float *maxDist, int nSeqs)
967
float dist, leftSum = 0.0, rightSum = 0.0, leftMean, rightMean, diff;
968
clustalw::TreeNode* p;
969
clustalw::TreeNode** pathToRoot;
971
int depth = 0, i, j, n = 0;
972
int numLeft, numRight;
976
pathToRoot = new clustalw::TreeNode*[nSeqs];
977
distToNode = new float[nSeqs];
978
// determine all nodes between the selected node and the root;
981
(*maxDist) = dist = 0.0;
982
numLeft = numRight = 0;
986
pathToRoot[depth] = p;
988
distToNode[depth] = dist;
993
// for each leaf, determine whether the leaf is left or right of the node.
994
// (RIGHT = descendant, LEFT = not descendant)
996
for (i = 0; i < numSeq; i++)
1009
// find the common ancestor.
1013
while ((found == false) && (p->parent != NULL))
1015
for (j = 0; j < depth; j++)
1016
if (p->parent == pathToRoot[j])
1031
if (direction == LEFT)
1034
leftSum += distToNode[n - 1];
1043
if (dist > (*maxDist))
1049
delete [] distToNode;
1051
delete [] pathToRoot;
1054
leftMean = leftSum / numLeft;
1055
rightMean = rightSum / numRight;
1057
diff = leftMean - rightMean;
1064
void Tree::orderNodes()
1067
clustalw::TreeNode* p;
1069
for (i = 0; i < numSeq; i++)
1085
int Tree::calcWeight(int leaf)
1087
clustalw::TreeNode* p;
1091
while (p->parent != NULL)
1093
weight += p->dist / p->order;
1099
return ((int)weight);
1103
* skipSpace is used to skip all the spaces at the begining of a file. The next read
1104
* will give a character other than a space.
1107
void Tree::skipSpace(ifstream* file)
1127
void Tree::groupSeqs(clustalw::TreeNode* p, int *nextGroups, int nSeqs, AlignmentSteps* stepsPtr)
1132
tmpGroups = new int[nSeqs + 1];
1133
for (i = 0; i < nSeqs; i++)
1138
if (p->left != NULL)
1140
if (p->left->leaf == NODE)
1142
groupSeqs(p->left, nextGroups, nSeqs, stepsPtr);
1144
for (i = 0; i < nSeqs; i++)
1145
if (nextGroups[i] != 0)
1152
markGroup1(p->left, tmpGroups, nSeqs);
1157
if (p->right != NULL)
1159
if (p->right->leaf == NODE)
1161
groupSeqs(p->right, nextGroups, nSeqs, stepsPtr);
1162
for (i = 0; i < nSeqs; i++)
1163
if (nextGroups[i] != 0)
1170
markGroup2(p->right, tmpGroups, nSeqs);
1172
stepsPtr->saveSet(nSeqs, tmpGroups);
1175
for (i = 0; i < nSeqs; i++)
1177
nextGroups[i] = tmpGroups[i];
1180
delete [] tmpGroups;
1190
void Tree::markGroup1(clustalw::TreeNode* p, int *groups, int n)
1194
for (i = 0; i < n; i++)
1213
void Tree::markGroup2(clustalw::TreeNode* p, int *groups, int n)
1217
for (i = 0; i < n; i++)
1223
else if (groups[i] != 0)
1234
clustalw::TreeNode* Tree::avail()
1236
clustalw::TreeNode* p;
1255
void Tree::setInfo(TreeNode* p, TreeNode* parent, int pleaf, string pname, float
1263
if (p->leaf == true)