2
Copyright, Ziheng Yang, April 1995.
4
cl -Ot -O2 evolver.c tools.c
5
cl -Ot -O2 -DCodonNSbranches -FeevolverNSbranches.exe evolver.c tools.c
6
cl -Ot -O2 -DCodonNSsites -FeevolverNSsites.exe evolver.c tools.c
7
cl -Ot -O2 -DCodonNSbranchsites -FeevolverNSbranchsites.exe evolver.c tools.c
9
cc -fast -o evolver evolver.c tools.c -lm
10
cc -O4 -DCodonNSbranches -o evolverNSbranches evolver.c tools.c -lm
11
cc -O4 -DCodonNSsites -o evolverNSsites evolver.c tools.c -lm
12
cc -O4 -DCodonNSbranchsites -o evolverNSbranchsites evolver.c tools.c -lm
18
evolver 9 <MasterTreeFile> <TreesFile>
22
#define CodonNSbranches
24
#define CodonNSbranchsites
30
#define NBRANCH (NS*2-2)
37
char *z[2*NS-1], spname[NS][LSPNAME+1], daafile[96], cleandata, readpattern;
38
int ns, ls, npatt, np, ntime, ncode, clock, rooted, model, icode;
39
int seqtype, *pose, ncatG, NSsites;
40
double *fpatt, kappa, omega, alpha, pi[64], *conP, daa[20*20];
41
double freqK[NCATG], rK[NCATG];
42
char *siteID; /* used if ncatG>1 */
43
double *siterates; /* rates for gamma or omega for site or branch-site models */
44
double *omegaBS, *QfactorBS; /* omega IDs for branch-site models */
47
int nbranch, nnode, root, branches[NBRANCH][2];
50
int father, nson, sons[MAXNSONS], ibranch;
51
double branch, age, omega, label, *conP;
52
char *nodeStr, fossil;
56
extern int GeneticCode[][64], noisy;
57
int LASTROUND=0; /* not used */
63
#include "treespace.c"
65
void TreeDistances(FILE* fout);
66
void Simulate(char*ctlf);
67
void MakeSeq(char*z, int ls);
68
int EigenQbase(double rates[], double pi[],
69
double Root[],double U[],double V[],double Q[]);
70
int EigenQcodon (int getstats, double kappa,double omega,double pi[],
71
double Root[], double U[], double V[], double Q[]);
72
int EigenQaa(double pi[],double Root[], double U[], double V[],double Q[]);
73
void CladeProbabilities (char treefile[]);
74
void CladeSupport (char tree1f[], char treesf[], int burnin);
75
int between_f_and_x(void);
76
void LabelClades(FILE *fout);
78
void Rell2MLtree(int argc, char *argv[]);
82
char *MCctlf0[]={"MCbase.dat","MCcodon.dat","MCaa.dat"};
83
char *seqf[3]={"mc.paml", "mc.paml", "mc.nex"};
85
enum {JC69, K80, F81, F84, HKY85, T92, TN93, REV} BaseModels;
86
char *basemodels[]={"JC69","K80","F81","F84","HKY85","T92","TN93","REV"};
87
enum {Poisson, EqualInput, Empirical, Empirical_F} AAModels;
88
char *aamodels[]={"Poisson", "EqualInput", "Empirical", "Empirical_F"};
91
double PMat[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], Root[NCODE];
92
static double Qfactor=-1, Qrates[5]; /* Qrates[] hold kappa's for nucleotides */
95
int main (int argc, char*argv[])
97
char *MCctlf=NULL, outf[96]="evolver.out", file1[96]="truetree",file2[96]="rst1";
98
int i, option=6, ntree=1,rooted, BD=0, burnin=0;
99
double bfactor=1, birth=-1,death=-1,sample=-1,mut=-1, *space;
100
FILE *fout=gfopen(outf,"w");
102
/* Rell2MLtree(argc, argv); */
104
printf("EVOLVER in %s\n", VerStr);
105
com.alpha=0; com.cleandata=1; com.model=0; com.NSsites=0;
107
if(argc==1) printf("Results for options 1-4 & 8 go into %s\n",outf);
108
else if(argc!=3 && argc!=4 && argc!=5) {
109
puts("Usage: \n\tevolver \n\tevolver option# MyDataFile"); exit(-1);
112
sscanf(argv[1],"%d",&option);
114
if(option<5 || option>7) error2("command line option not right.");
117
sscanf(argv[1],"%d",&option);
118
if(option!=9) error2("option not good?");
119
strcpy(file1, argv[2]);
120
strcpy(file2, argv[3]);
121
if(argc>4) sscanf(argv[4],"%d",&burnin);
126
printf("\n\t(1) Get random UNROOTED trees?\n");
127
printf("\t(2) Get random ROOTED trees?\n");
128
printf("\t(3) List all UNROOTED trees?\n");
129
printf("\t(4) List all ROOTED trees?\n");
130
printf("\t(5) Simulate nucleotide data sets (use %s)?\n",MCctlf0[0]);
131
printf("\t(6) Simulate codon data sets (use %s)?\n",MCctlf0[1]);
132
printf("\t(7) Simulate amino acid data sets (use %s)?\n",MCctlf0[2]);
133
printf("\t(8) Calculate identical bi-partitions between trees?\n");
134
printf("\t(9) Calculate clade support values (read 2 treefiles)?\n");
135
printf("\t(11) Label clades?\n");
136
printf("\t(0) Quit?\n");
137
#if defined (CodonNSbranches)
138
option=6; com.model=1;
139
MCctlf = (argc==3 ? argv[2] : "MCcodonNSbranches.dat");
140
#elif defined (CodonNSsites)
141
option=6; com.NSsites=3;
142
MCctlf = (argc==3 ? argv[2] : "MCcodonNSsites.dat");
143
#elif defined (CodonNSbranchsites)
144
option=6; com.model=1; com.NSsites=3;
145
MCctlf = (argc==3 ? argv[2] : "MCcodonNSbranchsites.dat");
149
scanf("%d", &option);
151
if(option==0) exit(0);
152
if(option>=5 && option<=7) break;
154
printf ("No. of species: ");
155
scanf ("%d", &com.ns);
157
if(com.ns>NS) error2 ("Too many species. Raise NS.");
158
if((space=(double*)malloc(10000*sizeof(double)))==NULL) error2("oom");
159
rooted = !(option%2);
161
printf("\nnumber of trees & random number seed? ");
162
scanf("%d%d",&ntree,&i);
163
SetSeed(i==-1?(int)time(NULL):i);
164
printf ("Want branch lengths from the birth-death process (0/1)? ");
168
if(com.ns<3) error2("no need to do this?");
169
i = (com.ns*2-1)*sizeof(struct TREEN);
170
if((nodes=(struct TREEN*)malloc(i)) == NULL)
174
case(1): /* random UNROOTED trees */
175
case(2): /* random ROOTED trees */
176
for(i=0; i<com.ns; i++) /* default spname */
177
sprintf(com.spname[i],"S%d",i+1);
179
printf ("\nbirth rate, death rate, sampling fraction, and ");
180
printf ("mutation rate (tree height)?\n");
181
scanf ("%lf%lf%lf%lf", &birth, &death, &sample, &mut);
183
for(i=0;i<ntree;i++) {
184
RandomLHistory (rooted, space);
185
if(BD) BranchLengthBD (1, birth, death, sample, mut);
186
if(com.ns<20&&ntree<10) { OutTreeN(F0,0,BD); puts("\n"); }
187
OutTreeN(fout,1,BD); FPN(fout);
190
for (i=0; i<com.ns-2-!rooted; i++)
191
Ib[i] = (int)((3.+i)*rndu());
192
MakeTreeIb (com.ns, Ib, rooted);
197
ListTrees(fout, com.ns, rooted);
199
case(8): TreeDistances(fout); break;
201
case(9): CladeSupport(file1, file2, burnin); break;
203
case(9): CladeProbabilities("/papers/BPPJC3sB/Karol.trees"); break;
205
case(10): between_f_and_x(); break;
206
case(11): LabelClades(fout); break;
211
com.seqtype=option-5; /* 0, 1, 2 for bases, codons, & amino acids */
212
Simulate(MCctlf?MCctlf:MCctlf0[option-5]);
217
int between_f_and_x (void)
219
/* this helps with the exponential transform for frequency parameters */
224
printf("\ndirection (0:x=>f; 1:f=>x; -1:end) & #classes? ");
226
if(fromf==-1) return(0);
227
scanf("%d", &n); if(n>100) error2("too many classes");
228
printf("input the first %d values for %s? ",n-1,(fromf?"f":"x"));
229
FOR(i,n-1) scanf("%lf",&x[i]);
230
x[n-1]=(fromf?1-sum(x,n-1):0);
231
f_and_x(x, x, n, fromf, 1);
237
void LabelClades(FILE *fout)
239
/* This reads in a tree and scan species names to check whether they form a
240
paraphyletic group and then label the clade.
241
It assumes that the tree is unrooted, and so goes through two rounds to check
242
whether the remaining seqs form a monophyletic clade.
245
int unrooted=1,iclade, sizeclade, mrca, paraphyl, is, imrca, i,j,k, lasts, haslength;
246
char key[96]="A", treef[64]="/A/F/flu/HA.all.prankcodon.tre", *p,chosen[NS], *endstr="end";
247
int *anc[NS-1], loc, bitmask, SI=sizeof(int)*8;
250
printf("Tree file name? ");
252
printf("Treat tree as unrooted (0 no, 1 yes)? ");
253
scanf ("%d", &unrooted);
255
ftree = gfopen (treef,"r");
256
fscanf (ftree, "%d%d", &com.ns, &j);
257
if(com.ns<=0) error2("need ns in tree file");
260
i = (com.ns*2-1)*sizeof(struct TREEN);
261
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
262
for(i=0; i<com.ns*2-1; i++) nodes[i].nodeStr = NULL;
263
for(i=0; i<com.ns-1; i++) {
264
anc[i] = (int*)malloc((com.ns/SI+1)*sizeof(int));
265
if(anc[i]==NULL) error2("oom");
267
ReadTreeN(ftree, &haslength, &j, 1, 0);
269
if(debug) { OutTreeN(F0, 1, PrNodeNum); FPN(F0); }
271
for(iclade=0; iclade<com.ns-1; iclade++) {
272
printf("\nString for selecting sequences (followed by non-digit) (end to end)? ");
274
if(strcmp(endstr, key) == 0)
276
for(i=0; i<com.ns; i++)
281
for(i=0; i<com.ns; i++) {
282
if( (p=strstr(com.spname[i], key))
288
for(i=0; i<com.ns; i++)
289
if(strstr(com.spname[i], key)) chosen[i] = 1;
292
/* look for MRCA, going through two rounds, assuming unrooted tree */
293
for(imrca=0; imrca<1+unrooted; imrca++) {
295
for(i=0; i<com.ns; i++) chosen[i] = 1 - chosen[i];
297
for(i=0,sizeclade=0; i<com.ns; i++)
303
if(sizeclade <= 1 || sizeclade >= com.ns-1) {
304
puts("unable to form a clade. <2 seqs.");
307
for(i=0; i<com.ns-1; i++) for(j=0; j<com.ns/SI+1; j++)
309
for(is=0; is<com.ns; is++) {
310
if(chosen[is]==0) continue;
311
loc = is/SI; bitmask = 1 << (is%SI);
312
for(j=nodes[is].father; j!=-1; j=nodes[j].father) {
313
anc[j-com.ns][loc] |= bitmask;
315
for(i=0,k=0; i<com.ns; i++)
316
if(anc[j-com.ns][i/SI] & (1<<(i%SI)))
324
if(imrca==0 && mrca!=tree.root) /* 1st round is enough */
328
if(sizeclade <= 1 || sizeclade >= com.ns-1 || mrca==tree.root) {
329
printf("Unable to label. Ignored.");
334
for(is=0; is<com.ns-1; is++) {
335
printf("\nnode %4d: ", is+com.ns);
336
for(j=0; j<com.ns; j++) {
337
loc = j/SI; bitmask = 1 << (j%SI);
338
printf(" %d", (anc[is][loc] & bitmask) != 0);
342
printf("\nClade #%d (%s): %d seqs selected, MRCA is %d\n", iclade+1, key, sizeclade, mrca+1);
343
for(is=0,paraphyl=0; is<com.ns; is++) {
345
for(j=nodes[is].father; j!=-1; j=nodes[j].father)
346
if(j==mrca) { paraphyl++; break; }
349
printf("\nThis clade is paraphyletic, & includes %d other sequences\n", paraphyl);
351
nodes[mrca].label = iclade+1;
352
if(debug) OutTreeN(F0, 1, haslength|PrLabel);
355
for(i=0; i<com.ns-1; i++) free(anc[i]);
356
OutTreeN(fout, 1, haslength|PrLabel); FPN(fout);
357
printf("Printed final tree with labels in evolver.out\n");
361
void TreeDistanceDistribution (FILE* fout)
363
/* This calculates figure 3.7 of Yang (2006).
364
This reads the file of all trees (such as 7s.all.trees), and calculates the
365
distribution of partition distance in all pairwise comparisons.
367
int i,j,ntree, k,*nib, parti2B[NS], nsame, IBsame[NS], lenpart=0;
368
char treef[64]="5s.all.trees", *partition;
370
double mPD[NS], PD1[NS]; /* distribution of partition distances */
372
puts("Tree file name?");
375
ftree=gfopen (treef,"r");
376
fscanf (ftree, "%d%d", &com.ns, &ntree);
377
printf("%2d sequences %2d trees.\n", com.ns, ntree);
378
i=(com.ns*2-1)*sizeof(struct TREEN);
379
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
381
lenpart = (com.ns-1)*com.ns*sizeof(char);
383
printf("\n%d bytes of space requested.\n", i);
384
partition = (char*)malloc(i);
385
nib = (int*)malloc(ntree*sizeof(int));
386
if (partition==NULL || nib==NULL) error2("out of memory");
388
puts("\ntree #: mean prop of tree pairs with 0 1 2 ... shared bipartitions\n");
389
fputs("\ntree #: prop of tree pairs with 0 1 2 ... shared bipartitions\n",fout);
390
for (i=0; i<ntree; i++) {
391
ReadTreeN (ftree, &j, &k, 0, 1);
392
nib[i]=tree.nbranch-com.ns;
393
BranchPartition(partition+i*lenpart, parti2B);
395
for(k=0; k<com.ns-3; k++) mPD[k]=0;
396
for (i=0; i<ntree; i++,FPN(fout)) {
397
for(k=0; k<com.ns-3; k++) PD1[k]=0;
398
for (j=0; j<ntree; j++) {
400
nsame=NSameBranch(partition+i*lenpart,partition+j*lenpart, nib[i],nib[j],IBsame);
403
for(k=0; k<com.ns-3; k++) PD1[k] /= (ntree-1.);
404
for(k=0; k<com.ns-3; k++) mPD[k] = (mPD[k]*i+PD1[k])/(i+1.);
405
printf("%8d (%5.1f%%):", i+1,(i+1.)/ntree*100);
406
for(k=0; k<com.ns-3; k++) printf(" %7.4f", mPD[k]);
407
fprintf(fout, "%8d:", i+1); for(k=0; k<com.ns-3; k++) fprintf(fout, " %7.4f", PD1[k]);
408
printf("%s", (com.ns<8||(i+1)%100==0 ? "\n" : "\r"));
410
free(partition); free(nodes); free(nib); fclose(ftree);
415
void TreeDistances (FILE* fout)
417
int i,j,ntree, k,*nib, parti2B[NS], nsame, IBsame[NS],nIBsame[NS], lenpart=0;
418
char treef[64]="5s.all.trees", *partition;
420
double psame, mp, vp;
423
TreeDistanceDistribution(fout);
426
puts("\nNumber of identical bi-partitions between trees.\nTree file name?");
429
ftree=gfopen (treef,"r");
430
fscanf (ftree, "%d%d", &com.ns, &ntree);
431
printf("%2d sequences %2d trees.\n", com.ns, ntree);
432
i=(com.ns*2-1)*sizeof(struct TREEN);
433
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
435
if(ntree<2) error2("ntree");
436
printf ("\n%d species, %d trees\n", com.ns, ntree);
437
puts("\n\t1: first vs. rest?\n\t2: all pairwise comparisons?\n");
441
lenpart=(com.ns-1)*com.ns*sizeof(char);
442
i=(k==1?2:ntree)*lenpart;
443
printf("\n%d bytes of space requested.\n", i);
444
partition=(char*)malloc(i);
445
nib=(int*)malloc(ntree*sizeof(int));
446
if (partition==NULL || nib==NULL) error2("out of memory");
448
if(k==2) { /* pairwise comparisons */
449
fputs("Number of identical bi-partitions in pairwise comparisons\n",fout);
450
for (i=0; i<ntree; i++) {
451
ReadTreeN (ftree, &j, &k, 0, 1);
452
nib[i]=tree.nbranch-com.ns;
453
BranchPartition(partition+i*lenpart, parti2B);
455
for (i=0; i<ntree; i++,FPN(F0),FPN(fout)) {
456
printf("%2d (%2d):", i+1,nib[i]);
457
fprintf(fout,"%2d (%2d):", i+1,nib[i]);
458
for (j=0; j<i; j++) {
459
nsame=NSameBranch(partition+i*lenpart,partition+j*lenpart, nib[i],nib[j],IBsame);
460
printf(" %2d", nsame);
461
fprintf(fout," %2d", nsame);
465
else { /* first vs. others */
466
ReadTreeN (ftree, &j, &k, 0, 1);
467
nib[0]=tree.nbranch-com.ns;
468
if (nib[0]==0) error2("1st tree is a star tree..");
469
BranchPartition (partition, parti2B);
470
fputs ("Comparing the first tree with the others\nFirst tree:\n",fout);
471
OutTreeN(fout,0,0); FPN(fout); OutTreeB(fout); FPN(fout);
472
fputs ("\nInternal branches in the first tree:\n",fout);
475
fprintf(fout,"%3d (%2d..%-2d): ( ",
476
i+1,tree.branches[k][0]+1,tree.branches[k][1]+1);
477
FOR(j,com.ns) if(partition[i*com.ns+j]) fprintf(fout,"%d ",j+1);
480
if(nodes[tree.root].nson<=2)
481
fputs("\nRooted tree, results may not be correct.\n",fout);
482
fputs("\nCorrect internal branches compared with the 1st tree:\n",fout);
483
FOR(k,nib[0]) nIBsame[k]=0;
484
for (i=1,mp=vp=0; i<ntree; i++,FPN(fout)) {
485
ReadTreeN (ftree, &j, &k, 0, 1);
486
nib[1]=tree.nbranch-com.ns;
487
BranchPartition (partition+lenpart, parti2B);
488
nsame=NSameBranch (partition,partition+lenpart, nib[0],nib[1],IBsame);
490
psame=nsame/(double)nib[0];
491
FOR(k,nib[0]) nIBsame[k]+=IBsame[k];
492
fprintf(fout,"1 vs. %3d: %4d: ", i+1,nsame);
493
FOR(k,nib[0]) if(IBsame[k]) fprintf(fout," %2d", k+1);
494
printf("1 vs. %5d: %6d/%d %10.4f\n", i+1,nsame,nib[0],psame);
495
vp += square(psame - mp)*(i-1.)/i;
496
mp=(mp*(i-1.) + psame)/i;
498
vp=(ntree<=2 ? 0 : sqrt(vp/((ntree-1-1)*(ntree-1.))));
499
fprintf(fout,"\nmean and S.E. of proportion of identical partitions\n");
500
fprintf(fout,"between the 1st and all the other %d trees ", ntree-1);
501
fprintf(fout,"(ignore these if not revelant):\n %.4f +- %.4f\n", mp, vp);
502
fprintf(fout,"\nNumbers of times, out of %d, ", ntree-1);
503
fprintf(fout,"interior branches of tree 1 are present");
504
fputs("\n(This may be bootstrap support for nodes in tree 1)\n",fout);
506
i=tree.branches[parti2B[k]][0]+1; j=tree.branches[parti2B[k]][1]+1;
507
fprintf(fout,"%3d (%2d..%-2d): %6d (%5.1f%%)\n",
508
k+1,i,j,nIBsame[k],nIBsame[k]*100./(ntree-1.));
511
free(partition); free(nodes); free(nib); fclose(ftree);
517
int EigenQbase(double rates[], double pi[],
518
double Root[],double U[],double V[],double Q[])
520
/* Construct the rate matrix Q[] for nucleotide model REV.
526
for (i=0,k=0; i<3; i++) for (j=i+1; j<4; j++)
527
if (i*4+j!=11) Q[i*4+j]=Q[j*4+i]=rates[k++];
528
for (i=0,Q[3*4+2]=Q[2*4+3]=1; i<4; i++) FOR (j,4) Q[i*4+j] *= pi[j];
529
for (i=0,mr=0; i<4; i++)
530
{ Q[i*4+i]=0; Q[i*4+i]=-sum(Q+i*4, 4); mr-=pi[i]*Q[i*4+i]; }
533
eigenQREV(Q, com.pi, 4, Root, U, V, space);
538
static double freqK_NS=-1;
540
int EigenQcodon (int getstats, double kappa, double omega, double pi[],
541
double Root[], double U[], double V[], double Q[])
543
/* Construct the rate matrix Q[].
544
64 codons are used, and stop codons have 0 freqs.
546
int n=com.ncode, i,j,k, c[2],ndiff,pos=0,from[3],to[3];
547
double mr, space[64];
549
for(i=0; i<n*n; i++) Q[i] = 0;
550
for (i=0; i<n; i++) FOR (j,i) {
551
from[0]=i/16; from[1]=(i/4)%4; from[2]=i%4;
552
to[0]=j/16; to[1]=(j/4)%4; to[2]=j%4;
553
c[0]=GeneticCode[com.icode][i]; c[1]=GeneticCode[com.icode][j];
554
if (c[0]==-1 || c[1]==-1) continue;
555
for (k=0,ndiff=0; k<3; k++) if (from[k]!=to[k]) { ndiff++; pos=k; }
556
if (ndiff!=1) continue;
558
if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0) Q[i*n+j]*=kappa;
559
if(c[0]!=c[1]) Q[i*n+j]*=omega;
562
for(i=0; i<n; i++) for(j=0; j<n; j++)
563
Q[i*n+j] *= com.pi[j];
564
for(i=0,mr=0;i<n;i++) {
565
Q[i*n+i] = -sum(Q+i*n,n);
566
mr -= pi[i]*Q[i*n+i];
570
Qfactor += freqK_NS * mr;
572
if(com.ncatG==0) FOR(i,n*n) Q[i]*=1/mr;
573
else FOR(i,n*n) Q[i]*=Qfactor; /* NSsites models */
574
eigenQREV(Q, com.pi, n, Root, U, V, space);
581
int EigenQaa (double pi[], double Root[], double U[], double V[], double Q[])
583
/* Construct the rate matrix Q[]
586
double mr, space[20];
590
case (Poisson) : case (EqualInput) :
591
fillxc (Q, 1., n*n); break;
592
case (Empirical) : case (Empirical_F):
593
FOR(i,n) FOR(j,i) Q[i*n+j]=Q[j*n+i]=com.daa[i*n+j]/100;
596
FOR (i,n) FOR (j,n) Q[i*n+j]*=com.pi[j];
597
for (i=0,mr=0; i<n; i++) {
598
Q[i*n+i]=0; Q[i*n+i]=-sum(Q+i*n,n); mr-=com.pi[i]*Q[i*n+i];
601
eigenQREV(Q, com.pi, n, Root, U, V, space);
602
FOR(i,n) Root[i]=Root[i]/mr;
608
int GetDaa (FILE* fout, double daa[])
610
/* Get the amino acid substitution rate matrix (grantham, dayhoff, jones, etc).
616
fdaa=gfopen(com.daafile, "r");
617
printf("\nReading rate matrix from %s\n", com.daafile);
619
for (i=0; i<n; i++) for (j=0,daa[i*n+i]=0; j<i; j++) {
620
fscanf(fdaa, "%lf", &daa[i*n+j]);
621
daa[j*n+i]=daa[i*n+j];
623
if (com.model==Empirical) {
624
FOR(i,n) if(fscanf(fdaa,"%lf",&com.pi[i])!=1) error2("err aaRatefile");
625
if (fabs(1-sum(com.pi,20))>1e-4) error2("\nSum of aa freq. != 1\n");
630
fprintf (fout, "\n%s\n", com.daafile);
632
fprintf (fout, "\n%4s", getAAstr(aa3,i));
633
FOR (j,i) fprintf (fout, "%5.0f", daa[i*n+j]);
644
void MakeSeq(char*z, int ls)
646
/* generate a random sequence of nucleotides, codons, or amino acids by
647
sampling com.pi[], or read the ancestral sequence from the file RootSeq.txt
650
int i,j,h, n=com.ncode, ch, n31=(com.seqtype==1?3:1), lst;
651
double p[64],r, small=1e-5;
652
char *pch=(com.seqtype==2?AAs:BASEs);
653
char rootseqf[]="RootSeq.txt", codon[4]=" ";
654
FILE *fseq=(FILE*)fopen(rootseqf,"r");
658
if(times++==0) printf("Reading sequence at the root from file.\n\n");
659
if(com.siterates && com.ncatG>1)
660
error2("sequence for root doesn't work for site-class models");
663
for(i=0; i<n31; i++) {
664
while((ch=fgetc(fseq)) !=EOF && !isalpha(ch)) ;
665
if(ch==EOF) error2("EOF when reading root sequence.");
667
codon[i]=(char)(ch=CodeChara((char)ch, com.seqtype));
669
if(com.seqtype==1) ch = codon[0]*16 + codon[1]*4 + codon[2];
671
printf("error when reading site %d\n", lst+1);
672
if(com.seqtype==1 && com.pi[ch]==0)
673
printf("you seem to have a stop codon in the root sequence\n");
676
if(lst==com.ls) break;
681
for(j=0; j<n; j++) p[j] = com.pi[j];
682
for(j=1; j<n; j++) p[j] += p[j-1];
683
if(fabs(p[n-1]-1) > small)
684
{ printf("\nsum pi = %.6f != 1!\n", p[n-1]); exit(-1); }
685
for(h=0; h<com.ls; h++) {
686
for(j=0,r=rndu();j<n-1;j++)
695
void Evolve1 (int inode)
697
/* evolve sequence com.z[tree.root] along the tree to generate com.z[],
698
using nodes[].branch, nodes[].omega, & com.model
699
Needs com.z[0,1,...,nnode-1], while com.z[0] -- com.z[ns-1] constitute
701
For codon sequences, com.siterates[] has w's for NSsites and NSbranchsite models.
703
int is, h,i,j, ison, from, n=com.ncode, longseq=100000;
706
for (is=0; is<nodes[inode].nson; is++) {
707
ison=nodes[inode].sons[is];
708
memcpy(com.z[ison],com.z[inode],com.ls*sizeof(char));
709
t=nodes[ison].branch;
711
if(com.seqtype==1 && com.model && com.NSsites) { /* branch-site models */
712
Qfactor=com.QfactorBS[ison];
713
for(h=0; h<com.ls; h++)
714
com.siterates[h] = com.omegaBS[ison*com.ncatG+com.siteID[h]];
717
for(h=0; h<com.ls; h++) {
718
/* decide whether to recalcualte PMat[]. */
719
if (h==0 || (com.siterates && com.siterates[h]!=com.siterates[h-1])) {
720
rw = (com.siterates?com.siterates[h]:1);
722
switch(com.seqtype) {
725
PMatTN93(PMat, t*Qfactor*rw*Qrates[0],
726
t*Qfactor*rw*Qrates[1], t*Qfactor*rw, com.pi);
727
else if(com.model==REV)
728
PMatUVRoot(PMat, t*rw, com.ncode, U,V,Root);
731
case (CODONseq): /* Watch out for NSsites model */
732
if(com.model || com.NSsites) { /* no need to update UVRoot if M0 */
733
if(com.model && com.NSsites==0) /* branch */
734
rw = nodes[ison].omega; /* should be equal to com.rK[nodes[].label] */
736
EigenQcodon(0, com.kappa, rw, com.pi, Root,U,V, PMat);
738
PMatUVRoot(PMat, t, com.ncode, U, V, Root);
742
PMatUVRoot(PMat, t*rw, com.ncode, U, V, Root);
747
PMat[i*n+j] += PMat[i*n+j-1];
749
for(j=0,from=com.z[ison][h],rw=rndu(); j<n-1; j++)
750
if(rw < PMat[from*n+j]) break;
754
if(com.ls>longseq) printf("\r nodes %2d -> %2d, evolving . . ", inode+1, ison+1);
756
if(nodes[ison].nson) Evolve1(ison);
759
if(inode==tree.root && com.ls>longseq) printf("\r%s", strc(50,' '));
763
int PatternWeightSimple (int CollapsJC)
765
/* This is modified from PatternWeight() and collaps sites into patterns,
766
for nucleotide, amino acid, or codon sequences.
767
This relies on \0 being the end of the string so that sequences should not be
768
encoded before this routine is called.
769
com.pose[i] has labels for genes as input and maps sites to patterns in return.
770
com.fpatt, a vector of doubles, wastes space as site pattern counts are integers.
771
Sequences z[ns*ls] are copied into patterns zt[ls*lpatt], and bsearch is used
772
twice to avoid excessive copying, to count npatt first & to generate fpatt etc.
774
int maxnpatt=com.ls, h, ip,l,u, j, k, same;
775
/* int n31 = (com.seqtype==CODONseq ? 3 : 1); */
777
int lpatt = com.ns*n31+1; /* extra 0 used for easy debugging, can be voided */
778
int *p2s; /* point patterns to sites in zt */
780
double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata+1;
784
/* (A) Collect and sort patterns. Get com.npatt.
785
Move sequences com.z[ns][ls] into sites zt[ls*lpatt].
786
Use p2s to map patterns to sites in zt to avoid copying.
789
if((com.seqtype==1 && com.ns<5) || (com.seqtype!=1 && com.ns<7))
790
maxnpatt = (int)(pow(nc, (double)com.ns) + 0.5);
791
if(maxnpatt>com.ls) maxnpatt = com.ls;
792
p2s = (int*)malloc(maxnpatt*sizeof(int));
793
zt = (char*)malloc((com.ns+1)*com.ls*n31*sizeof(char));
794
if(p2s==NULL || zt==NULL) error2("oom p2s or zt");
795
memset(zt, 0, (com.ns+1)*com.ls*n31*sizeof(char));
796
for(j=0; j<com.ns; j++)
797
for(h=0; h<com.ls; h++)
799
zt[h*lpatt+j*n31+k] = com.z[j][h*n31+k];
801
com.npatt = l = u = ip = 0;
802
for(h=0; h<com.ls; h++) {
803
if(debug) printf("\nh %3d %s", h, zt+h*lpatt);
804
/* bsearch in existing patterns. Knuth 1998 Vol3 Ed2 p.410
805
ip is the loc for match or insertion. [l,u] is the search interval.
808
if(h != 0) { /* not 1st pattern? */
809
for(l=0, u=com.npatt-1; ; ) {
812
k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
814
else if(k>0) l = ip + 1;
815
else { same = 1; break; }
819
if(com.npatt>maxnpatt)
820
error2("npatt > maxnpatt");
821
if(l > ip) ip++; /* last comparison in bsearch had k > 0. */
822
/* Insert new pattern at ip. This is the expensive step. */
825
memmove(p2s+ip+1, p2s+ip, (com.npatt-ip)*sizeof(int));
828
for(j=com.npatt; j>ip; j--)
836
printf(": %3d (%c ilu %3d%3d%3d) ", com.npatt, DS[same], ip, l, u);
837
for(j=0; j<com.npatt; j++)
838
printf(" %s", zt+p2s[j]*lpatt);
843
/* (B) count pattern frequencies */
844
com.fpatt = (double*)realloc(com.fpatt, com.npatt*sizeof(double));
845
if(com.fpatt==NULL) error2("oom fpatt");
846
for(ip=0; ip<com.npatt; ip++) com.fpatt[ip] = 0;
847
for(h=0; h<com.ls; h++) {
848
for(same=0, l=0, u=com.npatt-1; ; ) {
851
k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
853
else if(k>0) l = ip + 1;
854
else { same = 1; break; }
857
error2("ghost pattern?");
861
for(j=0; j<com.ns; j++) {
863
com.z[j] = (char*)realloc(com.z[j], com.npatt*n31*sizeof(char));
865
for(ip=0,p=com.z[j]; ip<com.npatt; ip++)
867
*p++ = zt[p2s[ip]*lpatt+j*n31+k];
875
void Simulate (char*ctlf)
877
/* simulate nr data sets of nucleotide, codon, or AA sequences.
878
ls: number of nucleotides, codons, or AAs in each sequence.
879
All 64 codons are used for codon sequences.
880
When com.alpha or com.ncatG>1, sites are randomized after sequences are
882
space[com.ls] is used to hold site marks.
883
format (0: paml sites; 1: paml patterns; 2: paup nex)
885
char *ancf="ancestral.txt", *siteIDf="siterates.txt";
886
FILE *fin, *fseq, *ftree=NULL, *fanc=NULL, *fsiteID=NULL;
887
char *paupstart="paupstart",*paupblock="paupblock",*paupend="paupend";
889
int lline=32000, i,j,k, ir,n,nr, fixtree=1, sspace=10000, rooted=1;
890
int h=0,format=0, b[3]={0}, nrate=1, counts[NCATG];
892
char *tmpseq=NULL, *pc;
893
double birth=0, death=0, sample=1, mut=1, tlength, *space, *blengthBS;
894
double T,C,A,G,Y,R, Falias[NCATG];
898
printf("\nReading options from data file %s\n", ctlf);
899
com.ncode=n=(com.seqtype==0 ? 4 : (com.seqtype==1?64:20));
900
fin=(FILE*)gfopen(ctlf,"r");
901
fscanf(fin, "%d", &format); fgets(line, lline, fin);
902
printf("\nSimulated data will go into %s.\n", seqf[format]);
903
if(format==2) printf("%s, %s, & %s will be appended if existent.\n",
904
paupstart,paupblock,paupend);
906
fscanf (fin, "%d", &i);
907
fgets(line, lline, fin);
908
SetSeed(i<=0?(int)time(NULL)*2+1:i);
909
fscanf (fin, "%d%d%d", &com.ns, &com.ls, &nr);
910
fgets(line, lline, fin);
911
i=(com.ns*2-1)*sizeof(struct TREEN);
912
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
914
if(com.ns>NS) error2("too many seqs?");
915
printf ("\n%d seqs, %d sites, %d replicate(s)\n", com.ns, com.ls, nr);
916
k=(com.ns*com.ls* (com.seqtype==CODONseq?4:1) *nr)/1000+1;
917
printf ("Seq file will be about %dK bytes.\n",k);
918
for(i=0; i<com.ns; i++) /* default spname */
919
sprintf(com.spname[i],"S%d",i+1);
922
fscanf(fin,"%lf",&tlength); fgets(line, lline, fin);
923
if(ReadTreeN(fin,&i,&j, 1, 1)) /* might overwrite spname */
924
error2("err tree..");
926
if(i==0) error2("use : to specify branch lengths in tree");
927
for(i=0,T=0; i<tree.nnode; i++)
928
if(i!=tree.root) T += nodes[i].branch;
930
for(i=0; i<tree.nnode; i++)
931
if(i!=tree.root) nodes[i].branch *= tlength/T;
933
printf("tree length = %.3f\n", (tlength>0?tlength:T));
935
printf("\nModel tree & branch lengths:\n");
936
/* OutTreeN(F0,1,1); FPN(F0); */
937
OutTreeN(F0,0,1); FPN(F0);
939
if(com.seqtype==CODONseq && com.model && !com.NSsites) { /* branch model */
940
FOR(i,tree.nnode) nodes[i].omega=nodes[i].label;
941
FPN(F0); OutTreeN(F0, 1, PrBranch&PrLabel); FPN(F0);
944
else { /* random trees, broken or need testing? */
945
printf ("\nbirth rate, death rate, sampling fraction, mutation rate (tree height)?\n");
946
fscanf (fin, "%lf%lf%lf%lf", &birth, &death, &sample, &mut);
947
fgets(line, lline, fin);
948
printf("%9.4f %9.4f %9.4f %9.4f\n", birth, death, sample, mut);
951
if(com.seqtype==BASEseq) {
952
fscanf(fin,"%d",&com.model);
953
fgets(line, lline, fin);
954
if(com.model<0 || com.model>REV) error2("model err");
955
if(com.model==T92) error2("T92: please use HKY85 with T=A and C=G.");
957
printf("\nModel: %s\n", basemodels[com.model]);
958
if(com.model==REV) nrate=5;
959
else if(com.model==TN93) nrate=2;
960
FOR(i,nrate) fscanf(fin,"%lf",&Qrates[i]);
961
fgets(line, lline, fin);
962
if(nrate<=2) FOR(i,nrate) printf("kappa %9.5f\n",Qrates[i]); FPN(F0);
964
printf("a & b & c & d & e: ");
965
FOR(i,nrate) printf("%9.5f",Qrates[i]); FPN(F0);
967
if((com.model==JC69 || com.model==F81)&&Qrates[0]!=1)
968
error2("kappa should be 1 for this model");
970
else if(com.seqtype==CODONseq) {
972
getcodon(CODONs[i], i);
973
if(com.model==0 && com.NSsites) { /* site model */
974
fscanf(fin,"%d",&com.ncatG); fgets(line, lline, fin);
975
if(com.ncatG>NCATG) error2("ncatG>NCATG");
976
FOR(i,com.ncatG) fscanf(fin,"%lf",&com.freqK[i]); fgets(line, lline, fin);
977
FOR(i,com.ncatG) fscanf(fin,"%lf",&com.rK[i]); fgets(line, lline, fin);
978
printf("\n\ndN/dS (w) for site classes (K=%d)", com.ncatG);
979
printf("\nf: "); FOR(i,com.ncatG) printf("%9.5f",com.freqK[i]);
980
printf("\nw: "); FOR(i,com.ncatG) printf("%9.5f",com.rK[i]); FPN(F0);
982
else if(com.model && com.NSsites) { /* branchsite model */
983
fscanf(fin,"%d",&com.ncatG); fgets(line, lline, fin);
984
if(com.ncatG>min2(NCATG,127)) error2("ncatG too large");
985
FOR(i,com.ncatG) fscanf(fin,"%lf",&com.freqK[i]); fgets(line,lline,fin);
986
printf("\n%d site classes.\nFreqs: ", com.ncatG);
987
FOR(i,com.ncatG) printf("%9.5f",com.freqK[i]);
989
if((com.omegaBS=(double*)malloc((com.ncatG+2)*tree.nnode*sizeof(double)))==NULL)
991
com.QfactorBS = com.omegaBS + com.ncatG*tree.nnode;
992
blengthBS = com.QfactorBS + tree.nnode;
994
for(i=0; i<tree.nnode; i++)
995
blengthBS[i] = nodes[i].branch;
996
for(k=0; k<com.ncatG; k++) {
997
ReadTreeN(fin, &i, &j, 0, 1);
998
if(i) error2("do not include branch lengths except in the first tree.");
999
if(!j) error2("Use # to specify omega's for branches");
1000
for(i=0; i<tree.nnode; i++) com.omegaBS[i*com.ncatG+k]=nodes[i].label;
1002
for(i=0; i<tree.nnode; i++)
1003
{ nodes[i].branch=blengthBS[i]; nodes[i].label=nodes[i].omega=0; }
1004
for(i=0; i<tree.nnode; i++) { /* print out omega as node labels. */
1005
nodes[i].nodeStr=pc=(char*)malloc(20*com.ncatG*sizeof(char));
1006
sprintf(pc, "'[%.2f", com.omegaBS[i*com.ncatG+0]);
1007
for(k=1,pc+=strlen(pc); k<com.ncatG; k++,pc+=strlen(pc))
1008
sprintf(pc, ", %.2f", com.omegaBS[i*com.ncatG+k]);
1011
FPN(F0); OutTreeN(F0,1,PrBranch|PrLabel); FPN(F0);
1013
else if(!com.model) { /* M0 */
1014
fscanf(fin,"%lf",&com.omega);
1015
fgets(line, lline, fin);
1016
printf("omega = %9.5f\n",com.omega);
1017
for(i=0; i<tree.nbranch; i++)
1018
nodes[tree.branches[i][1]].omega = com.omega;
1021
fscanf(fin,"%lf",&com.kappa); fgets(line, lline, fin);
1022
printf("kappa = %9.5f\n",com.kappa);
1025
if(com.seqtype==BASEseq || com.seqtype==AAseq) {
1026
fscanf(fin,"%lf%d", &com.alpha, &com.ncatG);
1027
fgets(line, lline, fin);
1029
printf("Gamma rates, alpha =%.4f (K=%d)\n",com.alpha,com.ncatG);
1032
puts("Rates are constant over sites.");
1035
if(com.alpha || com.ncatG) { /* this is used for codon NSsites as well. */
1037
if(com.seqtype==1 && com.model && com.NSsites) k*=tree.nnode;
1038
if((com.siterates=(double*)malloc(k*sizeof(double)))==NULL) error2("oom1");
1039
if((siteorder=(int*)malloc(com.ls*sizeof(int)))==NULL) error2("oom2");
1043
if(com.seqtype==AAseq) { /* get aa substitution model and rate matrix */
1044
fscanf(fin,"%d",&com.model);
1045
printf("\nmodel: %s",aamodels[com.model]);
1046
if(com.model>=2) { fscanf(fin,"%s",com.daafile); GetDaa(NULL,com.daa); }
1047
fgets(line, lline, fin);
1049
/* get freqs com.pi[] */
1050
if((com.seqtype==BASEseq && com.model>K80) ||
1051
com.seqtype==CODONseq ||
1052
(com.seqtype==AAseq && (com.model==1 || com.model==3)))
1053
FOR(k,com.ncode) fscanf(fin,"%lf",&com.pi[k]);
1054
else if(com.model==0 || (com.seqtype==BASEseq && com.model<=K80))
1055
fillxc(com.pi,1./com.ncode,com.ncode);
1057
printf("sum pi = 1 = %.6f:", sum(com.pi,com.ncode));
1058
matout2(F0, com.pi, 1, com.ncode, 7, 4);
1059
if(com.seqtype==BASEseq) {
1061
T=com.pi[0]; C=com.pi[1]; A=com.pi[2]; G=com.pi[3]; Y=T+C; R=A+G;
1062
if (com.model==F84) {
1063
Qrates[1]=1+Qrates[0]/R; /* kappa2 */
1064
Qrates[0]=1+Qrates[0]/Y; /* kappa1 */
1066
else if (com.model<=HKY85) Qrates[1]=Qrates[0];
1067
Qfactor = 1/(2*T*C*Qrates[0] + 2*A*G*Qrates[1] + 2*Y*R);
1070
if(com.model==REV) EigenQbase(Qrates, com.pi, Root,U,V,PMat);
1073
/* get Qfactor for NSsites & NSbranchsite models */
1074
if(com.seqtype==CODONseq && com.NSsites) {
1075
if(!com.model) { /* site models */
1076
for(k=0,Qfactor=0; k<com.ncatG; k++) {
1077
freqK_NS=com.freqK[k];
1078
EigenQcodon(1, com.kappa,com.rK[k],com.pi, NULL,NULL,NULL, PMat);
1081
printf("Qfactor for NSsites model = %9.5f\n", Qfactor);
1083
else { /* branch-site models */
1084
for(i=0; i<tree.nnode; i++) {
1085
if(i==tree.root) { com.QfactorBS[i]=-1; continue; }
1086
for(k=0,Qfactor=0; k<com.ncatG; k++) {
1087
freqK_NS=com.freqK[k];
1088
EigenQcodon(1, com.kappa,com.omegaBS[i*com.ncatG+k],com.pi, NULL,NULL,NULL, PMat);
1090
com.QfactorBS[i]=1/Qfactor; Qfactor=0;
1091
printf("node %2d: Qfactor = %9.5f\n", i+1, com.QfactorBS[i]);
1095
if(com.seqtype==CODONseq && com.ncatG<=1 && com.model==0)
1096
EigenQcodon(0, com.kappa,com.omega, com.pi, Root, U, V, PMat);
1097
else if(com.seqtype==AAseq)
1098
EigenQaa(com.pi, Root, U, V,PMat);
1100
puts("\nAll parameters are read. Ready to simulate\n");
1101
for(j=0; j<com.ns*2-1; j++)
1102
com.z[j] = (char*)malloc(com.ls*sizeof(char));
1103
sspace = max2(sspace, com.ls*(int)sizeof(double));
1104
space = (double*)malloc(sspace);
1105
if(com.alpha || com.ncatG) tmpseq=(char*)space;
1106
if (com.z[com.ns*2-1-1]==NULL) error2("oom for seqs");
1108
printf("oom for space, %d bytes needed.", sspace);
1112
fseq = gfopen(seqf[format],"w");
1113
if(format==2) appendfile(fseq,paupstart);
1115
fanc = (FILE*)gfopen(ancf,"w");
1117
fputs("\nAncestral sequences generated during simulation ",fanc);
1118
fprintf(fanc,"(check against %s)\n",seqf[format]);
1119
OutTreeN(fanc,0,0); FPN(fanc); OutTreeB(fanc); FPN(fanc);
1121
if(com.alpha || com.NSsites) {
1122
fsiteID=(FILE*)gfopen(siteIDf,"w");
1123
if(com.seqtype==1) fprintf(fsiteID, "\nSite class IDs\n");
1124
else fprintf(fsiteID, "\nRates for sites\n");
1125
if(com.seqtype==CODONseq && com.NSsites) {
1126
if(!com.model) matout(fsiteID,com.rK, 1,com.ncatG);
1127
if((com.siteID=(char*)malloc(com.ls*sizeof(char)))==NULL)
1128
error2("oom siteID");
1132
for (ir=0; ir<nr; ir++) {
1133
if (!fixtree) { /* right now tree is fixed */
1134
RandomLHistory (rooted, space);
1135
if (rooted && com.ns<10) j = GetIofLHistory ();
1136
BranchLengthBD (1, birth, death, sample, mut);
1138
printf ("\ntree used: ");
1143
MakeSeq(com.z[tree.root], com.ls);
1146
Rates4Sites(com.siterates,com.alpha,com.ncatG,com.ls, 0,space);
1147
else if(com.seqtype==1 && com.NSsites) { /* for NSsites */
1148
/* the table for the alias algorithm is the same, but ncatG is small. */
1149
MultiNomialAliasSetTable(com.ncatG, com.freqK, Falias, Lalias, space);
1150
MultiNomialAlias(com.ls, com.ncatG, Falias, Lalias, counts);
1152
for (i=0,h=0; i<com.ncatG; i++)
1153
for (j=0; j<counts[i]; j++) {
1154
com.siteID[h]=(char)i;
1155
com.siterates[h++]=com.rK[i]; /* overwritten later for branchsite */
1161
/* randomize sites for site-class model */
1162
if(com.siterates && com.ncatG>1) {
1163
if(format==1 && ir==0)
1164
puts("\nrequested site pattern counts as output for site-class model.\n");
1165
randorder(siteorder, com.ls, (int*)space);
1166
for(j=0; j<tree.nnode; j++) {
1167
memcpy(tmpseq,com.z[j],com.ls*sizeof(char));
1168
for(h=0; h<com.ls; h++) com.z[j][h]=tmpseq[siteorder[h]];
1170
if(com.alpha || com.ncatG>1) {
1171
memcpy(space,com.siterates,com.ls*sizeof(double));
1172
for(h=0; h<com.ls; h++) com.siterates[h]=space[siteorder[h]];
1175
memcpy((char*)space,com.siteID,com.ls*sizeof(char));
1176
for(h=0; h<com.ls; h++) com.siteID[h]=*((char*)space+siteorder[h]);
1180
/* print sequences*/
1182
for(i=0; i<com.ns; i++) for(h=0; h<com.ls; h++) com.z[i][h] ++;
1183
PatternWeightSimple(0);
1184
for(i=0; i<com.ns; i++) for(h=0; h<com.npatt; h++) com.z[i][h] --;
1186
if(format==2) fprintf(fseq,"\n\n[Replicate # %d]\n", ir+1);
1187
printSeqs(fseq, NULL, NULL, format); /* printsma not usable as it codes into 0,1,...,60. */
1188
if(format==2 && !fixtree) {
1189
fprintf(fseq,"\nbegin tree;\n tree true_tree = [&U] ");
1190
OutTreeN(fseq,1,1); fputs(";\n",fseq);
1191
fprintf(fseq,"end;\n\n");
1193
if(format==2) appendfile(fseq,paupblock);
1195
/* print ancestral seqs, rates for sites. */
1197
j = (com.seqtype==CODONseq?3*com.ls:com.ls);
1198
fprintf(fanc,"[replicate %d]\n",ir+1);
1202
{ OutTreeN(fanc,1,1); FPN(fanc); FPN(fanc); }
1205
fprintf(fanc,"%6d %6d\n",tree.nnode-com.ns,j);
1206
for(j=com.ns; j<tree.nnode; j++,FPN(fanc)) {
1207
fprintf(fanc,"node%-26d ", j+1);
1208
print1seq(fanc, com.z[j], com.ls, NULL);
1213
if(com.seqtype==CODONseq && com.NSsites && com.model==0) { /* site model */
1215
if(com.rK[com.ncatG-1]>1)
1216
FOR(h,com.ls) if(com.rK[com.siteID[h]]>1) k++;
1217
fprintf(fsiteID, "\n[replicate %d: %2d]\n",ir+1, k);
1218
if(k) for(h=0,k=0; h<com.ls; h++) {
1219
if(com.rK[com.siteID[h]]>1) {
1220
fprintf(fsiteID,"%4d ",h+1);
1221
if(++k%15==0) FPN(fsiteID);
1226
else if(com.seqtype==CODONseq && com.NSsites && com.model) { /* branchsite */
1227
fprintf(fsiteID, "\n[replicate %d]\n",ir+1);
1228
for(h=0; h<com.ls; h++) {
1229
fprintf(fsiteID," %4d ", com.siteID[h]+1);
1230
if(h==com.ls-1 || (h+1)%15==0) FPN(fsiteID);
1233
else { /* gamma rates */
1234
fprintf(fsiteID,"\n[replicate %d]\n",ir+1);
1235
for(h=0; h<com.ls; h++) {
1236
fprintf(fsiteID,"%7.4f ",com.siterates[h]);
1237
if(h==com.ls-1 || (h+1)%10==0) FPN(fsiteID);
1244
printf ("\rdid data set %d %s", ir+1, (com.ls>100000||nr<100? "\n" : ""));
1246
if(format==2) appendfile(fseq,paupend);
1248
fclose(fseq); if(!fixtree) fclose(fanc);
1249
if(com.alpha || com.NSsites) fclose(fsiteID);
1250
for(j=0; j<com.ns*2-1; j++) free(com.z[j]);
1252
if(com.model && com.NSsites) /* branch-site model */
1253
for(i=0; i<tree.nnode; i++) free(nodes[i].nodeStr);
1255
if(com.alpha || com.ncatG) {
1256
free(com.siterates); com.siterates=NULL;
1258
if(com.siteID) free(com.siteID); com.siteID=NULL;
1260
if(com.seqtype==1 && com.model && com.NSsites) free(com.omegaBS);
1267
int GetSpnamesFromMB (FILE *fmb, char line[], int lline)
1269
/* This reads species names from MrBayes output file fmb, like the following.
1271
Taxon 1 -> 1_Arabidopsis_thaliana
1272
Taxon 2 -> 2_Taxus_baccata
1275
char *p=NULL, *mbstr1="Taxon ", *mbstr2="->";
1277
puts("Reading species names from mb output file.\n");
1279
for(ispecies=0; ; ) {
1280
if(fgets(line, lline, fmb)==NULL) return(-1);
1281
if(strstr(line, mbstr1) && strstr(line, mbstr2)) {
1282
p=strstr(line, mbstr1)+5;
1283
sscanf(p, "%d", &ispecies);
1284
p=strstr(line, mbstr2)+3;
1285
if(com.spname[ispecies-1][0])
1286
error2("species name already read?");
1288
for(j=0; isgraph(*p)&&j<lline; ) com.spname[ispecies-1][j++] = *p++;
1289
com.spname[ispecies-1][j]=0;
1291
printf("\tTaxon %2d: %s\n", ispecies, com.spname[ispecies-1]);
1302
char *GrepLine (FILE*fin, char*query, char* line, int lline)
1304
/* This greps infile to search for query[], and returns NULL or line[].
1310
if(fgets(line, lline, fin)==NULL) return(NULL);
1311
if(strstr(line, query)) return(line);
1317
void CladeProbabilities (char treefile[])
1319
/* This reads a tree from treefile and then scans a set of MrBayes output files
1320
(mbfiles) to retrieve posterior probabilities for every clade in that tree.
1321
It first scans the first mb output file to get the species names.
1324
6 -- ...........................************* 8001 1.000 0.005 (0.000)
1325
7 -- ....................******************** 8001 1.000 0.006 (0.000)
1328
int lline=100000, i,j,k, nib, inode, parti2B[NS];
1329
char line[100000], *partition, *p;
1330
char symbol[2]=".*", cladestr[NS+1]={0};
1331
FILE *ftree, *fmb[20];
1335
char *mbfiles[]={"mb-1e-5.out", "mb-2e-5.out", "mb-3e-5.out", "mb-4e-5.out",
1336
"mb-5e-5.out", "mb-6e-5.out", "mb-7e-5.out", "mb-8e-5.out",
1337
"mb-9e-5.out", "mb-1e-4.out", "mb-2e-4.out", "mb-3e-4.out",
1338
"mb-5e-4.out", "mb-1e-3.out", "mb-1e-2.out"};
1341
char *mbfiles[]={"mb-1e-4.out", "mb-1e-1.out"};
1343
printf("tree file is %s\nmb output files:\n", treefile);
1344
ftree=gfopen(treefile,"r");
1345
for(k=0; k<nmbfiles; k++)
1346
fmb[k]=gfopen(mbfiles[k],"r");
1347
for(k=0; k<nmbfiles; k++) printf("\t%s\n", mbfiles[k]);
1349
GetSpnamesFromMB(fmb[0], line, lline); /* read species names from mb output */
1351
fscanf (ftree, "%d%d", &i, &k);
1352
if(i && i!=com.ns) error2("do you mean to specify ns in the tree file?");
1353
i=(com.ns*2-1)*sizeof(struct TREEN);
1354
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
1355
ReadTreeN (ftree, &i, &j, 0, 1);
1357
FPN(F0); OutTreeN(F0, 0, 0); FPN(F0); FPN(F0);
1358
nib=tree.nbranch-com.ns;
1359
for(i=0;i<tree.nnode;i++) {
1360
nodes[i].nodeStr = NULL;
1361
if(i>com.ns) nodes[i].nodeStr=(char*)malloc(100*sizeof(char));
1364
partition=(char*)malloc(nib*com.ns*sizeof(char));
1365
if (partition==NULL) error2("oom");
1366
if((Pclade=(double*)malloc(nib*nmbfiles*sizeof(double)))==NULL)
1368
for(i=0;i<nib*nmbfiles; i++) Pclade[i]=0;
1370
BranchPartition(partition, parti2B);
1372
for(i=0; i<nib; i++) {
1373
inode=tree.branches[parti2B[i]][1];
1374
if(partition[i*com.ns+0])
1375
for(j=0; j<com.ns; j++) cladestr[j]=symbol[1-partition[i*com.ns+j]];
1377
for(j=0; j<com.ns; j++) cladestr[j]=symbol[partition[i*com.ns+j]];
1378
printf("#%2d branch %2d node %2d %s", i+1, parti2B[i], inode, cladestr);
1380
for(k=0; k<nmbfiles; k++) {
1381
if(GrepLine(fmb[k], cladestr, line, lline)) {
1382
p=strstr(line,cladestr);
1383
sscanf(p+com.ns, "%lf%lf\0", &t, &Pclade[i*nmbfiles+k]);
1386
for(k=0; k<nmbfiles; k++) printf("%6.2f", Pclade[i*nmbfiles+k]);
1388
for(k=0,p=nodes[inode].nodeStr; k<nmbfiles; k++) {
1389
sprintf(p, "%3.0f%s", Pclade[i*nmbfiles+k]*100,(k<nmbfiles-1?"/":""));
1393
FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0);
1395
for(i=0; i<tree.nnode; i++) free(nodes[i].nodeStr);
1396
free(nodes); free(partition); free(Pclade);
1398
for(k=0; k<nmbfiles; k++) fclose(fmb[k]);
1403
void CladeSupport(char tree1f[], char tree2f[], int burnin)
1405
/* This reads one tree from tree1f and then scans many trees in tree2f to
1406
calculate (bootstrap and Bayesian) support values.
1408
int i,j,k, ntree, nib1,nib2, intree;
1409
char *partition1, *partition2;
1410
double Pclade[NS]={0};
1413
printf("Calculate support values for clades on the master tree\n");
1414
if(!isgraph(tree1f[0])) {
1415
printf("input two tree file names\n");
1416
scanf("%s%s", tree1f, tree2f);
1418
f1=gfopen(tree1f,"r");
1419
f2=gfopen(tree2f,"r");
1421
fscanf(f1, "%d%d", &com.ns, &k);
1422
if(k>1) puts("only the first tree in the master tree file is used.");
1423
i=(com.ns*2-1)*sizeof(struct TREEN);
1424
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
1425
for(i=0; i<com.ns*2-1; i++) nodes[i].nodeStr=NULL;
1427
ReadTreeN (f1, &i, &j, 1, 1);
1428
printf("master tree\n"); OutTreeN(F0,0,0); FPN(F0); FPN(F0);
1430
nib1=tree.nbranch-com.ns;
1431
partition1=(char*)malloc(2*(com.ns-1)*com.ns*sizeof(char));
1432
if(partition1==NULL) error2("oom");
1433
partition2=partition1+(com.ns-1)*com.ns;
1435
BranchPartition(partition1, NULL);
1437
for(ntree=-burnin; ; ntree++) {
1438
fscanf(f2, "%d%d", &i, &j);
1439
if(ReadTreeN (f2, &i, &j, 0, 1)) break;
1440
printf("\nreading tree %5d ", ntree+1);
1441
if(com.ns<15) OutTreeN(F0, 0, 0);
1442
if(ntree<0) continue;
1443
nib2=tree.nbranch-com.ns;
1444
BranchPartition(partition2, NULL);
1445
for(i=0; i<nib1; i++) {
1446
for(j=0,intree=0; j<nib2; j++) {
1447
for(k=0; k<com.ns; k++)
1448
if(partition1[i*com.ns+k]!=partition2[j*com.ns+k]) break;
1449
if(k==com.ns) { intree=1; break; }
1451
if(intree) Pclade[i]++;
1455
/* for(i=0; i<nib1; i++) Pclade[i]/=ntree; */
1457
fscanf(f1, "%d%d", &com.ns, &k);
1458
ReadTreeN (f1, &i, &j, 1, 1);
1459
for(i=0,nib1=0; i<tree.nbranch; i++)
1460
if(tree.branches[i][1] >= com.ns)
1461
nodes[tree.branches[i][1]].label = Pclade[nib1++];
1463
if(burnin) printf("\n\n%d burn in, %d trees used\n", burnin, ntree);
1464
else printf("\n\n%d trees used\n", ntree);
1465
matout2(F0, Pclade, 1, nib1, 6, 2);
1466
FPN(F0); OutTreeN(F0,0,PrLabel); FPN(F0);
1467
OutTreeN(F0,1,PrLabel); FPN(F0);
1469
free(nodes); free(partition1); fclose(f1); fclose(f2);
1474
void Rell2MLtree(int argc, char *argv[])
1476
/* for CodonTree project. This retrieves the ML tree by examining the RELL
1477
results for the 51 trees.
1479
int ngene=106, ntree=51, ig,i,j,itree,MLtree, lline=10000;
1483
if(argc!=3) error2("Usage Rell2MLtree treefile rellfile");
1484
printf("extracting ML tree from rell output\n");
1485
ft=gfopen(argv[1],"r");
1486
fr=gfopen(argv[2],"r");
1487
fo=gfopen("t2.trees","w");
1490
i=(com.ns*2-1)*sizeof(struct TREEN);
1491
if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
1493
for(ig=0; ig<ngene; ig++) {
1494
fscanf(fr, "%d", &MLtree);
1495
fgets(line, lline, fr);
1497
fscanf(ft, "%d%d", &i, &j);
1498
if(i!=com.ns) error2("tree file error.");
1499
for(itree=0; itree<ntree; itree++) {
1500
if(ReadTreeN (ft, &i, &j, 0, 1)) break;
1501
if(itree==MLtree-1) break;
1503
OutTreeN(F0, 0, 0); FPN(F0);
1504
OutTreeN(fo, 0, 0); FPN(fo);